mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-24 07:27:32 +08:00
Compare commits
58 Commits
remove_pyi
...
no_distrib
| Author | SHA1 | Date | |
|---|---|---|---|
| e105c47187 | |||
| bb9d89d32b | |||
| 259bb6f535 | |||
| 9c85b9f05b | |||
| ff61b90cde | |||
| d1a640b146 | |||
| 074af6f329 | |||
| cb0624f175 | |||
| da969718ad | |||
| c94cca7b59 | |||
| fbdb9ba538 | |||
| 607f616b3e | |||
| eca1bf31b7 | |||
| 606c46aee0 | |||
| d093a0afd4 | |||
| 5dc4e78047 | |||
| c9e57d7e9f | |||
| 70337a066f | |||
| 7d1bcd9aea | |||
| 09cbf34e93 | |||
| 456fbeaa6d | |||
| a8c80f3fa9 | |||
| bf6b40da3e | |||
| 814ba34fa6 | |||
| 06bb32d55e | |||
| b3ad8f4a9c | |||
| 755cf90672 | |||
| 76e5df3866 | |||
| 7fe1f5ea49 | |||
| e156a07171 | |||
| ba5ca31676 | |||
| 8e1db46493 | |||
| aff2438554 | |||
| 3f8a2e62ea | |||
| 6d64bc3990 | |||
| 972140b7e9 | |||
| 84186c39ed | |||
| 74a35c6344 | |||
| d2f6daf6a7 | |||
| e74b21d66a | |||
| f01bf0f64b | |||
| 886699bc5c | |||
| 72b5159782 | |||
| f37eaebed1 | |||
| 5b9114bf19 | |||
| deb7ebe0a3 | |||
| 9c93dc8123 | |||
| 31040b6357 | |||
| aa41d3e49c | |||
| f0fcf436c5 | |||
| 5663910472 | |||
| da669d51bf | |||
| 783985e9fe | |||
| 49d30f9a23 | |||
| 66133b1ab7 | |||
| 543d50db2b | |||
| 7dd5f7b125 | |||
| a956c4ab1c |
@ -7,4 +7,4 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
|
||||
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -21,6 +21,7 @@ self-hosted-runner:
|
||||
- linux.arm64.2xlarge.ephemeral
|
||||
- linux.arm64.m7g.4xlarge
|
||||
- linux.arm64.m7g.4xlarge.ephemeral
|
||||
- linux.arm64.r7g.12xlarge.memory
|
||||
- linux.4xlarge.nvidia.gpu
|
||||
- linux.8xlarge.nvidia.gpu
|
||||
- linux.16xlarge.nvidia.gpu
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
caba63f0fa29ef9e3d566699f32f11c07c8bda4e
|
||||
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
f510715882304796a96e33028b4f6de1b026c2c7
|
||||
973c9d01da863cac9c51e8a5c0d390fc84b84fbc
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
6c5478ff7c3d50dd1e3047d72ec5909bea474073
|
||||
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
|
||||
|
||||
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
dnf install -y git curl wget sudo; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version >/dev/null 2>&1; then \
|
||||
@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
echo "Installing sccache..."; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
|
||||
else \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
|
||||
fi; \
|
||||
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
dnf install -y git curl wget sudo; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
|
||||
3
.github/scripts/prepare_vllm_wheels.sh
vendored
3
.github/scripts/prepare_vllm_wheels.sh
vendored
@ -84,6 +84,9 @@ repackage_wheel() {
|
||||
rm -rf $package
|
||||
}
|
||||
|
||||
# Require to re-package the wheel
|
||||
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
repackage_wheel $package
|
||||
|
||||
41
.github/workflows/build-vllm-wheel.yml
vendored
41
.github/workflows/build-vllm-wheel.yml
vendored
@ -12,6 +12,9 @@ on:
|
||||
paths:
|
||||
- .github/workflows/build-vllm-wheel.yml
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
schedule:
|
||||
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
|
||||
- cron: 30 13 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -24,21 +27,33 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ '3.12' ]
|
||||
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
|
||||
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
runner: [ 'linux.12xlarge.memory' ]
|
||||
include:
|
||||
- device: cu128
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu128
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
|
||||
- device: cu129
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu129
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
||||
name: "Build ${{ matrix.device }} vLLM wheel"
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu128
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu129
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: 480
|
||||
env:
|
||||
PY_VERS: ${{ matrix.python-version }}
|
||||
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
|
||||
PLATFORM: 'manylinux_2_28_x86_64'
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
@ -136,7 +151,7 @@ jobs:
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
|
||||
|
||||
@ -146,15 +161,17 @@ jobs:
|
||||
|
||||
# Copied from build-triton-wheel workflow (mostly)
|
||||
upload-wheel:
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel"
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
needs:
|
||||
- build-wheel
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
permissions:
|
||||
id-token: write
|
||||
@ -190,15 +207,15 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
- name: Set DRY_RUN
|
||||
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
- name: Set UPLOAD_CHANNEL
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
4
.github/workflows/inductor-periodic.yml
vendored
4
.github/workflows/inductor-periodic.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
cuda-arch-list: '8.0;8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
@ -62,7 +62,7 @@ jobs:
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
|
||||
2
.github/workflows/pull.yml
vendored
2
.github/workflows/pull.yml
vendored
@ -127,8 +127,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,8 +140,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -389,3 +389,5 @@ android/pytorch_android_torchvision/.cxx
|
||||
|
||||
# Claude Code local configuration
|
||||
CLAUDE.local.md
|
||||
/test_*.py
|
||||
/debug_*.py
|
||||
|
||||
@ -874,7 +874,7 @@ cmake_dependent_option(
|
||||
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
||||
Will be disabled if not supported by the platform"
|
||||
ON
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
@ -909,7 +909,7 @@ cmake_dependent_option(
|
||||
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
|
||||
#
|
||||
if(USE_ROCM)
|
||||
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
|
||||
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
include(cmake/External/aotriton.cmake)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
|
||||
if (self.dim() == 1 || broadcast_index) {
|
||||
Tensor squeezed = index;
|
||||
if (broadcast_index && index.dim() > 1) {
|
||||
for (const auto d : c10::irange(index.dim())) {
|
||||
for (int64_t d = index.dim() - 1; d >= 0; --d) {
|
||||
if (d == dim) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -317,6 +317,17 @@ void nonzero_static_cuda_out_impl(
|
||||
out_temp =
|
||||
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
|
||||
}
|
||||
// If input has zero elements, avoid kernel grid calculations (which can
|
||||
// produce zero divisors) and just fill the output with fill_value.
|
||||
if (self.numel() == 0) {
|
||||
if (need_to_copy) {
|
||||
out_temp.fill_(fill_value);
|
||||
out.copy_(out_temp);
|
||||
} else {
|
||||
out.fill_(fill_value);
|
||||
}
|
||||
return;
|
||||
}
|
||||
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
|
||||
: out.mutable_data_ptr<int64_t>();
|
||||
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
|
||||
MPSGraphFFTScalingModeNone = 0L,
|
||||
MPSGraphFFTScalingModeSize = 1L,
|
||||
MPSGraphFFTScalingModeUnitary = 2L,
|
||||
};
|
||||
|
||||
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
|
||||
@property(readwrite, nonatomic) BOOL inverse;
|
||||
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
|
||||
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
|
||||
+ (nullable instancetype)descriptor;
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
|
||||
|
||||
@interface MPSGraph (SonomaOps)
|
||||
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
@end
|
||||
|
||||
// define BFloat16 enums for MacOS13
|
||||
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
|
||||
|
||||
// define Metal version
|
||||
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
|
||||
#endif
|
||||
@ -1,196 +0,0 @@
|
||||
#pragma once
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
// TODO: Remove me when moved to MacOS 13
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger strideInX;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInY;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInZ;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger paddingLeft;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingRight;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingTop;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBottom;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingFront;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBack;
|
||||
|
||||
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger groups;
|
||||
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
|
||||
|
||||
#endif
|
||||
|
||||
@interface MPSGraph (VenturaOps)
|
||||
|
||||
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
|
||||
MPSGraphResizeNearestRoundingModeCeil = 2L,
|
||||
MPSGraphResizeNearestRoundingModeFloor = 3L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
|
||||
};
|
||||
|
||||
// Define complex enums for MacOS 12
|
||||
#define MPSDataTypeComplexBit 0x01000000
|
||||
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
|
||||
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
|
||||
#endif
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
sourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
samplingMode:(MPSGraphResizeMode)samplingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
@end
|
||||
@ -9,8 +9,6 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
#include <fmt/ranges.h>
|
||||
|
||||
@ -8,8 +8,6 @@
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/operations/BinaryKernel.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,23 +1,12 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_mps_convolution_native.h>
|
||||
#include <ATen/ops/_mps_convolution_transpose_native.h>
|
||||
#include <ATen/ops/mps_convolution_backward_native.h>
|
||||
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
|
||||
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@implementation FakeMPSGraphConvolution3DOpDescriptor
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
#endif
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -50,11 +39,9 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
|
||||
descriptor_.paddingFront = paddingDepth;
|
||||
descriptor_.paddingBack = paddingDepth;
|
||||
|
||||
// PyTorch always uses NCDHW memory layout for 3D tensors
|
||||
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
|
||||
// PyTorch always uses OIDHW memory layout for 3D weights
|
||||
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
|
||||
descriptor_.groups = groups; // not yet tested in Xcode/C++
|
||||
}
|
||||
@ -186,18 +173,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
if (bias_defined)
|
||||
bias_shape = bias_opt.value().sizes();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
std::string bias_shape_key;
|
||||
if (bias_defined) {
|
||||
bias_shape_key = std::to_string(bias_shape[0]);
|
||||
@ -205,20 +180,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
bias_shape_key = "nobias";
|
||||
}
|
||||
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
|
||||
} else {
|
||||
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
}
|
||||
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
mps::getTensorsStringKey({input_t, weight_t}),
|
||||
bias_defined,
|
||||
bias_shape_key);
|
||||
|
||||
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
|
||||
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
|
||||
@ -400,33 +371,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
@autoreleasepool {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
MPSShape* mps_input_shape = getMPSShape(input_size);
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
|
||||
} else {
|
||||
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
}
|
||||
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
getTensorsStringKey({grad_output_t, weight_t}));
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
|
||||
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
@ -551,19 +504,13 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
MPSShape* mps_weight_shape = getMPSShape(weight_size);
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
} else {
|
||||
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
}
|
||||
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSShape* inputShape = getMPSShape(input_t);
|
||||
bool isDepthwiseConv =
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_copy_from_and_resize_native.h>
|
||||
#include <ATen/ops/_copy_from_native.h>
|
||||
|
||||
@ -5,8 +5,6 @@
|
||||
#include <ATen/native/DistributionTemplates.h>
|
||||
#include <ATen/native/Distributions.h>
|
||||
#include <ATen/native/TensorFactories.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,6 +1,4 @@
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -12,20 +10,6 @@
|
||||
#include <ATen/ops/_fft_r2c_native.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
@implementation FakeMPSGraphFFTDescriptor
|
||||
+ (nullable instancetype)descriptor {
|
||||
// Redispatch the constructor to the actual implementation
|
||||
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
|
||||
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
|
||||
}
|
||||
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
@end
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
namespace {
|
||||
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/GridSamplerUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/GridSampler.h>
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorAdvancedIndexing.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
@ -6,9 +6,7 @@
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/native/SortingUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -2,8 +2,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UnaryOps.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
||||
@ -4,8 +4,6 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
||||
@ -2517,7 +2517,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: empty_like
|
||||
QuantizedCPU, QuantizedCUDA: empty_like_quantized
|
||||
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
|
||||
autogen: empty_like.out
|
||||
@ -6492,7 +6492,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: unsqueeze
|
||||
SparseCPU, SparseCUDA: unsqueeze_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
|
||||
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
|
||||
tags: core
|
||||
@ -10259,7 +10259,7 @@
|
||||
structured_delegate: any.all_out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: any_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: any_sparse
|
||||
tags: core
|
||||
|
||||
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
|
||||
@ -95,72 +95,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
|
||||
namespace pytorch_flash
|
||||
{
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
mha_fwd(
|
||||
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
|
||||
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
std::optional<at::Tensor>&
|
||||
out_, // batch_size x seqlen_q x num_heads x head_size
|
||||
std::optional<at::Tensor>&
|
||||
alibi_slopes_, // num_heads or batch_size x num_heads
|
||||
const float p_dropout,
|
||||
const float softmax_scale,
|
||||
bool is_causal,
|
||||
std::optional<int64_t> window_size_left,
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace cuda::philox {
|
||||
|
||||
@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
|
||||
#endif
|
||||
|
||||
TORCH_API
|
||||
std::tuple<
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -294,7 +294,42 @@ mha_fwd(
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_);
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
|
||||
@ -72,6 +72,12 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"timm_vovnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -55,6 +55,12 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"timm_nfnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -733,7 +733,7 @@ def timed(
|
||||
|
||||
time_total = 0
|
||||
# Dont collect outputs to correctly measure timing
|
||||
for _ in range(times):
|
||||
for i in range(times):
|
||||
# If batch_size is 1, it too often collides with other non batch size
|
||||
# dimensions resulting in errors.
|
||||
if batch_size and batch_size > 1:
|
||||
@ -1106,7 +1106,13 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
elif args.torchscript_jit_trace:
|
||||
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
|
||||
else:
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
if kwargs["hf_llm"]:
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model.forward = torch._dynamo.run(model)
|
||||
frozen_model_iter_fn = model_iter_fn
|
||||
else:
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
|
||||
for rep in trange(args.repeat, desc="running benchmark"):
|
||||
inputs = (
|
||||
@ -1120,7 +1126,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
maybe_mark_step(args)
|
||||
|
||||
# interleave the runs to handle frequency scaling and load changes
|
||||
with maybe_mark_profile(p=p, mark="expected"):
|
||||
with (
|
||||
maybe_mark_profile(p=p, mark="expected"),
|
||||
torch.compiler.set_stance("force_eager"),
|
||||
):
|
||||
timings[rep, 0], expected_output = timed(
|
||||
model,
|
||||
model_iter_fn,
|
||||
@ -2233,11 +2242,12 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_1st_run_OOM"
|
||||
@ -2254,11 +2264,12 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_2nd_run_OOM"
|
||||
@ -2542,7 +2553,11 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
baseline_timings = experiment(
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
)
|
||||
|
||||
if self.args.export_aot_inductor:
|
||||
@ -2610,7 +2625,11 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
backend_timings = experiment(
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
)
|
||||
timings = np.stack((baseline_timings, backend_timings), axis=1)
|
||||
result_summary = latency_experiment_summary(
|
||||
@ -2629,9 +2648,17 @@ class BenchmarkRunner:
|
||||
tag=None,
|
||||
batch_size=None,
|
||||
):
|
||||
niters = 5
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If we're benchmarking an llm, we want to use the generate function
|
||||
self.model_iter_fn = self.generate
|
||||
niters = 1
|
||||
|
||||
if self.args.xla:
|
||||
with self.pick_grad(name, self.args.training):
|
||||
return experiment(*self.maybe_cast(model, example_inputs))
|
||||
return experiment(
|
||||
self.model_iter_fn, *self.maybe_cast(model, example_inputs)
|
||||
)
|
||||
|
||||
def warmup(fn, model, example_inputs, mode, niters=5):
|
||||
gc.collect()
|
||||
@ -2696,17 +2723,22 @@ class BenchmarkRunner:
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"eager_{self.args.only}"
|
||||
):
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=1,
|
||||
niters=niters,
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=1,
|
||||
)
|
||||
|
||||
if (
|
||||
self.args.export_aot_inductor
|
||||
@ -2715,7 +2747,13 @@ class BenchmarkRunner:
|
||||
):
|
||||
optimized_model_iter_fn = optimize_ctx
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model = optimize_ctx(model)
|
||||
optimized_model_iter_fn = self.model_iter_fn
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"compiled_{self.args.only}"
|
||||
@ -2793,7 +2831,13 @@ class BenchmarkRunner:
|
||||
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
|
||||
)
|
||||
|
||||
results.append(experiment(model, example_inputs, **experiment_kwargs))
|
||||
experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
|
||||
|
||||
results.append(
|
||||
experiment(
|
||||
self.model_iter_fn, model, example_inputs, **experiment_kwargs
|
||||
)
|
||||
)
|
||||
return " ".join(map(str, results))
|
||||
|
||||
def minify_model(
|
||||
@ -4084,7 +4128,7 @@ def run(runner, args, original_dir=None):
|
||||
# Overwrite 'translation_validation' config, if specified.
|
||||
torch.fx.experimental._config.translation_validation = False
|
||||
|
||||
experiment = functools.partial(experiment, args, runner.model_iter_fn)
|
||||
experiment = functools.partial(experiment, args)
|
||||
|
||||
if args.only and should_diff_branch(args):
|
||||
import git
|
||||
|
||||
@ -7,6 +7,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import types
|
||||
import warnings
|
||||
|
||||
|
||||
@ -128,6 +129,12 @@ with open(MODELS_FILENAME) as fh:
|
||||
assert len(BATCH_SIZE_KNOWN_MODELS)
|
||||
|
||||
|
||||
try:
|
||||
from .huggingface_llm_models import HF_LLM_MODELS
|
||||
except ImportError:
|
||||
from huggingface_llm_models import HF_LLM_MODELS
|
||||
|
||||
|
||||
def get_module_cls_by_model_name(model_cls_name):
|
||||
_module_by_model_name = {
|
||||
"Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
|
||||
@ -418,11 +425,8 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
use_eval_mode = self.args.use_eval_mode
|
||||
dtype = torch.float32
|
||||
reset_rng_state()
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
# Get batch size
|
||||
if model_name in BATCH_SIZE_KNOWN_MODELS:
|
||||
batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
|
||||
elif batch_size is None:
|
||||
@ -440,14 +444,46 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}" # noqa: G004
|
||||
)
|
||||
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
# Get model and example inputs
|
||||
if model_name in HF_LLM_MODELS:
|
||||
benchmark_cls = HF_LLM_MODELS[model_name]
|
||||
model, example_inputs = benchmark_cls.get_model_and_inputs(
|
||||
model_name, device
|
||||
)
|
||||
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
# Set this flag so that when we test for speedup, we use
|
||||
# model.generate instead of using model.forward
|
||||
self.hf_llm = True
|
||||
|
||||
def generate(self, _, example_inputs, collect_outputs=True):
|
||||
return model.generate(**example_inputs)
|
||||
|
||||
self.generate = types.MethodType(generate, self)
|
||||
|
||||
else:
|
||||
self.hf_llm = False
|
||||
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
if (
|
||||
is_training
|
||||
@ -460,12 +496,6 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
self.validate_model(model, example_inputs)
|
||||
return device, model_name, model, example_inputs, batch_size
|
||||
|
||||
@ -530,7 +560,8 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
|
||||
def forward_pass(self, mod, inputs, collect_outputs=True):
|
||||
with self.autocast(**self.autocast_arg):
|
||||
return mod(**inputs)
|
||||
res = mod(**inputs)
|
||||
return res.logits if self.hf_llm else res
|
||||
|
||||
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
|
||||
cloned_inputs = clone_inputs(inputs)
|
||||
|
||||
@ -9,9 +9,16 @@ skip:
|
||||
# Fails with even batch size = 1
|
||||
- GPTJForCausalLM
|
||||
- GPTJForQuestionAnswering
|
||||
# Model too big
|
||||
- google/gemma-3-4b-it
|
||||
|
||||
device:
|
||||
cpu: []
|
||||
cpu:
|
||||
- meta-llama/Llama-3.2-1B
|
||||
- google/gemma-2-2b
|
||||
- google/gemma-3-4b-it
|
||||
- openai/whisper-tiny
|
||||
- Qwen/Qwen3-0.6B
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -67,6 +74,11 @@ batch_size:
|
||||
XGLMForCausalLM: 4
|
||||
XLNetLMHeadModel: 2
|
||||
YituTechConvBert: 2
|
||||
meta-llama/Llama-3.2-1B: 8
|
||||
google/gemma-2-2b: 8
|
||||
google/gemma-3-4b-it: 8
|
||||
openai/whisper-tiny: 8
|
||||
Qwen/Qwen3-0.6B: 8
|
||||
|
||||
|
||||
tolerance:
|
||||
|
||||
102
benchmarks/dynamo/huggingface_llm_models.py
Normal file
102
benchmarks/dynamo/huggingface_llm_models.py
Normal file
@ -0,0 +1,102 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def pip_install(package):
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
||||
|
||||
|
||||
try:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
except ModuleNotFoundError:
|
||||
print("Installing HuggingFace Transformers...")
|
||||
pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
|
||||
finally:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
class Benchmark:
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
raise NotImplementedError("get_model_and_inputs() not implemented")
|
||||
|
||||
|
||||
class WhisperBenchmark(Benchmark):
|
||||
SAMPLE_RATE = 16000
|
||||
DURATION = 30.0 # seconds
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
processor = WhisperProcessor.from_pretrained(model_name)
|
||||
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
|
||||
model.config.forced_decoder_ids = None
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
|
||||
audio = torch.randn(num_samples) * 0.1
|
||||
inputs = dict(
|
||||
processor(
|
||||
audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
|
||||
)
|
||||
)
|
||||
inputs["input_features"] = inputs["input_features"].to(device)
|
||||
|
||||
decoder_start_token = model.config.decoder_start_token_id
|
||||
inputs["decoder_input_ids"] = torch.tensor(
|
||||
[[decoder_start_token]], device=device
|
||||
)
|
||||
|
||||
return model, inputs
|
||||
|
||||
|
||||
class TextGenerationBenchmark(Benchmark):
|
||||
INPUT_LENGTH = 1000
|
||||
OUTPUT_LENGTH = 2000
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
|
||||
model.eval()
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.use_cache = True
|
||||
model.generation_config.cache_implementation = "static"
|
||||
model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
vocab_size = tokenizer.vocab_size
|
||||
input_ids = torch.randint(
|
||||
low=0,
|
||||
high=vocab_size,
|
||||
size=(1, TextGenerationBenchmark.INPUT_LENGTH),
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
example_inputs = {"input_ids": input_ids}
|
||||
|
||||
return model, example_inputs
|
||||
|
||||
|
||||
HF_LLM_MODELS: dict[str, Benchmark] = {
|
||||
"meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
|
||||
"google/gemma-2-2b": TextGenerationBenchmark,
|
||||
"google/gemma-3-4b-it": TextGenerationBenchmark,
|
||||
"openai/whisper-tiny": WhisperBenchmark,
|
||||
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
|
||||
}
|
||||
@ -46,3 +46,8 @@ TrOCRForCausalLM,64
|
||||
XGLMForCausalLM,32
|
||||
XLNetLMHeadModel,16
|
||||
YituTechConvBert,32
|
||||
meta-llama/Llama-3.2-1B,8
|
||||
google/gemma-2-2b,8
|
||||
google/gemma-3-4b-it,8
|
||||
openai/whisper-tiny,8
|
||||
Qwen/Qwen3-0.6B,8
|
||||
|
||||
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
|
||||
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
|
||||
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
|
||||
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
|
||||
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
|
||||
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
|
||||
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
|
||||
|
||||
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
// We return nullptr rather than throwing an error since there are bits of c10
|
||||
// that expect an empty PyObjectSlot when python is not available.
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
return nullptr;
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*getGlobalPyInterpreter())
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return getGlobalPyInterpreter();
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = getGlobalPyInterpreter();
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
|
||||
@ -6,17 +6,10 @@
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Function pointer type for getting the global interpreter
|
||||
using GetPyInterpreterFn = PyInterpreter* (*)();
|
||||
|
||||
// Global function pointer (set by csrc initialization)
|
||||
C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
|
||||
|
||||
// Helper function to get the global interpreter
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot();
|
||||
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj() const {
|
||||
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter == nullptr || pyobj_ == nullptr) {
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
|
||||
@ -10,9 +10,9 @@ namespace c10::cuda {
|
||||
|
||||
void c10_cuda_check_implementation(
|
||||
const int32_t err,
|
||||
const char* /*filename*/,
|
||||
const char* /*function_name*/,
|
||||
const int /*line_number*/,
|
||||
const char* filename,
|
||||
const char* function_name,
|
||||
const uint32_t line_number,
|
||||
const bool include_device_assertions) {
|
||||
const auto cuda_error = static_cast<cudaError_t>(err);
|
||||
const auto cuda_kernel_failure = include_device_assertions
|
||||
@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
|
||||
}
|
||||
#endif
|
||||
throw c10::AcceleratorError(
|
||||
{__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
|
||||
{function_name, filename, line_number}, err, check_message);
|
||||
}
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
|
||||
const int32_t err,
|
||||
const char* filename,
|
||||
const char* function_name,
|
||||
const int line_number,
|
||||
const uint32_t line_number,
|
||||
const bool include_device_assertions);
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
22
c10/util/FileSystem.h
Normal file
22
c10/util/FileSystem.h
Normal file
@ -0,0 +1,22 @@
|
||||
// Shim header for filesystem for compilers that are too old to have it not
|
||||
// in the experimental namespace
|
||||
|
||||
#if __has_include(<filesystem>)
|
||||
#include <filesystem>
|
||||
#elif __has_include(<experimental/filesystem>)
|
||||
#include <experimental/filesystem>
|
||||
#else
|
||||
#error "Neither <filesystem> nor <experimental/filesystem> is available."
|
||||
#endif
|
||||
|
||||
namespace c10 {
|
||||
|
||||
#if __has_include(<filesystem>)
|
||||
// NOLINTNEXTLINE(misc-unused-alias-decls)
|
||||
namespace filesystem = std::filesystem;
|
||||
#elif __has_include(<experimental/filesystem>)
|
||||
// NOLINTNEXTLINE(misc-unused-alias-decls)
|
||||
namespace filesystem = std::experimental::filesystem;
|
||||
#endif
|
||||
|
||||
} // namespace c10
|
||||
@ -283,23 +283,55 @@ class intrusive_ptr final {
|
||||
}
|
||||
|
||||
void reset_() noexcept {
|
||||
if (target_ != NullType::singleton() &&
|
||||
detail::atomic_refcount_decrement(target_->refcount_) == 0) {
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
bool should_delete =
|
||||
target_->weakcount_.load(std::memory_order_acquire) == 1;
|
||||
if (!should_delete) {
|
||||
// justification for const_cast: release_resources is basically a
|
||||
// destructor and a destructor always mutates the object, even for const
|
||||
// objects. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();
|
||||
should_delete =
|
||||
detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
|
||||
if (target_ != NullType::singleton()) {
|
||||
#if defined(__linux__) && (defined(__aarch64__) || defined(__x86_64__))
|
||||
if constexpr (
|
||||
std::atomic<uint64_t>::is_always_lock_free &&
|
||||
std::atomic<uint32_t>::is_always_lock_free &&
|
||||
sizeof(std::atomic<uint64_t>) == 8 &&
|
||||
sizeof(std::atomic<uint32_t>) == 4) {
|
||||
auto both_counts_ =
|
||||
reinterpret_cast<std::atomic<uint64_t>*>(&target_->refcount_);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
(reinterpret_cast<std::uintptr_t>(both_counts_) %
|
||||
sizeof(std::atomic<uint64_t>)) == 0 &&
|
||||
(reinterpret_cast<std::uintptr_t>(&target_->weakcount_) -
|
||||
reinterpret_cast<std::uintptr_t>(both_counts_)) ==
|
||||
sizeof(std::atomic<uint32_t>));
|
||||
// 0x100000001ULL is a 64-bit number combination of both the refcount_
|
||||
// and weakcount_ being 1.
|
||||
constexpr uint64_t unique_ref_ = 0x100000001ULL;
|
||||
if (both_counts_->load(std::memory_order_acquire) == unique_ref_) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target_ deletion
|
||||
// call (e.g. calling use_count()) without a data race.
|
||||
target_->refcount_.store(0, std::memory_order_relaxed);
|
||||
delete target_;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
#endif
|
||||
|
||||
if (detail::atomic_refcount_decrement(target_->refcount_) == 0) {
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
bool should_delete =
|
||||
target_->weakcount_.load(std::memory_order_acquire) == 1;
|
||||
if (!should_delete) {
|
||||
// justification for const_cast: release_resources is basically a
|
||||
// destructor and a destructor always mutates the object, even for
|
||||
// const objects.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
const_cast<std::remove_const_t<TTarget>*>(target_)
|
||||
->release_resources();
|
||||
should_delete =
|
||||
detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
|
||||
}
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -73,6 +73,19 @@ void box_cox_zero_lambda(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
|
||||
at::vec::Vectorized<T> data,
|
||||
at::vec::Vectorized<T> lambda1,
|
||||
at::vec::Vectorized<T> lambda2,
|
||||
at::vec::Vectorized<T> k_eps) {
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void box_cox_nonzero_lambda(
|
||||
int64_t D,
|
||||
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
|
||||
auto k_eps_vec = Vec(k_eps);
|
||||
for(; j + VLEN < D; j += VLEN) {
|
||||
auto data = Vec::loadu(data_ptr + j);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps_vec);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
res.store(out + j);
|
||||
}
|
||||
for ( ;j < D; ++j) {
|
||||
auto sum = data_ptr[j] + lambda2_ptr[j];
|
||||
auto max = std::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
|
||||
auto pow = std::pow(max, lambda1_ptr[j]);
|
||||
out[j] = pow * lambda_over_1 - lambda_over_1;
|
||||
if (j < D) {
|
||||
auto remaining = D - j;
|
||||
auto data = Vec::loadu(data_ptr + j, remaining);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
res.store(out + j, remaining);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
113
cmake/External/aotriton.cmake
vendored
113
cmake/External/aotriton.cmake
vendored
@ -45,88 +45,13 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
)
|
||||
set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/") # @lint-ignore
|
||||
set(__AOTRITON_Z "gz")
|
||||
# Set the default __AOTRITON_LIB path
|
||||
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
|
||||
if(WIN32)
|
||||
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
|
||||
endif()
|
||||
|
||||
function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
|
||||
# Windows-specific dependencies - build these first
|
||||
if(NOT noimage)
|
||||
message(FATAL_ERROR "noimage must be ON for Windows builds")
|
||||
endif()
|
||||
# Build dlfcn-win32
|
||||
set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
|
||||
set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
|
||||
|
||||
ExternalProject_Add(${dlfcn-win32_external}
|
||||
GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
|
||||
GIT_TAG v1.4.2
|
||||
PREFIX ${__DLFCN_WIN32_PREFIX}
|
||||
INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
|
||||
CMAKE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_C_COMPILER=cl
|
||||
-DCMAKE_CXX_COMPILER=cl
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
-DBUILD_TESTS=OFF
|
||||
BUILD_BYPRODUCTS
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
|
||||
)
|
||||
ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/"
|
||||
DEPENDEES install
|
||||
)
|
||||
set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
|
||||
|
||||
# Build xz/liblzma
|
||||
set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
|
||||
set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
|
||||
|
||||
ExternalProject_Add(${xz_external}
|
||||
GIT_REPOSITORY https://github.com/tukaani-project/xz.git
|
||||
GIT_TAG v5.8.1
|
||||
PREFIX ${__XZ_PREFIX}
|
||||
INSTALL_DIR ${__XZ_INSTALL_DIR}
|
||||
CMAKE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
-DENABLE_NLS=OFF
|
||||
-DXZ_TOOL_LZMAINFO=OFF
|
||||
-DXZ_TOOL_XZ=OFF
|
||||
-DXZ_TOOL_XZDEC=OFF
|
||||
-DXZ_TOOL_LZMADEC=OFF
|
||||
BUILD_BYPRODUCTS
|
||||
"${__XZ_INSTALL_DIR}/lib/lzma.lib"
|
||||
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
|
||||
)
|
||||
ExternalProject_Add_Step(${xz_external} copy_to_aotriton
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/"
|
||||
DEPENDEES install
|
||||
)
|
||||
set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
|
||||
endfunction()
|
||||
|
||||
function(aotriton_build_from_source noimage project)
|
||||
if(noimage)
|
||||
SET(RECURSIVE "OFF")
|
||||
else()
|
||||
SET(RECURSIVE "ON")
|
||||
endif()
|
||||
if(WIN32)
|
||||
message(STATUS "Building AOTriton Windows dependencies")
|
||||
aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
|
||||
endif()
|
||||
message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
|
||||
|
||||
ExternalProject_Add(${project}
|
||||
GIT_REPOSITORY https://github.com/ROCm/aotriton.git
|
||||
GIT_SUBMODULES_RECURSE ${RECURSIVE}
|
||||
@ -140,19 +65,12 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
-DAOTRITON_GPU_BUILD_TIMEOUT=0
|
||||
-DAOTRITON_NO_PYTHON=ON
|
||||
-DAOTRITON_NOIMAGE_MODE=${noimage}
|
||||
-DHIP_PLATFORM=amd
|
||||
$<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
|
||||
$<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
|
||||
BUILD_BYPRODUCTS
|
||||
"${__AOTRITON_LIB}"
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
||||
USES_TERMINAL_DOWNLOAD TRUE
|
||||
USES_TERMINAL_CONFIGURE TRUE
|
||||
USES_TERMINAL_BUILD TRUE
|
||||
USES_TERMINAL_INSTALL TRUE
|
||||
)
|
||||
if(WIN32)
|
||||
add_dependencies(${project} dlfcn-win32_external xz_external)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
|
||||
@ -177,7 +95,7 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
|
||||
"${__AOTRITON_INSTALL_DIR}"
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_LIB}"
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
||||
)
|
||||
message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
|
||||
Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
|
||||
@ -193,35 +111,14 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
string(CONCAT __AOTRITON_URL
|
||||
"${__AOTRITON_BASE_URL}"
|
||||
"${__AOTRITON_VER}/${__AOTRITON_FILE}")
|
||||
|
||||
# Set up directories
|
||||
set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
|
||||
set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
|
||||
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
|
||||
set(__DOWNLOAD_NO_EXTRACT "")
|
||||
set(__BUILD_COMMANDS "")
|
||||
|
||||
# On Windows, we need custom tar extraction with UTF-8 support
|
||||
if(WIN32)
|
||||
set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
|
||||
set(__BUILD_COMMANDS
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
|
||||
COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
|
||||
)
|
||||
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(${project}
|
||||
URL "${__AOTRITON_URL}"
|
||||
URL_HASH SHA256=${__AOTRITON_SHA256}
|
||||
DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR}
|
||||
${__DOWNLOAD_NO_EXTRACT}
|
||||
SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
|
||||
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
${__BUILD_COMMANDS}
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
"${__AOTRITON_INSTALL_SOURCE_DIR}"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
|
||||
"${__AOTRITON_INSTALL_DIR}"
|
||||
BUILD_BYPRODUCTS
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
|
||||
@ -267,7 +164,7 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endif()
|
||||
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB})
|
||||
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
|
||||
target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
|
||||
set(AOTRITON_FOUND TRUE)
|
||||
endif() # __AOTRITON_INCLUDED
|
||||
|
||||
38
demo_no_spew.py
Normal file
38
demo_no_spew.py
Normal file
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demo: No log spew with distributed logging patch.
|
||||
Run with: torchrun --nproc_per_node=2 demo_no_spew.py
|
||||
"""
|
||||
import os
|
||||
import warnings
|
||||
import logging
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
# Initialize distributed
|
||||
if 'RANK' in os.environ:
|
||||
dist.init_process_group('gloo')
|
||||
rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
else:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
|
||||
print(f"=== Process {rank}/{world_size} ===")
|
||||
|
||||
# Test warnings
|
||||
warnings.warn("This warning should only appear ONCE (from rank 0)")
|
||||
|
||||
# Test logging
|
||||
logging.warning("This logging should only appear ONCE (from rank 0)")
|
||||
|
||||
# Test the original cpp_extension case
|
||||
logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)
|
||||
from torch.utils.cpp_extension import _get_cuda_arch_flags
|
||||
_get_cuda_arch_flags()
|
||||
|
||||
|
||||
print(f"Process {rank} completed")
|
||||
|
||||
if 'RANK' in os.environ:
|
||||
dist.destroy_process_group()
|
||||
@ -1187,7 +1187,8 @@ int64_t _Tensor_ndim(mpy::handle h) {
|
||||
mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
|
||||
// fast case: tensor is live in python
|
||||
std::optional<PyObject*> mb_obj =
|
||||
t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
|
||||
t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
if (mb_obj.has_value() &&
|
||||
!t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
|
||||
return *mb_obj;
|
||||
|
||||
@ -8,3 +8,4 @@ pyyaml
|
||||
requests
|
||||
six # dependency chain: NNPACK -> PeachPy -> six
|
||||
typing-extensions>=4.10.0
|
||||
pip # not technically needed, but this makes setup.py invocation work
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <c10/util/Enumerate.h>
|
||||
#include <torch/custom_class.h>
|
||||
#include <torch/nativert/detail/ITree.h>
|
||||
|
||||
namespace torch::nativert::detail {
|
||||
@ -1147,4 +1148,200 @@ TEST(ITreeTest, ToAtenType) {
|
||||
c10::TypeKind::AnyType);
|
||||
}
|
||||
|
||||
TEST(ITreeTest, KeyedJaggedTensorUnflatten) {
|
||||
// Test KeyedJaggedTensor pytree node registration
|
||||
// KeyedJaggedTensor has 6 tensor fields: _values, _weights, _lengths,
|
||||
// _offsets, _stride_per_key_per_rank, _inverse_indices
|
||||
auto jsonSpec = R"(
|
||||
[
|
||||
1,
|
||||
{
|
||||
"type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
|
||||
"context": "[\"key1\", \"key2\"]",
|
||||
"children_spec": [
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)";
|
||||
|
||||
auto [graph, valuePtrs] = makeValues(6);
|
||||
const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
|
||||
|
||||
// Create mock tensor values for the 6 fields
|
||||
std::vector<c10::IValue> flats = {
|
||||
c10::IValue(1), // _values
|
||||
c10::IValue(2), // _weights
|
||||
c10::IValue(3), // _lengths
|
||||
c10::IValue(4), // _offsets
|
||||
c10::IValue(5), // _stride_per_key_per_rank
|
||||
c10::IValue(6), // _inverse_indices tensor part
|
||||
};
|
||||
|
||||
// Test unflatten - this will create a generic tuple since we don't have
|
||||
// the actual KeyedJaggedTensor constructor available in tests
|
||||
auto itree = itreeUnflatten(flats, spec);
|
||||
EXPECT_TRUE(itree.isTuple());
|
||||
EXPECT_EQ(itree.toTupleRef().elements().size(), 6);
|
||||
|
||||
// Verify the values match what we put in
|
||||
for (size_t i = 0; i < 6; i++) {
|
||||
EXPECT_EQ(itree.toTupleRef().elements()[i], flats[i]);
|
||||
}
|
||||
|
||||
// Verify spec has correct number of children and structure
|
||||
EXPECT_EQ(spec.children().size(), 6);
|
||||
EXPECT_EQ(spec.numIValues(), 6);
|
||||
EXPECT_FALSE(spec.isIValue());
|
||||
EXPECT_EQ(
|
||||
spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
|
||||
}
|
||||
|
||||
TEST(ITreeTest, KeyedJaggedTensorNodeRegistration) {
|
||||
// Test that KeyedJaggedTensor pytree node is properly registered
|
||||
|
||||
// Verify the KeyedJaggedTensor node is in the registry by attempting
|
||||
// to load a spec that references it
|
||||
auto jsonSpec = R"(
|
||||
[
|
||||
1,
|
||||
{
|
||||
"type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
|
||||
"context": "[\"key1\", \"key2\"]",
|
||||
"children_spec": [
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)";
|
||||
|
||||
auto [graph, valuePtrs] = makeValues(6);
|
||||
|
||||
// This should not throw - if KeyedJaggedTensor wasn't registered,
|
||||
// we'd get an exception about "Unknown pytree node type"
|
||||
EXPECT_NO_THROW({
|
||||
const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
|
||||
|
||||
// Verify the spec loaded correctly
|
||||
EXPECT_FALSE(spec.isIValue());
|
||||
EXPECT_EQ(
|
||||
spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
|
||||
EXPECT_EQ(spec.children().size(), 6);
|
||||
EXPECT_EQ(spec.numIValues(), 6);
|
||||
|
||||
// Verify context is parsed correctly
|
||||
EXPECT_FALSE(spec.context().is_null());
|
||||
EXPECT_TRUE(spec.context().is_array());
|
||||
EXPECT_EQ(spec.context().size(), 2);
|
||||
});
|
||||
}
|
||||
|
||||
TEST(ITreeTest, JaggedTensorNodeRegistration) {
|
||||
// Test that JaggedTensor pytree node is also properly registered
|
||||
|
||||
auto jsonSpec = R"(
|
||||
[
|
||||
1,
|
||||
{
|
||||
"type": "torchrec.sparse.jagged_tensor.JaggedTensor",
|
||||
"context": "null",
|
||||
"children_spec": [
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"context": null,
|
||||
"children_spec": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)";
|
||||
|
||||
auto [graph, valuePtrs] = makeValues(4);
|
||||
|
||||
// This should not throw - if JaggedTensor wasn't registered,
|
||||
// we'd get an exception about "Unknown pytree node type"
|
||||
EXPECT_NO_THROW({
|
||||
const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
|
||||
|
||||
// Verify the spec loaded correctly
|
||||
EXPECT_FALSE(spec.isIValue());
|
||||
EXPECT_EQ(spec.uniformName(), "torchrec.sparse.jagged_tensor.JaggedTensor");
|
||||
EXPECT_EQ(spec.children().size(), 4);
|
||||
EXPECT_EQ(spec.numIValues(), 4);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace torch::nativert::detail
|
||||
|
||||
@ -10,10 +10,13 @@ import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributed._composable import checkpoint
|
||||
from torch.testing._internal.common_cuda import TEST_CUDA
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
|
||||
from torch.utils.checkpoint import CheckpointError
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class MemoryDelta(ContextDecorator):
|
||||
def __init__(self, device: torch.device):
|
||||
self.device: torch.device = device
|
||||
@ -22,16 +25,16 @@ class MemoryDelta(ContextDecorator):
|
||||
|
||||
def __enter__(self):
|
||||
self.active_memory_enter = (
|
||||
torch.cuda.memory_stats()["active_bytes.all.current"]
|
||||
if self.device.type == "cuda"
|
||||
torch.accelerator.memory_stats()["active_bytes.all.current"]
|
||||
if self.device.type == "cuda" or self.device.type == "xpu"
|
||||
else 0
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc):
|
||||
self.active_memory_exit = (
|
||||
torch.cuda.memory_stats()["active_bytes.all.current"]
|
||||
if self.device.type == "cuda"
|
||||
torch.accelerator.memory_stats()["active_bytes.all.current"]
|
||||
if self.device.type == "cuda" or self.device.type == "xpu"
|
||||
else 0
|
||||
)
|
||||
|
||||
@ -126,7 +129,7 @@ class TestCheckpoint(TestCase):
|
||||
loss2 = net2(x2).sum()
|
||||
loss2.backward()
|
||||
|
||||
if x.is_cuda:
|
||||
if x.is_cuda or x.is_xpu:
|
||||
self.assertTrue(mem2.delta() < mem1.delta())
|
||||
|
||||
for p1, p2 in zip(net1.parameters(), net2.parameters()):
|
||||
@ -137,10 +140,10 @@ class TestCheckpoint(TestCase):
|
||||
net = ToyModel()
|
||||
self._test_tensor_only(net, x)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "no cuda")
|
||||
@unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
|
||||
def test_tensor_only_gpu(self):
|
||||
x = torch.randn(20, 100, device="cuda:0")
|
||||
net = ToyModel().to("cuda:0")
|
||||
x = torch.randn(20, 100, device=f"{device_type}:0")
|
||||
net = ToyModel().to(f"{device_type}:0")
|
||||
self._test_tensor_only(net, x)
|
||||
|
||||
def test_random_cpu(self):
|
||||
|
||||
@ -47,6 +47,8 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
run_tests,
|
||||
TEST_XPU,
|
||||
xfailIf,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
@ -58,6 +60,9 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class SimpleModel(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -73,7 +78,7 @@ class SimpleModel(nn.Module):
|
||||
return x
|
||||
|
||||
def get_input(self):
|
||||
return torch.rand(4, 5, device="cuda")
|
||||
return torch.rand(4, 5, device=device_type)
|
||||
|
||||
|
||||
class SimpleModelUneven(nn.Module):
|
||||
@ -94,7 +99,7 @@ class SimpleModelUneven(nn.Module):
|
||||
return x
|
||||
|
||||
def get_input(self):
|
||||
return torch.rand(4, 5, device="cuda")
|
||||
return torch.rand(4, 5, device=device_type)
|
||||
|
||||
|
||||
class TestFullyShard2DTraining(FSDPTest):
|
||||
@ -105,13 +110,15 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return min(4, torch.cuda.device_count())
|
||||
return min(4, torch.accelerator.device_count())
|
||||
|
||||
def init_global_mesh(self) -> DeviceMesh:
|
||||
# Prefer to test with >=4 GPUs, but for 2 GPUs, use 2-way TP
|
||||
dp_size = 2 if self.world_size > 2 else 1
|
||||
return init_device_mesh(
|
||||
"cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
|
||||
device_type,
|
||||
(dp_size, self.world_size // dp_size),
|
||||
mesh_dim_names=("dp", "tp"),
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@ -138,7 +145,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
|
||||
torch.manual_seed(42)
|
||||
model = MLPStack(mlp_dim)
|
||||
ref_model = copy.deepcopy(model).cuda()
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
|
||||
model.parallelize(
|
||||
@ -150,9 +157,8 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
|
||||
|
||||
torch.manual_seed(42 + dp_pg.rank() + 1)
|
||||
device = torch.device("cuda")
|
||||
for iter_idx in range(10):
|
||||
inp = torch.randn((8, mlp_dim), device=device)
|
||||
inp = torch.randn((8, mlp_dim), device=device_type)
|
||||
losses: list[torch.Tensor] = []
|
||||
for _model, _optim in ((ref_model, ref_optim), (model, optim)):
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
|
||||
@ -162,6 +168,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1881
|
||||
def test_train_parity_2d_transformer(self):
|
||||
self.run_subtests(
|
||||
{"use_shard_placement_fn": [False, True]},
|
||||
@ -172,12 +179,12 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
torch.manual_seed(42)
|
||||
model_args = ModelArgs(n_layers=3, dropout_p=0.0)
|
||||
model = Transformer(model_args)
|
||||
ref_model = copy.deepcopy(model).cuda()
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
||||
|
||||
dp_size, tp_size = self.world_size // 2, 2
|
||||
global_mesh = init_device_mesh(
|
||||
"cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
|
||||
device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
|
||||
)
|
||||
model = Transformer.parallelize(model, global_mesh["tp"], use_seq_parallel=True)
|
||||
|
||||
@ -205,7 +212,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
self.assertEqual(full_param, ref_param)
|
||||
|
||||
torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
|
||||
inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
|
||||
inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
|
||||
for iter_idx in range(5):
|
||||
ref_loss = ref_model(inp).sum()
|
||||
loss = model(inp).sum()
|
||||
@ -242,15 +249,16 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
self.assertEqual(full_param, ref_param)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_tp_with_fsdp_offloading(self):
|
||||
global_mesh = init_device_mesh(
|
||||
"cuda", (1, self.world_size), mesh_dim_names=("dp", "tp")
|
||||
device_type, (1, self.world_size), mesh_dim_names=("dp", "tp")
|
||||
)
|
||||
dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
|
||||
torch.manual_seed(42)
|
||||
mlp_dim = 16
|
||||
model = MLPStack(mlp_dim)
|
||||
ref_model = copy.deepcopy(model).cuda()
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
|
||||
# Parallelize with N-way TP and 1-way FSDP
|
||||
model.parallelize(
|
||||
@ -268,7 +276,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
# NOTE: We still see the FSDP all-gather/reduce-scatter c10d ops
|
||||
# called, but they will just be no-ops without issuing any kernels.
|
||||
# We prefer to keep the no-op check at the c10d level, not in FSDP.
|
||||
inp = torch.randn((4, mlp_dim), device="cuda") # same on all ranks
|
||||
inp = torch.randn((4, mlp_dim), device=device_type) # same on all ranks
|
||||
for _ in range(10):
|
||||
ref_optim.zero_grad()
|
||||
optim.zero_grad()
|
||||
@ -297,6 +305,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
ref_optim.step()
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1881
|
||||
@with_temp_dir
|
||||
def test_train_parity_2d_transformer_checkpoint_resume(self):
|
||||
"""
|
||||
@ -352,7 +361,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
)
|
||||
|
||||
torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
|
||||
inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
|
||||
inp = torch.randint(0, model_args.vocab_size, (3, 16), device=device_type)
|
||||
loss_no_cp1 = train_step(model_no_cp, optim_no_cp, inp)
|
||||
loss_no_cp2 = train_step(model_no_cp, optim_no_cp, inp)
|
||||
|
||||
@ -410,14 +419,14 @@ class TestFullyShard2DStateDict(DTensorTestBase):
|
||||
@property
|
||||
def backend(self):
|
||||
# need to specify gloo backend for testing cpu offload
|
||||
return "cpu:gloo,cuda:nccl"
|
||||
return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_fully_shard_tp_2d_set_full_state_dict(self):
|
||||
dummy_model = SimpleModel().cuda()
|
||||
dummy_model = SimpleModel().to(device_type)
|
||||
mesh_2d = init_device_mesh(
|
||||
"cuda",
|
||||
device_type,
|
||||
(2, self.world_size // 2),
|
||||
mesh_dim_names=("dp", "tp"),
|
||||
)
|
||||
@ -561,7 +570,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
|
||||
)
|
||||
model = FSDP(
|
||||
SimpleModel().cuda(),
|
||||
SimpleModel().to(device_type),
|
||||
device_mesh=mesh_2d["dp"],
|
||||
)
|
||||
fsdp_state = _get_module_fsdp_state(model)
|
||||
@ -573,7 +582,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
recompute_activation=False,
|
||||
) -> None:
|
||||
torch.manual_seed(0)
|
||||
model = SimpleModel().cuda(self.rank)
|
||||
model = SimpleModel().to(f"{device_type}:{self.rank}")
|
||||
model = FSDP(model, use_orig_params=use_orig_params)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||
|
||||
@ -587,7 +596,9 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
"net1": ColwiseParallel(),
|
||||
"net2": RowwiseParallel(),
|
||||
}
|
||||
model_2d = parallelize_module(SimpleModel().cuda(), tp_mesh, parallelize_plan)
|
||||
model_2d = parallelize_module(
|
||||
SimpleModel().to(device_type), tp_mesh, parallelize_plan
|
||||
)
|
||||
model_2d = FSDP(
|
||||
model_2d,
|
||||
device_mesh=dp_mesh,
|
||||
@ -615,7 +626,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
# Ensure all input across TP ranks are same.
|
||||
# TODO: add a get_group_rank() to DeviceMesh.
|
||||
torch.manual_seed(i + dist.get_rank(dp_mesh.get_group(mesh_dim=0)))
|
||||
input = torch.rand(4, 5).cuda(self.rank)
|
||||
input = torch.rand(4, 5).to(f"{device_type}:{self.rank}")
|
||||
output = model(input)
|
||||
output_2d = model_2d(input)
|
||||
self.assertEqual(output, output_2d)
|
||||
@ -652,7 +663,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
@property
|
||||
def backend(self):
|
||||
# need to specify gloo backend for testing cpu offload
|
||||
return "cpu:gloo,cuda:nccl"
|
||||
return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@ -669,7 +680,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
"net3": ColwiseParallel(),
|
||||
}
|
||||
model_2d = parallelize_module(
|
||||
SimpleModel().cuda(),
|
||||
SimpleModel().to(device_type),
|
||||
mesh_2d["tp"],
|
||||
parallelize_plan=parallelize_plan,
|
||||
)
|
||||
@ -679,8 +690,10 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
isinstance(model_2d_fsdp_state._fsdp_extension, DTensorExtensions)
|
||||
)
|
||||
|
||||
mesh_1d = init_device_mesh("cuda", (self.world_size,))
|
||||
model_1d = FSDP(SimpleModel().cuda(), device_mesh=mesh_1d, use_orig_params=True)
|
||||
mesh_1d = init_device_mesh(device_type, (self.world_size,))
|
||||
model_1d = FSDP(
|
||||
SimpleModel().to(device_type), device_mesh=mesh_1d, use_orig_params=True
|
||||
)
|
||||
model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
|
||||
self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)
|
||||
|
||||
@ -692,7 +705,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
|
||||
# Create a model without wrapper
|
||||
torch.manual_seed(0)
|
||||
no_wrap_model = simple_model().cuda(self.rank)
|
||||
no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
|
||||
no_wrap_state_dict = no_wrap_model.state_dict()
|
||||
|
||||
# Create a model and sharded it with 2D FSDP + TP
|
||||
@ -706,7 +719,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
"net1": ColwiseParallel(),
|
||||
"net2": RowwiseParallel(),
|
||||
}
|
||||
model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
|
||||
model_2d = parallelize_module(
|
||||
simple_model().to(device_type), tp_mesh, parallelize_plan
|
||||
)
|
||||
model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
|
||||
|
||||
FSDP.set_state_dict_type(
|
||||
@ -754,7 +769,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
"net1": ColwiseParallel(),
|
||||
"net2": RowwiseParallel(),
|
||||
}
|
||||
model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
|
||||
model_2d = parallelize_module(
|
||||
simple_model().to(device_type), tp_mesh, parallelize_plan
|
||||
)
|
||||
model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
|
||||
optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
|
||||
|
||||
@ -768,7 +785,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
ref_state_dict = deepcopy(model_2d.state_dict())
|
||||
|
||||
# Update the parameters so model.state_dict() will be different from ref_dtensor_sd.
|
||||
model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
|
||||
model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
|
||||
optim_2d.step()
|
||||
|
||||
# Load ref_state_dict back.
|
||||
@ -799,9 +816,11 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
|
||||
# Create a model without wrapper
|
||||
torch.manual_seed(0)
|
||||
no_wrap_model = simple_model().cuda(self.rank)
|
||||
no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
|
||||
no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
|
||||
no_wrap_model(no_wrap_model.get_input().cuda(self.rank)).sum().backward()
|
||||
no_wrap_model(
|
||||
no_wrap_model.get_input().to(f"{device_type}:{self.rank}")
|
||||
).sum().backward()
|
||||
no_wrap_optim.step()
|
||||
no_wrap_osd = get_optimizer_state_dict(no_wrap_model, optimizers=no_wrap_optim)
|
||||
|
||||
@ -815,7 +834,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
"net2": RowwiseParallel(),
|
||||
}
|
||||
model_2d = parallelize_module(
|
||||
simple_model().cuda(), mesh_2d["tp"], parallelize_plan
|
||||
simple_model().to(device_type), mesh_2d["tp"], parallelize_plan
|
||||
)
|
||||
model_2d = FSDP(model_2d, device_mesh=mesh_2d["dp"], use_orig_params=True)
|
||||
FSDP.set_state_dict_type(
|
||||
@ -823,7 +842,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
StateDictType.SHARDED_STATE_DICT,
|
||||
)
|
||||
optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
|
||||
model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
|
||||
model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
|
||||
optim_2d.step()
|
||||
optim_2d_osd = get_optimizer_state_dict(model_2d, optimizers=optim_2d)
|
||||
ref_optim_2d_osd = deepcopy(optim_2d_osd)
|
||||
@ -842,7 +861,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
# compare with no_wrap state.
|
||||
if isinstance(dist_state, DTensor):
|
||||
dist_state = (
|
||||
dist_state.cuda()
|
||||
dist_state.to(device_type)
|
||||
.redistribute(placements=(Replicate(), Replicate()))
|
||||
.to_local()
|
||||
)
|
||||
@ -850,7 +869,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
self.assertTrue(torch.allclose(state, dist_state))
|
||||
|
||||
# Update the parameters 2d optim states will be different from ref_optim_state_dict.
|
||||
model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
|
||||
model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
|
||||
optim_2d.step()
|
||||
|
||||
set_optimizer_state_dict(
|
||||
@ -892,8 +911,8 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
5) dcp.load the state dict from storage
|
||||
6) load the state dict into the 2D model
|
||||
"""
|
||||
dummy_model = SimpleModel().cuda()
|
||||
mesh_1d = init_device_mesh("cuda", (self.world_size,))
|
||||
dummy_model = SimpleModel().to(device_type)
|
||||
mesh_1d = init_device_mesh(device_type, (self.world_size,))
|
||||
model = FSDP(dummy_model, device_mesh=mesh_1d)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||
model(model.get_input()).sum().backward()
|
||||
@ -911,9 +930,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
dcp.save(state_dict, checkpoint_id=self.temp_dir)
|
||||
|
||||
# initialize 2d model
|
||||
dummy_model = SimpleModel().cuda()
|
||||
dummy_model = SimpleModel().to(device_type)
|
||||
mesh_2d = init_device_mesh(
|
||||
"cuda",
|
||||
device_type,
|
||||
(2, self.world_size // 2),
|
||||
mesh_dim_names=("dp", "tp"),
|
||||
)
|
||||
|
||||
@ -30,7 +30,7 @@ from torch.distributed.tensor.parallel import (
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcessTestCase,
|
||||
requires_nccl,
|
||||
requires_accelerator_dist_backend,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -38,6 +38,7 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
run_tests,
|
||||
skip_but_pass_in_sandcastle_if,
|
||||
TEST_XPU,
|
||||
)
|
||||
from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
|
||||
|
||||
@ -46,6 +47,10 @@ if TYPE_CHECKING:
|
||||
from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
backend = torch.distributed.get_default_backend_for_device(device_type)
|
||||
|
||||
|
||||
# MLP Layer
|
||||
class MLPModule(torch.nn.Module):
|
||||
def __init__(self, d_hid: int):
|
||||
@ -79,7 +84,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
@classmethod
|
||||
def backend_str(cls) -> str:
|
||||
# Testing with NCCL backend
|
||||
return "nccl"
|
||||
return backend
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@ -100,9 +105,11 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
def device(self):
|
||||
return self.rank
|
||||
|
||||
@requires_nccl()
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
|
||||
)
|
||||
def test_pp_and_dcp(self):
|
||||
"""
|
||||
Test that pipeline parallelism and distributed checkpointing can be used together and
|
||||
@ -143,11 +150,11 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
device = torch.device("cuda", self.device)
|
||||
torch.cuda.set_device(self.device)
|
||||
device = torch.device(device_type, self.device)
|
||||
torch.accelerator.set_device_index(self.device)
|
||||
store = torch.distributed.FileStore(self.file_name, self.world_size)
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
backend=backend,
|
||||
store=store,
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
@ -192,9 +199,11 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
|
||||
_dcp_test(self)
|
||||
|
||||
@requires_nccl()
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_if_lt_x_gpu(8)
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 8+ GPUs")
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
|
||||
)
|
||||
@parametrize(
|
||||
"ScheduleClass",
|
||||
[
|
||||
@ -213,11 +222,11 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
],
|
||||
)
|
||||
def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
|
||||
_device_raii = torch.device("cuda", self.device)
|
||||
torch.cuda.set_device(self.device)
|
||||
_device_raii = torch.device(device_type, self.device)
|
||||
torch.accelerator.set_device_index(self.device)
|
||||
store = torch.distributed.FileStore(self.file_name, self.world_size)
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
backend=backend,
|
||||
store=store,
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
@ -228,7 +237,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
num_microbatches = 8
|
||||
dp_size = self.world_size // (tp_size * pp_size)
|
||||
device_mesh = init_device_mesh(
|
||||
"cuda",
|
||||
device_type,
|
||||
mesh_shape=(dp_size, pp_size, tp_size),
|
||||
mesh_dim_names=("dp", "pp", "tp"),
|
||||
)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from copy import deepcopy
|
||||
|
||||
import torch
|
||||
@ -14,7 +15,11 @@ from torch.testing._internal.common_distributed import (
|
||||
MultiProcessTestCase,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
device_module = torch.get_device_module(device_type)
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
@ -154,6 +159,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self._compare_module(model, replicate_model)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
|
||||
def test_replicate_move_args_kwargs_to_device(self):
|
||||
class MyNet(nn.Module):
|
||||
def __init__(self) -> None:
|
||||
@ -166,24 +172,25 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
return self.a(inp)
|
||||
|
||||
self._init_pg()
|
||||
torch.cuda.set_device(self.rank)
|
||||
model = MyNet().cuda()
|
||||
replicate(model, device_id=torch.cuda.current_device())
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
model = MyNet().to(device_type)
|
||||
replicate(model, device_id=torch.accelerator.current_device_index())
|
||||
# CPU input ensures replicate can move arg and kwargs to device.
|
||||
a, b = torch.randn(2, 2), torch.randn(2, 2)
|
||||
model(a, kwarg=b).sum().backward()
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
|
||||
def test_replicate_ignore_module(self):
|
||||
self._init_pg()
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
# Seed ensures diff input and thus different local grads across ranks.
|
||||
torch.manual_seed(self.rank)
|
||||
torch.cuda.manual_seed(self.rank)
|
||||
model = Net().cuda()
|
||||
device_module.manual_seed(self.rank)
|
||||
model = Net().to(device_type)
|
||||
replicate(model, ignored_modules=[model.fc1])
|
||||
# CPU input ensures that replicate can move input to GPU as DDP does.
|
||||
inp = torch.randn(5, 2, device="cuda") * (self.rank + 1)
|
||||
inp = torch.randn(5, 2, device=device_type) * (self.rank + 1)
|
||||
out = model(inp) * 10
|
||||
out.sum().backward()
|
||||
# FC1 grads should not be synchronized, FC2 and 3 should be.
|
||||
@ -221,10 +228,11 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self._compare_module(model, replicate_model)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
|
||||
def test_replicate_device_id(self):
|
||||
self._init_pg()
|
||||
model = Net()
|
||||
model_cuda = deepcopy(model).cuda()
|
||||
model_cuda = deepcopy(model).to(device_type)
|
||||
model_cuda2 = deepcopy(model_cuda)
|
||||
replicate(model, device_id=torch.device("cpu"))
|
||||
# DDP instance is attached in first pre forward
|
||||
@ -233,13 +241,15 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
# Should be None for CPU training
|
||||
self.assertEqual(None, replicate_ddp_weakref.device_ids)
|
||||
|
||||
replicate(model_cuda, device_id=torch.device(torch.cuda.current_device()))
|
||||
replicate(
|
||||
model_cuda, device_id=torch.device(torch.accelerator.current_device_index())
|
||||
)
|
||||
# DDP instance is attached in first pre forward
|
||||
model_cuda(torch.randn(2, 2))
|
||||
replicate_ddp_weakref = replicate.state(model_cuda)._ddp_weakref()
|
||||
self.assertEqual([0], replicate_ddp_weakref.device_ids)
|
||||
# Pass in int as device_id
|
||||
replicate(model_cuda2, device_id=int(torch.cuda.current_device()))
|
||||
replicate(model_cuda2, device_id=int(torch.accelerator.current_device_index()))
|
||||
# DDP instance is attached in first pre forward
|
||||
model_cuda2(torch.randn(2, 2))
|
||||
replicate_ddp_weakref = replicate.state(model_cuda2)._ddp_weakref()
|
||||
@ -256,6 +266,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
|
||||
class ReplicateFullyShardInit(ReplicateTest):
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
|
||||
def test_replicate_fully_shard_init(self):
|
||||
class ToyModel(nn.Module):
|
||||
def __init__(self, dim: int):
|
||||
@ -273,14 +284,14 @@ class ReplicateFullyShardInit(ReplicateTest):
|
||||
return y
|
||||
|
||||
self._init_pg()
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
dim = 3
|
||||
bz = 2
|
||||
model = ToyModel(dim).cuda()
|
||||
model = ToyModel(dim).to(device_type)
|
||||
for linear in model.linears:
|
||||
fully_shard(linear)
|
||||
fully_shard(model.linears)
|
||||
replicate(model, device_id=torch.cuda.current_device())
|
||||
replicate(model, device_id=torch.accelerator.current_device_index())
|
||||
for linear in model.linears:
|
||||
self.assertTrue(isinstance(linear.weight, DTensor))
|
||||
inp = torch.rand(bz, dim)
|
||||
|
||||
@ -98,6 +98,8 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
self.create_pg(device)
|
||||
torch._dynamo.config.optimize_ddp = "python_reducer"
|
||||
torch.manual_seed(123)
|
||||
if device_type == "xpu":
|
||||
torch.use_deterministic_algorithms(True, warn_only=True)
|
||||
model = Net(checkpoint=checkpoint).to(device)
|
||||
input = torch.randn([1, DIM], device=device)
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import torch.distributed._functional_collectives as funcol
|
||||
import torch.nn as nn
|
||||
from torch.distributed.tensor import DeviceMesh, DTensor, Shard
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
|
||||
from torch.testing._internal.common_distributed import requires_nccl
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
@ -14,7 +14,6 @@ from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
|
||||
c10d_functional = torch.ops.c10d_functional
|
||||
c10d_ops = torch.ops.c10d
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class TestCommMode(TestCase):
|
||||
@ -29,7 +28,7 @@ class TestCommMode(TestCase):
|
||||
dist.init_process_group(
|
||||
backend="fake", rank=1, world_size=self.world_size, store=store
|
||||
)
|
||||
self.device_type = device_type
|
||||
self.device_type = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.world_pg = dist.distributed_c10d._get_default_group()
|
||||
|
||||
def checksAssert(self, comm_mode, key, expected_value, expected_total_value):
|
||||
@ -112,12 +111,12 @@ class TestCommMode(TestCase):
|
||||
self.assertEqual(comm_counts[c10d_functional.all_gather_into_tensor], 1)
|
||||
self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 0)
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@requires_nccl()
|
||||
def test_comm_mode_with_c10d(self):
|
||||
if not torch.accelerator.is_available():
|
||||
if not torch.cuda.is_available():
|
||||
return
|
||||
|
||||
inp = torch.rand(2, 8, 16).to(device_type)
|
||||
inp = torch.rand(2, 8, 16).cuda()
|
||||
all_gather_out = inp.new_empty(self.world_size * 2, 8, 16)
|
||||
|
||||
comm_mode = CommDebugMode()
|
||||
|
||||
@ -616,11 +616,11 @@ class DTensorMeshTest(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
def test_dtensor_device_mesh_device_conversion(self):
|
||||
# construct a gpu device mesh
|
||||
# construct a cuda device mesh
|
||||
mesh = self.build_device_mesh()
|
||||
|
||||
# construct from a cpu local tensor with gpu device mesh
|
||||
# should automatically convert the dist tensor to gpu
|
||||
# construct from a cpu local tensor with cuda device mesh
|
||||
# should automatically convert the dist tensor to cuda
|
||||
placements = [Shard(0)]
|
||||
local_tensor = torch.randn(3, 3)
|
||||
dist_tensor = DTensor.from_local(local_tensor, mesh, placements)
|
||||
@ -669,7 +669,7 @@ class DTensorMeshTest(DTensorTestBase):
|
||||
@with_comms
|
||||
def test_dtensor_2d_mesh(self):
|
||||
mesh_tensor = torch.arange(self.world_size).reshape(2, 4)
|
||||
# construct a gpu device mesh
|
||||
# construct a cuda device mesh
|
||||
mesh = DeviceMesh(self.device_type, mesh_tensor)
|
||||
|
||||
# construct a dist tensor on 2d device mesh and test if works
|
||||
@ -691,7 +691,7 @@ class DTensorMeshTest(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
def test_device_mesh_nd(self):
|
||||
# construct a gpu device mesh
|
||||
# construct a cuda device mesh
|
||||
mesh_tensor = torch.arange(self.world_size).reshape(2, 2, 2)
|
||||
mesh = DeviceMesh(self.device_type, mesh_tensor)
|
||||
# construct a dist tensor on 3d device mesh and test if works
|
||||
@ -953,8 +953,8 @@ class TestDTensorPlacementTypes(DTensorTestBase):
|
||||
# Keep everything deterministic.
|
||||
torch.manual_seed(0)
|
||||
tensor = torch.rand(size)
|
||||
if self.device_type != "cpu":
|
||||
return tensor.to(self.device_type)
|
||||
if self.device_type == "cuda":
|
||||
return tensor.cuda()
|
||||
else:
|
||||
return tensor
|
||||
|
||||
|
||||
@ -39,7 +39,6 @@ from torch.distributed.tensor.parallel import (
|
||||
RowwiseParallel,
|
||||
)
|
||||
from torch.distributed.tensor.placement_types import _StridedShard
|
||||
from torch.testing._internal.common_device_type import skipXPUIf
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_fsdp import get_devtype
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -48,6 +47,8 @@ from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
skipIfHpu,
|
||||
skipIfTorchDynamo,
|
||||
TEST_CUDA,
|
||||
TEST_HPU,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
@ -94,8 +95,6 @@ aot_eager_graph = aot_autograd(
|
||||
partition_fn=min_cut_rematerialization_partition,
|
||||
)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
def _apply_sharding(mod: nn.Module, shard_dim: int, device_mesh: DeviceMesh):
|
||||
"""
|
||||
@ -142,7 +141,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
|
||||
|
||||
@property
|
||||
def device_type(self) -> str:
|
||||
return device_type
|
||||
return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"
|
||||
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
@ -161,9 +160,9 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
|
||||
res = fn(x)
|
||||
res.to_local().sum().backward()
|
||||
|
||||
@unittest.skipIf(not torch.accelerator.is_available(), "accelerator not available")
|
||||
@unittest.skipIf(not TEST_CUDA, "CUDA not available")
|
||||
def test_dtensor_basic_export(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
mesh = DeviceMesh("cuda", torch.arange(self.world_size))
|
||||
|
||||
param = torch.randn(4, 4)
|
||||
param_x = DTensor.from_local(param, mesh, [Shard(0)], run_check=False)
|
||||
@ -189,10 +188,10 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
|
||||
)
|
||||
self.assertExpectedInline(
|
||||
str(ep.graph_module.code).strip(),
|
||||
f"""\
|
||||
"""\
|
||||
def forward(self, b_buffer, x):
|
||||
_assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(x, dtype = torch.float64, device = device(type='cpu'), layout = torch.strided); _assert_tensor_metadata_default = None
|
||||
to = torch.ops.aten.to.dtype_layout(x, dtype = torch.float64, layout = torch.strided, device = device(type='{self.device_type}')); x = None
|
||||
to = torch.ops.aten.to.dtype_layout(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda')); x = None
|
||||
view_as = torch.ops.aten.view_as.default(to, to); to = None
|
||||
dtensor___init__0 = self.dtensor___init__0
|
||||
dtensor_const_func_spec0 = self.dtensor_const_func_spec0
|
||||
@ -207,10 +206,10 @@ def forward(self, b_buffer, x):
|
||||
# add is performed in _propagate_tensor_meta_non_cached, hence add_1 instead of add
|
||||
self.assertExpectedInline(
|
||||
str(ep.run_decompositions({}).graph_module.code).strip(),
|
||||
f"""\
|
||||
"""\
|
||||
def forward(self, b_parametrizations_buffer_original0, x):
|
||||
_assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided); _assert_tensor_metadata = None
|
||||
_to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='{self.device_type}', index=0)); x = None
|
||||
_to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0)); x = None
|
||||
view = torch.ops.aten.view.default(_to_copy, [4, 4]); _to_copy = None
|
||||
add_1 = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view); b_parametrizations_buffer_original0 = view = None
|
||||
view_1 = torch.ops.aten.view.default(add_1, [4, 4]); add_1 = None
|
||||
@ -340,7 +339,6 @@ def forward(self, b_parametrizations_buffer_original0, x):
|
||||
self.assertEqual(res, ref)
|
||||
|
||||
@skipIfHpu
|
||||
@skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1981")
|
||||
def test_dtensor_dynamic_loss_parallel_log_softmax(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
@ -716,13 +714,13 @@ def forward(self, b_parametrizations_buffer_original0, x):
|
||||
out = layer_norm.permute(0, 2, 1)
|
||||
return out
|
||||
|
||||
x = torch.randn(4, 2, 4, requires_grad=True, device=self.device_type)
|
||||
x = torch.randn(4, 2, 4, requires_grad=True, device="cuda")
|
||||
x_dt = DTensor.from_local(x, mesh, [Shard(1)], run_check=False)
|
||||
|
||||
y = torch.randn(4, requires_grad=True, device=self.device_type)
|
||||
y = torch.randn(4, requires_grad=True, device="cuda")
|
||||
y_dt = DTensor.from_local(y, mesh, [Replicate()], run_check=False)
|
||||
|
||||
z = torch.randn(4, requires_grad=True, device=self.device_type)
|
||||
z = torch.randn(4, requires_grad=True, device="cuda")
|
||||
z_dt = DTensor.from_local(z, mesh, [Replicate()], run_check=False)
|
||||
|
||||
opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
|
||||
@ -820,7 +818,7 @@ def forward(self, b_parametrizations_buffer_original0, x):
|
||||
# pass in tensor as inputs/outputs, create DTensor and run redistribute
|
||||
# (allgather collective) inside the fn
|
||||
def fn(x_dt):
|
||||
if x_dt.device_mesh.device_type == f"{self.device_type}":
|
||||
if x_dt.device_mesh.device_type == "cuda":
|
||||
return x_dt + 1
|
||||
else:
|
||||
return x_dt + 2
|
||||
@ -949,7 +947,7 @@ def forward(self, primals_1):
|
||||
|
||||
model = FakeTransformer().to(self.device_type)
|
||||
|
||||
tp_mesh = init_device_mesh(self.device_type, (2,), mesh_dim_names=("tp",))
|
||||
tp_mesh = init_device_mesh("cuda", (2,), mesh_dim_names=("tp",))
|
||||
|
||||
# apply sequence parallel
|
||||
parallel_plan = {
|
||||
|
||||
@ -126,10 +126,6 @@ dtensor_fails = {
|
||||
xfail("cummin"),
|
||||
xfail("diagonal_scatter"),
|
||||
xfail("dist"),
|
||||
xfail("empty"),
|
||||
xfail("empty_strided"),
|
||||
xfail("empty_like"),
|
||||
xfail("empty_permuted"),
|
||||
xfail("expand_copy"),
|
||||
xfail("exponential"),
|
||||
xfail("equal"),
|
||||
@ -482,6 +478,11 @@ dtensor_fails = {
|
||||
skip("_segment_reduce", "offsets"),
|
||||
# TODO: fix the following ops
|
||||
skip("squeeze"),
|
||||
# These must be skipped as their contents are nondeterministic
|
||||
skip("empty"),
|
||||
skip("empty_strided"),
|
||||
skip("empty_like"),
|
||||
skip("empty_permuted"),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -19,6 +19,8 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
run_tests,
|
||||
TEST_CUDA,
|
||||
TEST_HPU,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
@ -517,7 +519,7 @@ class RedistributeTest(DTensorTestBase):
|
||||
local_out_dt = out_dt.to_local()
|
||||
local_expected_dt = expected_dt.to_local()
|
||||
self.assertEqual(out_dt.to_local(), expected_dt.to_local())
|
||||
if torch.accelerator.is_available():
|
||||
if TEST_HPU or TEST_CUDA:
|
||||
self.assertEqual(
|
||||
comm_mode.get_comm_counts()[
|
||||
torch.ops._dtensor.shard_dim_alltoall
|
||||
|
||||
@ -295,8 +295,8 @@ class DistTensorOpsTest(DTensorTestBase):
|
||||
self.assertEqual(dist_tensor.dtype, torch.float32)
|
||||
self.assertEqual(zeros_like_dt.dtype, torch.bfloat16)
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_stack(self):
|
||||
mesh_2d = DeviceMesh(
|
||||
self.device_type, torch.arange(self.world_size).reshape(2, 2)
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
import torch._dynamo.testing
|
||||
@ -9,6 +10,7 @@ import torch._inductor.config
|
||||
import torch._inductor.test_case
|
||||
import torch.onnx.operators
|
||||
import torch.utils.cpp_extension
|
||||
from torch._dynamo.aot_compile import ModelInput
|
||||
from torch._dynamo.exc import PackageError, Unsupported
|
||||
from torch._dynamo.package import DynamoCache
|
||||
from torch._dynamo.precompile_context import PrecompileContext
|
||||
@ -226,6 +228,85 @@ from user code:
|
||||
actual = compiled_fn(*inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_aot_compile_module(self):
|
||||
mod = SimpleLinearModule()
|
||||
|
||||
model = torch.compile(
|
||||
mod,
|
||||
fullgraph=True,
|
||||
backend="inductor",
|
||||
options={
|
||||
"guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
|
||||
},
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def train_mode(model):
|
||||
"""
|
||||
Context manager that sets the model to training mode before entering the context.
|
||||
"""
|
||||
model.train()
|
||||
yield
|
||||
|
||||
@contextmanager
|
||||
def eval_mode(model):
|
||||
"""
|
||||
Context manager that sets the model to evaluation mode before entering the context.
|
||||
"""
|
||||
model.eval()
|
||||
yield
|
||||
|
||||
inputs = [
|
||||
ModelInput(
|
||||
args=(torch.randn(3, 3),),
|
||||
kwargs={},
|
||||
contexts=[torch.no_grad(), eval_mode(model)],
|
||||
),
|
||||
ModelInput(
|
||||
args=(torch.randn(3, 3),), kwargs={}, contexts=[train_mode(model)]
|
||||
),
|
||||
]
|
||||
assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
|
||||
model._aot_compile(
|
||||
inputs,
|
||||
)
|
||||
with torch.compiler.set_stance("fail_on_recompile"):
|
||||
model.eval()
|
||||
inputs = (torch.randn(3, 3),)
|
||||
expected = mod(*inputs)
|
||||
actual = model(*inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
# Shouldn't recompile
|
||||
model.train()
|
||||
expected.sum().backward()
|
||||
|
||||
model._save_aot_compiled_module(self.path())
|
||||
torch._dynamo.reset()
|
||||
model = torch.compile(
|
||||
mod,
|
||||
fullgraph=True,
|
||||
backend="inductor",
|
||||
options={
|
||||
"guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
|
||||
},
|
||||
)
|
||||
assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
|
||||
with open(self.path(), "rb") as f:
|
||||
data = f.read()
|
||||
model._load_aot_compiled_module(data)
|
||||
|
||||
with torch.compiler.set_stance("fail_on_recompile"):
|
||||
model.eval()
|
||||
inputs = (torch.randn(3, 3),)
|
||||
expected = mod(*inputs)
|
||||
actual = model(*inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
# Shouldn't recompile
|
||||
model.train()
|
||||
expected.sum().backward()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
@ -16605,6 +16605,37 @@ def forward(self, x):
|
||||
wrapper = Wrapper(pyt_model, example_inputs)
|
||||
wrapper.forward()
|
||||
|
||||
def test_export_with_dict_input_nested_in_args(self):
|
||||
"""Test export with dictionary input nested in args."""
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MyModel, self).__init__()
|
||||
self.linear = torch.nn.Linear(10, 1)
|
||||
|
||||
def forward(self, data_batch):
|
||||
h1 = self.linear(data_batch["a1"])
|
||||
h2 = self.linear(data_batch["a2"])
|
||||
return h1 + h2
|
||||
|
||||
# Create model and example inputs
|
||||
model = MyModel()
|
||||
a1 = torch.randn(10)
|
||||
a2 = torch.randn(10)
|
||||
original_input = {"a1": a1, "a2": a2}
|
||||
example_args_forward = (original_input,)
|
||||
|
||||
# Export the model
|
||||
exported_model = export(model, example_args_forward)
|
||||
|
||||
# Run both models and compare results
|
||||
reordered_input = {"a2": a2, "a1": a1}
|
||||
original_output = exported_model.module()(reordered_input)
|
||||
loaded_output = model(original_input)
|
||||
|
||||
# Verify outputs are close (allowing for floating point differences)
|
||||
torch.testing.assert_close(original_output, loaded_output)
|
||||
|
||||
def test_strict_export_with_shared_parameters(self):
|
||||
"""Test that parameter names are preserved when there are shared parameters with the same name."""
|
||||
|
||||
|
||||
@ -129,11 +129,13 @@ class TestMaxAutotune(TestCase):
|
||||
@parametrize("a_transposed", (False, True))
|
||||
@parametrize("b_transposed", (False, True))
|
||||
@parametrize("dynamic", (False, True))
|
||||
@parametrize("tma_store", (False, True))
|
||||
def test_max_autotune_regular_mm_persistent_tma(
|
||||
self,
|
||||
a_transposed: bool,
|
||||
b_transposed: bool,
|
||||
dynamic: bool,
|
||||
tma_store: bool,
|
||||
):
|
||||
def mm(a, b):
|
||||
# TMA requires 16-byte alignment: here we repeat the dims
|
||||
@ -165,12 +167,35 @@ class TestMaxAutotune(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"triton.enable_persistent_tma_matmul": "1",
|
||||
"triton.enable_template_tma_store": tma_store,
|
||||
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
|
||||
}
|
||||
):
|
||||
c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
|
||||
c_actual, code = run_and_get_code(torch.compile(mm, dynamic=dynamic), a, b)
|
||||
c_expected = mm(a, b)
|
||||
|
||||
if has_triton_stable_tma_api():
|
||||
make_desc_api = "triton.language.make_tensor_descriptor"
|
||||
read_api = "tl.load_tensor_descriptor"
|
||||
if tma_store:
|
||||
# Note: The tma_descriptor0 is generated by the kernel. If the
|
||||
# code generation process changes this could change.
|
||||
write_api = "tma_descriptor0.store"
|
||||
else:
|
||||
write_api = "tl.store"
|
||||
else:
|
||||
make_desc_api = (
|
||||
"triton.language.extra.cuda.experimental_device_tensormap_create2d"
|
||||
)
|
||||
read_api = "tl._experimental_descriptor_load"
|
||||
# TMA store is not supported with the experimental API
|
||||
write_api = "tl.store"
|
||||
|
||||
# Verify that we are using a TMA implementation
|
||||
FileCheck().check("triton_tem_fused_mm").check(make_desc_api).check(
|
||||
read_api
|
||||
).check(write_api).run(code[0])
|
||||
|
||||
torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
|
||||
|
||||
@unittest.skipIf(
|
||||
@ -264,6 +289,42 @@ class TestMaxAutotune(TestCase):
|
||||
# given the config flags above, we should have no choices left.
|
||||
self.assertIn("NoValidChoicesError", str(context.exception))
|
||||
|
||||
@unittest.skipIf(
|
||||
not has_triton_tma_device(), "Need device-side TMA support in Triton"
|
||||
)
|
||||
@parametrize("dynamic", (False, True))
|
||||
def test_max_autotune_regular_mm_persistent_tma_illegal_output_alignment(
|
||||
self, dynamic
|
||||
):
|
||||
def mm(a, b, out):
|
||||
torch.mm(a, b, out=out)
|
||||
return out
|
||||
|
||||
M, N, K = 21, 31, 32
|
||||
a = torch.empty_strided((M, K), (K, 1), dtype=torch.float16, device=GPU_TYPE)
|
||||
a[:] = torch.randn((M, K), dtype=torch.float16)
|
||||
b = torch.empty_strided((K, N), (1, K), dtype=torch.float16, device=GPU_TYPE)
|
||||
b[:] = torch.randn((K, N), dtype=torch.float16)
|
||||
# allocate an output with a stride not divisble by 16, so it can't satisfy TMA alignment checks.
|
||||
out = torch.empty_strided((M, N), (N, 1), dtype=torch.float16, device=GPU_TYPE)
|
||||
|
||||
with (
|
||||
self.assertRaises(BackendCompilerFailed) as context,
|
||||
config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"triton.enable_persistent_tma_matmul": "1",
|
||||
"triton.enable_template_tma_store": True,
|
||||
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
|
||||
}
|
||||
),
|
||||
):
|
||||
torch.compile(mm, dynamic=dynamic)(a, b, out)
|
||||
|
||||
# Lowering to the persistent+TMA Triton template should be skipped
|
||||
# since the output doesn't have a stride of 1 in any dim
|
||||
self.assertIn("NoValidChoicesError", str(context.exception))
|
||||
|
||||
@unittest.skipIf(
|
||||
not has_triton_tma_device(), "Need device-side TMA support in Triton"
|
||||
)
|
||||
@ -317,11 +378,13 @@ class TestMaxAutotune(TestCase):
|
||||
@parametrize("a_transposed", (False, True))
|
||||
@parametrize("b_transposed", (False, True))
|
||||
@parametrize("dynamic", (False, True))
|
||||
@parametrize("tma_store", (False, True))
|
||||
def test_max_autotune_addmm_persistent_tma(
|
||||
self,
|
||||
a_transposed: bool,
|
||||
b_transposed: bool,
|
||||
dynamic: bool,
|
||||
tma_store: bool,
|
||||
):
|
||||
def addmm(x, a, b):
|
||||
# TMA requires 16-byte alignment: here we repeat the dims
|
||||
@ -355,12 +418,37 @@ class TestMaxAutotune(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"triton.enable_persistent_tma_matmul": "1",
|
||||
"triton.enable_template_tma_store": tma_store,
|
||||
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
|
||||
}
|
||||
):
|
||||
c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
|
||||
c_actual, code = run_and_get_code(
|
||||
torch.compile(addmm, dynamic=dynamic), x, a, b
|
||||
)
|
||||
c_expected = addmm(x, a, b)
|
||||
|
||||
if has_triton_stable_tma_api():
|
||||
make_desc_api = "triton.language.make_tensor_descriptor"
|
||||
read_api = "tl.load_tensor_descriptor"
|
||||
if tma_store:
|
||||
# Note: The tma_descriptor0 is generated by the kernel. If the
|
||||
# code generation process changes this could change.
|
||||
write_api = "tma_descriptor0.store"
|
||||
else:
|
||||
write_api = "tl.store"
|
||||
else:
|
||||
make_desc_api = (
|
||||
"triton.language.extra.cuda.experimental_device_tensormap_create2d"
|
||||
)
|
||||
read_api = "tl._experimental_descriptor_load"
|
||||
# TMA store is not supported with the experimental API
|
||||
write_api = "tl.store"
|
||||
|
||||
# Verify that we are using a TMA implementation
|
||||
FileCheck().check("triton_tem_fused_addmm").check(make_desc_api).check(
|
||||
read_api
|
||||
).check(write_api).run(code[0])
|
||||
|
||||
torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
|
||||
|
||||
@unittest.skipIf(
|
||||
@ -1508,7 +1596,7 @@ class TestMaxAutotune(TestCase):
|
||||
# Make sure all args of generate_and_load_args are passed to make_key_args (Except generate_with_caching)
|
||||
# update this function each time new arg added to generate_and_load and make sure arg is added to make_key
|
||||
self.assertEqual(generate_and_load_args - 1, make_key_args)
|
||||
self.assertEqual(generate_and_load_args, 17)
|
||||
self.assertEqual(generate_and_load_args, 18)
|
||||
|
||||
@fresh_cache()
|
||||
@config.patch(
|
||||
@ -1594,7 +1682,7 @@ class TestMaxAutotune(TestCase):
|
||||
"[[22,30],[30,1],torch.float32,device(type='cuda',index=0),0]"],
|
||||
'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
|
||||
'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
|
||||
'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
|
||||
'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,
|
||||
'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
|
||||
'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
|
||||
|
||||
@ -1634,7 +1722,7 @@ class TestMaxAutotune(TestCase):
|
||||
"[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
|
||||
'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
|
||||
'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
|
||||
'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
|
||||
'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
|
||||
'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
|
||||
expected = expected.replace("cuda", GPU_TYPE)
|
||||
self.assertExpectedInline(
|
||||
|
||||
@ -353,6 +353,33 @@ class TestOperatorReorderForPeakMemory(TestCase):
|
||||
y = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)
|
||||
z = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)
|
||||
|
||||
from torch._inductor.choices import InductorChoices
|
||||
from torch._inductor.scheduler import BaseSchedulerNode, Scheduler
|
||||
|
||||
class CustomInductorChoices(InductorChoices):
|
||||
@staticmethod
|
||||
def can_fuse(
|
||||
scheduler: Scheduler,
|
||||
node1: BaseSchedulerNode,
|
||||
node2: BaseSchedulerNode,
|
||||
shared_data_score: int,
|
||||
) -> bool:
|
||||
can_fuse_default = InductorChoices.can_fuse(
|
||||
scheduler, node1, node2, shared_data_score
|
||||
)
|
||||
if (not can_fuse_default) or (
|
||||
not config.realize_acc_reads_size_threshold
|
||||
):
|
||||
return can_fuse_default
|
||||
|
||||
all_reads = (node1.read_writes.reads | node2.read_writes.reads) - (
|
||||
node1.read_writes.writes | node2.read_writes.writes
|
||||
)
|
||||
size_of_reads = [scheduler.dep_size_hint(dep) for dep in all_reads]
|
||||
return sum(size_of_reads) < config.realize_acc_reads_size_threshold
|
||||
|
||||
torch._inductor.virtualized.V.set_choices_handler(CustomInductorChoices())
|
||||
|
||||
# CASE 1: no restriction on the amount of accumulation
|
||||
with config.patch({"realize_acc_reads_size_threshold": float("inf")}):
|
||||
f_compiled = torch.compile(f)
|
||||
|
||||
@ -1,244 +1,249 @@
|
||||
{
|
||||
"EndToEndLSTM (__main__.RNNTest)": 194.9510040283203,
|
||||
"MultiheadAttention (__main__.ModulesTest)": 140.13499959309897,
|
||||
"test__adaptive_avg_pool2d (__main__.CPUReproTests)": 89.57710986667209,
|
||||
"test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 64.31833351982965,
|
||||
"test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 66.09833272298177,
|
||||
"test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.02314267839704,
|
||||
"test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.13800048828125,
|
||||
"test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.19166692097982,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 153.9259999593099,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 214.78533426920572,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 158.7769978841146,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.201476414998375,
|
||||
"test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.8566665649414,
|
||||
"test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 158.88999938964844,
|
||||
"test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 600.0303955078125,
|
||||
"test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 143.89337348937988,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 494.34210883246527,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.5401102701823,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 135.9231694539388,
|
||||
"test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.03799947102864,
|
||||
"test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.23316764831543,
|
||||
"test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 214.73055691189236,
|
||||
"test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 150.5653305053711,
|
||||
"test_cat_2k_args (__main__.TestTEFuserDynamic)": 121.138150700114,
|
||||
"test_cat_2k_args (__main__.TestTEFuserStatic)": 117.27021219874874,
|
||||
"test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 332.1435546875,
|
||||
"test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.1364440917969,
|
||||
"test_collect_callgrind (__main__.TestBenchmarkUtils)": 322.539549085829,
|
||||
"test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 109.46066538492839,
|
||||
"test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 110.44916661580403,
|
||||
"test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.25650024414062,
|
||||
"test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 75.41433461507161,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 111.43533325195312,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 113.98733520507812,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 485.4573465983073,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 464.56699625651044,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 265.6348292032878,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 314.0461654663086,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1546.3898315429688,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.4828332265218,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1384.938496907552,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.32633463541667,
|
||||
"test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.70183436075847,
|
||||
"test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.88016764322917,
|
||||
"test_comprehensive_linalg_pinv_singular_cuda_complex128 (__main__.TestDecompCUDA)": 60.60533459981283,
|
||||
"test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.5096664428711,
|
||||
"test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 78.69066619873047,
|
||||
"test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 92.91299947102864,
|
||||
"test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 73.34999974568684,
|
||||
"test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.28683344523112,
|
||||
"test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.44366518656413,
|
||||
"test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.09783299763997,
|
||||
"test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.4760004679362,
|
||||
"test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 142.64183044433594,
|
||||
"test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 137.7250010172526,
|
||||
"test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 138.17566553751627,
|
||||
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.95266660054524,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 60.835333506266274,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 66.94753379821778,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 138.8831672668457,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 157.37983194986978,
|
||||
"test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 148.48499552408853,
|
||||
"test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 142.54666646321616,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.76000086466472,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 70.30716641743977,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 340.98316701253253,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 314.614995320638,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.2018330891927,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 85.09549967447917,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.72550201416016,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.59499867757161,
|
||||
"test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 61.82139994303385,
|
||||
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 141.1143341064453,
|
||||
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 142.72383499145508,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1356.413838704427,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1347.1215209960938,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1366.5043131510417,
|
||||
"test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 593.5763346354166,
|
||||
"test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.9474945068359,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 74.53666687011719,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 75.8316650390625,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 74.80666669209798,
|
||||
"test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 67.3658332824707,
|
||||
"test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 67.6716677347819,
|
||||
"test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 120.74283218383789,
|
||||
"test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 117.90700022379558,
|
||||
"test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.16149965922038,
|
||||
"test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.09249877929688,
|
||||
"test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 68.72949981689453,
|
||||
"test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 76.05216598510742,
|
||||
"test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.25549952189128,
|
||||
"test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 124.02233123779297,
|
||||
"test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 130.15816497802734,
|
||||
"test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 114.52783139546712,
|
||||
"test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 94.13066546122234,
|
||||
"test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 243.25878143310547,
|
||||
"test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 560.9872216118706,
|
||||
"test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 85.30400085449219,
|
||||
"test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.0622667948405,
|
||||
"test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.94093297322591,
|
||||
"test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 164.94733174641928,
|
||||
"test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.41599782307942,
|
||||
"test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 80.62599987453885,
|
||||
"test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 77.90822347005208,
|
||||
"test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 88.02899932861328,
|
||||
"test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 83.99416732788086,
|
||||
"test_count_nonzero_all (__main__.TestBool)": 625.3162163628472,
|
||||
"test_custom_module_lstm (__main__.TestQuantizedOps)": 691.5127597384983,
|
||||
"test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.18333435058594,
|
||||
"test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 146.76594623766448,
|
||||
"test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 341.765677134196,
|
||||
"test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.25488874647353,
|
||||
"test_fail_random.py (__main__.TestTyping)": 69.70459224559643,
|
||||
"test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 99.30016708374023,
|
||||
"test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.32933298746745,
|
||||
"test_fuse_large_params_cpu (__main__.CpuTests)": 100.9027509689331,
|
||||
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 156.06466674804688,
|
||||
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.44311014811197,
|
||||
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 140.33400217692056,
|
||||
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 108.87950007120769,
|
||||
"test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 78.21525671543219,
|
||||
"test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 95.37383270263672,
|
||||
"test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.23833465576172,
|
||||
"test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.07466634114584,
|
||||
"test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.14850107828775,
|
||||
"test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 141.07866414388022,
|
||||
"test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 155.69166564941406,
|
||||
"test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 638.5084838867188,
|
||||
"test_group_norm (__main__.TestQuantizedOps)": 235.64022382100424,
|
||||
"test_indirect_device_assert (__main__.TritonCodeGenTests)": 328.87933349609375,
|
||||
"test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 116.18105255930047,
|
||||
"test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 70.07888836330838,
|
||||
"test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 89.06283315022786,
|
||||
"test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 131.60088857014975,
|
||||
"test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.61966451009114,
|
||||
"test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 131.74433390299478,
|
||||
"test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.52466583251953,
|
||||
"test_linear (__main__.TestStaticQuantizedModule)": 219.97832912868924,
|
||||
"test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 111.1229985555013,
|
||||
"test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 139.29833475748697,
|
||||
"test_linear_relu (__main__.TestStaticQuantizedModule)": 222.60332700941296,
|
||||
"test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 137.30917072296143,
|
||||
"test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.62766689724393,
|
||||
"test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.4219970703125,
|
||||
"test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.6419982910156,
|
||||
"test_lstm_cpu (__main__.TestMkldnnCPU)": 69.61133321126302,
|
||||
"test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.47244517008464,
|
||||
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.23977788289388,
|
||||
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.10499954223633,
|
||||
"test_nan_assert_float16 (__main__.ProcessGroupNCCLGroupTest)": 105.55233224232991,
|
||||
"test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 148.99966939290366,
|
||||
"test_proper_exit (__main__.TestDataLoader)": 195.07049942016602,
|
||||
"test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 238.3838322957357,
|
||||
"test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 180.44411044650607,
|
||||
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 64.31058961917192,
|
||||
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 62.13955030441284,
|
||||
"test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 141.32811228434244,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.34100087483723,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 84.88599904378255,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.63999938964844,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.23133341471355,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.41600036621094,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.7643305460612,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.55433400472005,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 86.17699940999348,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 76.47133382161458,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 98.72666676839192,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08499908447266,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 79.43900044759114,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 87.4413324991862,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.52833302815755,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.18200174967448,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.71099853515625,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.84733327229817,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 89.47599792480469,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.17300160725911,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.56466674804688,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.08200073242188,
|
||||
"test_qrnncell (__main__.TestDynamicQuantizedOps)": 200.46322377522787,
|
||||
"test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 637.5349934895834,
|
||||
"test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1213.9888509114583,
|
||||
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 759.4036661783854,
|
||||
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1672.4736735026042,
|
||||
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 76.77566528320312,
|
||||
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 292.51483662923175,
|
||||
"test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 129.11066691080728,
|
||||
"test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 260.64366658528644,
|
||||
"test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 73.24966684977214,
|
||||
"test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 157.60366821289062,
|
||||
"test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.70783360799153,
|
||||
"test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 89.36199951171875,
|
||||
"test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.34283447265625,
|
||||
"test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 64.08739941914877,
|
||||
"test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 126.64083353678386,
|
||||
"test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.82166735331218,
|
||||
"test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.22033437093098,
|
||||
"test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 65.57016626993816,
|
||||
"test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 76.09683354695638,
|
||||
"test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 71.15816752115886,
|
||||
"test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.32677883572049,
|
||||
"test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 157.43183390299478,
|
||||
"test_save_load_large_string_attribute (__main__.TestSaveLoad)": 131.13233439127603,
|
||||
"test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.5550011528863,
|
||||
"test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 117.62710995144315,
|
||||
"test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 114.96744452582465,
|
||||
"test_std (__main__.TestQuantizedOps)": 275.08810419506494,
|
||||
"test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.82900087038675,
|
||||
"test_terminate_handler_on_crash (__main__.TestTorch)": 110.43555479579501,
|
||||
"test_terminate_signal (__main__.ForkTest)": 130.07055732442274,
|
||||
"test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.6981106830968,
|
||||
"test_terminate_signal (__main__.SpawnTest)": 133.48411263359918,
|
||||
"test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 90.4521090189616,
|
||||
"test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 164.04612350463867,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 77.9958324432373,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 78.84283447265625,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 79.08466720581055,
|
||||
"test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 127.43616739908855,
|
||||
"test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 129.390500386556,
|
||||
"test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.55349795023601,
|
||||
"test_unary_ops (__main__.TestTEFuserDynamic)": 84.59466772609287,
|
||||
"test_unary_ops (__main__.TestTEFuserStatic)": 87.30733429061041,
|
||||
"test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.17999776204427,
|
||||
"test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 79.73050053914388,
|
||||
"test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 87.70950190226237,
|
||||
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.42566680908203,
|
||||
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.90966542561848,
|
||||
"test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 62.53285598754883,
|
||||
"test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 91.11416816711426,
|
||||
"test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 86.59666760762532,
|
||||
"test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 93.32300059000652,
|
||||
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 100.57566833496094,
|
||||
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 116.00733248392741,
|
||||
"test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 62.26690483093262,
|
||||
"test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 87.44200134277344,
|
||||
"test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 133.6548334757487,
|
||||
"test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 114.57983334859212,
|
||||
"test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 69.25033442179362,
|
||||
"test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 124.68766911824544,
|
||||
"test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 76.81024932861328,
|
||||
"test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 140.70899963378906,
|
||||
"test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 118.22750091552734,
|
||||
"test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.27366256713867
|
||||
"EndToEndLSTM (__main__.RNNTest)": 197.77900187174478,
|
||||
"MultiheadAttention (__main__.ModulesTest)": 137.42000325520834,
|
||||
"test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.1816660563151,
|
||||
"test__adaptive_avg_pool2d (__main__.CPUReproTests)": 91.37688869900174,
|
||||
"test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.57933298746745,
|
||||
"test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 66.92922253078885,
|
||||
"test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.68500010172527,
|
||||
"test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.91966756184897,
|
||||
"test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 87.69499969482422,
|
||||
"test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.02233378092448,
|
||||
"test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.45699946085612,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 136.27599589029947,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 259.30466715494794,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 135.36400095621744,
|
||||
"test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.07166544596354,
|
||||
"test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.8491905757359,
|
||||
"test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 90.34733327229817,
|
||||
"test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 140.09266916910806,
|
||||
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 65.17999935150146,
|
||||
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 73.75112533569336,
|
||||
"test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 646.9324035644531,
|
||||
"test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 142.86450004577637,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 493.49299791124133,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 498.72944810655383,
|
||||
"test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 133.2033322652181,
|
||||
"test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.788333892822266,
|
||||
"test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 69.57333119710286,
|
||||
"test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 81.06516774495442,
|
||||
"test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 215.5933346218533,
|
||||
"test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 135.41816584269205,
|
||||
"test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 338.17533026801215,
|
||||
"test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 423.4767761230469,
|
||||
"test_collect_callgrind (__main__.TestBenchmarkUtils)": 325.6485578748915,
|
||||
"test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.10633341471355,
|
||||
"test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 104.33766555786133,
|
||||
"test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.72683334350586,
|
||||
"test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 71.48199971516927,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 96.58033243815105,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 96.65433247884114,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 464.92467244466144,
|
||||
"test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 460.3839925130208,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 263.58483632405597,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 298.0318349202474,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1310.3350016276042,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 66.3976656595866,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1316.084981282552,
|
||||
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.58183288574219,
|
||||
"test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.05749893188477,
|
||||
"test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.31333287556966,
|
||||
"test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.53133392333984,
|
||||
"test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40500005086263,
|
||||
"test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.91749890645345,
|
||||
"test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 70.98916562398274,
|
||||
"test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 126.90333302815755,
|
||||
"test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 112.40283330281575,
|
||||
"test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.09550094604492,
|
||||
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.223000049591064,
|
||||
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.44083213806152,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.70066706339518,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 60.468666076660156,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 116.34999974568684,
|
||||
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 116.57566579182942,
|
||||
"test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 115.4306640625,
|
||||
"test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 114.67599741617839,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 78.96566772460938,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.72616704305013,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 270.3598327636719,
|
||||
"test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 260.6623306274414,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.48316701253255,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.13166681925456,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 83.55450057983398,
|
||||
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.67749913533528,
|
||||
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 136.17766698201498,
|
||||
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 157.4010009765625,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1222.983662923177,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1228.281494140625,
|
||||
"test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1216.2643432617188,
|
||||
"test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.51465861002606,
|
||||
"test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 523.0736694335938,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 68.91749954223633,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.947166442871094,
|
||||
"test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.17983309427897,
|
||||
"test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 77.92383321126302,
|
||||
"test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 69.46137571334839,
|
||||
"test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.2076670328776,
|
||||
"test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 139.3495012919108,
|
||||
"test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 124.99983469645183,
|
||||
"test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 73.96983273824056,
|
||||
"test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.27383422851562,
|
||||
"test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 80.94216791788737,
|
||||
"test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.65583419799805,
|
||||
"test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 74.30566660563152,
|
||||
"test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 112.75583267211914,
|
||||
"test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 106.72283299763997,
|
||||
"test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 102.85349909464519,
|
||||
"test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 73.14683278401692,
|
||||
"test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 137.8197758992513,
|
||||
"test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 437.60955386691626,
|
||||
"test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 75.4076665242513,
|
||||
"test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.40233357747396,
|
||||
"test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 149.36666870117188,
|
||||
"test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 72.90299987792969,
|
||||
"test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.56499862670898,
|
||||
"test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 75.13744566175673,
|
||||
"test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.20433298746745,
|
||||
"test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 76.78600056966145,
|
||||
"test_count_nonzero_all (__main__.TestBool)": 655.6186726888021,
|
||||
"test_cpu_gpu_parity_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 80.43400009940652,
|
||||
"test_custom_module_lstm (__main__.TestQuantizedOps)": 798.5362040201823,
|
||||
"test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 360.75275349617004,
|
||||
"test_diff_hyperparams_sharding_strategy_str_no_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.4433339436849,
|
||||
"test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 85.3961664835612,
|
||||
"test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 93.10799916585286,
|
||||
"test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 215.1919957002004,
|
||||
"test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.04866790771484,
|
||||
"test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.6271112230089,
|
||||
"test_fail_creation_ops.py (__main__.TestTyping)": 71.04431086573108,
|
||||
"test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 88.46849950154622,
|
||||
"test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 107.12216822306316,
|
||||
"test_fuse_large_params_cpu (__main__.CpuTests)": 80.30040054321289,
|
||||
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 162.87633260091147,
|
||||
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 160.84833441840277,
|
||||
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 153.62799580891928,
|
||||
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 120.26516850789388,
|
||||
"test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 62.87366739908854,
|
||||
"test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 104.12133407592773,
|
||||
"test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 117.95999908447266,
|
||||
"test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 113.97000122070312,
|
||||
"test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 248.1183293660482,
|
||||
"test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 180.4351666768392,
|
||||
"test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 160.81400299072266,
|
||||
"test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 694.055165608724,
|
||||
"test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.28900146484375,
|
||||
"test_group_norm (__main__.TestQuantizedOps)": 207.3484410179986,
|
||||
"test_indirect_device_assert (__main__.TritonCodeGenTests)": 329.52866617838544,
|
||||
"test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.15944459703233,
|
||||
"test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.40099970499675,
|
||||
"test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 132.7371097140842,
|
||||
"test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.91166687011719,
|
||||
"test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 130.4806671142578,
|
||||
"test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.25733184814453,
|
||||
"test_linear (__main__.TestStaticQuantizedModule)": 131.34678183661566,
|
||||
"test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 124.32133229573567,
|
||||
"test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 126.89633433024089,
|
||||
"test_linear_relu (__main__.TestStaticQuantizedModule)": 128.11266708374023,
|
||||
"test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 75.69916741053264,
|
||||
"test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.60366736518012,
|
||||
"test_lstm_cpu (__main__.TestMkldnnCPU)": 66.15800094604492,
|
||||
"test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 130.17633226182727,
|
||||
"test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 60.61724901199341,
|
||||
"test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.76533508300781,
|
||||
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 84.80249977111816,
|
||||
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_False (__main__.TestCKBackend)": 82.48874931409955,
|
||||
"test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 421.6166585286458,
|
||||
"test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.6796671549479,
|
||||
"test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 357.6593322753906,
|
||||
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.8608890109592,
|
||||
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60900031195746,
|
||||
"test_proper_exit (__main__.TestDataLoader)": 223.7907740275065,
|
||||
"test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 213.6155548095703,
|
||||
"test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 168.48199971516928,
|
||||
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 68.48926869834342,
|
||||
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 68.39782928838963,
|
||||
"test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 99.70321994357639,
|
||||
"test_qat_resnet18 (__main__.TestQuantizePT2EQATModels)": 61.103378822063576,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.00533294677734,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.10599772135417,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.0443344116211,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.9883321126302,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.07866668701172,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 68.79566701253255,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.1106669108073,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.92966969807942,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.10766855875652,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.41666666666667,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.1106669108073,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.91766866048177,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.16766611735027,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.9856669108073,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.22266642252605,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.57533264160156,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 70.04799906412761,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.56433359781902,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.017333984375,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 94.46166737874348,
|
||||
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.06233215332031,
|
||||
"test_qrnncell (__main__.TestDynamicQuantizedOps)": 204.8830050362481,
|
||||
"test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 584.1243489583334,
|
||||
"test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1194.274678548177,
|
||||
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 842.1573282877604,
|
||||
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1500.2438354492188,
|
||||
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 80.01266479492188,
|
||||
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 304.8406728108724,
|
||||
"test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 123.26833089192708,
|
||||
"test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 289.4941685994466,
|
||||
"test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 78.4913330078125,
|
||||
"test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 160.19433085123697,
|
||||
"test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 76.93316650390625,
|
||||
"test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 95.25599924723308,
|
||||
"test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 190.9510014851888,
|
||||
"test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 115.96716562906902,
|
||||
"test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 85.82816696166992,
|
||||
"test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.81233215332031,
|
||||
"test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 73.0594991048177,
|
||||
"test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 78.28866704305013,
|
||||
"test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 203.66749827067056,
|
||||
"test_save_load_large_string_attribute (__main__.TestSaveLoad)": 118.92166392008464,
|
||||
"test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 161.21966722276477,
|
||||
"test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.33677842881944,
|
||||
"test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 122.50711229112413,
|
||||
"test_sort_stable_cpu (__main__.CpuTritonTests)": 77.22933451334636,
|
||||
"test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.92000071207683,
|
||||
"test_std (__main__.TestQuantizedOps)": 118.49511219395532,
|
||||
"test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.61699732144675,
|
||||
"test_tensor_split (__main__.TestVmapOperators)": 83.01314294423376,
|
||||
"test_terminate_handler_on_crash (__main__.TestTorch)": 111.18021970325046,
|
||||
"test_terminate_signal (__main__.ForkTest)": 131.81088901807865,
|
||||
"test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 131.90911058253712,
|
||||
"test_terminate_signal (__main__.SpawnTest)": 135.51344219843546,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 71.71866671244304,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.4015007019043,
|
||||
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 75.85683250427246,
|
||||
"test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.25,
|
||||
"test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 142.70416514078775,
|
||||
"test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.90866597493489,
|
||||
"test_unary_ops (__main__.TestTEFuserDynamic)": 83.01277730200026,
|
||||
"test_unary_ops (__main__.TestTEFuserStatic)": 84.06699878639645,
|
||||
"test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.28433227539062,
|
||||
"test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 96.625,
|
||||
"test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 78.01066716512044,
|
||||
"test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.23649978637695,
|
||||
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 100.44966379801433,
|
||||
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.67900085449219,
|
||||
"test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 75.2140007019043,
|
||||
"test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 100.80166753133138,
|
||||
"test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 96.56916745503743,
|
||||
"test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 99.54433314005534,
|
||||
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 69.86966705322266,
|
||||
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 103.45650100708008,
|
||||
"test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.28766759236653,
|
||||
"test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 70.02966690063477,
|
||||
"test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 100.93566703796387,
|
||||
"test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 94.60433260599773,
|
||||
"test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 98.65516599019368,
|
||||
"test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 105.35816828409831,
|
||||
"test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.68983332316081,
|
||||
"test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 152.76449966430664
|
||||
}
|
||||
@ -7160,6 +7160,67 @@ class TestCompileKernel(TestCase):
|
||||
expected = torch.full((n,), test_value, device="cuda", dtype=torch.float16)
|
||||
torch.testing.assert_close(output, expected, rtol=1e-3, atol=1e-3)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "No CUDA")
|
||||
def test_compile_kernel_template(self):
|
||||
kernel_source = """
|
||||
template<typename T>
|
||||
__global__ void add_tensors(const T* a, const T* b, T* c, int n) {
|
||||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < n)
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
"""
|
||||
|
||||
# Compile the kernel
|
||||
from torch.cuda import _compile_kernel
|
||||
|
||||
add_kernel_float = _compile_kernel(kernel_source, "add_tensors<float>")
|
||||
|
||||
# Prepare data
|
||||
N = 1024
|
||||
a = torch.rand(N, device="cuda")
|
||||
b = torch.rand(N, device="cuda")
|
||||
c = torch.empty_like(a)
|
||||
|
||||
# Calculate grid and block dimensions
|
||||
threads_per_block = 256
|
||||
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
|
||||
|
||||
# Launch kernel
|
||||
add_kernel_float(
|
||||
grid=(blocks_per_grid, 1, 1),
|
||||
block=(threads_per_block, 1, 1),
|
||||
args=[a, b, c, N],
|
||||
)
|
||||
|
||||
# Verify results
|
||||
expected = a + b
|
||||
self.assertEqual(c, expected)
|
||||
|
||||
# do again with different dtype
|
||||
add_kernel_int = _compile_kernel(kernel_source, "add_tensors<int>")
|
||||
|
||||
# Prepare data
|
||||
N = 1024
|
||||
a = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
|
||||
b = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
|
||||
c = torch.empty_like(a)
|
||||
|
||||
# Calculate grid and block dimensions
|
||||
threads_per_block = 256
|
||||
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
|
||||
|
||||
# Launch kernel
|
||||
add_kernel_int(
|
||||
grid=(blocks_per_grid, 1, 1),
|
||||
block=(threads_per_block, 1, 1),
|
||||
args=[a, b, c, N],
|
||||
)
|
||||
|
||||
# Verify results
|
||||
expected = a + b
|
||||
self.assertEqual(c, expected)
|
||||
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
|
||||
class TestCudaDeviceParametrized(TestCase):
|
||||
|
||||
@ -3685,6 +3685,21 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
|
||||
out = torch.compile(f)(idx, x)
|
||||
self.assertEqual(out, f(idx, x))
|
||||
|
||||
def test_trunc_int_div_true(self):
|
||||
@torch.compile(backend="inductor", dynamic=True, fullgraph=True)
|
||||
def f(x, s13, s57, s77):
|
||||
torch._check(s13 >= 0)
|
||||
torch._check(s57 >= 0)
|
||||
torch._check(s77 >= 0)
|
||||
if int(s13 * ((s57 // s13) + (s77 // s13)) / s13) >= 1:
|
||||
return x * 2
|
||||
else:
|
||||
return x * 100
|
||||
|
||||
# ensure we compile this with no errors.
|
||||
x = torch.rand(10)
|
||||
f(x, 4, 4096, 3920)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(TestUnbacked)
|
||||
|
||||
|
||||
@ -940,6 +940,8 @@ def generate_tensor_like_override_tests(cls):
|
||||
return None
|
||||
elif arg_type == "ScalarType":
|
||||
return torch.float32
|
||||
elif arg_type == "c10::string_view":
|
||||
return ""
|
||||
elif arg_type in ("std::string_view", "::std::string_view"):
|
||||
return ""
|
||||
elif arg_type == "SymInt":
|
||||
|
||||
@ -383,13 +383,14 @@ class TestScatterGather(TestCase):
|
||||
@dtypes(torch.float32)
|
||||
def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
|
||||
for d in (0, 1):
|
||||
inp = torch.randn(3, 4, device=device, dtype=dtype)
|
||||
inp = torch.randn(3, 4, 5, device=device, dtype=dtype)
|
||||
idx_1d = torch.randint(3, (10,), device=device)
|
||||
src_shape = list(inp.shape)
|
||||
src_shape[d] = 10
|
||||
src = torch.randn(src_shape, device=device, dtype=dtype)
|
||||
idx = idx_1d.unsqueeze(1 - d).expand(src_shape)
|
||||
print(idx.stride())
|
||||
idx_view_shape = [1] * inp.ndim
|
||||
idx_view_shape[d] = 10
|
||||
idx = idx_1d.view(idx_view_shape).expand(src_shape)
|
||||
ref = inp.clone().scatter_add_(d, idx, src)
|
||||
with DeterministicGuard(True):
|
||||
res = inp.clone().scatter_add_(d, idx, src)
|
||||
|
||||
@ -1164,9 +1164,9 @@ class TestSparse(TestSparseBase):
|
||||
"Concatenating sparse tensors, but a dense tensor was found at position 1."):
|
||||
torch.cat((sp, dn))
|
||||
|
||||
@expectedFailureMPS
|
||||
@coalescedonoff
|
||||
@dtypes(torch.double, torch.cdouble)
|
||||
@dtypesIfMPS(torch.float32, torch.complex64)
|
||||
def test_unsqueeze(self, device, dtype, coalesced):
|
||||
def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
|
||||
x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
|
||||
@ -2353,14 +2353,14 @@ class TestSparse(TestSparseBase):
|
||||
self.assertTrue(result.layout == torch.strided)
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA)' backend"
|
||||
RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA|MPS)' backend"
|
||||
):
|
||||
dense_tensor = sparse_tensor.to_dense()
|
||||
result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)
|
||||
|
||||
@coalescedonoff
|
||||
@expectedFailureMPS
|
||||
@dtypes(torch.double, torch.cdouble)
|
||||
@dtypesIfMPS(torch.float32, torch.complex64)
|
||||
def test_empty_like(self, device, dtype, coalesced):
|
||||
# tests https://github.com/pytorch/pytorch/issues/43699
|
||||
|
||||
@ -3310,7 +3310,6 @@ class TestSparse(TestSparseBase):
|
||||
sp_tensor_loaded = pickle.loads(serialized)
|
||||
self.assertEqual(sp_tensor, sp_tensor_loaded)
|
||||
|
||||
@expectedFailureMPS
|
||||
def test_any(self, device):
|
||||
t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
|
||||
t_any = torch.tensor(False)
|
||||
|
||||
@ -1654,6 +1654,15 @@ class TestUnaryUfuncs(TestCase):
|
||||
),
|
||||
)
|
||||
|
||||
# empty input
|
||||
# https://github.com/pytorch/pytorch/issues/162473
|
||||
input_tensor = torch.tensor([], device=device)
|
||||
static_size = 1
|
||||
self.assertEqual(
|
||||
torch.nonzero_static(input_tensor, size=static_size),
|
||||
torch.tensor([[-1]], device=device),
|
||||
)
|
||||
|
||||
# 1D input
|
||||
input_tensor = torch.tensor([0, 8], device=device)
|
||||
static_size = 1
|
||||
|
||||
@ -969,7 +969,7 @@ def saved_variables(
|
||||
if nctype.type == OptionalCType(BaseCType(stringT)):
|
||||
formula = re.sub(
|
||||
rf"\b{name}\b",
|
||||
f"{name}.has_value() ? std::optional<::std::string_view>({name}.value()) : std::nullopt",
|
||||
f"{name}.has_value() ? std::optional<std::string_view>({name}.value()) : std::nullopt",
|
||||
formula,
|
||||
)
|
||||
|
||||
|
||||
@ -12,7 +12,6 @@ BU
|
||||
contiguities
|
||||
contiguity
|
||||
coo
|
||||
DEPENDEES
|
||||
deser
|
||||
din
|
||||
dout
|
||||
|
||||
@ -6,6 +6,7 @@ import inspect
|
||||
import logging
|
||||
import pickle
|
||||
import types
|
||||
from contextlib import AbstractContextManager, ExitStack
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
@ -264,48 +265,141 @@ def aot_compile_fullgraph(
|
||||
|
||||
assert check_fn.guards_state is not None
|
||||
|
||||
backend_input = capture_output.backend_input
|
||||
assert backend_input is not None
|
||||
backend_input.graph_module._backend_id = backend_input.backend_id # type: ignore[assignment]
|
||||
output_graph = dynamo_output.tracer_output.output_graph
|
||||
assert output_graph is not None
|
||||
use_cuda = _graph_uses_non_cpu(output_graph.current_tracer.graph)
|
||||
backend_input = capture_output.backend_input
|
||||
assert backend_input is not None
|
||||
backend_input.graph_module._backend_id = backend_input.backend_id # type: ignore[assignment]
|
||||
output_graph = dynamo_output.tracer_output.output_graph
|
||||
assert output_graph is not None
|
||||
use_cuda = _graph_uses_non_cpu(output_graph.current_tracer.graph)
|
||||
import_sources = output_graph.import_sources
|
||||
with (
|
||||
torch._guards.tracing(TracingContext(backend_input.fake_mode)),
|
||||
torch._functorch.config.patch(
|
||||
{
|
||||
"bundled_autograd_cache": True,
|
||||
"force_non_lazy_backward_lowering": True,
|
||||
}
|
||||
),
|
||||
):
|
||||
compiled_fn = backend(
|
||||
backend_input.graph_module, backend_input.example_inputs
|
||||
)
|
||||
|
||||
import_sources = output_graph.import_sources
|
||||
with (
|
||||
torch._guards.tracing(TracingContext(backend_input.fake_mode)),
|
||||
torch._functorch.config.patch("bundled_autograd_cache", True),
|
||||
):
|
||||
compiled_fn = backend(backend_input.graph_module, backend_input.example_inputs)
|
||||
# If Inductor backend is used, grab the compiled_fn from PrecompileContext
|
||||
# TODO: this should be replaced once we make the backend return the SerializableCallable directly.
|
||||
if isinstance(backend, torch._TorchCompileInductorWrapper):
|
||||
compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
|
||||
backend_input.backend_id
|
||||
)
|
||||
|
||||
# If Inductor backend is used, grab the compiled_fn from PrecompileContext
|
||||
# TODO: this should be replaced once we make the backend return the SerializableCallable directly.
|
||||
if isinstance(backend, torch._TorchCompileInductorWrapper):
|
||||
compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
|
||||
backend_input.backend_id
|
||||
if not isinstance(compiled_fn, SerializableCallable):
|
||||
if hasattr(backend, "compiler_fn"):
|
||||
compiler_fn = backend.compiler_fn
|
||||
else:
|
||||
compiler_fn = backend
|
||||
raise RuntimeError(
|
||||
f"Compiled function type {type(compiled_fn)} (produced "
|
||||
+ f"from backend {compiler_fn}) does not implement SerializableCallable."
|
||||
)
|
||||
|
||||
artifacts = CompileArtifacts(
|
||||
signature=signature,
|
||||
bytecode=dynamo_output.bytecode,
|
||||
guard_manager=check_fn.guard_manager,
|
||||
guards_state=check_fn.guards_state,
|
||||
import_sources=import_sources,
|
||||
backend_id=backend_input.backend_id,
|
||||
compiled_fn=compiled_fn,
|
||||
original_code=fn.__code__,
|
||||
closure=fn.__closure__,
|
||||
use_cuda=use_cuda,
|
||||
)
|
||||
aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
|
||||
|
||||
if not isinstance(compiled_fn, SerializableCallable):
|
||||
if hasattr(backend, "compiler_fn"):
|
||||
compiler_fn = backend.compiler_fn
|
||||
else:
|
||||
compiler_fn = backend
|
||||
raise RuntimeError(
|
||||
f"Compiled function type {type(compiled_fn)} (produced "
|
||||
+ f"from backend {compiler_fn}) does not implement SerializableCallable."
|
||||
)
|
||||
|
||||
artifacts = CompileArtifacts(
|
||||
signature=signature,
|
||||
bytecode=dynamo_output.bytecode,
|
||||
guard_manager=check_fn.guard_manager,
|
||||
guards_state=check_fn.guards_state,
|
||||
import_sources=import_sources,
|
||||
backend_id=backend_input.backend_id,
|
||||
compiled_fn=compiled_fn,
|
||||
original_code=fn.__code__,
|
||||
closure=fn.__closure__,
|
||||
use_cuda=use_cuda,
|
||||
)
|
||||
aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
|
||||
return aot_compiled_fn
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelInput:
|
||||
"""
|
||||
WIP type: represents a single model input
|
||||
Which consists of a tuple of arguments and a set of contexts in which to run the model.
|
||||
|
||||
For each ModelInput, we'll compile one full graph of the model, and then use the guards generated
|
||||
to dispatch between the compiled graphs.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
args: tuple[Any]
|
||||
kwargs: dict[str, Any]
|
||||
contexts: list[AbstractContextManager[Any]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AOTCompiledModel:
|
||||
# Represents a single forward function of a model along with dispatch
|
||||
# compiled_results is serializable. We require the model to deserialize again.
|
||||
model: torch.nn.Module
|
||||
compiled_results: list[AOTCompiledFunction]
|
||||
|
||||
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
||||
for result in self.compiled_results:
|
||||
if result.guard_check(self.model, *args, **kwargs):
|
||||
return result(self.model, *args, **kwargs)
|
||||
# All guards failed, just run one of them and throw the guard check error.
|
||||
return self.compiled_results[0](self.model, *args, **kwargs)
|
||||
|
||||
def serialize(self) -> bytes:
|
||||
data: list[bytes] = []
|
||||
for result in self.compiled_results:
|
||||
data.append(AOTCompiledFunction.serialize(result))
|
||||
return pickle.dumps(data)
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, model: torch.nn.Module, data: bytes) -> "AOTCompiledModel":
|
||||
from torch._dynamo.utils import get_metrics_context
|
||||
from torch._guards import compile_context, CompileContext
|
||||
|
||||
results: list[bytes] = pickle.loads(data)
|
||||
compiled_results = []
|
||||
for result in results:
|
||||
with (
|
||||
compile_context(CompileContext(convert_frame.get_compile_id({}))),
|
||||
get_metrics_context(),
|
||||
):
|
||||
compiled_results.append(AOTCompiledFunction.deserialize(result))
|
||||
return cls(model, compiled_results)
|
||||
|
||||
|
||||
def aot_compile_module(
|
||||
model: torch.nn.Module,
|
||||
inputs: list[ModelInput],
|
||||
hooks: Hooks,
|
||||
backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
|
||||
) -> AOTCompiledModel:
|
||||
"""
|
||||
Compiles a single nn.Module with any number of inputs, and returns a compiled forward function.
|
||||
"""
|
||||
|
||||
def compile_single_graph(model_input: ModelInput) -> AOTCompiledFunction:
|
||||
example_inputs = (model_input.args, model_input.kwargs)
|
||||
orig_forward = model.forward
|
||||
with ExitStack() as stack:
|
||||
for ctx in model_input.contexts:
|
||||
stack.enter_context(ctx)
|
||||
return aot_compile_fullgraph(
|
||||
orig_forward,
|
||||
example_inputs,
|
||||
hooks=hooks,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
compiled_results = []
|
||||
for model_input in inputs:
|
||||
log.info("Compiling input %s..", model_input)
|
||||
compiled_results.append(compile_single_graph(model_input))
|
||||
|
||||
assert len(compiled_results) > 0
|
||||
|
||||
return AOTCompiledModel(model, compiled_results)
|
||||
|
||||
@ -413,6 +413,57 @@ class OptimizedModule(torch.nn.Module):
|
||||
)
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
def _aot_compile(self, inputs: list[torch._dynamo.aot_compile.ModelInput]) -> None:
|
||||
"""
|
||||
Experimental: AOT Compile a set of inputs and use that as the forward function
|
||||
"""
|
||||
model = self._orig_mod
|
||||
hooks = self.dynamo_ctx._hooks
|
||||
assert hooks is not None
|
||||
if not config.enable_aot_compile:
|
||||
raise RuntimeError(
|
||||
"AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
|
||||
)
|
||||
if not self.dynamo_ctx.fullgraph:
|
||||
raise RuntimeError(
|
||||
"Graph breaks are not supported with aot compile. Please use torch.compile(fullgraph=True)."
|
||||
)
|
||||
|
||||
if not callable(self.dynamo_ctx.callback):
|
||||
raise RuntimeError("aot compile requires a callable dynamo callback.")
|
||||
|
||||
backend = innermost_fn(
|
||||
self.dynamo_ctx.callback, unaltered_fn_attr="_torchdynamo_orig_backend"
|
||||
)
|
||||
from torch._dynamo.aot_compile import aot_compile_module
|
||||
|
||||
self.forward = aot_compile_module(model, inputs, hooks, backend)
|
||||
|
||||
def _save_aot_compiled_module(self, path: Optional[str] = None) -> bytes:
|
||||
if not config.enable_aot_compile:
|
||||
raise RuntimeError(
|
||||
"AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
|
||||
)
|
||||
from torch._dynamo.aot_compile import AOTCompiledModel
|
||||
|
||||
assert isinstance(self.forward, AOTCompiledModel)
|
||||
result: bytes = self.forward.serialize()
|
||||
if path is not None:
|
||||
with open(path, "wb") as f:
|
||||
f.write(result)
|
||||
return result
|
||||
|
||||
def _load_aot_compiled_module(self, data: bytes) -> None:
|
||||
if not config.enable_aot_compile:
|
||||
raise RuntimeError(
|
||||
"AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
|
||||
)
|
||||
from torch._dynamo.aot_compile import AOTCompiledModel
|
||||
|
||||
compiled_forward = AOTCompiledModel.deserialize(self._orig_mod, data)
|
||||
assert isinstance(compiled_forward, AOTCompiledModel)
|
||||
self.forward = compiled_forward
|
||||
|
||||
def __reduce__(
|
||||
self,
|
||||
) -> tuple[type[OptimizedModule], tuple[torch.nn.Module, _TorchDynamoContext]]:
|
||||
|
||||
@ -306,6 +306,8 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
|
||||
self,
|
||||
gm: torch.fx.GraphModule,
|
||||
):
|
||||
assert has_triton_package(), "Triton is not available"
|
||||
|
||||
triton_kernels = []
|
||||
for module in gm.modules():
|
||||
if not isinstance(module, torch.fx.GraphModule):
|
||||
@ -331,6 +333,11 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
|
||||
)
|
||||
|
||||
for kernel in triton_kernels:
|
||||
from triton.runtime.autotuner import Autotuner
|
||||
|
||||
if isinstance(kernel, Autotuner):
|
||||
# Grab the Inner JITFunction
|
||||
kernel = kernel.fn
|
||||
source_codes = user_defined_triton_kernel_transitive_closure_source_code(
|
||||
kernel
|
||||
)
|
||||
@ -355,7 +362,8 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
|
||||
[],
|
||||
[],
|
||||
)
|
||||
self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
|
||||
if has_triton_package():
|
||||
self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
|
||||
|
||||
if hasattr(gm, "saved_tensors_hooks_pack_0"):
|
||||
|
||||
|
||||
@ -1072,6 +1072,7 @@ def aot_module_simplified(
|
||||
boxed_forward_device_index,
|
||||
ignore_shape_env,
|
||||
flatten=False,
|
||||
force_non_lazy_backward_lowering=config.force_non_lazy_backward_lowering,
|
||||
)
|
||||
|
||||
compiled_fn = None
|
||||
|
||||
@ -296,6 +296,11 @@ fake_tensor_prefer_device_type: Optional[str] = None
|
||||
# TODO: turn on by default
|
||||
graphsafe_rng_functionalization = True
|
||||
|
||||
# Whether or not to eagerly compile the backward
|
||||
# used by AOT compile and other settings
|
||||
# TODO: once AOT compile calls aot autograd directly instead of
|
||||
# through compile_fx, we can remove this
|
||||
force_non_lazy_backward_lowering = False
|
||||
|
||||
# Error on BypassAOTAutogradCache instead of just a warning
|
||||
# Used for tests
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user