mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 14:54:55 +08:00
Compare commits
73 Commits
csl/remove
...
ciflow/b20
| Author | SHA1 | Date | |
|---|---|---|---|
| 8530256a41 | |||
| 598483e82e | |||
| cb9d989787 | |||
| da4e8e0673 | |||
| b4f6a78145 | |||
| 07ea62f600 | |||
| 95d4abebb4 | |||
| 13e5dadef2 | |||
| a4c7856112 | |||
| afb014541b | |||
| b91a2ab892 | |||
| 14a845a4ec | |||
| 5135ace3a3 | |||
| e7c1905837 | |||
| 9cf623a209 | |||
| 06aa3ef3d3 | |||
| 0384104e23 | |||
| 325ec98009 | |||
| 47acdea74a | |||
| 71606b289c | |||
| e342a7509a | |||
| 27ac58bd70 | |||
| 406719c3da | |||
| 957570e4a3 | |||
| eeb6c96a89 | |||
| 0b12e49795 | |||
| 87646e5db4 | |||
| 29d6bb79e1 | |||
| c2924bbafa | |||
| a2f109dcc3 | |||
| ba5ffa2dca | |||
| c131e4b390 | |||
| 7fd15aa2bd | |||
| c45c966031 | |||
| d18c742779 | |||
| 4957ae5838 | |||
| 31d6d3ef5c | |||
| 2325c511e7 | |||
| d865156967 | |||
| fbc0bd2e90 | |||
| 70f5f55abf | |||
| 69ecb562e7 | |||
| 5062abe4e7 | |||
| c7007e7584 | |||
| 09705ca9b2 | |||
| ea6b0b5d0f | |||
| bbf852d87f | |||
| 6392b986e7 | |||
| 32d30d96cf | |||
| 46516efa85 | |||
| 84b2147b85 | |||
| 1727a71cb6 | |||
| fb9e10fe25 | |||
| 4e277e6323 | |||
| ba327b7a5c | |||
| 8eb21304ab | |||
| b83a3f6e87 | |||
| 289b47e657 | |||
| c20308b79e | |||
| 4c41e9bde7 | |||
| 2f5223564e | |||
| 28615a765d | |||
| d1446ad75c | |||
| e401a56b96 | |||
| 22650c89fb | |||
| c62a17a2fb | |||
| 713e289ae7 | |||
| 69784a0dbe | |||
| 3c2409c465 | |||
| 724cd32b0c | |||
| b62935d1a5 | |||
| ccc8c117dc | |||
| 86db4de10f |
@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
;;
|
||||
rocm*)
|
||||
BASE_TARGET=rocm
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
||||
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
|
||||
;;
|
||||
*)
|
||||
|
||||
@ -260,6 +260,12 @@ case "$tag" in
|
||||
HALIDE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda13.0-py3.12-pallas)
|
||||
CUDA_VERSION=13.0.0
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
PALLAS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.12-triton-cpu)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
@ -381,6 +387,7 @@ docker build \
|
||||
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
|
||||
--build-arg "EXECUTORCH=${EXECUTORCH}" \
|
||||
--build-arg "HALIDE=${HALIDE}" \
|
||||
--build-arg "PALLAS=${PALLAS}" \
|
||||
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
||||
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
||||
--build-arg "ACL=${ACL:-}" \
|
||||
|
||||
1
.ci/docker/ci_commit_pins/jax.txt
Normal file
1
.ci/docker/ci_commit_pins/jax.txt
Normal file
@ -0,0 +1 @@
|
||||
0.8.0
|
||||
40
.ci/docker/common/install_jax.sh
Executable file
40
.ci/docker/common/install_jax.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
# Get the pinned JAX version (same for all CUDA versions)
|
||||
JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
|
||||
|
||||
function install_jax_12() {
|
||||
echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
|
||||
pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
|
||||
# Verify installation
|
||||
python -c "import jax" # check for errors
|
||||
echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
|
||||
}
|
||||
|
||||
function install_jax_13() {
|
||||
echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
|
||||
pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
|
||||
# Verify installation
|
||||
python -c "import jax" # check for errors
|
||||
echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
|
||||
}
|
||||
|
||||
# idiomatic parameter and option handling in sh
|
||||
while test $# -gt 0
|
||||
do
|
||||
case "$1" in
|
||||
12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
|
||||
;;
|
||||
13.0|13.0.*) install_jax_13;
|
||||
;;
|
||||
*) echo "bad argument $1"; exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
fi
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
*)
|
||||
|
||||
@ -87,11 +87,7 @@ case ${image} in
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
manylinux2_28-builder:xpu)
|
||||
|
||||
@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
|
||||
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
|
||||
RUN rm install_halide.sh common_utils.sh halide.txt
|
||||
|
||||
ARG PALLAS
|
||||
ARG CUDA_VERSION
|
||||
# Install JAX with CUDA support (for Pallas)
|
||||
COPY ./common/install_jax.sh install_jax.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
|
||||
RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
|
||||
RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
|
||||
|
||||
ARG ONNX
|
||||
# Install ONNX dependencies
|
||||
COPY ./common/install_onnx.sh ./common/common_utils.sh ./
|
||||
|
||||
@ -8,9 +8,11 @@ from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
try:
|
||||
from typing import Any, Callable, Required, TypedDict # Python 3.11+
|
||||
from collections.abc import Callable # Python 3.11+
|
||||
from typing import Any, Required, TypedDict
|
||||
except ImportError:
|
||||
from typing import Any, Callable, TypedDict
|
||||
from collections.abc import Callable
|
||||
from typing import Any, TypedDict
|
||||
|
||||
from typing_extensions import Required # Fallback for Python <3.11
|
||||
|
||||
|
||||
@ -337,7 +337,7 @@ test_python() {
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -824,6 +824,11 @@ test_inductor_halide() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_inductor_pallas() {
|
||||
python test/run_test.py --include inductor/test_pallas.py --verbose
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_inductor_triton_cpu() {
|
||||
python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
|
||||
assert_git_not_dirty
|
||||
@ -1724,6 +1729,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
|
||||
test_inductor_distributed
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
||||
test_inductor_halide
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
|
||||
test_inductor_pallas
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
||||
test_inductor_triton_cpu
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
ca2212438fdd8ce29b66999ed70ed54b0f9372d1
|
||||
ccb801b88af136454798b945175c4c87e636ac33
|
||||
|
||||
9
.github/labeler.yml
vendored
9
.github/labeler.yml
vendored
@ -138,7 +138,8 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
@ -148,7 +149,8 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
@ -158,7 +160,8 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
||||
1
.github/nitpicks.yml
vendored
1
.github/nitpicks.yml
vendored
@ -10,3 +10,4 @@
|
||||
pathFilter:
|
||||
- 'torch/csrc/inductor/aoti_torch/c/*'
|
||||
- 'torch/csrc/inductor/aoti_torch/generated/*'
|
||||
- 'torch/csrc/stable/c/*'
|
||||
|
||||
3
.github/scripts/delete_old_branches.py
vendored
3
.github/scripts/delete_old_branches.py
vendored
@ -1,10 +1,11 @@
|
||||
# Delete old branches
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
from typing import Any
|
||||
|
||||
from github_utils import gh_fetch_json_dict, gh_graphql
|
||||
from gitutils import GitRepo
|
||||
|
||||
3
.github/scripts/filter_test_configs.py
vendored
3
.github/scripts/filter_test_configs.py
vendored
@ -8,10 +8,11 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from enum import Enum
|
||||
from functools import cache
|
||||
from logging import info
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any, Optional
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
import yaml
|
||||
|
||||
3
.github/scripts/get_workflow_job_id.py
vendored
3
.github/scripts/get_workflow_job_id.py
vendored
@ -11,7 +11,8 @@ import sys
|
||||
import time
|
||||
import urllib
|
||||
import urllib.parse
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Optional
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
|
||||
3
.github/scripts/github_utils.py
vendored
3
.github/scripts/github_utils.py
vendored
@ -3,8 +3,9 @@
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, cast, Optional, Union
|
||||
from typing import Any, cast, Optional, Union
|
||||
from urllib.error import HTTPError
|
||||
from urllib.parse import quote
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
4
.github/scripts/gitutils.py
vendored
4
.github/scripts/gitutils.py
vendored
@ -4,10 +4,10 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterator
|
||||
from collections.abc import Callable, Iterator
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, cast, Optional, TypeVar, Union
|
||||
from typing import Any, cast, Optional, TypeVar, Union
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -17,12 +17,12 @@ import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
from re import Pattern
|
||||
from typing import Any, Callable, cast, NamedTuple, Optional
|
||||
from typing import Any, cast, NamedTuple, Optional
|
||||
from warnings import warn
|
||||
|
||||
import yaml
|
||||
|
||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -67,6 +67,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3.10-gcc11,
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-cuda13.0-py3.12-pallas,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-noble-xpu-n-py3,
|
||||
pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
|
||||
|
||||
26
.github/workflows/inductor-unittest.yml
vendored
26
.github/workflows/inductor-unittest.yml
vendored
@ -81,6 +81,32 @@ jobs:
|
||||
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-pallas-build:
|
||||
name: inductor-pallas-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-py3.12-pallas
|
||||
cuda-arch-list: '8.9'
|
||||
runner: linux.8xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-pallas-test:
|
||||
name: inductor-pallas-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-pallas-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-triton-cpu-build:
|
||||
name: inductor-triton-cpu-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -127,6 +127,7 @@ torch/test/
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
|
||||
torch/version.py
|
||||
torch/_inductor/kernel/vendored_templates/*
|
||||
minifier_launcher.py
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
|
||||
|
||||
@ -1402,7 +1402,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'usort==1.0.8.post1',
|
||||
'isort==6.0.1',
|
||||
'ruff==0.13.1', # sync with RUFF
|
||||
'ruff==0.14.4', # sync with RUFF
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
@ -1537,7 +1537,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'ruff==0.13.1', # sync with PYFMT
|
||||
'ruff==0.14.4', # sync with PYFMT
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
/test/inductor/test_flex_attention.py @drisspg
|
||||
/test/inductor/test_flex_decoding.py @drisspg
|
||||
|
||||
# Low Precision GEMMs
|
||||
# Low Precision & Grouped GEMMs
|
||||
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
|
||||
/test/test_scaled_matmul_cuda.py @drisspg @slayton58
|
||||
|
||||
@ -226,8 +226,8 @@ template <
|
||||
typename B = HostBlock<S>>
|
||||
struct CachingHostAllocatorImpl {
|
||||
virtual ~CachingHostAllocatorImpl() {
|
||||
active_ = false;
|
||||
if (pinned_use_background_threads()) {
|
||||
if (active_) {
|
||||
active_ = false;
|
||||
getBackgroundThreadPool()->waitWorkComplete();
|
||||
}
|
||||
}
|
||||
@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl {
|
||||
if (pinned_use_background_threads()) {
|
||||
// Launch the background thread and process events in a loop.
|
||||
static bool background_thread_flag [[maybe_unused]] = [this] {
|
||||
active_ = true;
|
||||
getBackgroundThreadPool()->run([&]() {
|
||||
while (active_) {
|
||||
process_events();
|
||||
@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl {
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
// Indicates whether the object is active.
|
||||
// Indicates whether the event-processing thread pool is active.
|
||||
// Set to false in the destructor to signal background threads to stop.
|
||||
std::atomic<bool> active_{true};
|
||||
std::atomic<bool> active_{false};
|
||||
protected:
|
||||
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
||||
};
|
||||
|
||||
@ -141,6 +141,9 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
|
||||
};
|
||||
|
||||
MPSStream* stream = at::mps::getCurrentMPSStream();
|
||||
if (result.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);
|
||||
|
||||
@autoreleasepool {
|
||||
|
||||
@ -2803,7 +2803,7 @@
|
||||
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_out
|
||||
CPU, CUDA, MPS, MTIA: floor_divide_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
||||
|
||||
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||
@ -4383,7 +4383,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mv
|
||||
SparseCPU, SparseCUDA: mv_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: mv_sparse
|
||||
|
||||
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
|
||||
@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
|
||||
start.record()
|
||||
coo.matmul(mat)
|
||||
stop.record()
|
||||
|
||||
times.append(start.elapsed_time(stop))
|
||||
|
||||
coo_mean_time = sum(times) / len(times)
|
||||
coo_mean_time = sum(times) / len(times)
|
||||
|
||||
times = []
|
||||
for _ in range(test_count):
|
||||
start.record()
|
||||
csr.matmul(mat)
|
||||
stop.record()
|
||||
times.append(start.elapsed_time(stop))
|
||||
times = []
|
||||
for _ in range(test_count):
|
||||
start.record()
|
||||
csr.matmul(mat)
|
||||
stop.record()
|
||||
times.append(start.elapsed_time(stop))
|
||||
|
||||
csr_mean_time = sum(times) / len(times)
|
||||
csr_mean_time = sum(times) / len(times)
|
||||
|
||||
return coo_mean_time, csr_mean_time
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/SafePyObject.h>
|
||||
#include <c10/macros/Export.h>
|
||||
#include <optional>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
@ -15,7 +17,8 @@ struct C10_API AutogradState {
|
||||
bool inference_mode,
|
||||
bool fw_grad_mode,
|
||||
bool multithreading_enabled)
|
||||
: grad_mode_(grad_mode),
|
||||
: graph_exec_group_(std::nullopt),
|
||||
grad_mode_(grad_mode),
|
||||
inference_mode_(inference_mode),
|
||||
fw_grad_mode_(fw_grad_mode),
|
||||
multithreading_enabled_(multithreading_enabled),
|
||||
@ -41,6 +44,10 @@ struct C10_API AutogradState {
|
||||
view_replay_enabled_ = view_replay_enabled;
|
||||
}
|
||||
|
||||
void set_graph_exec_group(std::optional<SafePyObject> group) {
|
||||
graph_exec_group_ = std::move(group);
|
||||
}
|
||||
|
||||
bool get_grad_mode() const {
|
||||
return grad_mode_;
|
||||
}
|
||||
@ -61,7 +68,12 @@ struct C10_API AutogradState {
|
||||
return view_replay_enabled_;
|
||||
}
|
||||
|
||||
const std::optional<SafePyObject>& get_graph_exec_group() const {
|
||||
return graph_exec_group_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::optional<SafePyObject> graph_exec_group_;
|
||||
bool grad_mode_ : 1;
|
||||
bool inference_mode_ : 1;
|
||||
bool fw_grad_mode_ : 1;
|
||||
|
||||
@ -66,6 +66,15 @@ def define_targets(rules):
|
||||
],
|
||||
)
|
||||
|
||||
rules.cc_test(
|
||||
name = "util/nofatal_test",
|
||||
srcs = ["util/nofatal_test.cpp"],
|
||||
deps = [
|
||||
"//c10/util:base",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
rules.cc_test(
|
||||
name = "util/ssize_test",
|
||||
srcs = ["util/ssize_test.cpp"],
|
||||
|
||||
53
c10/test/util/nofatal_test.cpp
Normal file
53
c10/test/util/nofatal_test.cpp
Normal file
@ -0,0 +1,53 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Logging.h>
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
inline void expectThrowsEq(T&& fn, const char* expected_msg) {
|
||||
try {
|
||||
std::forward<T>(fn)();
|
||||
} catch (const c10::Error& e) {
|
||||
EXPECT_TRUE(
|
||||
std::string(e.what_without_backtrace()).find(expected_msg) !=
|
||||
std::string::npos);
|
||||
return;
|
||||
}
|
||||
ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
|
||||
<< "\" but didn't throw";
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(NofatalTest, TorchCheckComparisons) {
|
||||
// quick make sure that no-op works as expected
|
||||
TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
|
||||
"Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
|
||||
expectThrowsEq(
|
||||
[]() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
|
||||
expectThrowsEq(
|
||||
[]() {
|
||||
void* p = nullptr;
|
||||
TORCH_CHECK_NOTNULL(p);
|
||||
},
|
||||
"Check failed: 'p' must be non NULL.");
|
||||
|
||||
#if GTEST_HAS_DEATH_TEST
|
||||
#ifndef NDEBUG
|
||||
// if dbg build, DCHECK should result in deth
|
||||
EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
|
||||
#else
|
||||
TORCH_DCHECK_EQ(1, 2); // no-op
|
||||
#endif
|
||||
#endif // GTEST_HAS_DEATH_TEST
|
||||
}
|
||||
@ -702,6 +702,98 @@ namespace c10::detail {
|
||||
#define TORCH_CHECK_ARG(cond, argN, ...) \
|
||||
TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
|
||||
|
||||
#ifndef FATAL_IF
|
||||
#ifdef C10_USE_GLOG
|
||||
#define FATAL_IF(condition) \
|
||||
condition ? (void)0 \
|
||||
: ::c10::LoggerVoidify() & \
|
||||
::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
|
||||
.stream()
|
||||
#else
|
||||
#define FATAL_IF(condition) \
|
||||
condition ? (void)0 \
|
||||
: ::c10::LoggerVoidify() & \
|
||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef NON_FATAL_IF
|
||||
#ifdef C10_USE_GLOG
|
||||
#define NON_FATAL_IF(condition) \
|
||||
condition ? (void)0 \
|
||||
: ::c10::LoggerVoidify() & \
|
||||
::c10::MessageLogger( \
|
||||
__FILE__, __LINE__, ::google::GLOG_FATAL, false) \
|
||||
.stream()
|
||||
#else
|
||||
#define NON_FATAL_IF(condition) \
|
||||
condition ? (void)0 \
|
||||
: ::c10::LoggerVoidify() & \
|
||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
|
||||
.stream()
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Binary comparison check macros
|
||||
#define TORCH_CHECK_OP(val1, val2, op) \
|
||||
NON_FATAL_IF(((val1)op(val2))) \
|
||||
<< "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
|
||||
<< (val2) << "). "
|
||||
|
||||
#define TORCH_DCHECK_OP(val1, val2, op) \
|
||||
FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
|
||||
<< (val1) << " vs. " << (val2) << "). "
|
||||
|
||||
#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
||||
#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
||||
#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
||||
#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
||||
#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
||||
#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
||||
|
||||
// Debug versions of TORCH_CHECK_OP macros
|
||||
#ifndef NDEBUG
|
||||
#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
|
||||
#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
|
||||
#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
|
||||
#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
|
||||
#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
|
||||
#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
|
||||
#else // !NDEBUG
|
||||
// Optimized versions - generate no code
|
||||
#define TORCH_DCHECK_EQ(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, ==)
|
||||
#define TORCH_DCHECK_NE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, !=)
|
||||
#define TORCH_DCHECK_LE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, <=)
|
||||
#define TORCH_DCHECK_LT(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, <)
|
||||
#define TORCH_DCHECK_GE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, >=)
|
||||
#define TORCH_DCHECK_GT(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_DCHECK_OP(val1, val2, >)
|
||||
#endif // NDEBUG
|
||||
|
||||
// Null pointer check macro
|
||||
#define TORCH_CHECK_NOTNULL(val) \
|
||||
::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define TORCH_DCHECK_NOTNULL(val) \
|
||||
::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
|
||||
#else // !NDEBUG
|
||||
#define TORCH_DCHECK_NOTNULL(val) \
|
||||
while (false) \
|
||||
TORCH_CHECK_NOTNULL(val)
|
||||
#endif // NDEBUG
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Deprecated macros
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
@ -291,6 +291,32 @@ namespace c10 {
|
||||
using fLB::FLAGS_logtostderr;
|
||||
using fLI::FLAGS_minloglevel;
|
||||
using fLI::FLAGS_v;
|
||||
|
||||
MessageLogger::MessageLogger(
|
||||
const char* file,
|
||||
int line,
|
||||
int severity,
|
||||
bool exit_on_fatal)
|
||||
: stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
|
||||
|
||||
MessageLogger::~MessageLogger() noexcept(false) {
|
||||
if (severity_ == ::google::GLOG_FATAL) {
|
||||
DealWithFatal();
|
||||
}
|
||||
}
|
||||
|
||||
std::stringstream& MessageLogger::stream() {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
void MessageLogger::DealWithFatal() {
|
||||
if (exit_on_fatal_) {
|
||||
LOG(FATAL) << stream_.str();
|
||||
} else {
|
||||
throw c10::Error(stream_.str(), nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
C10_DEFINE_int(
|
||||
@ -412,17 +438,16 @@ void ShowLogInfoToStderr() {
|
||||
FLAGS_caffe2_log_level = GLOG_INFO;
|
||||
}
|
||||
|
||||
MessageLogger::MessageLogger(const char* file, int line, int severity)
|
||||
: severity_(severity) {
|
||||
MessageLogger::MessageLogger(
|
||||
const char* file,
|
||||
int line,
|
||||
int severity,
|
||||
bool exit_on_fatal)
|
||||
: severity_(severity), exit_on_fatal_(exit_on_fatal) {
|
||||
if (severity_ < FLAGS_caffe2_log_level) {
|
||||
// Nothing needs to be logged.
|
||||
return;
|
||||
}
|
||||
#ifdef ANDROID
|
||||
tag_ = "native";
|
||||
#else // !ANDROID
|
||||
tag_ = "";
|
||||
#endif // ANDROID
|
||||
|
||||
time_t rawtime = 0;
|
||||
time(&rawtime);
|
||||
@ -458,7 +483,7 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
|
||||
}
|
||||
|
||||
// Output the contents of the stream to the proper channel on destruction.
|
||||
MessageLogger::~MessageLogger() {
|
||||
MessageLogger::~MessageLogger() noexcept(false) {
|
||||
if (severity_ < FLAGS_caffe2_log_level) {
|
||||
// Nothing needs to be logged.
|
||||
return;
|
||||
@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() {
|
||||
}
|
||||
}
|
||||
|
||||
std::stringstream& MessageLogger::stream() {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
void MessageLogger::DealWithFatal() {
|
||||
if (exit_on_fatal_) {
|
||||
abort();
|
||||
} else {
|
||||
throw c10::Error(stream_.str(), nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#endif // !C10_USE_GLOG
|
||||
|
||||
74
c10/util/logging_common.h
Normal file
74
c10/util/logging_common.h
Normal file
@ -0,0 +1,74 @@
|
||||
#ifndef C10_UTIL_LOGGING_COMMON_H_
|
||||
#define C10_UTIL_LOGGING_COMMON_H_
|
||||
|
||||
#include <c10/macros/Export.h>
|
||||
#include <sstream>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
// MessageLogger that throws exceptions instead of aborting (glog version)
|
||||
// or logs and may abort (non-glog version).
|
||||
class C10_API MessageLogger {
|
||||
public:
|
||||
MessageLogger(
|
||||
const char* file,
|
||||
int line,
|
||||
int severity,
|
||||
bool exit_on_fatal = true);
|
||||
~MessageLogger() noexcept(false);
|
||||
|
||||
// Return the stream associated with the logger object.
|
||||
std::stringstream& stream();
|
||||
|
||||
private:
|
||||
// When there is a fatal log, and fatal == true, we abort
|
||||
// otherwise, we throw.
|
||||
void DealWithFatal();
|
||||
|
||||
#if defined(ANDROID) && !defined(C10_USE_GLOG)
|
||||
const char* tag_{"native"};
|
||||
#endif
|
||||
std::stringstream stream_;
|
||||
int severity_;
|
||||
bool exit_on_fatal_;
|
||||
};
|
||||
|
||||
// This class is used to explicitly ignore values in the conditional
|
||||
// logging macros. This avoids compiler warnings like "value computed
|
||||
// is not used" and "statement has no effect".
|
||||
class C10_API LoggerVoidify {
|
||||
public:
|
||||
LoggerVoidify() = default;
|
||||
// This has to be an operator with a precedence lower than << but
|
||||
// higher than ?:
|
||||
void operator&(const std::ostream& s [[maybe_unused]]) {}
|
||||
};
|
||||
|
||||
// Forward declarations for CheckNotNull functions
|
||||
template <typename T>
|
||||
T& CheckNotNullCommon(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal = true);
|
||||
|
||||
template <typename T>
|
||||
T* CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T* t,
|
||||
bool fatal = true);
|
||||
|
||||
template <typename T>
|
||||
T& CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal = true);
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#endif // C10_UTIL_LOGGING_COMMON_H_
|
||||
@ -47,57 +47,53 @@ INSTANTIATE_FOR_CONTAINER(set)
|
||||
|
||||
#endif
|
||||
|
||||
#include <c10/util/logging_common.h>
|
||||
#include <glog/logging.h>
|
||||
|
||||
// Additional macros on top of glog
|
||||
#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
|
||||
#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
|
||||
#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
|
||||
#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
|
||||
#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
|
||||
#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
|
||||
namespace c10 {
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
|
||||
#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
|
||||
#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
|
||||
#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
|
||||
#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
|
||||
#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
|
||||
#else // !NDEBUG
|
||||
// These versions generate no code in optimized mode.
|
||||
#define TORCH_DCHECK_EQ(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_EQ(val1, val2)
|
||||
#define TORCH_DCHECK_NE(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_NE(val1, val2)
|
||||
#define TORCH_DCHECK_LE(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_LE(val1, val2)
|
||||
#define TORCH_DCHECK_LT(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_LT(val1, val2)
|
||||
#define TORCH_DCHECK_GE(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_GE(val1, val2)
|
||||
#define TORCH_DCHECK_GT(val1, val2) \
|
||||
while (false) \
|
||||
DCHECK_GT(val1, val2)
|
||||
#endif // NDEBUG
|
||||
[[noreturn]] void ThrowEnforceNotMet(
|
||||
const char* file,
|
||||
const int line,
|
||||
const char* condition,
|
||||
const std::string& msg,
|
||||
const void* caller);
|
||||
|
||||
// Check that a pointer is not null.
|
||||
#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
|
||||
template <typename T>
|
||||
T& CheckNotNullCommon(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal) {
|
||||
if (t == nullptr) {
|
||||
MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
|
||||
<< "Check failed: '" << names << "' must be non NULL. ";
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Debug only version of TORCH_CHECK_NOTNULL
|
||||
#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
|
||||
#else // !NDEBUG
|
||||
// Optimized version - generates no code.
|
||||
#define TORCH_DCHECK_NOTNULL(val) \
|
||||
while (false) \
|
||||
DCHECK_NOTNULL(val)
|
||||
#endif // NDEBUG
|
||||
template <typename T>
|
||||
T* CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T* t,
|
||||
bool fatal) {
|
||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T& CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal) {
|
||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
// Log with source location information override (to be used in generic
|
||||
// warning/error handlers implemented as functions, not macros)
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include <c10/util/Flags.h>
|
||||
#include <c10/util/logging_common.h>
|
||||
|
||||
const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
|
||||
|
||||
@ -24,61 +25,40 @@ const int GLOG_ERROR = 2;
|
||||
const int GLOG_WARNING = 1;
|
||||
const int GLOG_INFO = 0;
|
||||
|
||||
class C10_API MessageLogger {
|
||||
public:
|
||||
MessageLogger(const char* file, int line, int severity);
|
||||
~MessageLogger();
|
||||
// Return the stream associated with the logger object.
|
||||
std::stringstream& stream() {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
private:
|
||||
// When there is a fatal log, we simply abort.
|
||||
void DealWithFatal() {
|
||||
abort();
|
||||
}
|
||||
|
||||
const char* tag_;
|
||||
std::stringstream stream_;
|
||||
int severity_;
|
||||
};
|
||||
|
||||
// This class is used to explicitly ignore values in the conditional
|
||||
// logging macros. This avoids compiler warnings like "value computed
|
||||
// is not used" and "statement has no effect".
|
||||
class C10_API LoggerVoidify {
|
||||
public:
|
||||
LoggerVoidify() = default;
|
||||
// This has to be an operator with a precedence lower than << but
|
||||
// higher than ?:
|
||||
void operator&(const std::ostream& s [[maybe_unused]]) {}
|
||||
};
|
||||
|
||||
// Log a message and terminate.
|
||||
template <class T>
|
||||
void LogMessageFatal(const char* file, int line, const T& message) {
|
||||
MessageLogger(file, line, GLOG_FATAL).stream() << message;
|
||||
}
|
||||
|
||||
// Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
|
||||
// pointers and smart pointers.
|
||||
template <typename T>
|
||||
T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
|
||||
T& CheckNotNullCommon(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal) {
|
||||
if (t == nullptr) {
|
||||
LogMessageFatal(file, line, std::string(names));
|
||||
MessageLogger(file, line, GLOG_FATAL, fatal).stream()
|
||||
<< "Check failed: '" << names << "' must be non NULL. ";
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* CheckNotNull(const char* file, int line, const char* names, T* t) {
|
||||
return CheckNotNullCommon(file, line, names, t);
|
||||
T* CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T* t,
|
||||
bool fatal) {
|
||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T& CheckNotNull(const char* file, int line, const char* names, T& t) {
|
||||
return CheckNotNullCommon(file, line, names, t);
|
||||
T& CheckNotNull(
|
||||
const char* file,
|
||||
int line,
|
||||
const char* names,
|
||||
T& t,
|
||||
bool fatal) {
|
||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
||||
}
|
||||
} // namespace c10
|
||||
|
||||
@ -136,65 +116,6 @@ static_assert(
|
||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
|
||||
#endif // NDEBUG
|
||||
|
||||
#define TORCH_CHECK_OP(val1, val2, op) \
|
||||
FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
|
||||
<< (val1) << " vs. " << (val2) << ") "
|
||||
|
||||
// TORCH_CHECK_OP macro definitions
|
||||
#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
||||
#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
||||
#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
||||
#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
||||
#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
||||
#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Debug only versions of TORCH_CHECK_OP macros.
|
||||
#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
||||
#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
||||
#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
||||
#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
||||
#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
||||
#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
||||
#else // !NDEBUG
|
||||
// These versions generate no code in optimized mode.
|
||||
#define TORCH_DCHECK_EQ(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, ==)
|
||||
#define TORCH_DCHECK_NE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, !=)
|
||||
#define TORCH_DCHECK_LE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, <=)
|
||||
#define TORCH_DCHECK_LT(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, <)
|
||||
#define TORCH_DCHECK_GE(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, >=)
|
||||
#define TORCH_DCHECK_GT(val1, val2) \
|
||||
while (false) \
|
||||
TORCH_CHECK_OP(val1, val2, >)
|
||||
#endif // NDEBUG
|
||||
|
||||
// Check that a pointer is not null.
|
||||
#define TORCH_CHECK_NOTNULL(val) \
|
||||
::c10::CheckNotNull( \
|
||||
__FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Debug only version of TORCH_CHECK_NOTNULL
|
||||
#define TORCH_DCHECK_NOTNULL(val) \
|
||||
::c10::CheckNotNull( \
|
||||
__FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
|
||||
#else // !NDEBUG
|
||||
// Optimized version - generates no code.
|
||||
#define TORCH_DCHECK_NOTNULL(val) \
|
||||
while (false) \
|
||||
TORCH_CHECK_NOTNULL(val)
|
||||
#endif // NDEBUG
|
||||
|
||||
// ---------------------- Support for std objects --------------------------
|
||||
// These are adapted from glog to support a limited set of logging capability
|
||||
// for STL objects.
|
||||
|
||||
@ -172,9 +172,9 @@ ignore = [
|
||||
"SIM102", "SIM103", "SIM112", # flake8-simplify code styles
|
||||
"SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
|
||||
"SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
|
||||
"SIM110",
|
||||
"SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
|
||||
"SIM114", # Combine `if` branches using logical `or` operator
|
||||
"SIM115",
|
||||
"SIM115", # Checks for cases where files are opened without using a context manager.
|
||||
"SIM116", # Disable Use a dictionary instead of consecutive `if` statements
|
||||
"SIM117",
|
||||
"SIM118",
|
||||
@ -184,7 +184,6 @@ ignore = [
|
||||
"TC006",
|
||||
# TODO: Remove Python-3.10 specific suppressions
|
||||
"B905",
|
||||
"UP035",
|
||||
]
|
||||
select = [
|
||||
"B",
|
||||
|
||||
33
setup.py
33
setup.py
@ -630,6 +630,37 @@ def mirror_files_into_torchgen() -> None:
|
||||
raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
|
||||
|
||||
|
||||
def mirror_inductor_external_kernels() -> None:
|
||||
"""
|
||||
Copy external kernels into Inductor so they are importable.
|
||||
"""
|
||||
paths = [
|
||||
(
|
||||
CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
|
||||
CWD
|
||||
/ "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
|
||||
),
|
||||
]
|
||||
for new_path, orig_path in paths:
|
||||
# Create the dirs involved in new_path if they don't exist
|
||||
if not new_path.exists():
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy the files from the orig location to the new location
|
||||
if orig_path.is_file():
|
||||
shutil.copyfile(orig_path, new_path)
|
||||
continue
|
||||
if orig_path.is_dir():
|
||||
if new_path.exists():
|
||||
# copytree fails if the tree exists already, so remove it.
|
||||
shutil.rmtree(new_path)
|
||||
shutil.copytree(orig_path, new_path)
|
||||
continue
|
||||
raise RuntimeError(
|
||||
"Check the file paths in `mirror_inductor_external_kernels()`"
|
||||
)
|
||||
|
||||
|
||||
# ATTENTION: THIS IS AI SLOP
|
||||
def extract_variant_from_version(version: str) -> str:
|
||||
"""Extract variant from version string, defaulting to 'cpu'."""
|
||||
@ -1615,6 +1646,7 @@ def main() -> None:
|
||||
mirror_files_into_torchgen()
|
||||
if RUN_BUILD_DEPS:
|
||||
build_deps()
|
||||
mirror_inductor_external_kernels()
|
||||
|
||||
(
|
||||
ext_modules,
|
||||
@ -1649,6 +1681,7 @@ def main() -> None:
|
||||
"_inductor/codegen/aoti_runtime/*.cpp",
|
||||
"_inductor/script.ld",
|
||||
"_inductor/kernel/flex/templates/*.jinja",
|
||||
"_inductor/kernel/templates/*.jinja",
|
||||
"_export/serde/*.yaml",
|
||||
"_export/serde/*.thrift",
|
||||
"share/cmake/ATen/*.cmake",
|
||||
|
||||
@ -208,7 +208,7 @@ class _BaseDataSparsiferTestCase(TestCase):
|
||||
assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)
|
||||
|
||||
state1 = state_dict1["state"]
|
||||
for name in state1.keys():
|
||||
for name in state1:
|
||||
# compare mask
|
||||
assert name in sparsifier2.state
|
||||
assert "mask" in sparsifier2.state[name]
|
||||
|
||||
@ -119,7 +119,7 @@ class TestBaseSparsifier(TestCase):
|
||||
for idx in range(len(sparsifier0.groups)):
|
||||
mg0 = sparsifier0.groups[idx]
|
||||
mg1 = sparsifier1.groups[idx]
|
||||
for key in mg0.keys():
|
||||
for key in mg0:
|
||||
assert key in mg1
|
||||
if key == "module":
|
||||
# We cannot compare modules as they are different
|
||||
|
||||
@ -67,13 +67,13 @@ Tensor sgd_out_of_place(
|
||||
|
||||
void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor res = sgd_out_of_place(
|
||||
to<Tensor>(stack[0]),
|
||||
to<Tensor>(stack[1]),
|
||||
float(to<double>(stack[2])),
|
||||
to<double>(stack[3]),
|
||||
to<bool>(stack[4]));
|
||||
torch::stable::detail::to<Tensor>(stack[0]),
|
||||
torch::stable::detail::to<Tensor>(stack[1]),
|
||||
float(torch::stable::detail::to<double>(stack[2])),
|
||||
torch::stable::detail::to<double>(stack[3]),
|
||||
torch::stable::detail::to<bool>(stack[4]));
|
||||
|
||||
stack[0] = from(res);
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
|
||||
@ -89,8 +89,8 @@ Tensor identity(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor res = identity(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
Tensor res = identity(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -108,14 +108,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
Tensor my_abs(Tensor t) {
|
||||
const auto num_args = 1;
|
||||
StableIValue stack[num_args];
|
||||
stack[0] = from(t);
|
||||
stack[0] = torch::stable::detail::from(t);
|
||||
aoti_torch_call_dispatcher("aten::abs", "", stack);
|
||||
return to<Tensor>(stack[0]);
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
|
||||
stack[0] = from(tensor_res);
|
||||
Tensor tensor_res = my_abs(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(tensor_res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -132,21 +132,21 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
|
||||
|
||||
auto mf = aoti_torch_memory_format_contiguous_format();
|
||||
|
||||
stack[0] = from(t);
|
||||
stack[1] = from(std::optional(t.scalar_type())); // dtype
|
||||
stack[2] = from(std::nullopt); // layout
|
||||
stack[3] = from(std::optional(device)); // device
|
||||
stack[4] = from(std::optional(false)); // pin_memory
|
||||
stack[5] = from(std::optional(mf)); // memory_format
|
||||
stack[0] = torch::stable::detail::from(t);
|
||||
stack[1] = torch::stable::detail::from(std::optional(t.scalar_type())); // dtype
|
||||
stack[2] = torch::stable::detail::from(std::nullopt); // layout
|
||||
stack[3] = torch::stable::detail::from(std::optional(device)); // device
|
||||
stack[4] = torch::stable::detail::from(std::optional(false)); // pin_memory
|
||||
stack[5] = torch::stable::detail::from(std::optional(mf)); // memory_format
|
||||
|
||||
aoti_torch_call_dispatcher("aten::ones_like", "", stack);
|
||||
|
||||
return to<Tensor>(stack[0]);
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
|
||||
stack[0] = from(res);
|
||||
Tensor res = my_ones_like(torch::stable::detail::to<Tensor>(stack[0]), stack[1]);
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -159,28 +159,28 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
|
||||
std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
|
||||
StableIValue stack_exp[1];
|
||||
stack_exp[0] = from(t1);
|
||||
stack_exp[0] = torch::stable::detail::from(t1);
|
||||
aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
|
||||
|
||||
StableIValue stack_neg[1];
|
||||
stack_neg[0] = from(t2);
|
||||
stack_neg[0] = torch::stable::detail::from(t2);
|
||||
aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
|
||||
|
||||
StableIValue stack_is_leaf[1];
|
||||
stack_is_leaf[0] = from(t3);
|
||||
stack_is_leaf[0] = torch::stable::detail::from(t3);
|
||||
aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);
|
||||
|
||||
return std::make_tuple(
|
||||
to<Tensor>(stack_exp[0]),
|
||||
to<Tensor>(stack_neg[0]),
|
||||
to<bool>(stack_is_leaf[0]));
|
||||
torch::stable::detail::to<Tensor>(stack_exp[0]),
|
||||
torch::stable::detail::to<Tensor>(stack_neg[0]),
|
||||
torch::stable::detail::to<bool>(stack_is_leaf[0]));
|
||||
}
|
||||
|
||||
void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
|
||||
stack[0] = from(std::get<0>(tuple));
|
||||
stack[1] = from(std::get<1>(tuple));
|
||||
stack[2] = from(std::get<2>(tuple));
|
||||
auto tuple = exp_neg_is_leaf(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<Tensor>(stack[2]));
|
||||
stack[0] = torch::stable::detail::from(std::get<0>(tuple));
|
||||
stack[1] = torch::stable::detail::from(std::get<1>(tuple));
|
||||
stack[2] = torch::stable::detail::from(std::get<2>(tuple));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -193,15 +193,15 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
|
||||
Tensor neg_exp(Tensor t) {
|
||||
StableIValue stack[1];
|
||||
stack[0] = from(t);
|
||||
stack[0] = torch::stable::detail::from(t);
|
||||
aoti_torch_call_dispatcher("aten::exp", "", stack);
|
||||
aoti_torch_call_dispatcher("aten::neg", "", stack);
|
||||
return to<Tensor>(stack[0]);
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor res = neg_exp(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
Tensor res = neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -214,10 +214,10 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
|
||||
Tensor divide_neg_exp(Tensor t) {
|
||||
StableIValue stack_neg[1];
|
||||
stack_neg[0] = from(t);
|
||||
stack_neg[0] = torch::stable::detail::from(t);
|
||||
|
||||
StableIValue stack_exp[1];
|
||||
stack_exp[0] = from(t);
|
||||
stack_exp[0] = torch::stable::detail::from(t);
|
||||
aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
|
||||
aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
|
||||
|
||||
@ -225,12 +225,12 @@ Tensor divide_neg_exp(Tensor t) {
|
||||
stack_div[0] = stack_neg[0];
|
||||
stack_div[1] = stack_exp[0];
|
||||
aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
|
||||
return to<Tensor>(stack_div[0]);
|
||||
return torch::stable::detail::to<Tensor>(stack_div[0]);
|
||||
}
|
||||
|
||||
void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
Tensor res = divide_neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -246,8 +246,8 @@ bool is_contiguous(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
bool res = is_contiguous(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
bool res = is_contiguous(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -263,9 +263,9 @@ Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
|
||||
}
|
||||
|
||||
void boxed_my_transpose(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_transpose(to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
|
||||
auto res = my_transpose(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<int64_t>(stack[1]), torch::stable::detail::to<int64_t>(stack[2]));
|
||||
|
||||
stack[0] = from(res);
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_empty_like(Tensor t) {
|
||||
@ -273,8 +273,8 @@ Tensor my_empty_like(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_empty_like(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_empty_like(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool my_is_cpu(Tensor t) {
|
||||
@ -283,8 +283,8 @@ bool my_is_cpu(Tensor t) {
|
||||
|
||||
|
||||
void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_is_cpu(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_is_cpu(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor fill_infinity(Tensor t) {
|
||||
@ -296,8 +296,8 @@ void boxed_fill_infinity(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
auto res = fill_infinity(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = fill_infinity(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_pad(Tensor t) {
|
||||
@ -310,8 +310,8 @@ void boxed_my_pad(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
auto res = my_pad(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_pad(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
|
||||
@ -323,11 +323,11 @@ void boxed_my_narrow(
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
auto res = my_narrow(
|
||||
to<Tensor>(stack[0]),
|
||||
to<int64_t>(stack[1]),
|
||||
to<int64_t>(stack[2]),
|
||||
to<int64_t>(stack[3]));
|
||||
stack[0] = from(res);
|
||||
torch::stable::detail::to<Tensor>(stack[0]),
|
||||
torch::stable::detail::to<int64_t>(stack[1]),
|
||||
torch::stable::detail::to<int64_t>(stack[2]),
|
||||
torch::stable::detail::to<int64_t>(stack[3]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
@ -342,8 +342,8 @@ Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_new_empty_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_new_zeros_dtype_variant(Tensor t) {
|
||||
@ -352,8 +352,8 @@ Tensor my_new_zeros_dtype_variant(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_new_zeros_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
|
||||
@ -361,8 +361,8 @@ Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
|
||||
}
|
||||
|
||||
void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
|
||||
stack[0] = from(tensor_res);
|
||||
Tensor tensor_res = my_copy_(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<bool>(stack[2]));
|
||||
stack[0] = torch::stable::detail::from(tensor_res);
|
||||
}
|
||||
|
||||
Tensor my_clone(Tensor t) {
|
||||
@ -370,8 +370,8 @@ Tensor my_clone(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
|
||||
stack[0] = from(tensor_res);
|
||||
Tensor tensor_res = my_clone(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(tensor_res);
|
||||
}
|
||||
|
||||
|
||||
@ -408,8 +408,8 @@ Tensor my_zero_(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_zero_(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_zero_(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_amax(Tensor t) {
|
||||
@ -417,8 +417,8 @@ Tensor my_amax(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_amax(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_amax(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
@ -426,8 +426,8 @@ Tensor my_amax_vec(Tensor t) {
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_amax_vec(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
auto res = my_amax_vec(torch::stable::detail::to<Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -464,8 +464,8 @@ void boxed_test_default_constructor(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_default_constructor(to<bool>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
bool res = test_default_constructor(torch::stable::detail::to<bool>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -478,6 +478,56 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_amax_vec", &boxed_my_amax_vec);
|
||||
}
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
|
||||
return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
|
||||
}
|
||||
|
||||
void boxed_my__foreach_mul(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
// Why is the following NOT torch::stable::detail::to<HeaderOnlyArrayRef<Tensor>>(stack[0])? Because calling `to`
|
||||
// on a StableIValue means that the result is owning its underlying data now! HeaderOnlyArrayRef
|
||||
// is not owning, so it cannot safely steward the result of the torch::stable::detail::to<>.
|
||||
auto res = my__foreach_mul(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
|
||||
}
|
||||
|
||||
void boxed_my__foreach_mul_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
my__foreach_mul_(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
|
||||
}
|
||||
|
||||
std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
|
||||
// This function tests that my__foreach_mul can take in std::initializer_lists
|
||||
// in addition to std::vectors.
|
||||
Tensor t1_1 = my_clone(t1);
|
||||
Tensor t1_2 = my_clone(t1);
|
||||
Tensor t2_1 = my_clone(t2);
|
||||
Tensor t2_2 = my_clone(t2);
|
||||
return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
|
||||
}
|
||||
|
||||
void boxed_make_tensor_clones_and_call_foreach(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = make_tensor_clones_and_call_foreach(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
|
||||
m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
|
||||
m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul", &boxed_my__foreach_mul);
|
||||
m.impl("my__foreach_mul_", &boxed_my__foreach_mul_);
|
||||
m.impl("make_tensor_clones_and_call_foreach", &boxed_make_tensor_clones_and_call_foreach);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::accelerator APIs
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
@ -500,8 +550,8 @@ void boxed_test_device_guard(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
|
||||
stack[0] = from(res);
|
||||
int res = test_device_guard(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
int64_t test_device_guard_set_index() {
|
||||
@ -520,7 +570,7 @@ void boxed_test_device_guard_set_index(
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
int64_t res = test_device_guard_set_index();
|
||||
stack[0] = from(res);
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
int64_t test_stream(int32_t device_index) {
|
||||
@ -536,8 +586,8 @@ void boxed_test_stream(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
|
||||
stack[0] = from(res);
|
||||
int64_t res = test_stream(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
int64_t test_get_current_device_index() {
|
||||
@ -549,7 +599,7 @@ void boxed_test_get_current_device_index(
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
int64_t res = test_get_current_device_index();
|
||||
stack[0] = from(res);
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
@ -565,4 +615,5 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_stream", &boxed_test_stream);
|
||||
m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
|
||||
}
|
||||
|
||||
#endif // LAE_USE_CUDA
|
||||
|
||||
@ -333,3 +333,45 @@ def my_new_zeros_dtype_variant(t) -> Tensor:
|
||||
Returns: New zeros tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my__foreach_mul_(tensors, others) -> ():
|
||||
"""
|
||||
Updates tensors to be the result of pointwise multiplying with others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: nothing, tensors is updated in place.
|
||||
"""
|
||||
torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
|
||||
|
||||
|
||||
def my__foreach_mul(tensors, others) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of tensors that are the results of pointwise multiplying
|
||||
tensors and others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: list of multiplied tensors
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
|
||||
|
||||
|
||||
def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of 2 tensors corresponding to the square of the inputs.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
|
||||
Returns: list of [t1^2, t2^2]
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
@ -367,6 +367,57 @@ if not IS_WINDOWS:
|
||||
self.assertNotEqual(result.data_ptr(), expected.data_ptr())
|
||||
self.assertEqual(result.stride(), expected.stride())
|
||||
|
||||
def test_my__foreach_mul_(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
tensors_c = [t.clone() for t in tensors]
|
||||
others = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
|
||||
libtorch_agnostic.ops.my__foreach_mul_(tensors, others)
|
||||
expected_values = torch._foreach_mul(tensors_c, others)
|
||||
|
||||
for tensor_t, expected_t in zip(tensors, expected_values):
|
||||
self.assertEqual(tensor_t, expected_t)
|
||||
|
||||
def test_my__foreach_mul(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
others = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
|
||||
result = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
|
||||
expected = torch._foreach_mul(tensors, others)
|
||||
|
||||
for result_t, expected_t in zip(result, expected):
|
||||
self.assertEqual(result_t, expected_t)
|
||||
|
||||
def _make_cuda_tensors(prior_mem):
|
||||
cuda_res = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
|
||||
self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
|
||||
|
||||
expected = torch._foreach_mul(tensors, others)
|
||||
for result_t, expected_t in zip(cuda_res, expected):
|
||||
self.assertEqual(result_t, expected_t)
|
||||
|
||||
if tensors[0].is_cuda:
|
||||
init_mem = torch.cuda.memory_allocated(device)
|
||||
for _ in range(3):
|
||||
_make_cuda_tensors(init_mem)
|
||||
curr_mem = torch.cuda.memory_allocated(device)
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_make_tensor_clones_and_call_foreach(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t1 = torch.rand(2, 5, device=device)
|
||||
t2 = torch.rand(3, 4, device=device)
|
||||
result = libtorch_agnostic.ops.make_tensor_clones_and_call_foreach(t1, t2)
|
||||
self.assertEqual(result[0], t1 * t1)
|
||||
self.assertEqual(result[1], t2 * t2)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from backend import get_custom_backend_library_path, Model, to_custom_backend
|
||||
@ -41,14 +40,11 @@ class TestCustomBackend(TestCase):
|
||||
self.test_execute()
|
||||
|
||||
# Save and load.
|
||||
f = tempfile.NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
f.close()
|
||||
torch.jit.save(self.model, f.name)
|
||||
loaded = torch.jit.load(f.name)
|
||||
finally:
|
||||
os.unlink(f.name)
|
||||
self.model = loaded
|
||||
self.model = loaded
|
||||
|
||||
# Test execution again.
|
||||
self.test_execute()
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
@ -144,16 +143,13 @@ def forward(self, arg0_1):
|
||||
# Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
|
||||
# opens the file, and it cannot be opened multiple times in Windows. To support Windows,
|
||||
# close the file after creation and try to remove it manually.
|
||||
file = tempfile.NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile() as file:
|
||||
file.close()
|
||||
model.save(file.name)
|
||||
loaded = torch.jit.load(file.name)
|
||||
finally:
|
||||
os.unlink(file.name)
|
||||
|
||||
output = loaded.forward(torch.ones(5))
|
||||
self.assertTrue(output.allclose(torch.ones(5) + 1))
|
||||
output = loaded.forward(torch.ones(5))
|
||||
self.assertTrue(output.allclose(torch.ones(5) + 1))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Owner(s): ["module: fsdp"]
|
||||
import functools
|
||||
import os
|
||||
import unittest.mock
|
||||
import unittest
|
||||
|
||||
import torch.distributed as dist
|
||||
from torch._dynamo.test_case import run_tests
|
||||
@ -37,9 +37,9 @@ import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from torch.distributed.fsdp import fully_shard
|
||||
logger = logging.getLogger("torch.distributed._composable.fsdp")
|
||||
logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
device = {device_type.type}
|
||||
device = '{device_type.type}'
|
||||
torch.manual_seed(0)
|
||||
model = nn.Sequential(*[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)])
|
||||
for layer in model:
|
||||
|
||||
@ -76,7 +76,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
store=dist.FileStore(self.file_name, self.world_size),
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_replicate_transformer(self):
|
||||
"""
|
||||
This tests that replicate works on a transformer model with fully_shard and replicate layers
|
||||
@ -126,7 +126,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
for parameter in layer.parameters():
|
||||
self.assertEqual(parameter.placements, (Shard(dim=0),))
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_replicate_transformer_managed_modules(self):
|
||||
"""
|
||||
This tests that replicate managed modules works properly. In this test we use a Transformer Module with 3 layers,
|
||||
@ -178,7 +178,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
replicate_model = replicate(replicate_model)
|
||||
self.assertEqual(len(_get_managed_modules((replicate_model,))), 21)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_replicate_tp_device_mesh(self):
|
||||
"""
|
||||
This tests that a user can pass in a device mesh to replicate a module
|
||||
@ -206,7 +206,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self.assertEqual(parameter.device_mesh.shape, (2,))
|
||||
self.assertEqual(parameter.placements, (Replicate(),))
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_train_replicate_fsdp(self):
|
||||
"""
|
||||
Tests that replicate_model has the same behavior as original model when training
|
||||
@ -253,7 +253,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self.assertEqual(replicate_loss, loss)
|
||||
check_sharded_parity(self, model, replicate_model)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_train_parity_2d_mlp(self):
|
||||
"""
|
||||
Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
|
||||
|
||||
@ -80,7 +80,7 @@ class TestSACILP(TestCase):
|
||||
# postprocessing due to the fact that for ModTracker, the post backward hook
|
||||
# is not being called for modules whose inputs don't require gradients
|
||||
# TODO: fix this in ModTracker and ensure it does not lead to any perf regression
|
||||
if _ModState.POST_BW not in mod_stats.snapshots.keys():
|
||||
if _ModState.POST_BW not in mod_stats.snapshots:
|
||||
mod_stats.snapshots.setdefault(_ModState.POST_BW, []).append(
|
||||
copy.deepcopy(last_snapshot)
|
||||
)
|
||||
|
||||
@ -16,7 +16,7 @@ from torch.distributed.argparse_util import check_env, env
|
||||
class ArgParseUtilTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# remove any lingering environment variables
|
||||
for e in os.environ.keys():
|
||||
for e in os.environ.keys(): # noqa: SIM118
|
||||
if e.startswith("PET_"):
|
||||
del os.environ[e]
|
||||
|
||||
|
||||
@ -207,7 +207,7 @@ class TestDefaultStager(TestCase):
|
||||
for i, result in enumerate(staged_results):
|
||||
self.assertIsInstance(result, dict)
|
||||
# Verify the result contains the expected keys
|
||||
for key in state_dicts[i].keys():
|
||||
for key in state_dicts[i]:
|
||||
self.assertIn(key, result)
|
||||
|
||||
stager.close()
|
||||
|
||||
@ -299,7 +299,7 @@ class TestDTensorReshardMeshChange(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
|
||||
"""
|
||||
Saving a dtensor with uneven shards.
|
||||
@ -436,6 +436,7 @@ class TestCheckpointableReshard(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_uneven_reshard_with_checkpointable_api(self) -> None:
|
||||
"""
|
||||
Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
|
||||
@ -498,6 +499,7 @@ class TestCheckpointableReshard(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
|
||||
"""
|
||||
Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
|
||||
|
||||
@ -60,7 +60,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
self.assertEqual(
|
||||
sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
|
||||
)
|
||||
for key in state_dict_to_save.keys():
|
||||
for key in state_dict_to_save:
|
||||
self.assertTrue(
|
||||
torch.equal(state_dict_to_save[key], state_dict_loaded[key])
|
||||
)
|
||||
@ -89,7 +89,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
self.assertEqual(
|
||||
sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
|
||||
)
|
||||
for key in state_dict_to_save.keys():
|
||||
for key in state_dict_to_save:
|
||||
self.assertTrue(
|
||||
torch.equal(state_dict_to_save[key], state_dict_to_load[key])
|
||||
)
|
||||
@ -116,7 +116,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
self.assertEqual(
|
||||
sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
|
||||
)
|
||||
for key in state_dict_to_save.keys():
|
||||
for key in state_dict_to_save:
|
||||
self.assertTrue(
|
||||
torch.equal(state_dict_to_save[key], state_dict_loaded[key])
|
||||
)
|
||||
@ -156,7 +156,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
self.assertEqual(
|
||||
sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
|
||||
)
|
||||
for key in state_dict_to_save.keys():
|
||||
for key in state_dict_to_save:
|
||||
self.assertTrue(
|
||||
torch.equal(state_dict_to_save[key], state_dict_to_load[key])
|
||||
)
|
||||
|
||||
@ -18,6 +18,7 @@ from torch.distributed.checkpoint._dedup_save_plans import dedup_save_plans
|
||||
from torch.distributed.checkpoint.api import CheckpointException
|
||||
from torch.distributed.checkpoint.default_planner import (
|
||||
_create_default_local_metadata,
|
||||
_validate_global_plan,
|
||||
create_default_global_save_plan,
|
||||
create_default_local_load_plan,
|
||||
create_default_local_save_plan,
|
||||
@ -28,6 +29,7 @@ from torch.distributed.checkpoint.filesystem import CURRENT_DCP_VERSION
|
||||
from torch.distributed.checkpoint.metadata import (
|
||||
BytesStorageMetadata,
|
||||
ChunkStorageMetadata,
|
||||
Metadata,
|
||||
MetadataIndex,
|
||||
TensorProperties,
|
||||
TensorStorageMetadata,
|
||||
@ -560,6 +562,32 @@ class TestPlannerHelpers(TestCase):
|
||||
self.assertTrue(_compare_save_plans(plan2, plan2))
|
||||
|
||||
|
||||
class TestValidateGlobalPlan(TestCase):
|
||||
def _make_metadata(self, chunks, size):
|
||||
storage = TensorStorageMetadata(
|
||||
properties=TensorProperties(dtype=torch.float32),
|
||||
size=torch.Size(size),
|
||||
chunks=chunks,
|
||||
)
|
||||
return Metadata(state_dict_metadata={"param": storage})
|
||||
|
||||
def test_non_overlapping_chunks(self):
|
||||
chunks = [
|
||||
ChunkStorageMetadata(offsets=torch.Size([i]), sizes=torch.Size([1]))
|
||||
for i in range(4)
|
||||
]
|
||||
metadata = self._make_metadata(chunks, [4])
|
||||
self.assertTrue(_validate_global_plan([SavePlan([])], metadata))
|
||||
|
||||
def test_detect_overlapping_chunks(self):
|
||||
chunks = [
|
||||
ChunkStorageMetadata(offsets=torch.Size([0]), sizes=torch.Size([2])),
|
||||
ChunkStorageMetadata(offsets=torch.Size([1]), sizes=torch.Size([2])),
|
||||
]
|
||||
metadata = self._make_metadata(chunks, [4])
|
||||
self.assertFalse(_validate_global_plan([SavePlan([])], metadata))
|
||||
|
||||
|
||||
class TestLoadPlanner(TestCase):
|
||||
@with_temp_dir
|
||||
def test_strict(self):
|
||||
|
||||
@ -769,7 +769,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
model_state_dict3 = copy.deepcopy(model_state_dict3)
|
||||
self.assertEqual(len(model_state_dict2), 2)
|
||||
self.assertEqual(len(model_state_dict3), 2)
|
||||
for key in model_state_dict3.keys():
|
||||
for key in model_state_dict3:
|
||||
full_fqn = f"l.{key}"
|
||||
value1 = model_state_dict1[full_fqn]
|
||||
value2 = model_state_dict2[full_fqn]
|
||||
@ -886,7 +886,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
self.assertEqual(cpu_model_value, meta_model_value)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
|
||||
# This test verifies that we can set model state dict by a meta device model
|
||||
# With the correlated changes in state_dict, meta device model should be accepted
|
||||
|
||||
@ -587,9 +587,7 @@ class TestFSDPStateDict(FSDPTest):
|
||||
model, cpu_offload.offload_params, fp16
|
||||
)
|
||||
|
||||
ignore_keys = [
|
||||
k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k
|
||||
]
|
||||
ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]
|
||||
|
||||
self._validate_state_dict_contents(
|
||||
model,
|
||||
@ -910,7 +908,7 @@ class TestFSDPStateDict(FSDPTest):
|
||||
with sd_mgr:
|
||||
fsdp_state_dict = model.state_dict()
|
||||
|
||||
ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
|
||||
ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]
|
||||
self._validate_state_dict_contents(
|
||||
model,
|
||||
fsdp_state_dict,
|
||||
@ -959,9 +957,7 @@ class TestFSDPStateDict(FSDPTest):
|
||||
# Full name of linear_skip param tensors in SkipModel, as would be
|
||||
# stored in checkpoint.
|
||||
linear_skip_tensor_names = [
|
||||
k
|
||||
for k in dict(module.named_parameters()).keys()
|
||||
if LINEAR_SKIP in k
|
||||
k for k in dict(module.named_parameters()) if LINEAR_SKIP in k
|
||||
]
|
||||
# skip SkipModule
|
||||
linear_skip = getattr(module, LINEAR_SKIP)
|
||||
|
||||
@ -137,7 +137,7 @@ class ElasticLaunchTest(unittest.TestCase):
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
|
||||
# remove any lingering environment variables.
|
||||
for env in os.environ.keys():
|
||||
for env in os.environ.keys(): # noqa: SIM118
|
||||
if env.startswith("PET_"):
|
||||
del os.environ[env]
|
||||
|
||||
|
||||
@ -69,7 +69,7 @@ class ElasticLaunchTest(TestCase):
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
|
||||
# remove any lingering environment variables
|
||||
for env in os.environ.keys():
|
||||
for env in os.environ.keys(): # noqa: SIM118
|
||||
if env.startswith("PET_"):
|
||||
del os.environ[env]
|
||||
|
||||
|
||||
@ -39,6 +39,7 @@ from torch.nn.modules.loss import MSELoss
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinuousTest,
|
||||
requires_accelerator_dist_backend,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
check_leaked_tensors,
|
||||
@ -231,6 +232,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [_ScheduleForwardOnly])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_forward_only(self, ScheduleClass):
|
||||
mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
|
||||
x_clone = x.clone()
|
||||
@ -274,6 +276,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_eval_inference_mode(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
@ -351,6 +354,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_return_output(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
@ -406,6 +410,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_multi_iter(self, ScheduleClass):
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
chunks = 4
|
||||
@ -429,6 +434,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_kwargs_with_tracer(self, ScheduleClass):
|
||||
mod = ModelWithKwargs(d_hid, splits=self.world_size)
|
||||
mod.to(self.device)
|
||||
@ -481,6 +487,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_tracer(self, ScheduleClass):
|
||||
mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
@ -523,6 +530,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@parametrize("shape_inference", [True, False])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_manual(self, ScheduleClass, shape_inference):
|
||||
mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
@ -586,6 +594,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_manual_interleaved(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -650,6 +659,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -736,6 +746,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
"schedule_class",
|
||||
[ScheduleZBVZeroBubble, ScheduleDualPipeV],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_v_shape_schedules(self, schedule_class):
|
||||
n_stages = 8
|
||||
rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
|
||||
@ -780,6 +791,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_custom_function_callback(self):
|
||||
"""Test the custom function callback functionality with _PipelineScheduleRuntime."""
|
||||
n_stages = 8
|
||||
@ -979,6 +991,7 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
"ScheduleClass",
|
||||
[ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -1072,6 +1085,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
"schedule_class",
|
||||
[ScheduleVShaped, ScheduleUnbalanced],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_non_symmetric_stage_ids(self, schedule_class):
|
||||
n_stages = schedule_class.n_stages
|
||||
rank_stages = schedule_class.rank_stages
|
||||
@ -1121,6 +1135,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleWithReorderedB])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
|
||||
n_stages = 2
|
||||
stages_per_rank = 1
|
||||
@ -1181,6 +1196,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleWithW])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_schedule_with_native_zero_bubble(self, ScheduleClass):
|
||||
n_stages = ScheduleClass.n_stages
|
||||
num_microbatches = ScheduleClass.num_microbatches
|
||||
|
||||
@ -464,6 +464,25 @@ def forward(self, b_parametrizations_buffer_original0, x):
|
||||
run(g, 64, 8)
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_dtensor_requires_grad_recompile(self):
|
||||
cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def f(x):
|
||||
y = x * x
|
||||
return y.to_local()
|
||||
|
||||
full_x = torch.randn(8, 8, requires_grad=False)
|
||||
x = distribute_tensor(full_x, mesh, [Shard(0)])
|
||||
f(x)
|
||||
|
||||
full_x = torch.randn(8, 8, requires_grad=True)
|
||||
x = distribute_tensor(full_x, mesh, [Shard(0)])
|
||||
f(x)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_dtensor_attribute_access_on_intermediate(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
|
||||
@ -535,6 +535,19 @@ class DTensorExportTest(TestCase):
|
||||
|
||||
self.assertEqual(fn(z), gm(z)[0])
|
||||
|
||||
def test_dtensor_data_dependent_index(self):
|
||||
device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
|
||||
|
||||
class Foo(torch.nn.Module):
|
||||
def forward(self, x, y):
|
||||
return x[y]
|
||||
|
||||
x = torch.randn(10)
|
||||
y = torch.randint(1, (10,)).bool()
|
||||
x_dt = distribute_tensor(x, device_mesh, placements=[Replicate()])
|
||||
y_dt = distribute_tensor(y, device_mesh, placements=[Replicate()])
|
||||
_dynamo_graph_capture_for_export(Foo())(x_dt, y_dt)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(DTensorExportTest)
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ from torch.distributed.tensor.parallel import (
|
||||
RowwiseParallel,
|
||||
SequenceParallel,
|
||||
)
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
@ -764,6 +765,7 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
self.assertEqual(grad1_norm.device_mesh, mesh_y)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_foreach_add_different_mesh(self):
|
||||
mesh_shape = (2, self.world_size // 2)
|
||||
mesh_2d = init_device_mesh(
|
||||
|
||||
@ -577,7 +577,7 @@ class DistTensorReplicateStrategyRegistrationTest(DTensorTestBase):
|
||||
self.assertEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
torch.ops.c10d_functional.all_gather_into_tensor: 4,
|
||||
torch.ops.c10d_functional.all_gather_into_tensor: self.world_size,
|
||||
},
|
||||
)
|
||||
expected_cost = [
|
||||
|
||||
@ -54,6 +54,7 @@ def apply_reordering_and_get_graph(graph, out_li) -> None:
|
||||
"max_compute_pre_fetch",
|
||||
"custom_runtime_estimation",
|
||||
"insert_overlap_deps",
|
||||
"collective_estimator",
|
||||
)
|
||||
for key in config_keys:
|
||||
if (val := getattr(dist_opts, key)) is not None:
|
||||
@ -943,6 +944,50 @@ class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
|
||||
correct = func(inputs_a, inputs_b, ranks=ranks)
|
||||
self.assertTrue(same(out, correct))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_collective_benchmarking_with_real_pg(self):
|
||||
"""Test collective benchmarking with real process group (falls back on fake)."""
|
||||
|
||||
def func(a):
|
||||
# Test all three collective types with 8x8 (power of 2 size = 256 elements = 1024 bytes for fp32)
|
||||
ar = _functional_collectives.all_reduce(a, "sum", "0")
|
||||
ag = _functional_collectives.all_gather_tensor(
|
||||
a, 0, list(range(self.world_size))
|
||||
)
|
||||
rs = _functional_collectives.reduce_scatter_tensor(a, "sum", 0, "0")
|
||||
|
||||
b = torch.matmul(a, a)
|
||||
c = torch.matmul(ar, b)
|
||||
return c.sum() + ag.sum() + rs.sum()
|
||||
|
||||
patches = {
|
||||
**get_patches(),
|
||||
"aten_distributed_optimizations.collective_estimator": "benchmark",
|
||||
"aten_distributed_optimizations.custom_runtime_estimation": None, # Remove custom estimation so benchmarking happens
|
||||
}
|
||||
|
||||
with _dynamo_dist_per_rank_init(
|
||||
self.rank,
|
||||
self.world_size,
|
||||
self.backend(device_type),
|
||||
fake_pg=not at_least_x_gpu(2),
|
||||
):
|
||||
inputs = torch.ones(8, 8, dtype=torch.float, device=device_type) + self.rank
|
||||
|
||||
with torch._inductor.config.patch(patches):
|
||||
compiled = torch.compile(func)
|
||||
out, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
|
||||
|
||||
# Verify all three collective types are present
|
||||
FileCheck().check("all_reduce").check("all_gather").check(
|
||||
"reduce_scatter"
|
||||
).run(aten_graph_str)
|
||||
|
||||
# Test passes if compilation succeeded with benchmarking enabled
|
||||
# Cache verification is tricky due to multiprocess test setup
|
||||
correct = func(inputs)
|
||||
self.assertTrue(same(out, correct))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@torch._inductor.config.patch(get_bucket_patches())
|
||||
def test_multidtype_bucketing(self):
|
||||
|
||||
@ -485,7 +485,7 @@ elif TEST_XPU:
|
||||
def exit_if_lt_x_accelerators(x):
|
||||
if torch.accelerator.is_available():
|
||||
if torch.accelerator.device_count() < x:
|
||||
sys.exit(TEST_SKIPS[f"multi-accelerator-{x}"].exit_code)
|
||||
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
|
||||
|
||||
|
||||
def with_comms(func=None):
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
# Owner(s): ["module: dynamo"]
|
||||
# flake8: noqa: B950
|
||||
# flake8: noqa: E731
|
||||
import contextlib
|
||||
import copy
|
||||
import functools
|
||||
@ -15,7 +17,11 @@ import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
from functorch.compile import min_cut_rematerialization_partition
|
||||
from torch._dynamo.backends.common import aot_autograd
|
||||
from torch._dynamo.testing import CompileCounterWithBackend
|
||||
from torch._dynamo.testing import (
|
||||
AotEagerAndRecordGraphs,
|
||||
CompileCounterWithBackend,
|
||||
normalize_gm,
|
||||
)
|
||||
from torch._higher_order_ops.wrap import tag_activation_checkpoint
|
||||
from torch.testing._internal.common_device_type import instantiate_device_type_tests
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
|
||||
@ -1649,6 +1655,43 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
|
||||
|
||||
self.assertEqual(opt_fn(x), fn(x))
|
||||
|
||||
def test_return_same_element_twice(self):
|
||||
def gn(x):
|
||||
y = torch.sin(x)
|
||||
return y, y
|
||||
|
||||
def fn(x):
|
||||
return torch.utils.checkpoint.checkpoint(gn, x, use_reentrant=True)
|
||||
|
||||
x = torch.randn(4, 4, requires_grad=True)
|
||||
ref = fn(x)
|
||||
|
||||
backend = AotEagerAndRecordGraphs()
|
||||
opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
|
||||
res = opt_fn(x)
|
||||
self.assertEqual(ref[0], res[0])
|
||||
self.assertEqual(ref[1], res[1])
|
||||
|
||||
self.assertExpectedInline(
|
||||
normalize_gm(backend.graphs[0].print_readable(print_output=False)),
|
||||
"""\
|
||||
class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[4, 4]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
wrap_body_0 = self.wrap_body_0
|
||||
tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = True); wrap_body_0 = l_x_ = None
|
||||
getitem: "f32[4, 4]" = tag_activation_checkpoint[0]
|
||||
getitem_1: "f32[4, 4]" = tag_activation_checkpoint[1]; tag_activation_checkpoint = None
|
||||
return (getitem, getitem_1)
|
||||
|
||||
class wrap_body_0(torch.nn.Module):
|
||||
def forward(self, l_x_: "f32[4, 4]"):
|
||||
y: "f32[4, 4]" = torch.sin(l_x_); l_x_ = None
|
||||
return (y, y)
|
||||
""",
|
||||
)
|
||||
|
||||
@torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
|
||||
def test_nonlocal_mutation(self):
|
||||
counter = 0
|
||||
@ -1672,6 +1715,114 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
|
||||
# The mutation is not reapplied in the backward because the flag was on.
|
||||
self.assertEqual(counter, 1)
|
||||
|
||||
@torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
|
||||
def test_nonlocal_list_mutation(self):
|
||||
def gn(x, z):
|
||||
out = x.sin()
|
||||
z.append(out)
|
||||
return torch.cos(torch.sin(torch.matmul(x, x) @ x)), out
|
||||
|
||||
def fn(x):
|
||||
z = []
|
||||
|
||||
out1, out2 = torch.utils.checkpoint.checkpoint(
|
||||
gn,
|
||||
x,
|
||||
z,
|
||||
use_reentrant=False,
|
||||
)
|
||||
|
||||
return out1, z[0]
|
||||
|
||||
x = torch.randn(4, 4, requires_grad=True)
|
||||
ref = fn(x)
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
res = opt_fn(x)
|
||||
self.assertEqual(ref[0], res[0])
|
||||
self.assertEqual(ref[1], res[1])
|
||||
|
||||
@torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
|
||||
def test_nonlocal_list_mutation_hidden(self):
|
||||
def gn(x, z):
|
||||
o = torch.matmul(x, x) @ x
|
||||
out = x.sin()
|
||||
z.append(out)
|
||||
return torch.cos(torch.sin(o)), torch.sin(x)
|
||||
|
||||
def fn(x):
|
||||
z = []
|
||||
|
||||
outs = torch.utils.checkpoint.checkpoint(
|
||||
gn,
|
||||
x,
|
||||
z,
|
||||
use_reentrant=False,
|
||||
)
|
||||
out1 = outs[0]
|
||||
# Check that the extra output pytree handling is done properly
|
||||
out2 = outs[-1]
|
||||
|
||||
return out1 + out2, z[0]
|
||||
|
||||
x = torch.randn(4, 4, requires_grad=True)
|
||||
ref = fn(x)
|
||||
|
||||
backend = AotEagerAndRecordGraphs()
|
||||
opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
|
||||
res = opt_fn(x)
|
||||
self.assertEqual(ref[0], res[0])
|
||||
self.assertEqual(ref[1], res[1])
|
||||
|
||||
self.assertExpectedInline(
|
||||
normalize_gm(backend.graphs[0].print_readable(print_output=False)),
|
||||
"""\
|
||||
class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[4, 4]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
wrap_body_0 = self.wrap_body_0
|
||||
tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = False); wrap_body_0 = l_x_ = None
|
||||
out1: "f32[4, 4]" = tag_activation_checkpoint[0]
|
||||
out2: "f32[4, 4]" = tag_activation_checkpoint[1]
|
||||
getitem_4: "f32[4, 4]" = tag_activation_checkpoint[4]; tag_activation_checkpoint = None
|
||||
|
||||
add: "f32[4, 4]" = out1 + out2; out1 = out2 = None
|
||||
return (add, getitem_4)
|
||||
|
||||
class wrap_body_0(torch.nn.Module):
|
||||
def forward(self, l_x_: "f32[4, 4]"):
|
||||
matmul: "f32[4, 4]" = torch.matmul(l_x_, l_x_)
|
||||
o: "f32[4, 4]" = matmul @ l_x_
|
||||
|
||||
out: "f32[4, 4]" = l_x_.sin()
|
||||
|
||||
sin_1: "f32[4, 4]" = torch.sin(o)
|
||||
child: "f32[4, 4]" = torch.cos(sin_1)
|
||||
child_1: "f32[4, 4]" = torch.sin(l_x_); l_x_ = None
|
||||
return (child, child_1, matmul, o, out, sin_1)
|
||||
""",
|
||||
)
|
||||
|
||||
self.assertExpectedInline(
|
||||
normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
|
||||
"""\
|
||||
class GraphModule(torch.nn.Module):
|
||||
def forward(self, primals_1: "f32[4, 4]"):
|
||||
mm: "f32[4, 4]" = torch.ops.aten.mm.default(primals_1, primals_1)
|
||||
mm_1: "f32[4, 4]" = torch.ops.aten.mm.default(mm, primals_1); mm = None
|
||||
|
||||
sin: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
|
||||
|
||||
sin_1: "f32[4, 4]" = torch.ops.aten.sin.default(mm_1); mm_1 = None
|
||||
cos: "f32[4, 4]" = torch.ops.aten.cos.default(sin_1); sin_1 = None
|
||||
sin_2: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
|
||||
|
||||
add: "f32[4, 4]" = torch.ops.aten.add.Tensor(cos, sin_2); cos = sin_2 = None
|
||||
return (add, sin, primals_1)
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
devices = ["cuda", "hpu"]
|
||||
instantiate_device_type_tests(
|
||||
|
||||
@ -2109,6 +2109,89 @@ Detected recompile when torch.compile stance is 'fail_on_recompile'. filename: '
|
||||
with self.assertRaises(Unsupported):
|
||||
outer_f2(inp)
|
||||
|
||||
def test_disable_recursive_flags(self):
|
||||
class SimpleLinear(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.layer0 = torch.nn.Linear(4, 4)
|
||||
|
||||
def forward(self, inp):
|
||||
return self.layer0(torch.sigmoid(inp))
|
||||
|
||||
class SimpleModel(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.layer0 = SimpleLinear()
|
||||
self.layer1 = torch.nn.Linear(4, 4)
|
||||
|
||||
def forward(self, inp):
|
||||
z = self.layer0(torch.sin(inp))
|
||||
return self.layer1(z)
|
||||
|
||||
for recursive_flag in [True, False]:
|
||||
model = SimpleModel()
|
||||
other_model = SimpleModel()
|
||||
|
||||
model.forward = torch._dynamo.disable(
|
||||
model.forward,
|
||||
recursive=recursive_flag,
|
||||
)
|
||||
self.assertEqual(
|
||||
torch._dynamo.is_dynamo_disable_recursive(model.forward),
|
||||
recursive_flag,
|
||||
)
|
||||
|
||||
other_model = torch._dynamo.disable(other_model, recursive=recursive_flag)
|
||||
self.assertEqual(
|
||||
torch._dynamo.is_dynamo_disable_recursive(
|
||||
other_model.forward
|
||||
if isinstance(other_model, torch.nn.Module)
|
||||
else other_model
|
||||
),
|
||||
recursive_flag,
|
||||
)
|
||||
|
||||
# check the model is compilable
|
||||
torch.compile(model)
|
||||
torch.compile(other_model)
|
||||
|
||||
def test_dynamo_disable_annotations(self):
|
||||
class SimpleModel(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.register_buffer("buffer", torch.rand(2, 2))
|
||||
|
||||
@torch._dynamo.disable()
|
||||
def f1(self, x) -> torch.Tensor:
|
||||
return x + self.buffer + 1
|
||||
|
||||
@torch._dynamo.disable()
|
||||
def f2(self, x) -> torch.Tensor:
|
||||
return x + self.buffer + 2
|
||||
|
||||
def forward(self, x) -> torch.Tensor:
|
||||
return self.f1(x) + self.f2(x)
|
||||
|
||||
model = SimpleModel()
|
||||
inp = torch.rand(2, 2)
|
||||
with torch.fx.traceback.preserve_node_meta():
|
||||
exported_model = torch.export.export(model, (inp,))
|
||||
graph = exported_model.graph_module.graph
|
||||
found_f1 = False
|
||||
found_f2 = False
|
||||
for node in graph.nodes:
|
||||
if "custom" in node.meta:
|
||||
if "_torchdynamo_disable_method" in node.meta["custom"]:
|
||||
if node.meta["custom"]["_torchdynamo_disable_method"] == "f1":
|
||||
found_f1 = True
|
||||
elif node.meta["custom"]["_torchdynamo_disable_method"] == "f2":
|
||||
found_f2 = True
|
||||
self.assertTrue(found_f1)
|
||||
self.assertTrue(found_f2)
|
||||
model.forward = torch._dynamo.disable(model.forward, recursive=False)
|
||||
with self.assertRaises(RuntimeError):
|
||||
exported_model = torch.export.export(model, (inp,))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
@ -422,34 +422,41 @@ from user code:
|
||||
import optree
|
||||
|
||||
@torch.compile(backend="eager")
|
||||
def fn(x):
|
||||
d = {"a": 1}
|
||||
optree.tree_flatten_with_path(d)
|
||||
return torch.sin(x)
|
||||
|
||||
def post_munge(s):
|
||||
s = re.sub(
|
||||
r"optree\.\S*\.flatten_with_path",
|
||||
"optree.<path>.flatten_with_path",
|
||||
s,
|
||||
)
|
||||
return re.sub(
|
||||
r"qualname: \S*flatten_with_path",
|
||||
"qualname: <path>.flatten_with_path",
|
||||
s,
|
||||
def fn1(x):
|
||||
tree = {"a": x, "b": (x - 1, 2 * x)}
|
||||
sin, cos = optree.tree_transpose_map(
|
||||
lambda t: (torch.sin(t), torch.cos(t)),
|
||||
tree,
|
||||
)
|
||||
return sin, cos
|
||||
|
||||
fn(torch.randn(4))
|
||||
self.assertEqual(len(counters["graph_break"]), 1)
|
||||
fn1(torch.randn(4))
|
||||
self.assertEqual(len(counters["graph_break"]), 0)
|
||||
|
||||
@torch.compile(backend="eager")
|
||||
def fn2(x):
|
||||
spec = optree.treespec_deque([])
|
||||
return spec, x
|
||||
|
||||
fn2(torch.randn(4))
|
||||
self.assertGreaterEqual(len(counters["graph_break"]), 1)
|
||||
first_graph_break = next(iter(counters["graph_break"].keys()))
|
||||
|
||||
def post_munge(string):
|
||||
return re.sub(
|
||||
r"(optree\.|qualname: )\S*(\.make_from_collection)",
|
||||
r"\1<path>\2",
|
||||
string,
|
||||
)
|
||||
|
||||
self.assertExpectedInline(
|
||||
post_munge(first_graph_break),
|
||||
"""\
|
||||
Attempted to call function marked as skipped
|
||||
Explanation: Dynamo cannot trace optree C/C++ function optree.<path>.flatten_with_path.
|
||||
Explanation: Dynamo cannot trace optree C/C++ function optree.<path>.make_from_collection.
|
||||
Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
|
||||
|
||||
Developer debug context: module: optree._C, qualname: <path>.flatten_with_path, skip reason: <missing reason>
|
||||
Developer debug context: module: optree._C, qualname: <path>.make_from_collection, skip reason: <missing reason>
|
||||
|
||||
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
|
||||
)
|
||||
@ -1043,7 +1050,7 @@ Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especiall
|
||||
msg = re.sub(r"line (\d+)", "line N", msg)
|
||||
msg = re.sub(
|
||||
r"""(?s)Traceback \(most recent call last\):.*
|
||||
File "exc.py", line N, in unimplemented_v2
|
||||
File "exc.py", line N, in unimplemented
|
||||
raise Unsupported\(msg\)""",
|
||||
"<Internal traceback>\n",
|
||||
msg,
|
||||
|
||||
@ -39,7 +39,10 @@ from torch.testing._internal.common_utils import (
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def count_ops(gm, args, freq, op):
|
||||
@ -6889,7 +6892,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
|
||||
fn, backend, x, y, skip_check=True
|
||||
) # dropout decomp is known to diverge with eager
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._functorch.config.patch(functionalize_rng_ops=True)
|
||||
def test_fallback(self):
|
||||
def gn(x, y):
|
||||
|
||||
@ -861,7 +861,7 @@ TRACE FX call mul from test_logging.py:N in fn (LoggingTests.test_trace_call_pre
|
||||
def test_logs_out(self):
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
||||
with tempfile.NamedTemporaryFile(delete=True) as tmp:
|
||||
file_path = _as_posix_path(tmp.name)
|
||||
"""
|
||||
NamedTemporaryFile will include a file open operation.
|
||||
@ -888,10 +888,6 @@ fn(torch.randn(5))
|
||||
file_path, encoding="utf-8"
|
||||
) as fd: # encoding file to UTF-8 for Windows.
|
||||
lines = fd.read()
|
||||
fd.close()
|
||||
os.remove(
|
||||
file_path
|
||||
) # Delete temp file manually, due to setup NamedTemporaryFile as delete=False.
|
||||
orig_maxDiff = unittest.TestCase.maxDiff
|
||||
unittest.TestCase.maxDiff = None
|
||||
try:
|
||||
@ -988,6 +984,7 @@ exclusions = {
|
||||
"hierarchical_compile",
|
||||
"compute_dependencies",
|
||||
"annotation",
|
||||
"node_runtime_estimation",
|
||||
}
|
||||
for name in torch._logging._internal.log_registry.artifact_names:
|
||||
if name not in exclusions:
|
||||
|
||||
@ -742,11 +742,14 @@ class TestExport(TestCase):
|
||||
self.assertExpectedInline(
|
||||
str(custom_metadata),
|
||||
"""\
|
||||
('call_function', 'cat', {'moo': 0})
|
||||
('call_function', 'item', {'moo': 0})
|
||||
('call_function', 'ge_1', {'moo': 0})
|
||||
('call_function', '_assert_scalar_default', {'moo': 0})
|
||||
('call_function', 'mul', {'moo': 0})""",
|
||||
('placeholder', 'x', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace'})
|
||||
('placeholder', 'y', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace'})
|
||||
('call_function', 'cat', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace', 'moo': 0})
|
||||
('call_function', 'item', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace', 'moo': 0})
|
||||
('call_function', 'ge_1', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace', 'moo': 0})
|
||||
('call_function', '_assert_scalar_default', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace', 'moo': 0})
|
||||
('call_function', 'mul', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace', 'moo': 0})
|
||||
('output', 'output', {'_torchdynamo_disable': True, '_torchdynamo_disable_recursive': True, '_torchdynamo_disable_method': 'dispatch_trace'})""",
|
||||
)
|
||||
|
||||
@requires_gpu
|
||||
@ -1221,8 +1224,14 @@ graph():
|
||||
%p_block_linear2_bias : [num_users=1] = placeholder[target=p_block_linear2_bias]
|
||||
%x : [num_users=1] = placeholder[target=x]
|
||||
%wrap_body0 : [num_users=1] = get_attr[target=wrap_body0]
|
||||
%tag_activation_checkpoint : [num_users=1] = call_function[target=torch.ops.higher_order.tag_activation_checkpoint](args = (%wrap_body0, %x, %p_block_linear1_weight, %p_block_linear1_bias, %p_block_linear2_weight, %p_block_linear2_bias), kwargs = {})
|
||||
%tag_activation_checkpoint : [num_users=7] = call_function[target=torch.ops.higher_order.tag_activation_checkpoint](args = (%wrap_body0, %x, %p_block_linear1_weight, %p_block_linear1_bias, %p_block_linear2_weight, %p_block_linear2_bias), kwargs = {})
|
||||
%getitem : [num_users=1] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 0), kwargs = {})
|
||||
%getitem_1 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 1), kwargs = {})
|
||||
%getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 2), kwargs = {})
|
||||
%getitem_3 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 3), kwargs = {})
|
||||
%getitem_4 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 4), kwargs = {})
|
||||
%getitem_5 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 5), kwargs = {})
|
||||
%getitem_6 : [num_users=0] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 6), kwargs = {})
|
||||
return (getitem,)""",
|
||||
)
|
||||
|
||||
@ -1231,14 +1240,14 @@ graph():
|
||||
"""\
|
||||
graph():
|
||||
%arg0_1 : [num_users=1] = placeholder[target=arg0_1]
|
||||
%arg1_1 : [num_users=1] = placeholder[target=arg1_1]
|
||||
%arg2_1 : [num_users=1] = placeholder[target=arg2_1]
|
||||
%arg3_1 : [num_users=1] = placeholder[target=arg3_1]
|
||||
%arg4_1 : [num_users=1] = placeholder[target=arg4_1]
|
||||
%linear : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%arg0_1, %arg1_1, %arg2_1), kwargs = {})
|
||||
%relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%linear,), kwargs = {})
|
||||
%arg1_1 : [num_users=2] = placeholder[target=arg1_1]
|
||||
%arg2_1 : [num_users=2] = placeholder[target=arg2_1]
|
||||
%arg3_1 : [num_users=2] = placeholder[target=arg3_1]
|
||||
%arg4_1 : [num_users=2] = placeholder[target=arg4_1]
|
||||
%linear : [num_users=2] = call_function[target=torch.ops.aten.linear.default](args = (%arg0_1, %arg1_1, %arg2_1), kwargs = {})
|
||||
%relu : [num_users=2] = call_function[target=torch.ops.aten.relu.default](args = (%linear,), kwargs = {})
|
||||
%linear_1 : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%relu, %arg3_1, %arg4_1), kwargs = {})
|
||||
return (linear_1,)""",
|
||||
return (linear_1, arg1_1, arg2_1, linear, relu, arg3_1, arg4_1)""",
|
||||
)
|
||||
|
||||
stack = contextlib.ExitStack()
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
|
||||
import copy
|
||||
import pathlib
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
@ -97,55 +96,55 @@ def run_with_nativert(ep):
|
||||
MODEL_NAME = "forward"
|
||||
|
||||
# TODO Does named tempfile have collision?
|
||||
with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
|
||||
torch.export.pt2_archive._package.package_pt2(
|
||||
f, exported_programs={MODEL_NAME: ep_infer}
|
||||
)
|
||||
filename = f.name
|
||||
|
||||
try:
|
||||
ep_args, ep_kwargs = ep_infer.example_inputs
|
||||
ep_args_copied, ep_kwargs_copied = (
|
||||
copy.deepcopy(ep_args),
|
||||
copy.deepcopy(ep_kwargs),
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
try:
|
||||
flat_expected = pytree.tree_leaves(
|
||||
ep_infer.module()(*ep_args_copied, **ep_kwargs_copied)
|
||||
ep_args, ep_kwargs = ep_infer.example_inputs
|
||||
ep_args_copied, ep_kwargs_copied = (
|
||||
copy.deepcopy(ep_args),
|
||||
copy.deepcopy(ep_kwargs),
|
||||
)
|
||||
except Exception as e:
|
||||
raise unittest.case.SkipTest(str(e)) from e
|
||||
torch.manual_seed(0)
|
||||
try:
|
||||
flat_expected = pytree.tree_leaves(
|
||||
ep_infer.module()(*ep_args_copied, **ep_kwargs_copied)
|
||||
)
|
||||
except Exception as e:
|
||||
raise unittest.case.SkipTest(str(e)) from e
|
||||
|
||||
model_runner = PyModelRunner(filename, MODEL_NAME)
|
||||
torch.manual_seed(0)
|
||||
if _is_supported_types((ep_args, ep_kwargs)):
|
||||
results = model_runner.run(*ep_args, **ep_kwargs)
|
||||
else:
|
||||
results = model_runner.run_with_flat_inputs_and_outputs(
|
||||
*pytree.tree_leaves((ep_args, ep_kwargs))
|
||||
)
|
||||
flat_results = pytree.tree_leaves(results)
|
||||
assert len(flat_results) == len(flat_expected)
|
||||
for result, expected in zip(flat_results, flat_expected):
|
||||
assert type(result) is type(expected)
|
||||
if isinstance(result, torch.Tensor) and isinstance(expected, torch.Tensor):
|
||||
assert result.shape == expected.shape
|
||||
assert result.dtype == expected.dtype
|
||||
assert result.device == expected.device
|
||||
torch.testing.assert_close(result, expected, equal_nan=True)
|
||||
model_runner = PyModelRunner(filename, MODEL_NAME)
|
||||
torch.manual_seed(0)
|
||||
if _is_supported_types((ep_args, ep_kwargs)):
|
||||
results = model_runner.run(*ep_args, **ep_kwargs)
|
||||
else:
|
||||
assert result == expected
|
||||
except RuntimeError as e:
|
||||
# User need to register pytree type on the cpp side, which
|
||||
# cannot be tested in python unittest.
|
||||
if "Unknown pytree node type" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
finally:
|
||||
pathlib.Path(filename).unlink(missing_ok=True)
|
||||
return ep
|
||||
results = model_runner.run_with_flat_inputs_and_outputs(
|
||||
*pytree.tree_leaves((ep_args, ep_kwargs))
|
||||
)
|
||||
flat_results = pytree.tree_leaves(results)
|
||||
assert len(flat_results) == len(flat_expected)
|
||||
for result, expected in zip(flat_results, flat_expected):
|
||||
assert type(result) is type(expected)
|
||||
if isinstance(result, torch.Tensor) and isinstance(
|
||||
expected, torch.Tensor
|
||||
):
|
||||
assert result.shape == expected.shape
|
||||
assert result.dtype == expected.dtype
|
||||
assert result.device == expected.device
|
||||
torch.testing.assert_close(result, expected, equal_nan=True)
|
||||
else:
|
||||
assert result == expected
|
||||
except RuntimeError as e:
|
||||
# User need to register pytree type on the cpp side, which
|
||||
# cannot be tested in python unittest.
|
||||
if "Unknown pytree node type" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
return ep
|
||||
|
||||
|
||||
def mocked_nativert_export_strict(*args, **kwargs):
|
||||
@ -287,7 +286,7 @@ class TestNativeRT(TestCase):
|
||||
)
|
||||
|
||||
# package everything needed for the NativeRT to execute the AOTI delegate
|
||||
with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
|
||||
package_nativert_with_aoti_delegate(
|
||||
f,
|
||||
MODEL_NAME,
|
||||
@ -298,50 +297,48 @@ class TestNativeRT(TestCase):
|
||||
)
|
||||
filename = f.name
|
||||
|
||||
try:
|
||||
ep_args, ep_kwargs = aoti_delegate_ep.example_inputs
|
||||
ep_args_copied, ep_kwargs_copied = (
|
||||
copy.deepcopy(ep_args),
|
||||
copy.deepcopy(ep_kwargs),
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
try:
|
||||
flat_expected = pytree.tree_leaves(
|
||||
aoti_delegate_ep.module()(*ep_args_copied, **ep_kwargs_copied)
|
||||
ep_args, ep_kwargs = aoti_delegate_ep.example_inputs
|
||||
ep_args_copied, ep_kwargs_copied = (
|
||||
copy.deepcopy(ep_args),
|
||||
copy.deepcopy(ep_kwargs),
|
||||
)
|
||||
except Exception as e:
|
||||
raise unittest.case.SkipTest(str(e)) from e
|
||||
torch.manual_seed(0)
|
||||
try:
|
||||
flat_expected = pytree.tree_leaves(
|
||||
aoti_delegate_ep.module()(*ep_args_copied, **ep_kwargs_copied)
|
||||
)
|
||||
except Exception as e:
|
||||
raise unittest.case.SkipTest(str(e)) from e
|
||||
|
||||
model_runner = PyModelRunner(filename, f"{MODEL_NAME}-{BACKEND_ID}")
|
||||
torch.manual_seed(0)
|
||||
if _is_supported_types((ep_args, ep_kwargs)):
|
||||
results = model_runner.run(*ep_args, **ep_kwargs)
|
||||
else:
|
||||
results = model_runner.run_with_flat_inputs_and_outputs(
|
||||
*pytree.tree_leaves((ep_args, ep_kwargs))
|
||||
)
|
||||
flat_results = pytree.tree_leaves(results)
|
||||
assert len(flat_results) == len(flat_expected)
|
||||
for result, expected in zip(flat_results, flat_expected):
|
||||
assert type(result) is type(expected)
|
||||
if isinstance(result, torch.Tensor) and isinstance(
|
||||
expected, torch.Tensor
|
||||
):
|
||||
assert result.shape == expected.shape
|
||||
assert result.dtype == expected.dtype
|
||||
assert result.device == expected.device
|
||||
torch.testing.assert_close(result, expected, equal_nan=True)
|
||||
model_runner = PyModelRunner(filename, f"{MODEL_NAME}-{BACKEND_ID}")
|
||||
torch.manual_seed(0)
|
||||
if _is_supported_types((ep_args, ep_kwargs)):
|
||||
results = model_runner.run(*ep_args, **ep_kwargs)
|
||||
else:
|
||||
assert result == expected
|
||||
except RuntimeError as e:
|
||||
# User need to register pytree type on the cpp side, which
|
||||
# cannot be tested in python unittest.
|
||||
if "Unknown pytree node type" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
finally:
|
||||
pathlib.Path(filename).unlink(missing_ok=True)
|
||||
results = model_runner.run_with_flat_inputs_and_outputs(
|
||||
*pytree.tree_leaves((ep_args, ep_kwargs))
|
||||
)
|
||||
flat_results = pytree.tree_leaves(results)
|
||||
assert len(flat_results) == len(flat_expected)
|
||||
for result, expected in zip(flat_results, flat_expected):
|
||||
assert type(result) is type(expected)
|
||||
if isinstance(result, torch.Tensor) and isinstance(
|
||||
expected, torch.Tensor
|
||||
):
|
||||
assert result.shape == expected.shape
|
||||
assert result.dtype == expected.dtype
|
||||
assert result.device == expected.device
|
||||
torch.testing.assert_close(result, expected, equal_nan=True)
|
||||
else:
|
||||
assert result == expected
|
||||
except RuntimeError as e:
|
||||
# User need to register pytree type on the cpp side, which
|
||||
# cannot be tested in python unittest.
|
||||
if "Unknown pytree node type" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
if is_fbcode():
|
||||
|
||||
@ -4,6 +4,7 @@ from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
from torch._dynamo.utils import counters
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
@ -39,6 +40,56 @@ class TestHopPrint(TestCase):
|
||||
|
||||
self.assertEqual(printed_output, "moo 1 2")
|
||||
|
||||
fx_f = make_fx(f)(x)
|
||||
new_inp = torch.randn(3, 3)
|
||||
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
fx_f(new_inp)
|
||||
ori_printed_output = mock_stdout.getvalue().strip()
|
||||
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
f(new_inp)
|
||||
fx_printed_output = mock_stdout.getvalue().strip()
|
||||
|
||||
self.assertEqual(ori_printed_output, fx_printed_output)
|
||||
|
||||
def test_print_with_proxy_graph(self):
|
||||
class M(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
|
||||
torch._higher_order_ops.print("moo {x}", x=x)
|
||||
res = x + x
|
||||
torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
|
||||
torch._higher_order_ops.print("yeehop {x}", x=x.shape[0])
|
||||
return (res,)
|
||||
|
||||
inputs = (torch.randn(3),)
|
||||
|
||||
# Without functionalization, print should just appear in the graph directly
|
||||
gm = make_fx(M(), tracing_mode="symbolic")(*inputs)
|
||||
|
||||
self.assertExpectedInline(
|
||||
str(gm.code).strip(),
|
||||
"""\
|
||||
def forward(self, arg0_1):
|
||||
print_1 = torch.ops.higher_order.print('moo {x} {y}', x = 1, y = 2); print_1 = None
|
||||
print_2 = torch.ops.higher_order.print('moo {x}', x = arg0_1); print_2 = None
|
||||
add = torch.ops.aten.add.Tensor(arg0_1, arg0_1)
|
||||
print_3 = torch.ops.higher_order.print('moo {x} {y}', x = 1, y = 2); print_3 = None
|
||||
sym_size_int = torch.ops.aten.sym_size.int(arg0_1, 0); arg0_1 = None
|
||||
print_4 = torch.ops.higher_order.print('yeehop {x}', x = sym_size_int); sym_size_int = print_4 = None
|
||||
return (add,)""",
|
||||
)
|
||||
|
||||
new_inp = torch.randn(4)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
gm(
|
||||
new_inp,
|
||||
)
|
||||
printed_output = mock_stdout.getvalue().strip()
|
||||
|
||||
self.assertEqual(printed_output, f"moo 1 2\nmoo {new_inp}\nmoo 1 2\nyeehop 4")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -1554,7 +1554,8 @@ class AOTInductorTestsTemplate:
|
||||
|
||||
# scaled_dot_product_flash_attention
|
||||
@unittest.skipIf(
|
||||
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
def test_sdpa(self):
|
||||
class Model(torch.nn.Module):
|
||||
@ -1571,7 +1572,10 @@ class AOTInductorTestsTemplate:
|
||||
)
|
||||
self.check_model(Model(), example_inputs)
|
||||
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
|
||||
@unittest.skipIf(
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
@unittest.skipIf(
|
||||
# for archs where this isn't lowered to flash attention, the math
|
||||
# backend will be used and it doesn't work for bfloat16
|
||||
@ -5926,8 +5930,8 @@ class AOTInductorTestsTemplate:
|
||||
@requires_gpu
|
||||
def test_d2h_copy(self):
|
||||
# device to copy host should always have the same stride
|
||||
if "cuda" not in self.device:
|
||||
raise unittest.SkipTest("This test is only for CUDA")
|
||||
if self.device not in ["cuda", "xpu"]:
|
||||
raise unittest.SkipTest("This test is only for CUDA or XPU")
|
||||
|
||||
class ToCpuModel(nn.Module):
|
||||
def forward(self, x):
|
||||
|
||||
@ -28,7 +28,7 @@ from torch.export.pt2_archive._package import (
|
||||
load_weights_to_pt2_contents,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
@ -267,9 +267,9 @@ class TestAOTInductorPackage(TestCase):
|
||||
|
||||
@unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
|
||||
@unittest.skipIf(
|
||||
_get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
|
||||
TEST_CUDA and _get_torch_cuda_version() < (12, 6),
|
||||
"Test is only supported on CUDA 12.6+",
|
||||
)
|
||||
@skipIfXpu # build system may be different
|
||||
def test_compile_after_package(self):
|
||||
self.check_package_cpp_only()
|
||||
|
||||
|
||||
@ -11,19 +11,19 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
|
||||
try:
|
||||
try:
|
||||
from .test_torchinductor import check_model, check_model_cuda
|
||||
from .test_torchinductor import check_model, check_model_gpu
|
||||
except ImportError:
|
||||
from test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
||||
check_model,
|
||||
check_model_cuda,
|
||||
check_model_gpu,
|
||||
)
|
||||
except (unittest.SkipTest, ImportError) as e:
|
||||
sys.stderr.write(f"{type(e)}: {e}\n")
|
||||
@ -34,7 +34,7 @@ except (unittest.SkipTest, ImportError) as e:
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -56,7 +56,7 @@ class ComboKernelTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_functions(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -65,9 +65,9 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -76,7 +76,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_functions(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -87,10 +87,10 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -99,7 +99,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_args(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -110,10 +110,10 @@ class ComboKernelTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -122,7 +122,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_split(self):
|
||||
def fn(a, b):
|
||||
a1 = torch.linalg.vector_norm(a)
|
||||
@ -130,15 +130,15 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1
|
||||
|
||||
inps = [
|
||||
torch.rand(2048, 512, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(2048, 512, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
]
|
||||
out_eager = fn(*inps)
|
||||
out_compiled = torch.compile(fn)(*inps)
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_partitioning(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -146,15 +146,15 @@ class ComboKernelTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
@ -163,7 +163,7 @@ class ComboKernelTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelBenchmarkTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -185,7 +185,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_benchmark(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -194,9 +194,9 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -205,7 +205,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_benchmark(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -216,10 +216,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -228,7 +228,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_benchmark(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -239,10 +239,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -251,7 +251,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_round_robin_dispatch(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -263,10 +263,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -275,7 +275,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_benchmark(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -283,28 +283,28 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_persistent_reduction_no_x_dim(self):
|
||||
def fn(x, y):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -317,7 +317,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelDynamicShapesTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -347,7 +347,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_activations(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -356,9 +356,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -367,7 +367,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_2d_blocking(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -375,21 +375,21 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_reduce(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -400,10 +400,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -412,7 +412,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_mutated(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -424,10 +424,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -436,7 +436,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch("combo_kernels_autotune", 0)
|
||||
def test_dynamic_shapes_activations_no_autotune(self):
|
||||
def test_activations(a, b, c):
|
||||
@ -446,9 +446,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -457,7 +457,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
|
||||
@ -465,8 +465,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -476,7 +476,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
|
||||
@ -484,8 +484,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(2), y.sum(2)
|
||||
|
||||
inps = (
|
||||
torch.rand(8, 16, 256, device="cuda"),
|
||||
torch.rand(8, 32, 256, device="cuda"),
|
||||
torch.rand(8, 16, 256, device=GPU_TYPE),
|
||||
torch.rand(8, 32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
|
||||
@ -495,7 +495,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_2d_blocking_round_robin(self):
|
||||
@ -506,12 +506,12 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return c0, c1, c2
|
||||
|
||||
inps = (
|
||||
torch.rand(20, 30, device="cuda"),
|
||||
torch.rand(30, 30, device="cuda"),
|
||||
torch.rand(40, 32, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda").t(),
|
||||
torch.rand(30, 30, device="cuda").t(),
|
||||
torch.rand(32, 40, device="cuda").t(),
|
||||
torch.rand(20, 30, device=GPU_TYPE),
|
||||
torch.rand(30, 30, device=GPU_TYPE),
|
||||
torch.rand(40, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 30, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 40, device=GPU_TYPE).t(),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -522,19 +522,19 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
|
||||
inps = (
|
||||
torch.rand(24, 30, device="cuda"),
|
||||
torch.rand(32, 30, device="cuda"),
|
||||
torch.rand(48, 32, device="cuda"),
|
||||
torch.rand(30, 24, device="cuda").t(),
|
||||
torch.rand(30, 32, device="cuda").t(),
|
||||
torch.rand(32, 48, device="cuda").t(),
|
||||
torch.rand(24, 30, device=GPU_TYPE),
|
||||
torch.rand(32, 30, device=GPU_TYPE),
|
||||
torch.rand(48, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 24, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 32, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 48, device=GPU_TYPE).t(),
|
||||
)
|
||||
out_compiled = compiled(*inps)
|
||||
out_eager = fn(*inps)
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
@torch._inductor.config.patch("triton.autotune_at_compile_time", True)
|
||||
@ -543,9 +543,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.mean(1), z.max(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -555,15 +555,15 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_helper_fn_defined(self):
|
||||
def fn(x, y, z):
|
||||
return x.sum(1), y.mean(1), z.cumsum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -577,5 +577,5 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
if HAS_CPU or HAS_CUDA_AND_TRITON:
|
||||
if HAS_CPU or HAS_GPU_AND_TRITON:
|
||||
run_tests(needs="filelock")
|
||||
|
||||
@ -73,6 +73,23 @@ class TestCompileWorker(TestCase):
|
||||
finally:
|
||||
pool.shutdown()
|
||||
|
||||
@skipIfWindows(msg="pass_fds not supported on Windows.")
|
||||
def test_quiesce_repeatedly(self):
|
||||
pool = SubprocPool(2)
|
||||
try:
|
||||
a = pool.submit(operator.add, 100, 1)
|
||||
pool.quiesce()
|
||||
pool.wakeup()
|
||||
b = pool.submit(operator.sub, 100, 1)
|
||||
pool.quiesce()
|
||||
pool.quiesce()
|
||||
pool.wakeup()
|
||||
b = pool.submit(operator.sub, 100, 1)
|
||||
self.assertEqual(a.result(), 101)
|
||||
self.assertEqual(b.result(), 99)
|
||||
finally:
|
||||
pool.shutdown()
|
||||
|
||||
@skipIfWindows(msg="pass_fds not supported on Windows.")
|
||||
def test_logging(self):
|
||||
os.environ["MAST_HPC_JOB_NAME"] = "test_job"
|
||||
|
||||
@ -45,6 +45,7 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
scoped_load_inline,
|
||||
skipIfWindows,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
@ -52,9 +53,13 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_CPU,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU,
|
||||
HAS_XPU_AND_TRITON,
|
||||
)
|
||||
from torch.testing._internal.logging_utils import logs_to_string
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
@ -3049,13 +3054,14 @@ main()
|
||||
|
||||
self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@skipIfXpu(msg="cudagraphs not supported on xpu for now!")
|
||||
@requires_gpu_and_triton
|
||||
def test_cudagraphs_sdpa(self):
|
||||
query = torch.rand(
|
||||
32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
|
||||
32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE, requires_grad=True
|
||||
)
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
|
||||
|
||||
with (
|
||||
@ -3747,7 +3753,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
|
||||
self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_flex_attention(self):
|
||||
def _squared(score, b, h, m, n):
|
||||
"""Joint graph needed for correctness"""
|
||||
@ -3765,7 +3771,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
a * b,
|
||||
b,
|
||||
dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
requires_grad=True,
|
||||
)
|
||||
fwd_bwd(v)
|
||||
@ -5222,6 +5228,7 @@ xfail_by_backend = {
|
||||
"test_reentrant_with_callbacks_both_depths", # queue_callback
|
||||
"test_reentrant_with_callbacks_depth_0", # queue_callback
|
||||
"test_reentrant_with_callbacks_depth_1", # queue_callback
|
||||
"test_checkpoint_graph_execution_group", # Attempted to call function marked as skipped
|
||||
"test_current_graph_task_execution_order", # nodes are already freed by the time dynamo traces the lifted hook
|
||||
"test_autograd_inplace_views_cross_dtype", # view_fn not supported by compiled autograd
|
||||
"test_post_accumulate_grad_hook_ordering", # accuracy error
|
||||
@ -5332,12 +5339,13 @@ if IS_S390X:
|
||||
test_autograd = load_test_module("test_autograd")
|
||||
test_custom_ops = load_test_module("test_custom_ops")
|
||||
test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
|
||||
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
|
||||
test_autograd.TestNestedCheckpoint
|
||||
)
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
|
||||
test_higher_order_ops.HigherOrderOpTests
|
||||
)
|
||||
@ -5366,6 +5374,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
super(TestCase, self).tearDown()
|
||||
reset()
|
||||
|
||||
@skipIfXpu(msg="NotImplementedError: The operator 'testlib::mutating_custom_op'")
|
||||
@ops(
|
||||
list(filter(lambda op: op.name not in xfail_hops, hop_db)),
|
||||
allowed_dtypes=(torch.float,),
|
||||
@ -5418,7 +5427,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), allow_xpu=True)
|
||||
instantiate_parametrized_tests(TestCompiledAutograd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -65,7 +65,11 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_GPU,
|
||||
has_triton,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def get_inputs(optim):
|
||||
@ -946,7 +950,7 @@ class CompiledOptimizerTests(TestCase):
|
||||
kwargs = aot_graph_input_parser(forward)
|
||||
torch.compile(forward)(**kwargs)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_foreach_map_adam(self):
|
||||
params = [
|
||||
torch.rand(
|
||||
|
||||
@ -3278,6 +3278,15 @@ class CPUReproTests(TestCase):
|
||||
metrics.reset()
|
||||
self.common(fn, (x,))
|
||||
|
||||
def test_softmax_with_zero_dim(self):
|
||||
def fn(x):
|
||||
x = torch.softmax(x, 0)
|
||||
return x
|
||||
|
||||
x = torch.rand([], dtype=torch.bfloat16)
|
||||
metrics.reset()
|
||||
self.common(fn, (x,))
|
||||
|
||||
@config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
|
||||
def test_local_buffer_in_outer_loop_fusion(self):
|
||||
def fn(x):
|
||||
|
||||
@ -208,7 +208,7 @@ class TestCustomLowering(InductorTestCase):
|
||||
|
||||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_tanh_approx(self):
|
||||
def fn(inp):
|
||||
@ -223,7 +223,7 @@ class TestCustomLowering(InductorTestCase):
|
||||
|
||||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_multi_inp_asm(self):
|
||||
def fn(a, b):
|
||||
|
||||
154
test/inductor/test_cutedsl_grouped_mm.py
Normal file
154
test/inductor/test_cutedsl_grouped_mm.py
Normal file
@ -0,0 +1,154 @@
|
||||
# Owner(s): ["module: inductor"]
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from torch._inductor import config
|
||||
from torch._inductor.codegen.cuda.cuda_env import is_datacenter_blackwell_arch
|
||||
from torch._inductor.test_case import run_tests, TestCase as InductorTestCase
|
||||
from torch._inductor.utils import ensure_cute_available
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
)
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
not (ensure_cute_available() and is_datacenter_blackwell_arch()),
|
||||
"CuTeDSL library or Blackwell device not available",
|
||||
)
|
||||
@instantiate_parametrized_tests
|
||||
class TestCuTeDSLGroupedGemm(InductorTestCase):
|
||||
def _get_inputs(
|
||||
self,
|
||||
group_size: int,
|
||||
M_hint: int,
|
||||
K: int,
|
||||
N: int,
|
||||
device: str,
|
||||
dtype: torch.dtype,
|
||||
alignment: int = 16,
|
||||
) -> tuple[Tensor, Tensor, Tensor]:
|
||||
# --- Random, tile-aligned M sizes ---
|
||||
M_sizes = (
|
||||
torch.randint(1, (M_hint // alignment) + 1, (group_size,), dtype=torch.int)
|
||||
* alignment
|
||||
)
|
||||
|
||||
M_total = torch.sum(M_sizes).item()
|
||||
|
||||
# --- Construct input tensors ---
|
||||
A = torch.randn(int(M_total), K, dtype=dtype, device=device) * 0.1
|
||||
B = torch.randn((group_size, K, N), dtype=dtype, device=device) * 0.01
|
||||
|
||||
# --- Build offsets (no leading zero, strictly increasing) ---
|
||||
offsets = torch.cumsum(M_sizes, dim=0).to(dtype=torch.int32, device=device)
|
||||
|
||||
return (A, B, offsets)
|
||||
|
||||
@parametrize("group_size", (2, 8))
|
||||
@parametrize("M_hint", (256, 1024))
|
||||
@parametrize("K", (64, 128))
|
||||
@parametrize("N", (128, 256))
|
||||
def test_grouped_gemm_basic(self, group_size: int, M_hint: int, K: int, N: int):
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
A, B, offsets = self._get_inputs(group_size, M_hint, K, N, device, dtype)
|
||||
|
||||
def grouped_gemm_fn(A_packed, B_batched, offs):
|
||||
return torch._grouped_mm(A_packed, B_batched, offs=offs)
|
||||
|
||||
# Eager execution
|
||||
c_eager = grouped_gemm_fn(A, B, offsets)
|
||||
|
||||
# Test with Cute backend
|
||||
with config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTEDSL",
|
||||
"test_configs.autotune_choice_name_regex": "cutedsl",
|
||||
"autotune_fallback_to_aten": False,
|
||||
}
|
||||
):
|
||||
grouped_gemm_compiled = torch.compile(
|
||||
grouped_gemm_fn, backend="inductor", dynamic=False
|
||||
)
|
||||
c_compiled = grouped_gemm_compiled(A, B, offsets)
|
||||
|
||||
self.assertEqual(c_eager.dtype, dtype)
|
||||
self.assertEqual(c_compiled.dtype, dtype)
|
||||
torch.testing.assert_close(c_eager, c_compiled)
|
||||
|
||||
@parametrize("layout_A", ("contiguous", "offset", "padded", "view"))
|
||||
@parametrize("layout_B", ("contiguous", "broadcasted"))
|
||||
def test_grouped_gemm_assorted_layouts(
|
||||
self,
|
||||
layout_A: str,
|
||||
layout_B: str,
|
||||
):
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
G, K, N = 8, 64, 128
|
||||
M_sizes = [128] * G
|
||||
sum_M = sum(M_sizes)
|
||||
offsets = torch.tensor(
|
||||
[sum(M_sizes[: i + 1]) for i in range(G)], dtype=torch.int32, device=device
|
||||
)
|
||||
|
||||
A_base = torch.randn(sum_M, K, device=device, dtype=dtype)
|
||||
A = A_base
|
||||
|
||||
if layout_A == "offset":
|
||||
# allocate bigger buffer than needed, use nonzero storage offset
|
||||
storage = torch.randn(sum_M * K + 512, device=device, dtype=dtype)
|
||||
offset = 128 # skip first 128 elements
|
||||
A = torch.as_strided(storage[offset:], (sum_M, K), (K, 1))
|
||||
elif layout_A == "padded":
|
||||
# simulate row pitch > K (row_stride = K + pad)
|
||||
row_pitch = K + 8
|
||||
storage = torch.randn(sum_M * row_pitch, device=device, dtype=dtype)
|
||||
A = torch.as_strided(storage, (sum_M, K), (row_pitch, 1))
|
||||
elif layout_A == "view":
|
||||
A_storage = torch.randn(sum_M * K, device=device, dtype=dtype)
|
||||
A = A_storage.view(sum_M, K)
|
||||
assert A._base is not None
|
||||
assert A.shape == (sum_M, K)
|
||||
|
||||
B = torch.randn((G, K, N), dtype=dtype, device=device) * 0.01
|
||||
|
||||
if layout_B == "broadcasted":
|
||||
# Broadcast B across groups (zero stride along G)
|
||||
B = B[0].expand(G, K, N)
|
||||
assert B.stride(0) == 0
|
||||
|
||||
def grouped_gemm_fn(A_packed, B_batched, offs):
|
||||
return torch._grouped_mm(A_packed, B_batched, offs=offs)
|
||||
|
||||
# --- eager ---
|
||||
c_eager = grouped_gemm_fn(A, B, offsets)
|
||||
|
||||
# --- compiled (CUTE backend) ---
|
||||
with config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTEDSL",
|
||||
"test_configs.autotune_choice_name_regex": "cutedsl",
|
||||
"autotune_fallback_to_aten": False,
|
||||
}
|
||||
):
|
||||
grouped_gemm_compiled = torch.compile(
|
||||
grouped_gemm_fn, backend="inductor", dynamic=False
|
||||
)
|
||||
c_compiled = grouped_gemm_compiled(A, B, offsets)
|
||||
|
||||
self.assertEqual(c_eager.dtype, dtype)
|
||||
self.assertEqual(c_compiled.dtype, dtype)
|
||||
torch.testing.assert_close(c_eager, c_compiled)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -10,10 +10,11 @@ from torch._inductor.utils import fresh_cache
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU_AND_TRITON,
|
||||
IS_BIG_GPU,
|
||||
)
|
||||
|
||||
@ -38,6 +39,7 @@ class DeterministicTest(TestCase):
|
||||
finally:
|
||||
torch.use_deterministic_algorithms(old_val, warn_only=True)
|
||||
|
||||
@skipIfXpu(msg="pad_mm is not enabled for XPU.")
|
||||
@parametrize("deterministic", [False, True])
|
||||
def test_mm_padding(self, deterministic):
|
||||
with inductor_config.patch(deterministic=deterministic):
|
||||
@ -106,5 +108,5 @@ class DeterministicTest(TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if HAS_CUDA_AND_TRITON:
|
||||
if HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
|
||||
@ -10,7 +10,10 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@ -55,14 +58,14 @@ class TestTorchDeviceAssertTrigger(TestCase):
|
||||
f_c = torch.compile(func_inline, backend=backend)
|
||||
f_c()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@skipIfRocm
|
||||
@torch._inductor.config.patch(force_disable_caches=True)
|
||||
def test_assert_fusion(self):
|
||||
torch._logging.set_logs(inductor_metrics=True)
|
||||
|
||||
def func():
|
||||
a = torch.tensor([1.0, 2.0], device="cuda")
|
||||
a = torch.tensor([1.0, 2.0], device=device_type)
|
||||
result = torch.all(a > 0)
|
||||
assert result, "should throw"
|
||||
|
||||
@ -74,13 +77,13 @@ class TestTorchDeviceAssertTrigger(TestCase):
|
||||
self.assertEqual(metrics.generated_kernel_count, 1)
|
||||
torch._logging.set_logs()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@skipIfRocm
|
||||
@torch._inductor.config.patch(force_disable_caches=True)
|
||||
def test_run_assert_triton(self):
|
||||
@torch.compile(backend="inductor")
|
||||
def fn():
|
||||
a = torch.tensor([1.0, 2.0], device="cuda")
|
||||
a = torch.tensor([1.0, 2.0], device=device_type)
|
||||
result = torch.all(a > 0)
|
||||
assert result, "should throw"
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ from torch import nn
|
||||
from torch._dynamo import compiled_autograd
|
||||
from torch._dynamo.test_case import run_tests, TestCase
|
||||
from torch._dynamo.testing import CompileCounter
|
||||
from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_MACOS
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu
|
||||
|
||||
|
||||
@ -483,7 +483,6 @@ class DistributedPatternTests(TestCase):
|
||||
# Recompile on grad==None/grad!=None
|
||||
self.assertEqual(bw_cnt.frame_count, 2)
|
||||
|
||||
@skipIfXpu
|
||||
@requires_gpu()
|
||||
@torch._functorch.config.patch(recompute_views=True)
|
||||
def test_fake_distributed_inductor(self):
|
||||
|
||||
@ -5,6 +5,10 @@ import torch
|
||||
from torch._inductor import config
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch.testing._internal.common_cuda import TEST_CUDA
|
||||
from torch.testing._internal.common_utils import TEST_XPU
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class MatMulModule(torch.nn.Module):
|
||||
@ -68,13 +72,13 @@ class TestInductorExternalCallable(TestCase):
|
||||
msg=f"torch.compile(..., external_matmul = {matmul_dup}) failed",
|
||||
)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "CUDA not found")
|
||||
@unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability() < (7, 0),
|
||||
"Triton does not support device capability < 7.0",
|
||||
)
|
||||
def test_matmul_cuda(self):
|
||||
device = torch.device("cuda")
|
||||
device = torch.device(device_type)
|
||||
x = (torch.eye(128, 128) * 2).to(device=device)
|
||||
opt_fn = torch.compile(
|
||||
MatMulModule().to(device),
|
||||
|
||||
@ -148,6 +148,24 @@ class FxirTestCase(InductorTestCase):
|
||||
args = [torch.randn(8, device=self.device) for _ in range(2)]
|
||||
self._compile_and_check(torch.add, args)
|
||||
|
||||
def test_device_type(self):
|
||||
"""
|
||||
Test that we allocate on a device type instead of a specific index.
|
||||
"""
|
||||
# Pass in a tensor on an indexed device.
|
||||
device_runtime = getattr(torch, self.device)
|
||||
indexed_device = torch.device(self.device, device_runtime.current_device())
|
||||
args = [torch.randn(8, device=indexed_device) for _ in range(2)]
|
||||
(gm,) = self._compile_and_check(torch.add, args)
|
||||
(empty_strided,) = gm.graph.find_nodes(
|
||||
op="call_function", target=torch.empty_strided
|
||||
)
|
||||
|
||||
# Check that the device of the output allocation is not indexed.
|
||||
output_device = torch.device(empty_strided.kwargs["device"])
|
||||
self.assertIs(output_device.index, None)
|
||||
self.assertEqual(output_device.type, indexed_device.type)
|
||||
|
||||
def test_multiple_kernels(self):
|
||||
def foo(x, y):
|
||||
return x.sum() + y.sum()
|
||||
|
||||
@ -11,6 +11,8 @@ from torch.testing._internal.common_utils import slowTest
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
try:
|
||||
try:
|
||||
from . import (
|
||||
@ -306,11 +308,11 @@ if RUN_GPU:
|
||||
|
||||
from torch._inductor.utils import is_big_gpu
|
||||
|
||||
if GPU_TYPE == "cuda" and is_big_gpu():
|
||||
if GPU_TYPE in ("cuda", "xpu") and is_big_gpu():
|
||||
skip_list = ["test_addmm", "test_linear_relu"]
|
||||
# need to skip instead of omit, otherwise fbcode ci can be flaky
|
||||
for test_name in skip_list:
|
||||
test_failures_gpu_wrapper[f"{test_name}_cuda"] = (
|
||||
test_failures_gpu_wrapper[f"{test_name}_{device_type}"] = (
|
||||
test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
|
||||
)
|
||||
test_failures_gpu_wrapper[f"{test_name}_gpu_dynamic_shapes"] = (
|
||||
|
||||
@ -16,7 +16,12 @@ from torch.testing._internal.common_device_type import (
|
||||
instantiate_device_type_tests,
|
||||
skipCUDAIf,
|
||||
)
|
||||
from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
run_tests,
|
||||
skipIfXpu,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import IS_BIG_GPU
|
||||
from torch.utils._ordered_set import OrderedSet
|
||||
|
||||
@ -91,6 +96,10 @@ class TestScheduler(TestCase):
|
||||
metrics.reset()
|
||||
torch._logging.set_logs()
|
||||
|
||||
@skipIfXpu(
|
||||
msg="InvalidModule: Invalid SPIR-V module, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2329"
|
||||
)
|
||||
@dtypes(torch.float, torch.float16)
|
||||
@skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
|
||||
@parametrize(
|
||||
|
||||
@ -1416,11 +1416,6 @@ class TestMaxAutotune(TestCase):
|
||||
torch.compile(lambda a, b: a.matmul(b))(a, b)
|
||||
self.assertIn("NoValidChoicesError", str(context.exception))
|
||||
|
||||
@unittest.skipIf(
|
||||
not torch.cuda.is_available()
|
||||
or torch.cuda.get_device_properties().total_memory < 2e10,
|
||||
"Only if the GPU has at least 20GB memory to be safe",
|
||||
)
|
||||
@config.patch(force_shape_pad=True, max_autotune=True)
|
||||
def test_linear_and_cel(self):
|
||||
"""
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
import functools
|
||||
import weakref
|
||||
from collections import Counter
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch._inductor.fx_passes.memory_estimator import (
|
||||
@ -28,7 +29,7 @@ def device_filter(device):
|
||||
|
||||
|
||||
class FakeTensorMemoryProfilerMode(TorchDispatchMode):
|
||||
def __init__(self, device_filter: Optional[Callable[torch.device, bool]] = None):
|
||||
def __init__(self, device_filter: Optional[Callable[[torch.device], bool]] = None):
|
||||
# counter of storage ids to live references
|
||||
self.storage_count: dict[int, int] = Counter()
|
||||
# live fake tensors
|
||||
|
||||
@ -319,11 +319,6 @@ class TestOperatorReorderForPeakMemory(TestCase):
|
||||
# succ nodes should be forwarded to pre mutation buffer
|
||||
self.assertTrue(buffer_info[post][2] <= buffer_info[pre][2])
|
||||
|
||||
@unittest.skipIf(
|
||||
not torch.cuda.is_available()
|
||||
or torch.cuda.get_device_properties().total_memory < int(1e10),
|
||||
"Need 10GB memory to be safe to run the test",
|
||||
)
|
||||
def test_fusing_reductions_increase_peak_memory(self):
|
||||
@torch.compile
|
||||
def f(a, b, c):
|
||||
@ -332,9 +327,9 @@ class TestOperatorReorderForPeakMemory(TestCase):
|
||||
a = torch.randn(1024 * 32, 16, device=GPU_TYPE)
|
||||
b = torch.randn(1024 * 32, 16, device=GPU_TYPE)
|
||||
c = torch.randn(16, 1024 * 32, device=GPU_TYPE)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.get_device_module(GPU_TYPE).reset_peak_memory_stats()
|
||||
f(a, b, c)
|
||||
peak_mem = torch.cuda.max_memory_allocated()
|
||||
peak_mem = torch.get_device_module(GPU_TYPE).max_memory_allocated()
|
||||
|
||||
expected_bound = a.size(0) * c.size(1) * a.dtype.itemsize * 2
|
||||
self.assertLess(peak_mem, expected_bound)
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
|
||||
|
||||
|
||||
@ -82,7 +82,6 @@ class TestMemoryPlanning(TestCase):
|
||||
).run(code)
|
||||
self.assertTrue(same(f(*args), result))
|
||||
|
||||
@skipIfXpu(msg="aoti doesn't work on XPU")
|
||||
def test_aoti(self):
|
||||
f, args = self._generate(device=GPU_TYPE)
|
||||
dim0_x = Dim("dim0_x", min=1, max=2048)
|
||||
|
||||
@ -7,12 +7,7 @@ import torch._inductor.config as inductor_config
|
||||
from torch._dynamo.test_minifier_common import MinifierTestBase
|
||||
from torch._inductor import config
|
||||
from torch.export import load as export_load
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_JETSON,
|
||||
IS_MACOS,
|
||||
skipIfXpu,
|
||||
TEST_WITH_ASAN,
|
||||
)
|
||||
from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS, TEST_WITH_ASAN
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import requires_gpu
|
||||
|
||||
@ -278,7 +273,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch(
|
||||
"triton.inject_relu_bug_TESTING_ONLY",
|
||||
"compile_error",
|
||||
@ -288,7 +282,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch(
|
||||
"triton.inject_relu_bug_TESTING_ONLY",
|
||||
"compile_error",
|
||||
@ -304,7 +297,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
|
||||
def test_aoti_gpu_accuracy_error(self):
|
||||
res = self._test_aoti(GPU_TYPE, "AccuracyError")
|
||||
|
||||
@ -7,19 +7,23 @@ import torch
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import run_and_get_code
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_utils import IS_LINUX
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU_AND_TRITON,
|
||||
HAS_MULTIGPU,
|
||||
)
|
||||
|
||||
|
||||
requires_multigpu = functools.partial(
|
||||
unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
|
||||
unittest.skipIf, not HAS_MULTIGPU, f"requires multiple {GPU_TYPE} devices"
|
||||
)
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
|
||||
|
||||
class TestMoveConstructorsToCuda(TestCase):
|
||||
class TestMoveConstructorsToGpu(TestCase):
|
||||
def _check_fn(self, func, expect_cpu, *args):
|
||||
out_eager = func(*args)
|
||||
|
||||
@ -36,7 +40,7 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
def foo(x):
|
||||
return x[torch.arange(x.shape[0])]
|
||||
|
||||
inp = torch.rand(32, 77, 512, device="cuda")
|
||||
inp = torch.rand(32, 77, 512, device=GPU_TYPE)
|
||||
|
||||
self._check_fn(foo, False, inp)
|
||||
|
||||
@ -45,14 +49,14 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
tmp1 = torch.arange(x.shape[0])
|
||||
return tmp1, x[tmp1]
|
||||
|
||||
inp = torch.rand(32, 77, 512, device="cuda")
|
||||
inp = torch.rand(32, 77, 512, device=GPU_TYPE)
|
||||
|
||||
self._check_fn(foo, True, inp)
|
||||
|
||||
def test_non_convertable_op_failure(self):
|
||||
def foo(x):
|
||||
y = torch.arange(x.shape[0])
|
||||
return x + y, torch.ones([4], device="cuda")
|
||||
return x + y, torch.ones([4], device=GPU_TYPE)
|
||||
|
||||
inp = torch.rand([100])
|
||||
|
||||
@ -76,7 +80,7 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
c2 = torch.arange(-1, 3)
|
||||
return x[c1 + c2], c2 - 4 * 2
|
||||
|
||||
inp = torch.rand([4]).cuda()
|
||||
inp = torch.rand([4]).to(GPU_TYPE)
|
||||
_, code = run_and_get_code(foo, inp)
|
||||
FileCheck().check_not("triton.jit").run(code[0])
|
||||
|
||||
@ -95,12 +99,12 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
def foo(x):
|
||||
return (
|
||||
x[torch.arange(x.shape[0])],
|
||||
torch.ones([4], device="cuda:0"),
|
||||
torch.ones([4], device="cuda:1"),
|
||||
torch.ones([4], device=f"{GPU_TYPE}:0"),
|
||||
torch.ones([4], device=f"{GPU_TYPE}:1"),
|
||||
)
|
||||
|
||||
# nyi, multi-gpu
|
||||
inp = torch.rand([100], device="cuda")
|
||||
inp = torch.rand([100], device=GPU_TYPE)
|
||||
self._check_fn(foo, True, inp)
|
||||
|
||||
def test_no_gpu(self):
|
||||
@ -112,5 +116,5 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if IS_LINUX and HAS_CUDA_AND_TRITON:
|
||||
if IS_LINUX and HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
@ -17,7 +17,6 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
@ -70,7 +69,6 @@ def make_cpp_wrapper_test(orig_test, **extra_args):
|
||||
"""
|
||||
|
||||
@config.patch("cpp_wrapper", True)
|
||||
@skipIfXpu(msg="cpp wrapper doesn't currently work on the XPU stack")
|
||||
def fn(self):
|
||||
# The same kernel may have been compiled by previous tests with
|
||||
# cpp_wrapper disabled. Clear the cache so we go ahead to re-compile
|
||||
@ -111,7 +109,6 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_gemm(self):
|
||||
def fn(x, y):
|
||||
@ -140,7 +137,6 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_relu_fused_gemm(self):
|
||||
def fn(x, y):
|
||||
|
||||
@ -52,9 +52,12 @@ def make_pallas(cls):
|
||||
return test_class
|
||||
|
||||
|
||||
@unittest.skipUnless(HAS_PALLAS, "requires jax and pallas")
|
||||
class PallasTests(TestCase):
|
||||
"""Basic tests for Pallas backend functionality."""
|
||||
class PallasTestsMixin:
|
||||
"""Basic tests for Pallas backend functionality (parameterized by DEVICE). Mixin only, not collected."""
|
||||
|
||||
def _compile(self, fn):
|
||||
key = "cuda_backend" if self.DEVICE == "cuda" else "cpu_backend"
|
||||
return torch.compile(fn, backend="inductor", options={key: "pallas"})
|
||||
|
||||
def test_simple_add(self):
|
||||
"""Test basic element-wise addition."""
|
||||
@ -62,12 +65,10 @@ class PallasTests(TestCase):
|
||||
def fn(a, b):
|
||||
return a + b
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
a = torch.randn(1024, device="cuda")
|
||||
b = torch.randn(1024, device="cuda")
|
||||
a = torch.randn(1024, device=self.DEVICE)
|
||||
b = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(a, b)
|
||||
expected = fn(a, b)
|
||||
self.assertEqual(result, expected)
|
||||
@ -78,12 +79,10 @@ class PallasTests(TestCase):
|
||||
def fn(a, b):
|
||||
return a * b
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
a = torch.randn(1024, device="cuda")
|
||||
b = torch.randn(1024, device="cuda")
|
||||
a = torch.randn(1024, device=self.DEVICE)
|
||||
b = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(a, b)
|
||||
expected = fn(a, b)
|
||||
self.assertEqual(result, expected)
|
||||
@ -94,11 +93,9 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return torch.sin(x)
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda")
|
||||
x = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -109,12 +106,10 @@ class PallasTests(TestCase):
|
||||
def fn(x, y):
|
||||
return x.sin() + y
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda")
|
||||
y = torch.randn(1024, device="cuda")
|
||||
x = torch.randn(1024, device=self.DEVICE)
|
||||
y = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(x, y)
|
||||
expected = fn(x, y)
|
||||
self.assertEqual(result, expected)
|
||||
@ -125,11 +120,9 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return torch.log(torch.exp(x))
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda")
|
||||
x = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -140,11 +133,9 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return torch.sqrt(x)
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda").abs() # Ensure positive for sqrt
|
||||
x = torch.randn(1024, device=self.DEVICE).abs() # Ensure positive for sqrt
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -155,11 +146,9 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return torch.tanh(x)
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda")
|
||||
x = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -170,11 +159,9 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return torch.abs(-x)
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(1024, device="cuda")
|
||||
x = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -185,12 +172,10 @@ class PallasTests(TestCase):
|
||||
def fn(a, b):
|
||||
return torch.maximum(a, b) + torch.minimum(a, b)
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
a = torch.randn(1024, device="cuda")
|
||||
b = torch.randn(1024, device="cuda")
|
||||
a = torch.randn(1024, device=self.DEVICE)
|
||||
b = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(a, b)
|
||||
expected = fn(a, b)
|
||||
self.assertEqual(result, expected)
|
||||
@ -228,15 +213,17 @@ class PallasTests(TestCase):
|
||||
|
||||
@torch.compile(
|
||||
backend="inductor",
|
||||
options={"cuda_backend": "pallas"},
|
||||
options={
|
||||
("cuda_backend" if self.DEVICE == "cuda" else "cpu_backend"): "pallas"
|
||||
},
|
||||
)
|
||||
def pallas_fn(a, b):
|
||||
return a.sin() + b.cos()
|
||||
|
||||
_, (code,) = run_and_get_code(
|
||||
pallas_fn,
|
||||
torch.randn(64, device="cuda"),
|
||||
torch.randn(64, device="cuda"),
|
||||
torch.randn(64, device=self.DEVICE),
|
||||
torch.randn(64, device=self.DEVICE),
|
||||
)
|
||||
# Verify Pallas-specific code generation
|
||||
self.assertIn("import jax", code)
|
||||
@ -249,12 +236,10 @@ class PallasTests(TestCase):
|
||||
def fn(x, y):
|
||||
return x + y
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
x = torch.randn(32, 32, device="cuda")
|
||||
y = torch.randn(32, 32, device="cuda")
|
||||
x = torch.randn(32, 32, device=self.DEVICE)
|
||||
y = torch.randn(32, 32, device=self.DEVICE)
|
||||
result = compiled(x, y)
|
||||
expected = fn(x, y)
|
||||
self.assertEqual(result, expected)
|
||||
@ -265,12 +250,10 @@ class PallasTests(TestCase):
|
||||
def fn(x):
|
||||
return x * 2.0
|
||||
|
||||
compiled = torch.compile(
|
||||
fn, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(fn)
|
||||
|
||||
for shape in [(64,), (128,), (256,), (1024,)]:
|
||||
x = torch.randn(shape, device="cuda")
|
||||
x = torch.randn(shape, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = fn(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -282,12 +265,10 @@ class PallasTests(TestCase):
|
||||
def contiguous_add(a, b):
|
||||
return a + b
|
||||
|
||||
compiled = torch.compile(
|
||||
contiguous_add, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(contiguous_add)
|
||||
|
||||
a = torch.randn(1024, device="cuda")
|
||||
b = torch.randn(1024, device="cuda")
|
||||
a = torch.randn(1024, device=self.DEVICE)
|
||||
b = torch.randn(1024, device=self.DEVICE)
|
||||
result = compiled(a, b)
|
||||
expected = contiguous_add(a, b)
|
||||
self.assertEqual(result, expected)
|
||||
@ -296,11 +277,9 @@ class PallasTests(TestCase):
|
||||
def contiguous_mul(x):
|
||||
return x * 2.0
|
||||
|
||||
compiled = torch.compile(
|
||||
contiguous_mul, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(contiguous_mul)
|
||||
|
||||
x = torch.randn(128, 8, device="cuda")
|
||||
x = torch.randn(128, 8, device=self.DEVICE)
|
||||
result = compiled(x)
|
||||
expected = contiguous_mul(x)
|
||||
self.assertEqual(result, expected)
|
||||
@ -310,12 +289,10 @@ class PallasTests(TestCase):
|
||||
def operate_on_tensor(x):
|
||||
return x.sin()
|
||||
|
||||
compiled = torch.compile(
|
||||
operate_on_tensor, backend="inductor", options={"cuda_backend": "pallas"}
|
||||
)
|
||||
compiled = self._compile(operate_on_tensor)
|
||||
|
||||
# Create a transposed (non-contiguous) view
|
||||
x = torch.randn(64, 32, device="cuda")
|
||||
x = torch.randn(64, 32, device=self.DEVICE)
|
||||
x_t = x.t() # Non-contiguous view
|
||||
self.assertFalse(x_t.is_contiguous())
|
||||
|
||||
@ -332,13 +309,24 @@ class PallasTests(TestCase):
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
|
||||
@unittest.skipUnless(HAS_PALLAS, "requires jax and pallas")
|
||||
class PallasTestsCUDA(PallasTestsMixin, TestCase):
|
||||
DEVICE = "cuda"
|
||||
|
||||
|
||||
@unittest.skipUnless(HAS_PALLAS, "requires jax and pallas")
|
||||
class PallasTestsCPU(PallasTestsMixin, TestCase):
|
||||
DEVICE = "cpu"
|
||||
|
||||
|
||||
# Create test variants using the main test suite
|
||||
# Note: Only enable GPU tests since Pallas primarily targets GPU
|
||||
if test_torchinductor.HAS_GPU and HAS_PALLAS:
|
||||
# Uncomment these to run full test suite with Pallas backend
|
||||
# make_pallas(test_torchinductor.SweepInputsGPUTest)
|
||||
# make_pallas(test_torchinductor.GPUTests)
|
||||
pass
|
||||
if hasattr(sys.modules.get(__name__), "test_torchinductor") and HAS_PALLAS:
|
||||
if getattr(test_torchinductor, "HAS_GPU", False):
|
||||
# Uncomment these to run full test suite with Pallas backend
|
||||
# make_pallas(test_torchinductor.SweepInputsGPUTest)
|
||||
# make_pallas(test_torchinductor.GPUTests)
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
if HAS_PALLAS:
|
||||
|
||||
@ -41,7 +41,6 @@ from torch.testing._internal.common_utils import (
|
||||
IS_LINUX,
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
|
||||
from torch.utils import _pytree as pytree
|
||||
@ -1217,6 +1216,43 @@ class TestPatternMatcher(TestCase):
|
||||
_, (code) = run_and_get_code(fn2, args[0], args[1], args[2])
|
||||
FileCheck().check_not("extern_kernels.addmm(").run(code[0])
|
||||
|
||||
def test_addmm_alpha_beta_with_pointwise(self):
|
||||
# Test that addmm with alpha/beta != 1 is unfused correctly with pointwise ops
|
||||
# See https://github.com/pytorch/pytorch/issues/167313
|
||||
x = torch.rand(2, device=GPU_TYPE)
|
||||
a = torch.rand(2, 3, device=GPU_TYPE)
|
||||
b = torch.rand(3, 2, device=GPU_TYPE)
|
||||
|
||||
def f(x, a, b):
|
||||
return torch.nn.functional.relu(torch.addmm(x, a, b, alpha=0.8, beta=0.2))
|
||||
|
||||
fc = torch.compile(f)
|
||||
|
||||
expected = f(x, a, b)
|
||||
actual = fc(x, a, b)
|
||||
|
||||
# The compiled version should produce the same result as eager
|
||||
torch.testing.assert_close(actual, expected)
|
||||
|
||||
# Verify that addmm is unfused (should not use extern_kernels.addmm)
|
||||
# The pattern should be replaced with beta * x + alpha * (a @ b)
|
||||
_, (code) = run_and_get_code(fc, x, a, b)
|
||||
FileCheck().check_not("extern_kernels.addmm(").run(code[0])
|
||||
|
||||
# Test with alpha=1, beta=1 (default) - should also unfuse
|
||||
def f_default(x, a, b):
|
||||
return torch.nn.functional.relu(torch.addmm(x, a, b))
|
||||
|
||||
fc_default = torch.compile(f_default)
|
||||
expected_default = f_default(x, a, b)
|
||||
actual_default = fc_default(x, a, b)
|
||||
|
||||
torch.testing.assert_close(actual_default, expected_default)
|
||||
|
||||
# Should unfuse and not use extern_kernels.addmm
|
||||
_, (code) = run_and_get_code(fc_default, x, a, b)
|
||||
FileCheck().check_not("extern_kernels.addmm(").run(code[0])
|
||||
|
||||
def test_serialized_patterns_up_to_date(self):
|
||||
import torch.utils._pytree as pytree
|
||||
from torch._inductor.fx_passes import joint_graph
|
||||
@ -1261,7 +1297,6 @@ class TestPatternMatcher(TestCase):
|
||||
# of search_fn).
|
||||
self.assertTrue(pattern.pattern_eq(search_fn_pattern))
|
||||
|
||||
@skipIfXpu
|
||||
@xfailIfSM89
|
||||
@inductor_config.patch(
|
||||
{
|
||||
|
||||
@ -12,8 +12,12 @@ import torch._inductor.utils
|
||||
from torch import _dynamo as torchdynamo
|
||||
from torch._inductor import config
|
||||
from torch.profiler import ProfilerActivity
|
||||
from torch.testing._internal.common_utils import TemporaryFileName
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, IS_BIG_GPU
|
||||
from torch.testing._internal.common_utils import skipIfXpu, TemporaryFileName
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU_AND_TRITON,
|
||||
IS_BIG_GPU,
|
||||
)
|
||||
from torch.torch_version import TorchVersion
|
||||
from torch.utils._triton import has_triton
|
||||
|
||||
@ -22,6 +26,10 @@ HAS_TRITON = has_triton()
|
||||
|
||||
|
||||
class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
@skipIfXpu(
|
||||
msg="AssertionError: False is not true, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2335"
|
||||
)
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_inductor_profiling_triton_launch(self):
|
||||
# Verify that we get some sort of CPU-side indication of triton kernel launches
|
||||
@ -31,7 +39,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return (x + y).sin().cos()
|
||||
|
||||
x, y = (torch.rand((4, 4), device="cuda") for _ in range(2))
|
||||
x, y = (torch.rand((4, 4), device=GPU_TYPE) for _ in range(2))
|
||||
|
||||
with torch.profiler.profile() as prof:
|
||||
fn(x, y)
|
||||
@ -95,7 +103,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return (x + y).sin().cos()
|
||||
|
||||
args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
|
||||
args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
|
||||
|
||||
events = self._test_profiling_kernel_names(fn, args, "sin")
|
||||
event_found = False
|
||||
@ -120,7 +128,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return x @ y
|
||||
|
||||
args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
|
||||
args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
|
||||
|
||||
def check_fn():
|
||||
# test_profiling_kernel_names will check this before asserting mm is in the trace.
|
||||
@ -153,8 +161,8 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return torch._foreach_add(x, y)
|
||||
|
||||
x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
|
||||
args = (x, y)
|
||||
|
||||
@ -206,8 +214,8 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return torch._foreach_add(x, y)
|
||||
|
||||
x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
|
||||
args = (x, y)
|
||||
fn_opt = torch.compile(fn)
|
||||
@ -216,11 +224,14 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
self.assertTrue(hooks_called["enter"])
|
||||
self.assertTrue(hooks_called["exit"])
|
||||
|
||||
@skipIfXpu(
|
||||
msg="TypeError: list indices must be integers or slices, not str, https://github.com/intel/torch-xpu-ops/issues/2335"
|
||||
)
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_pt2_triton_attributes(self):
|
||||
from torch._inductor.codecache import code_hash
|
||||
|
||||
device = "cuda"
|
||||
device = GPU_TYPE
|
||||
debug = False # set to True to get output file
|
||||
|
||||
@torchdynamo.optimize("inductor")
|
||||
@ -295,7 +306,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_cupti_lazy_reinit(self):
|
||||
x, y = (torch.randn(4, 4, device="cuda") for _ in range(2))
|
||||
x, y = (torch.randn(4, 4, device=GPU_TYPE) for _ in range(2))
|
||||
|
||||
def fn(x, y):
|
||||
return (x + y).sin()
|
||||
@ -314,5 +325,5 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._inductor.test_case import run_tests
|
||||
|
||||
if HAS_CUDA_AND_TRITON:
|
||||
if HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
|
||||
@ -28,7 +28,11 @@ from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import run_and_get_code, run_and_get_cpp_code
|
||||
from torch._inductor.virtualized import V
|
||||
from torch.testing._internal.common_utils import IS_MACOS
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
@ -70,8 +74,8 @@ class Model2(torch.nn.Module):
|
||||
class Model3(torch.nn.Module):
|
||||
def __init__(self, n, k):
|
||||
super().__init__()
|
||||
self.weight = torch.randn(n, k, device="cuda")
|
||||
self.bias = torch.randn(n, device="cuda")
|
||||
self.weight = torch.randn(n, k, device=GPU_TYPE)
|
||||
self.bias = torch.randn(n, device=GPU_TYPE)
|
||||
|
||||
def forward(self, a):
|
||||
return torch.nn.functional.linear(a, self.weight, self.bias)
|
||||
@ -151,7 +155,7 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
|
||||
self.assertTrue(m)
|
||||
filepath = Path(m.group(1))
|
||||
if device == "cuda":
|
||||
if device == "cuda" or device == "xpu":
|
||||
expected_mapping = [
|
||||
(
|
||||
"cppCodeToPost",
|
||||
@ -201,13 +205,20 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
},
|
||||
),
|
||||
]
|
||||
if backend == "aot_inductor":
|
||||
if backend == "aot_inductor" and device == "cuda":
|
||||
expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
|
||||
"mm_default"
|
||||
]
|
||||
expected_mapping[1][1]["mm_default"] = [
|
||||
"aoti_torch_cuda_mm_out:2"
|
||||
]
|
||||
elif backend == "aot_inductor" and device == "xpu":
|
||||
expected_mapping[0][1]["aoti_torch_xpu_mm_out:2"] = [
|
||||
"mm_default"
|
||||
]
|
||||
expected_mapping[1][1]["mm_default"] = [
|
||||
"aoti_torch_xpu_mm_out:2"
|
||||
]
|
||||
else:
|
||||
expected_mapping[0][1]["extern_kernels.mm:2"] = [
|
||||
"mm_default"
|
||||
@ -254,21 +265,21 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
if filepath:
|
||||
shutil.rmtree(filepath)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_cuda(self):
|
||||
self._test_triton_kernel_to_post_grad_tracing(device="cuda")
|
||||
self._test_triton_kernel_to_post_grad_tracing(device=GPU_TYPE)
|
||||
|
||||
def test_triton_kernel_to_post_grad_tracing_cpu(self):
|
||||
self._test_triton_kernel_to_post_grad_tracing(device="cpu")
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
|
||||
M = 8
|
||||
N = 6
|
||||
K = 16
|
||||
model = Model3(N, K)
|
||||
batch = 2
|
||||
a = torch.randn(batch, M, K, device="cuda")
|
||||
a = torch.randn(batch, M, K, device=GPU_TYPE)
|
||||
example_inputs = (a,)
|
||||
filepath = None
|
||||
|
||||
@ -302,9 +313,10 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
else:
|
||||
# backend = aot_inductor
|
||||
expected_data = {
|
||||
"aoti_torch_cuda_addmm_out:2": ["addmm"],
|
||||
f"aoti_torch_{GPU_TYPE}_addmm_out:2": ["addmm"],
|
||||
"triton_poi_fused_0:1": ["_tensor_constant1"],
|
||||
}
|
||||
|
||||
self._check_provenance_tracing_kernel_to_post_grad(
|
||||
filepath, expected_data
|
||||
)
|
||||
@ -312,12 +324,12 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
if filepath:
|
||||
shutil.rmtree(filepath)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def _test_pt_tracing_combo_kernel(self, backend):
|
||||
"""This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
|
||||
a = torch.randn(10, 10, device="cuda")
|
||||
b = torch.randn(20, 20, device="cuda")
|
||||
c = torch.randn(10, 10, device="cuda")
|
||||
a = torch.randn(10, 10, device=GPU_TYPE)
|
||||
b = torch.randn(20, 20, device=GPU_TYPE)
|
||||
c = torch.randn(10, 10, device=GPU_TYPE)
|
||||
example_inputs = (a, b, c)
|
||||
|
||||
model = Model2()
|
||||
@ -348,7 +360,7 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
|
||||
self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
|
||||
self._test_pt_tracing_combo_kernel(backend="inductor")
|
||||
self._test_pt_tracing_combo_kernel(backend="aot_inductor")
|
||||
@ -465,7 +477,7 @@ class TestProvenanceTracingNodeMeta(TestCase):
|
||||
"""
|
||||
return next(iter([node for node in gm.graph.nodes if node.target == target]))
|
||||
|
||||
@requires_cuda_and_triton # test only works for cuda pattern matcher
|
||||
@requires_gpu_and_triton # test only works for cuda pattern matcher
|
||||
def test_pattern_matcher_transfer_meta(self):
|
||||
"""
|
||||
Test that stack trace is transfered when node is decomposed in post_grad_passes
|
||||
@ -484,9 +496,9 @@ class TestProvenanceTracingNodeMeta(TestCase):
|
||||
x = self.sigmoid(x)
|
||||
return x * 3
|
||||
|
||||
x = torch.randn(8, 10).to("cuda")
|
||||
x = torch.randn(8, 10).to(GPU_TYPE)
|
||||
example_inputs = (x,)
|
||||
model = Model().to("cuda")
|
||||
model = Model().to(GPU_TYPE)
|
||||
|
||||
# mimic the before_post_grad graph
|
||||
ep = torch.export.export(model, example_inputs).run_decompositions()
|
||||
@ -546,9 +558,9 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
return s.split("\n")[i].strip()
|
||||
|
||||
@torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_tlparse_kernel_stack_traces(self):
|
||||
device = "cuda"
|
||||
device = GPU_TYPE
|
||||
model = Model4().to(device)
|
||||
x = torch.randn(8, 10).to(device)
|
||||
a = torch.randn(10, 20).to(device)
|
||||
@ -642,16 +654,16 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
for item in data[field]:
|
||||
self.assertIsInstance(item, str)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch("trace.provenance_tracking_level", 1)
|
||||
def test_kernel_information_generation(self):
|
||||
"""Test basic kernel information generation in AOTI packages."""
|
||||
|
||||
model = Model4().to("cuda")
|
||||
x = torch.randn(8, 10, device="cuda")
|
||||
a = torch.randn(10, 20, device="cuda")
|
||||
b = torch.randn(20, 30, device="cuda")
|
||||
c = torch.randn(10, 30, device="cuda")
|
||||
model = Model4().to(GPU_TYPE)
|
||||
x = torch.randn(8, 10, device=GPU_TYPE)
|
||||
a = torch.randn(10, 20, device=GPU_TYPE)
|
||||
b = torch.randn(20, 30, device=GPU_TYPE)
|
||||
c = torch.randn(10, 30, device=GPU_TYPE)
|
||||
inputs = (x, a, b, c)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@ -712,14 +724,14 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
],
|
||||
"pre_grad_nodes": ["gelu", "addmm"],
|
||||
},
|
||||
"aoti_torch_cuda_mm_out:1": {
|
||||
f"aoti_torch_{GPU_TYPE}_mm_out:1": {
|
||||
"stack_traces": [
|
||||
"x = self.fc1(x)",
|
||||
],
|
||||
"post_grad_nodes": ["mm_default_1"],
|
||||
"pre_grad_nodes": ["linear"],
|
||||
},
|
||||
"aoti_torch_cuda_mm_out:4": {
|
||||
f"aoti_torch_{GPU_TYPE}_mm_out:4": {
|
||||
"stack_traces": [
|
||||
"y = torch.addmm(c, d, b)",
|
||||
],
|
||||
|
||||
@ -29,7 +29,7 @@ from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import is_big_gpu, run_and_get_kernels
|
||||
from torch._inductor.virtualized import V
|
||||
from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
|
||||
from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU,
|
||||
@ -180,7 +180,6 @@ class TestSelectAlgorithm(TestCase):
|
||||
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
|
||||
|
||||
@patches
|
||||
@skipIfXpu(msg="Double datatype matmul is not supported in oneDNN")
|
||||
def test_mm_skip(self):
|
||||
@torch.compile
|
||||
def foo(a, b):
|
||||
@ -249,7 +248,6 @@ class TestSelectAlgorithm(TestCase):
|
||||
|
||||
# TODO: fix accuracy failure of the triton template on XPU.
|
||||
# and enable this test case.
|
||||
@skipIfXpu
|
||||
@patches
|
||||
def test_mm_plus_mm2(self):
|
||||
@torch.compile
|
||||
|
||||
@ -5,7 +5,7 @@ import torch._inductor
|
||||
from torch._dynamo.utils import counters
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
try:
|
||||
@ -248,7 +248,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
|
||||
)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -291,7 +291,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -317,7 +317,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -342,7 +342,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user