mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-15 14:54:56 +08:00
Compare commits
2 Commits
malfet-pat
...
ciflow/tru
| Author | SHA1 | Date | |
|---|---|---|---|
| 35b5994ed8 | |||
| 4e2045e211 |
@ -36,7 +36,11 @@ case ${DOCKER_TAG_PREFIX} in
|
|||||||
;;
|
;;
|
||||||
rocm*)
|
rocm*)
|
||||||
BASE_TARGET=rocm
|
BASE_TARGET=rocm
|
||||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||||
|
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||||
|
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
|
||||||
|
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||||
|
fi
|
||||||
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
|
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
@ -168,18 +168,6 @@ case "$tag" in
|
|||||||
VISION=yes
|
VISION=yes
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-py3.11-clang12)
|
|
||||||
ANACONDA_PYTHON_VERSION=3.11
|
|
||||||
CLANG_VERSION=12
|
|
||||||
VISION=no
|
|
||||||
TRITON=no
|
|
||||||
;;
|
|
||||||
pytorch-linux-jammy-py3.12-clang12)
|
|
||||||
ANACONDA_PYTHON_VERSION=3.12
|
|
||||||
CLANG_VERSION=12
|
|
||||||
VISION=no
|
|
||||||
TRITON=no
|
|
||||||
;;
|
|
||||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
|
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
|
||||||
if [[ $tag =~ "jammy" ]]; then
|
if [[ $tag =~ "jammy" ]]; then
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
@ -207,9 +195,9 @@ case "$tag" in
|
|||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
|
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=13
|
GCC_VERSION=11
|
||||||
VISION=yes
|
VISION=yes
|
||||||
XPU_VERSION=2025.2
|
XPU_VERSION=2025.2
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
@ -260,12 +248,6 @@ case "$tag" in
|
|||||||
HALIDE=yes
|
HALIDE=yes
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-cuda12.8-py3.12-pallas)
|
|
||||||
CUDA_VERSION=12.8.1
|
|
||||||
ANACONDA_PYTHON_VERSION=3.12
|
|
||||||
GCC_VERSION=11
|
|
||||||
PALLAS=yes
|
|
||||||
;;
|
|
||||||
pytorch-linux-jammy-py3.12-triton-cpu)
|
pytorch-linux-jammy-py3.12-triton-cpu)
|
||||||
CUDA_VERSION=12.6
|
CUDA_VERSION=12.6
|
||||||
ANACONDA_PYTHON_VERSION=3.12
|
ANACONDA_PYTHON_VERSION=3.12
|
||||||
@ -387,7 +369,6 @@ docker build \
|
|||||||
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
|
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
|
||||||
--build-arg "EXECUTORCH=${EXECUTORCH}" \
|
--build-arg "EXECUTORCH=${EXECUTORCH}" \
|
||||||
--build-arg "HALIDE=${HALIDE}" \
|
--build-arg "HALIDE=${HALIDE}" \
|
||||||
--build-arg "PALLAS=${PALLAS}" \
|
|
||||||
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
||||||
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
||||||
--build-arg "ACL=${ACL:-}" \
|
--build-arg "ACL=${ACL:-}" \
|
||||||
|
|||||||
@ -1 +0,0 @@
|
|||||||
0.8.0
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
|
||||||
|
|
||||||
# Get the pinned JAX version (same for all CUDA versions)
|
|
||||||
JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
|
|
||||||
|
|
||||||
function install_jax_12() {
|
|
||||||
echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
|
|
||||||
pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
python -c "import jax" # check for errors
|
|
||||||
echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
|
|
||||||
}
|
|
||||||
|
|
||||||
function install_jax_13() {
|
|
||||||
echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
|
|
||||||
pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
python -c "import jax" # check for errors
|
|
||||||
echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
|
|
||||||
}
|
|
||||||
|
|
||||||
# idiomatic parameter and option handling in sh
|
|
||||||
while test $# -gt 0
|
|
||||||
do
|
|
||||||
case "$1" in
|
|
||||||
12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
|
|
||||||
;;
|
|
||||||
13.0|13.0.*) install_jax_13;
|
|
||||||
;;
|
|
||||||
*) echo "bad argument $1"; exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
@ -9,7 +9,7 @@ set -xe
|
|||||||
|
|
||||||
function install_ubuntu() {
|
function install_ubuntu() {
|
||||||
. /etc/os-release
|
. /etc/os-release
|
||||||
if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
|
if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
|
||||||
echo "Ubuntu version ${VERSION_CODENAME} not supported"
|
echo "Ubuntu version ${VERSION_CODENAME} not supported"
|
||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
@ -35,24 +35,25 @@ function install_ubuntu() {
|
|||||||
# The xpu-smi packages
|
# The xpu-smi packages
|
||||||
apt-get install -y flex bison xpu-smi
|
apt-get install -y flex bison xpu-smi
|
||||||
|
|
||||||
# Compute and Media Runtimes
|
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||||
if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
|
# Compute and Media Runtimes
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
intel-opencl-icd intel-level-zero-gpu level-zero \
|
||||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
|
||||||
libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
|
||||||
else # jammy
|
# Development Packages
|
||||||
|
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
|
||||||
|
else # rolling driver
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
||||||
|
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
|
||||||
fi
|
fi
|
||||||
# Development Packages
|
|
||||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
|
|
||||||
|
|
||||||
# Install Intel Support Packages
|
# Install Intel Support Packages
|
||||||
apt-get install -y ${XPU_PACKAGES}
|
apt-get install -y ${XPU_PACKAGES}
|
||||||
@ -65,7 +66,7 @@ function install_ubuntu() {
|
|||||||
function install_rhel() {
|
function install_rhel() {
|
||||||
. /etc/os-release
|
. /etc/os-release
|
||||||
if [[ "${ID}" == "rhel" ]]; then
|
if [[ "${ID}" == "rhel" ]]; then
|
||||||
if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
||||||
echo "RHEL version ${VERSION_ID} not supported"
|
echo "RHEL version ${VERSION_ID} not supported"
|
||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
@ -146,7 +147,7 @@ function install_sles() {
|
|||||||
XPU_DRIVER_VERSION=""
|
XPU_DRIVER_VERSION=""
|
||||||
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||||
# Use GPU driver LTS releases
|
# Use GPU driver LTS releases
|
||||||
XPU_DRIVER_VERSION="/lts/2523"
|
XPU_DRIVER_VERSION="/lts/2350"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
|
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
|
||||||
|
|||||||
@ -49,7 +49,11 @@ case ${DOCKER_TAG_PREFIX} in
|
|||||||
fi
|
fi
|
||||||
BASE_TARGET=rocm
|
BASE_TARGET=rocm
|
||||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||||
|
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||||
|
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||||
|
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||||
|
fi
|
||||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
@ -87,7 +87,11 @@ case ${image} in
|
|||||||
MANY_LINUX_VERSION="2_28"
|
MANY_LINUX_VERSION="2_28"
|
||||||
DEVTOOLSET_VERSION="11"
|
DEVTOOLSET_VERSION="11"
|
||||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
|
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||||
|
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||||
|
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||||
|
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||||
|
fi
|
||||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||||
;;
|
;;
|
||||||
manylinux2_28-builder:xpu)
|
manylinux2_28-builder:xpu)
|
||||||
|
|||||||
@ -143,15 +143,6 @@ COPY ci_commit_pins/halide.txt halide.txt
|
|||||||
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
|
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
|
||||||
RUN rm install_halide.sh common_utils.sh halide.txt
|
RUN rm install_halide.sh common_utils.sh halide.txt
|
||||||
|
|
||||||
ARG PALLAS
|
|
||||||
ARG CUDA_VERSION
|
|
||||||
# Install JAX with CUDA support (for Pallas)
|
|
||||||
COPY ./common/install_jax.sh install_jax.sh
|
|
||||||
COPY ./common/common_utils.sh common_utils.sh
|
|
||||||
COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
|
|
||||||
RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
|
|
||||||
RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
|
|
||||||
|
|
||||||
ARG ONNX
|
ARG ONNX
|
||||||
# Install ONNX dependencies
|
# Install ONNX dependencies
|
||||||
COPY ./common/install_onnx.sh ./common/common_utils.sh ./
|
COPY ./common/install_onnx.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod
|
|||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from collections.abc import Callable # Python 3.11+
|
from typing import Any, Callable, Required, TypedDict # Python 3.11+
|
||||||
from typing import Any, Required, TypedDict
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from collections.abc import Callable
|
from typing import Any, Callable, TypedDict
|
||||||
from typing import Any, TypedDict
|
|
||||||
|
|
||||||
from typing_extensions import Required # Fallback for Python <3.11
|
from typing_extensions import Required # Fallback for Python <3.11
|
||||||
|
|
||||||
|
|||||||
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
|||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source /opt/intel/oneapi/compiler/latest/env/vars.sh
|
source /opt/intel/oneapi/compiler/latest/env/vars.sh
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source /opt/intel/oneapi/umf/latest/env/vars.sh
|
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source /opt/intel/oneapi/pti/latest/env/vars.sh
|
|
||||||
# Enable XCCL build
|
# Enable XCCL build
|
||||||
export USE_XCCL=1
|
export USE_XCCL=1
|
||||||
export USE_MPI=0
|
export USE_MPI=0
|
||||||
|
# XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
|
||||||
|
export USE_KINETO=0
|
||||||
export TORCH_XPU_ARCH_LIST=pvc
|
export TORCH_XPU_ARCH_LIST=pvc
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
|||||||
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source /opt/intel/oneapi/pti/latest/env/vars.sh
|
|
||||||
# Check XPU status before testing
|
# Check XPU status before testing
|
||||||
timeout 30 xpu-smi discovery || true
|
timeout 30 xpu-smi discovery || true
|
||||||
fi
|
fi
|
||||||
@ -826,11 +824,6 @@ test_inductor_halide() {
|
|||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
||||||
test_inductor_pallas() {
|
|
||||||
python test/run_test.py --include inductor/test_pallas.py --verbose
|
|
||||||
assert_git_not_dirty
|
|
||||||
}
|
|
||||||
|
|
||||||
test_inductor_triton_cpu() {
|
test_inductor_triton_cpu() {
|
||||||
python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
|
python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
|
||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
@ -1731,8 +1724,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
|
|||||||
test_inductor_distributed
|
test_inductor_distributed
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
||||||
test_inductor_halide
|
test_inductor_halide
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
|
|
||||||
test_inductor_pallas
|
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
||||||
test_inductor_triton_cpu
|
test_inductor_triton_cpu
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
||||||
|
|||||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
|||||||
ccb801b88af136454798b945175c4c87e636ac33
|
cfbc5c2f1c798991715a6b06bb3ce46478c4487c
|
||||||
|
|||||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
|||||||
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
|
c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
|
||||||
|
|||||||
22
.github/labeler.yml
vendored
22
.github/labeler.yml
vendored
@ -138,8 +138,7 @@
|
|||||||
- test/test_matmul_cuda.py
|
- test/test_matmul_cuda.py
|
||||||
- test/test_scaled_matmul_cuda.py
|
- test/test_scaled_matmul_cuda.py
|
||||||
- test/inductor/test_fp8.py
|
- test/inductor/test_fp8.py
|
||||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
- aten/src/ATen/native/cuda/Blas.cpp
|
||||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
|
||||||
- torch/**/*cublas*
|
- torch/**/*cublas*
|
||||||
- torch/_inductor/kernel/mm.py
|
- torch/_inductor/kernel/mm.py
|
||||||
- test/inductor/test_max_autotune.py
|
- test/inductor/test_max_autotune.py
|
||||||
@ -149,8 +148,7 @@
|
|||||||
- test/test_matmul_cuda.py
|
- test/test_matmul_cuda.py
|
||||||
- test/test_scaled_matmul_cuda.py
|
- test/test_scaled_matmul_cuda.py
|
||||||
- test/inductor/test_fp8.py
|
- test/inductor/test_fp8.py
|
||||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
- aten/src/ATen/native/cuda/Blas.cpp
|
||||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
|
||||||
- torch/**/*cublas*
|
- torch/**/*cublas*
|
||||||
- torch/_inductor/kernel/mm.py
|
- torch/_inductor/kernel/mm.py
|
||||||
- test/inductor/test_max_autotune.py
|
- test/inductor/test_max_autotune.py
|
||||||
@ -160,21 +158,7 @@
|
|||||||
- test/test_matmul_cuda.py
|
- test/test_matmul_cuda.py
|
||||||
- test/test_scaled_matmul_cuda.py
|
- test/test_scaled_matmul_cuda.py
|
||||||
- test/inductor/test_fp8.py
|
- test/inductor/test_fp8.py
|
||||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
- aten/src/ATen/native/cuda/Blas.cpp
|
||||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
|
||||||
- torch/_inductor/kernel/mm.py
|
- torch/_inductor/kernel/mm.py
|
||||||
- test/inductor/test_max_autotune.py
|
- test/inductor/test_max_autotune.py
|
||||||
- third_party/fbgemm
|
- third_party/fbgemm
|
||||||
|
|
||||||
"ciflow/mps":
|
|
||||||
- aten/src/ATen/mps/**
|
|
||||||
- aten/src/ATen/native/mps/**
|
|
||||||
- torch/_inductor/codegen/mps.py
|
|
||||||
- test/test_mps.py
|
|
||||||
- test/inductor/test_mps_basic.py
|
|
||||||
|
|
||||||
"ciflow/h100-symm-mem":
|
|
||||||
- torch/csrc/distributed/c10d/symm_mem/**
|
|
||||||
- torch/distributed/_symmetric_memory/**
|
|
||||||
- test/distributed/**/*mem*
|
|
||||||
- test/distributed/**/*mem*/**
|
|
||||||
|
|||||||
1
.github/nitpicks.yml
vendored
1
.github/nitpicks.yml
vendored
@ -10,4 +10,3 @@
|
|||||||
pathFilter:
|
pathFilter:
|
||||||
- 'torch/csrc/inductor/aoti_torch/c/*'
|
- 'torch/csrc/inductor/aoti_torch/c/*'
|
||||||
- 'torch/csrc/inductor/aoti_torch/generated/*'
|
- 'torch/csrc/inductor/aoti_torch/generated/*'
|
||||||
- 'torch/csrc/stable/c/*'
|
|
||||||
|
|||||||
6
.github/pytorch-probot.yml
vendored
6
.github/pytorch-probot.yml
vendored
@ -2,8 +2,8 @@ tracking_issue: 24422
|
|||||||
ciflow_tracking_issue: 64124
|
ciflow_tracking_issue: 64124
|
||||||
ciflow_push_tags:
|
ciflow_push_tags:
|
||||||
- ciflow/b200
|
- ciflow/b200
|
||||||
- ciflow/b200-distributed
|
|
||||||
- ciflow/b200-symm-mem
|
- ciflow/b200-symm-mem
|
||||||
|
- ciflow/b200-distributed
|
||||||
- ciflow/binaries
|
- ciflow/binaries
|
||||||
- ciflow/binaries_libtorch
|
- ciflow/binaries_libtorch
|
||||||
- ciflow/binaries_wheel
|
- ciflow/binaries_wheel
|
||||||
@ -22,8 +22,6 @@ ciflow_push_tags:
|
|||||||
- ciflow/inductor-perf-test-nightly-xpu
|
- ciflow/inductor-perf-test-nightly-xpu
|
||||||
- ciflow/inductor-periodic
|
- ciflow/inductor-periodic
|
||||||
- ciflow/inductor-rocm
|
- ciflow/inductor-rocm
|
||||||
- ciflow/inductor-rocm-mi200
|
|
||||||
- ciflow/inductor-rocm-mi300
|
|
||||||
- ciflow/linux-aarch64
|
- ciflow/linux-aarch64
|
||||||
- ciflow/mps
|
- ciflow/mps
|
||||||
- ciflow/nightly
|
- ciflow/nightly
|
||||||
@ -35,13 +33,11 @@ ciflow_push_tags:
|
|||||||
- ciflow/quantization-periodic
|
- ciflow/quantization-periodic
|
||||||
- ciflow/riscv64
|
- ciflow/riscv64
|
||||||
- ciflow/rocm
|
- ciflow/rocm
|
||||||
- ciflow/rocm-mi200
|
|
||||||
- ciflow/rocm-mi300
|
- ciflow/rocm-mi300
|
||||||
- ciflow/rocm-mi355
|
- ciflow/rocm-mi355
|
||||||
- ciflow/rocm-navi31
|
- ciflow/rocm-navi31
|
||||||
- ciflow/s390
|
- ciflow/s390
|
||||||
- ciflow/slow
|
- ciflow/slow
|
||||||
- ciflow/slow-rocm-mi200
|
|
||||||
- ciflow/torchbench
|
- ciflow/torchbench
|
||||||
- ciflow/triton_binaries
|
- ciflow/triton_binaries
|
||||||
- ciflow/trunk
|
- ciflow/trunk
|
||||||
|
|||||||
3
.github/scripts/delete_old_branches.py
vendored
3
.github/scripts/delete_old_branches.py
vendored
@ -1,11 +1,10 @@
|
|||||||
# Delete old branches
|
# Delete old branches
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from collections.abc import Callable
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any, Callable
|
||||||
|
|
||||||
from github_utils import gh_fetch_json_dict, gh_graphql
|
from github_utils import gh_fetch_json_dict, gh_graphql
|
||||||
from gitutils import GitRepo
|
from gitutils import GitRepo
|
||||||
|
|||||||
3
.github/scripts/filter_test_configs.py
vendored
3
.github/scripts/filter_test_configs.py
vendored
@ -8,11 +8,10 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import Callable
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from logging import info
|
from logging import info
|
||||||
from typing import Any, Optional
|
from typing import Any, Callable, Optional
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|||||||
3
.github/scripts/get_workflow_job_id.py
vendored
3
.github/scripts/get_workflow_job_id.py
vendored
@ -11,8 +11,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import urllib
|
import urllib
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from collections.abc import Callable
|
from typing import Any, Callable, Optional
|
||||||
from typing import Any, Optional
|
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
3
.github/scripts/github_utils.py
vendored
3
.github/scripts/github_utils.py
vendored
@ -3,9 +3,8 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import Callable
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, cast, Optional, Union
|
from typing import Any, Callable, cast, Optional, Union
|
||||||
from urllib.error import HTTPError
|
from urllib.error import HTTPError
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|||||||
4
.github/scripts/gitutils.py
vendored
4
.github/scripts/gitutils.py
vendored
@ -4,10 +4,10 @@ import os
|
|||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable, Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from typing import Any, cast, Optional, TypeVar, Union
|
from typing import Any, Callable, cast, Optional, TypeVar, Union
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|||||||
3
.github/scripts/lintrunner.sh
vendored
3
.github/scripts/lintrunner.sh
vendored
@ -34,9 +34,6 @@ python3 torch/utils/data/datapipes/gen_pyi.py
|
|||||||
# Also check generated pyi files
|
# Also check generated pyi files
|
||||||
find torch -name '*.pyi' -exec git add --force -- "{}" +
|
find torch -name '*.pyi' -exec git add --force -- "{}" +
|
||||||
|
|
||||||
# Print current environment
|
|
||||||
python3 -m pip freeze
|
|
||||||
|
|
||||||
RC=0
|
RC=0
|
||||||
# Run lintrunner on all files
|
# Run lintrunner on all files
|
||||||
if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
|
if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
|
||||||
|
|||||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -17,12 +17,12 @@ import re
|
|||||||
import time
|
import time
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import Pattern
|
from re import Pattern
|
||||||
from typing import Any, cast, NamedTuple, Optional
|
from typing import Any, Callable, cast, NamedTuple, Optional
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|||||||
1
.github/workflows/b200-distributed.yml
vendored
1
.github/workflows/b200-distributed.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
|
|||||||
1
.github/workflows/b200-symm-mem.yml
vendored
1
.github/workflows/b200-symm-mem.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
|
|||||||
7
.github/workflows/docker-builds.yml
vendored
7
.github/workflows/docker-builds.yml
vendored
@ -56,8 +56,6 @@ jobs:
|
|||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||||
pytorch-linux-jammy-py3.10-clang12,
|
pytorch-linux-jammy-py3.10-clang12,
|
||||||
pytorch-linux-jammy-py3.11-clang12,
|
|
||||||
pytorch-linux-jammy-py3.12-clang12,
|
|
||||||
pytorch-linux-jammy-py3.13-clang12,
|
pytorch-linux-jammy-py3.13-clang12,
|
||||||
pytorch-linux-jammy-py3.14-clang12,
|
pytorch-linux-jammy-py3.14-clang12,
|
||||||
pytorch-linux-jammy-rocm-n-py3,
|
pytorch-linux-jammy-rocm-n-py3,
|
||||||
@ -67,10 +65,9 @@ jobs:
|
|||||||
pytorch-linux-jammy-py3.10-gcc11,
|
pytorch-linux-jammy-py3.10-gcc11,
|
||||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
||||||
pytorch-linux-jammy-py3.12-halide,
|
pytorch-linux-jammy-py3.12-halide,
|
||||||
pytorch-linux-jammy-cuda12.8-py3.12-pallas,
|
|
||||||
pytorch-linux-jammy-xpu-n-1-py3,
|
pytorch-linux-jammy-xpu-n-1-py3,
|
||||||
pytorch-linux-noble-xpu-n-py3,
|
pytorch-linux-jammy-xpu-n-py3,
|
||||||
pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
|
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
|
||||||
pytorch-linux-jammy-py3-clang18-asan,
|
pytorch-linux-jammy-py3-clang18-asan,
|
||||||
pytorch-linux-jammy-py3-clang12-onnx,
|
pytorch-linux-jammy-py3-clang12-onnx,
|
||||||
pytorch-linux-jammy-linter,
|
pytorch-linux-jammy-linter,
|
||||||
|
|||||||
1
.github/workflows/h100-distributed.yml
vendored
1
.github/workflows/h100-distributed.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: "linux.c7i.12xlarge"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '9.0'
|
cuda-arch-list: '9.0'
|
||||||
|
|||||||
@ -83,8 +83,8 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
|
||||||
runner: linux.c7i.12xlarge
|
runner: linux.c7i.12xlarge
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -117,7 +117,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_xpu-test.yml
|
uses: ./.github/workflows/_xpu-test.yml
|
||||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
||||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||||
@ -137,7 +137,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_xpu-test.yml
|
uses: ./.github/workflows/_xpu-test.yml
|
||||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||||
|
|||||||
1
.github/workflows/inductor-rocm-mi300.yml
vendored
1
.github/workflows/inductor-rocm-mi300.yml
vendored
@ -7,7 +7,6 @@ on:
|
|||||||
- release/*
|
- release/*
|
||||||
tags:
|
tags:
|
||||||
- ciflow/inductor-rocm/*
|
- ciflow/inductor-rocm/*
|
||||||
- ciflow/inductor-rocm-mi300/*
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
|
|||||||
@ -7,7 +7,7 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- release/*
|
- release/*
|
||||||
tags:
|
tags:
|
||||||
- ciflow/inductor-rocm-mi200/*
|
- ciflow/inductor-rocm/*
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
26
.github/workflows/inductor-unittest.yml
vendored
26
.github/workflows/inductor-unittest.yml
vendored
@ -81,32 +81,6 @@ jobs:
|
|||||||
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
inductor-pallas-build:
|
|
||||||
name: inductor-pallas-build
|
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
|
||||||
needs: get-label-type
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
|
|
||||||
cuda-arch-list: '8.9'
|
|
||||||
runner: linux.8xlarge.memory
|
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
|
||||||
test-matrix: |
|
|
||||||
{ include: [
|
|
||||||
{ config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
|
|
||||||
]}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
inductor-pallas-test:
|
|
||||||
name: inductor-pallas-test
|
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
|
||||||
needs: inductor-pallas-build
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-py3.12-gcc11
|
|
||||||
docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
|
|
||||||
test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
inductor-triton-cpu-build:
|
inductor-triton-cpu-build:
|
||||||
name: inductor-triton-cpu-build
|
name: inductor-triton-cpu-build
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
|||||||
8
.github/workflows/nightly.yml
vendored
8
.github/workflows/nightly.yml
vendored
@ -5,11 +5,9 @@ on:
|
|||||||
- cron: 0 0 * * *
|
- cron: 0 0 * * *
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
# NOTE: Doc build pipelines should only get triggered on:
|
# NOTE: Doc build pipelines should only get triggered on release candidate builds
|
||||||
# Major or minor release candidates builds
|
# Release candidate tags look like: v1.11.0-rc1
|
||||||
- v[0-9]+.[0-9]+.0+-rc[0-9]+
|
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
|
||||||
# Final RC for major, minor and patch releases
|
|
||||||
- v[0-9]+.[0-9]+.[0-9]+
|
|
||||||
- ciflow/nightly/*
|
- ciflow/nightly/*
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
|||||||
1
.github/workflows/periodic-rocm-mi200.yml
vendored
1
.github/workflows/periodic-rocm-mi200.yml
vendored
@ -11,6 +11,7 @@ on:
|
|||||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
|
- ciflow/periodic/*
|
||||||
- ciflow/periodic-rocm-mi200/*
|
- ciflow/periodic-rocm-mi200/*
|
||||||
branches:
|
branches:
|
||||||
- release/*
|
- release/*
|
||||||
|
|||||||
1
.github/workflows/periodic-rocm-mi300.yml
vendored
1
.github/workflows/periodic-rocm-mi300.yml
vendored
@ -11,7 +11,6 @@ on:
|
|||||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
- ciflow/periodic/*
|
|
||||||
- ciflow/periodic-rocm-mi300/*
|
- ciflow/periodic-rocm-mi300/*
|
||||||
branches:
|
branches:
|
||||||
- release/*
|
- release/*
|
||||||
|
|||||||
8
.github/workflows/pull.yml
vendored
8
.github/workflows/pull.yml
vendored
@ -342,16 +342,16 @@ jobs:
|
|||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-noble-xpu-n-py3_10-build:
|
linux-jammy-xpu-n-py3_10-build:
|
||||||
name: linux-noble-xpu-n-py3.10
|
name: linux-jammy-xpu-n-py3.10
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
# This should sync with the build in xpu.yml but xpu uses a larger runner
|
# This should sync with the build in xpu.yml but xpu uses a larger runner
|
||||||
# sync-tag: linux-xpu-n-build
|
# sync-tag: linux-xpu-n-build
|
||||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
|
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
|
||||||
|
|||||||
1
.github/workflows/rocm-mi300.yml
vendored
1
.github/workflows/rocm-mi300.yml
vendored
@ -6,7 +6,6 @@ on:
|
|||||||
- main
|
- main
|
||||||
- release/*
|
- release/*
|
||||||
tags:
|
tags:
|
||||||
- ciflow/rocm/*
|
|
||||||
- ciflow/rocm-mi300/*
|
- ciflow/rocm-mi300/*
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
|
|||||||
@ -5,7 +5,7 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- release/*
|
- release/*
|
||||||
tags:
|
tags:
|
||||||
- ciflow/rocm-mi200/*
|
- ciflow/rocm/*
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: 29 8 * * * # about 1:29am PDT
|
- cron: 29 8 * * * # about 1:29am PDT
|
||||||
81
.github/workflows/slow-rocm-mi200.yml
vendored
81
.github/workflows/slow-rocm-mi200.yml
vendored
@ -1,81 +0,0 @@
|
|||||||
# This workflow is dedicated to host slow jobs that are run only periodically because
|
|
||||||
# they are too slow to run in every commit. The list of slow tests can be found in
|
|
||||||
# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
|
|
||||||
name: slow-rocm-mi200
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- release/*
|
|
||||||
tags:
|
|
||||||
- ciflow/slow/*
|
|
||||||
- ciflow/slow-rocm-mi200/*
|
|
||||||
schedule:
|
|
||||||
- cron: 0 */3 * * *
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
llm-td:
|
|
||||||
if: github.repository_owner == 'pytorch'
|
|
||||||
name: before-test
|
|
||||||
uses: ./.github/workflows/llm_td_retrieval.yml
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
target-determination:
|
|
||||||
name: before-test
|
|
||||||
uses: ./.github/workflows/target_determination.yml
|
|
||||||
needs: llm-td
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
get-label-type:
|
|
||||||
name: get-label-type
|
|
||||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
|
||||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
|
||||||
with:
|
|
||||||
triggering_actor: ${{ github.triggering_actor }}
|
|
||||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
|
||||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
|
||||||
curr_ref_type: ${{ github.ref_type }}
|
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-build:
|
|
||||||
name: linux-jammy-rocm-py3.10
|
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
|
||||||
needs: get-label-type
|
|
||||||
with:
|
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
|
||||||
build-environment: linux-jammy-rocm-py3.10
|
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
|
||||||
sync-tag: rocm-build
|
|
||||||
test-matrix: |
|
|
||||||
{ include: [
|
|
||||||
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
|
||||||
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
|
||||||
]}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-test:
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
name: linux-jammy-rocm-py3.10
|
|
||||||
uses: ./.github/workflows/_rocm-test.yml
|
|
||||||
needs:
|
|
||||||
- linux-jammy-rocm-py3_10-build
|
|
||||||
- target-determination
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-rocm-py3.10
|
|
||||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
|
||||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
|
||||||
secrets: inherit
|
|
||||||
30
.github/workflows/slow.yml
vendored
30
.github/workflows/slow.yml
vendored
@ -105,6 +105,36 @@ jobs:
|
|||||||
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-build:
|
||||||
|
name: linux-jammy-rocm-py3.10
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
build-environment: linux-jammy-rocm-py3.10
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||||
|
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-test:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
name: linux-jammy-rocm-py3.10
|
||||||
|
uses: ./.github/workflows/_rocm-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-jammy-rocm-py3_10-build
|
||||||
|
- target-determination
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-rocm-py3.10
|
||||||
|
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-py3_10-clang18-asan-build:
|
linux-jammy-py3_10-clang18-asan-build:
|
||||||
name: linux-jammy-py3.10-clang18-asan
|
name: linux-jammy-py3.10-clang18-asan
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
|||||||
3
.github/workflows/test-b200.yml
vendored
3
.github/workflows/test-b200.yml
vendored
@ -52,6 +52,7 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
@ -72,4 +73,4 @@ jobs:
|
|||||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
|
||||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
1
.github/workflows/test-h100.yml
vendored
1
.github/workflows/test-h100.yml
vendored
@ -41,6 +41,7 @@ jobs:
|
|||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '9.0'
|
cuda-arch-list: '9.0'
|
||||||
|
|||||||
5
.github/workflows/upload-test-stats.yml
vendored
5
.github/workflows/upload-test-stats.yml
vendored
@ -11,16 +11,15 @@ on:
|
|||||||
- inductor
|
- inductor
|
||||||
- unstable
|
- unstable
|
||||||
- slow
|
- slow
|
||||||
- slow-rocm-mi200
|
|
||||||
- unstable-periodic
|
- unstable-periodic
|
||||||
- inductor-periodic
|
- inductor-periodic
|
||||||
- rocm-mi200
|
- rocm
|
||||||
- rocm-mi300
|
- rocm-mi300
|
||||||
- rocm-mi355
|
- rocm-mi355
|
||||||
- inductor-micro-benchmark
|
- inductor-micro-benchmark
|
||||||
- inductor-micro-benchmark-x86
|
- inductor-micro-benchmark-x86
|
||||||
- inductor-cu124
|
- inductor-cu124
|
||||||
- inductor-rocm-mi200
|
- inductor-rocm
|
||||||
- inductor-rocm-mi300
|
- inductor-rocm-mi300
|
||||||
- mac-mps
|
- mac-mps
|
||||||
- linux-aarch64
|
- linux-aarch64
|
||||||
|
|||||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -47,15 +47,15 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-noble-xpu-n-py3_10-build:
|
linux-jammy-xpu-n-py3_10-build:
|
||||||
name: linux-noble-xpu-n-py3.10
|
name: linux-jammy-xpu-n-py3.10
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
sync-tag: linux-xpu-n-build
|
sync-tag: linux-xpu-n-build
|
||||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
|
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||||
runner: linux.c7i.12xlarge
|
runner: linux.c7i.12xlarge
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -74,17 +74,17 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-noble-xpu-n-py3_10-test:
|
linux-jammy-xpu-n-py3_10-test:
|
||||||
name: linux-noble-xpu-n-py3.10
|
name: linux-jammy-xpu-n-py3.10
|
||||||
uses: ./.github/workflows/_xpu-test.yml
|
uses: ./.github/workflows/_xpu-test.yml
|
||||||
needs: linux-noble-xpu-n-py3_10-build
|
needs: linux-jammy-xpu-n-py3_10-build
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
contents: read
|
contents: read
|
||||||
with:
|
with:
|
||||||
build-environment: linux-noble-xpu-n-py3.10
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
|
docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
windows-xpu-n-1-build:
|
windows-xpu-n-1-build:
|
||||||
|
|||||||
@ -186,8 +186,6 @@ include_patterns = [
|
|||||||
'aten/src/ATen/native/nested/cuda/*.h',
|
'aten/src/ATen/native/nested/cuda/*.h',
|
||||||
'aten/src/ATen/native/nested/*.cpp',
|
'aten/src/ATen/native/nested/*.cpp',
|
||||||
'aten/src/ATen/native/nested/*.h',
|
'aten/src/ATen/native/nested/*.h',
|
||||||
'aten/src/ATen/xpu/**/*.h',
|
|
||||||
'aten/src/ATen/xpu/**/*.cpp',
|
|
||||||
'c10/**/*.cpp',
|
'c10/**/*.cpp',
|
||||||
'c10/**/*.h',
|
'c10/**/*.h',
|
||||||
'torch/*.h',
|
'torch/*.h',
|
||||||
@ -1404,7 +1402,7 @@ init_command = [
|
|||||||
'--dry-run={{DRYRUN}}',
|
'--dry-run={{DRYRUN}}',
|
||||||
'usort==1.0.8.post1',
|
'usort==1.0.8.post1',
|
||||||
'isort==6.0.1',
|
'isort==6.0.1',
|
||||||
'ruff==0.14.4', # sync with RUFF
|
'ruff==0.13.1', # sync with RUFF
|
||||||
]
|
]
|
||||||
is_formatter = true
|
is_formatter = true
|
||||||
|
|
||||||
@ -1539,7 +1537,7 @@ init_command = [
|
|||||||
'python3',
|
'python3',
|
||||||
'tools/linter/adapters/pip_init.py',
|
'tools/linter/adapters/pip_init.py',
|
||||||
'--dry-run={{DRYRUN}}',
|
'--dry-run={{DRYRUN}}',
|
||||||
'ruff==0.14.4', # sync with PYFMT
|
'ruff==0.13.1', # sync with PYFMT
|
||||||
]
|
]
|
||||||
is_formatter = true
|
is_formatter = true
|
||||||
|
|
||||||
|
|||||||
@ -736,44 +736,6 @@ if(NOT DEFINED USE_BLAS)
|
|||||||
set(USE_BLAS ON)
|
set(USE_BLAS ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Prioritized Text Linker Optimization
|
|
||||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
|
||||||
|
|
||||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
|
||||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
|
||||||
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${Python_EXECUTABLE}
|
|
||||||
${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
|
|
||||||
--filein "${LINKER_SCRIPT_FILE_IN}"
|
|
||||||
--fout "${LINKER_SCRIPT_FILE_OUT}"
|
|
||||||
RESULT_VARIABLE _gen_result
|
|
||||||
OUTPUT_VARIABLE _gen_output
|
|
||||||
ERROR_VARIABLE _gen_error
|
|
||||||
)
|
|
||||||
|
|
||||||
if(NOT _gen_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
|
|
||||||
append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
|
|
||||||
append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
|
|
||||||
append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
|
|
||||||
|
|
||||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
|
|
||||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
|
|
||||||
|
|
||||||
else()
|
|
||||||
if(LINUX AND CPU_AARCH64)
|
|
||||||
message(WARNING [[
|
|
||||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
|
||||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
|
||||||
]])
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Build libtorch mobile library, which contains ATen/TH ops and native support
|
# Build libtorch mobile library, which contains ATen/TH ops and native support
|
||||||
# for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
|
# for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
|
||||||
if(INTERN_BUILD_MOBILE)
|
if(INTERN_BUILD_MOBILE)
|
||||||
@ -1440,6 +1402,9 @@ if(BUILD_JNI)
|
|||||||
add_subdirectory(android/pytorch_android)
|
add_subdirectory(android/pytorch_android)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
include(cmake/Summary.cmake)
|
||||||
|
caffe2_print_configuration_summary()
|
||||||
|
|
||||||
# Parse custom debug info
|
# Parse custom debug info
|
||||||
if(DEFINED USE_CUSTOM_DEBINFO)
|
if(DEFINED USE_CUSTOM_DEBINFO)
|
||||||
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
|
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
|
||||||
@ -1479,5 +1444,56 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
|||||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include(cmake/Summary.cmake)
|
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||||
caffe2_print_configuration_summary()
|
add_compile_options(
|
||||||
|
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||||
|
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||||
|
)
|
||||||
|
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||||
|
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||||
|
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||||
|
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||||
|
COMMENT "Generating prioritized text linker files"
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||||
|
|
||||||
|
if(BUILD_PYTHON)
|
||||||
|
set(LINKER_OPT_TARGETS torch_python)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(NOT BUILD_LIBTORCHLESS)
|
||||||
|
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||||
|
if(USE_CUDA)
|
||||||
|
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||||
|
endif()
|
||||||
|
if(USE_XPU)
|
||||||
|
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||||
|
endif()
|
||||||
|
if(USE_ROCM)
|
||||||
|
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||||
|
if(TARGET ${tgt})
|
||||||
|
add_dependencies("${tgt}" generate_linker_script)
|
||||||
|
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||||
|
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||||
|
else()
|
||||||
|
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
else()
|
||||||
|
if(LINUX AND CPU_AARCH64)
|
||||||
|
message(WARNING [[
|
||||||
|
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||||
|
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||||
|
]])
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|||||||
@ -210,12 +210,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
|||||||
/test/inductor/test_flex_attention.py @drisspg
|
/test/inductor/test_flex_attention.py @drisspg
|
||||||
/test/inductor/test_flex_decoding.py @drisspg
|
/test/inductor/test_flex_decoding.py @drisspg
|
||||||
|
|
||||||
# Low Precision & Grouped GEMMs
|
# Low Precision GEMMs
|
||||||
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
|
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
|
||||||
/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
|
|
||||||
/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
|
|
||||||
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
|
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
|
||||||
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
|
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
|
||||||
/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
|
|
||||||
/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
|
|
||||||
/test/test_scaled_matmul_cuda.py @drisspg @slayton58
|
/test/test_scaled_matmul_cuda.py @drisspg @slayton58
|
||||||
|
|||||||
@ -18,8 +18,6 @@ Please report security issues using https://github.com/pytorch/pytorch/security/
|
|||||||
|
|
||||||
All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
|
All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
|
||||||
|
|
||||||
**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model.
|
|
||||||
|
|
||||||
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
|
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
|
||||||
|
|
||||||
https://www.facebook.com/whitehat
|
https://www.facebook.com/whitehat
|
||||||
|
|||||||
@ -94,11 +94,6 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
|||||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
|
|
||||||
c10::DeviceIndex device_index) {
|
|
||||||
const auto device_type = getAccelerator(true).value();
|
|
||||||
return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
|
|
||||||
}
|
|
||||||
} // namespace at::accelerator
|
} // namespace at::accelerator
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
|
|||||||
@ -226,8 +226,8 @@ template <
|
|||||||
typename B = HostBlock<S>>
|
typename B = HostBlock<S>>
|
||||||
struct CachingHostAllocatorImpl {
|
struct CachingHostAllocatorImpl {
|
||||||
virtual ~CachingHostAllocatorImpl() {
|
virtual ~CachingHostAllocatorImpl() {
|
||||||
if (active_) {
|
active_ = false;
|
||||||
active_ = false;
|
if (pinned_use_background_threads()) {
|
||||||
getBackgroundThreadPool()->waitWorkComplete();
|
getBackgroundThreadPool()->waitWorkComplete();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -260,7 +260,6 @@ struct CachingHostAllocatorImpl {
|
|||||||
if (pinned_use_background_threads()) {
|
if (pinned_use_background_threads()) {
|
||||||
// Launch the background thread and process events in a loop.
|
// Launch the background thread and process events in a loop.
|
||||||
static bool background_thread_flag [[maybe_unused]] = [this] {
|
static bool background_thread_flag [[maybe_unused]] = [this] {
|
||||||
active_ = true;
|
|
||||||
getBackgroundThreadPool()->run([&]() {
|
getBackgroundThreadPool()->run([&]() {
|
||||||
while (active_) {
|
while (active_) {
|
||||||
process_events();
|
process_events();
|
||||||
@ -684,9 +683,9 @@ struct CachingHostAllocatorImpl {
|
|||||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||||
|
|
||||||
// Indicates whether the event-processing thread pool is active.
|
// Indicates whether the object is active.
|
||||||
// Set to false in the destructor to signal background threads to stop.
|
// Set to false in the destructor to signal background threads to stop.
|
||||||
std::atomic<bool> active_{false};
|
std::atomic<bool> active_{true};
|
||||||
protected:
|
protected:
|
||||||
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -55,6 +55,14 @@ struct numeric_limits<int8_t> {
|
|||||||
static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
|
static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct numeric_limits<uint16_t> {
|
||||||
|
static inline __host__ __device__ uint16_t lowest() { return 0; }
|
||||||
|
static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
|
||||||
|
static inline __host__ __device__ uint16_t lower_bound() { return 0; }
|
||||||
|
static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
|
||||||
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct numeric_limits<int16_t> {
|
struct numeric_limits<int16_t> {
|
||||||
static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
|
static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
|
||||||
@ -63,6 +71,14 @@ struct numeric_limits<int16_t> {
|
|||||||
static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
|
static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct numeric_limits<uint32_t> {
|
||||||
|
static inline __host__ __device__ uint32_t lowest() { return 0; }
|
||||||
|
static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
|
||||||
|
static inline __host__ __device__ uint32_t lower_bound() { return 0; }
|
||||||
|
static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
|
||||||
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct numeric_limits<int32_t> {
|
struct numeric_limits<int32_t> {
|
||||||
static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
|
static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
|
||||||
@ -71,6 +87,21 @@ struct numeric_limits<int32_t> {
|
|||||||
static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
|
static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct numeric_limits<uint64_t> {
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||||
|
static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
|
||||||
|
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||||
|
static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
|
||||||
|
#else
|
||||||
|
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||||
|
static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
|
||||||
|
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||||
|
static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct numeric_limits<int64_t> {
|
struct numeric_limits<int64_t> {
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
|||||||
@ -157,8 +157,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
|
|||||||
DispatchKey::Negative,
|
DispatchKey::Negative,
|
||||||
DispatchKey::Conjugate,
|
DispatchKey::Conjugate,
|
||||||
DispatchKey::XLA,
|
DispatchKey::XLA,
|
||||||
DispatchKey::XPU,
|
|
||||||
DispatchKey::HPU,
|
|
||||||
DispatchKey::CUDA,
|
DispatchKey::CUDA,
|
||||||
DispatchKey::CPU,
|
DispatchKey::CPU,
|
||||||
DispatchKey::PrivateUse1,
|
DispatchKey::PrivateUse1,
|
||||||
|
|||||||
@ -440,7 +440,7 @@ bool MPSHeapAllocatorImpl::release_cached_buffers() {
|
|||||||
// we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
|
// we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
|
||||||
m_mutex.unlock();
|
m_mutex.unlock();
|
||||||
auto stream = getDefaultMPSStream();
|
auto stream = getDefaultMPSStream();
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
dispatch_sync(stream->queue(), ^() {
|
||||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||||
});
|
});
|
||||||
m_mutex.lock();
|
m_mutex.lock();
|
||||||
|
|||||||
@ -110,9 +110,6 @@ class TORCH_API MPSStream {
|
|||||||
return _stream;
|
return _stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
MTLBuffer_t getErrorBuffer();
|
|
||||||
void checkLastError();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Stream _stream;
|
Stream _stream;
|
||||||
MTLCommandQueue_t _commandQueue = nil;
|
MTLCommandQueue_t _commandQueue = nil;
|
||||||
@ -124,8 +121,6 @@ class TORCH_API MPSStream {
|
|||||||
dispatch_queue_t _serialQueue = nullptr;
|
dispatch_queue_t _serialQueue = nullptr;
|
||||||
// CommitAndContinue is enabled by default
|
// CommitAndContinue is enabled by default
|
||||||
bool _enableCommitAndContinue = true;
|
bool _enableCommitAndContinue = true;
|
||||||
// Buffer that contains last raised error
|
|
||||||
MTLBuffer_t _errorBuffer = nil;
|
|
||||||
|
|
||||||
// use synchronize() to access any of these commit functions outside MPSStream
|
// use synchronize() to access any of these commit functions outside MPSStream
|
||||||
void commit();
|
void commit();
|
||||||
@ -160,7 +155,4 @@ class TORCH_API MPSStreamImpl {
|
|||||||
MPSStreamImpl();
|
MPSStreamImpl();
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __OBJC__
|
|
||||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
|
|
||||||
#endif
|
|
||||||
} // namespace at::mps
|
} // namespace at::mps
|
||||||
|
|||||||
@ -3,13 +3,13 @@
|
|||||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||||
#include <ATen/mps/MPSProfiler.h>
|
#include <ATen/mps/MPSProfiler.h>
|
||||||
#include <ATen/mps/MPSStream.h>
|
#include <ATen/mps/MPSStream.h>
|
||||||
#include <c10/metal/error.h>
|
|
||||||
|
|
||||||
@interface MPSGraphExecutionDescriptor ()
|
@interface MPSGraphExecutionDescriptor ()
|
||||||
@property(readwrite, atomic) BOOL enableCommitAndContinue;
|
@property(readwrite, atomic) BOOL enableCommitAndContinue;
|
||||||
@end
|
@end
|
||||||
|
|
||||||
namespace at::mps {
|
namespace at::mps {
|
||||||
|
|
||||||
//-----------------------------------------------------------------
|
//-----------------------------------------------------------------
|
||||||
// MPSStream
|
// MPSStream
|
||||||
//-----------------------------------------------------------------
|
//-----------------------------------------------------------------
|
||||||
@ -30,10 +30,6 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
|
|||||||
// Choose level which optimizes for GPU
|
// Choose level which optimizes for GPU
|
||||||
_compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
|
_compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
|
||||||
_executionDescriptor.compilationDescriptor = _compilationDescriptor;
|
_executionDescriptor.compilationDescriptor = _compilationDescriptor;
|
||||||
|
|
||||||
_errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages)
|
|
||||||
options:MTLResourceStorageModeShared];
|
|
||||||
std::memset([_errorBuffer contents], 0, 1024);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MPSStream::~MPSStream() {
|
MPSStream::~MPSStream() {
|
||||||
@ -42,8 +38,6 @@ MPSStream::~MPSStream() {
|
|||||||
[_executionDescriptor release];
|
[_executionDescriptor release];
|
||||||
[_compilationDescriptor release];
|
[_compilationDescriptor release];
|
||||||
_executionDescriptor = nil;
|
_executionDescriptor = nil;
|
||||||
[_errorBuffer release];
|
|
||||||
_errorBuffer = nil;
|
|
||||||
_compilationDescriptor = nil;
|
_compilationDescriptor = nil;
|
||||||
|
|
||||||
assert(_commandBuffer == nil);
|
assert(_commandBuffer == nil);
|
||||||
@ -110,7 +104,6 @@ void MPSStream::commitAndWait() {
|
|||||||
[_prevCommandBuffer waitUntilCompleted];
|
[_prevCommandBuffer waitUntilCompleted];
|
||||||
[_prevCommandBuffer release];
|
[_prevCommandBuffer release];
|
||||||
_prevCommandBuffer = nil;
|
_prevCommandBuffer = nil;
|
||||||
checkLastError();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_commandBuffer) {
|
if (_commandBuffer) {
|
||||||
@ -118,7 +111,6 @@ void MPSStream::commitAndWait() {
|
|||||||
[_commandBuffer waitUntilCompleted];
|
[_commandBuffer waitUntilCompleted];
|
||||||
[_commandBuffer release];
|
[_commandBuffer release];
|
||||||
_commandBuffer = nil;
|
_commandBuffer = nil;
|
||||||
checkLastError();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +153,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
|||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
dispatch_sync(_serialQueue, ^() {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
endKernelCoalescing();
|
endKernelCoalescing();
|
||||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||||
@ -191,7 +183,7 @@ void MPSStream::copy(id<MTLBuffer> srcBuffer,
|
|||||||
size_t dstOffset,
|
size_t dstOffset,
|
||||||
uint64_t profileId,
|
uint64_t profileId,
|
||||||
SyncType syncType) {
|
SyncType syncType) {
|
||||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
dispatch_sync(_serialQueue, ^() {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
endKernelCoalescing();
|
endKernelCoalescing();
|
||||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||||
@ -244,7 +236,7 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
|
|||||||
auto& profiler = getMPSProfiler();
|
auto& profiler = getMPSProfiler();
|
||||||
const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();
|
const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
dispatch_sync(_serialQueue, ^() {
|
||||||
endKernelCoalescing();
|
endKernelCoalescing();
|
||||||
if (isGraphProfilingEnabled) {
|
if (isGraphProfilingEnabled) {
|
||||||
// this function call is only relevant for interval-based Signposts
|
// this function call is only relevant for interval-based Signposts
|
||||||
@ -274,24 +266,6 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
id<MTLBuffer> MPSStream::getErrorBuffer() {
|
|
||||||
return _errorBuffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
void MPSStream::checkLastError() {
|
|
||||||
auto msgs = reinterpret_cast<c10::metal::ErrorMessages*>([_errorBuffer contents]);
|
|
||||||
const auto& msg = msgs->msg[0];
|
|
||||||
if (!msgs) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
unsigned int count = 0;
|
|
||||||
std::swap(count, msgs->count);
|
|
||||||
if (!count) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message);
|
|
||||||
}
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------
|
//-----------------------------------------------------------------
|
||||||
// MPSStreamImpl
|
// MPSStreamImpl
|
||||||
//-----------------------------------------------------------------
|
//-----------------------------------------------------------------
|
||||||
@ -315,19 +289,4 @@ MPSStream* getDefaultMPSStream() {
|
|||||||
return MPSStreamImpl::getInstance();
|
return MPSStreamImpl::getInstance();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper methods
|
|
||||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
|
|
||||||
__block std::optional<std::exception_ptr> block_exception;
|
|
||||||
dispatch_sync(queue, ^() {
|
|
||||||
try {
|
|
||||||
block();
|
|
||||||
} catch (...) {
|
|
||||||
block_exception = std::current_exception();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (block_exception) {
|
|
||||||
std::rethrow_exception(*block_exception);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace at::mps
|
} // namespace at::mps
|
||||||
|
|||||||
@ -23,7 +23,6 @@
|
|||||||
#include <ATen/ops/_aminmax_native.h>
|
#include <ATen/ops/_aminmax_native.h>
|
||||||
#include <ATen/ops/_assert_async_native.h>
|
#include <ATen/ops/_assert_async_native.h>
|
||||||
#include <ATen/ops/_assert_scalar_native.h>
|
#include <ATen/ops/_assert_scalar_native.h>
|
||||||
#include <ATen/ops/_async_error_native.h>
|
|
||||||
#include <ATen/ops/_functional_assert_async_native.h>
|
#include <ATen/ops/_functional_assert_async_native.h>
|
||||||
#include <ATen/ops/_functional_assert_scalar_native.h>
|
#include <ATen/ops/_functional_assert_scalar_native.h>
|
||||||
#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
|
#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
|
||||||
@ -480,14 +479,6 @@ Tensor isfinite(const Tensor& self) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void _async_error(std::string_view msg) {
|
|
||||||
TORCH_CHECK(0, msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
void _async_error_meta(std::string_view msg) {
|
|
||||||
// Do NOT error, it's an async error!
|
|
||||||
}
|
|
||||||
|
|
||||||
void _assert_async_cpu(const Tensor& self) {
|
void _assert_async_cpu(const Tensor& self) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
native::is_nonzero(self),
|
native::is_nonzero(self),
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/native/ReduceOpsUtils.h>
|
#include <ATen/native/ReduceOpsUtils.h>
|
||||||
|
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/TensorIterator.h>
|
#include <ATen/TensorIterator.h>
|
||||||
#include <ATen/OpMathType.h>
|
#include <ATen/OpMathType.h>
|
||||||
@ -78,12 +79,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
|
|||||||
reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
|
reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
|
||||||
[=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
|
[=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
|
||||||
} else {
|
} else {
|
||||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
|
AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
|
||||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||||
reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
|
reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
|
||||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
|
[=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); });
|
[=](Vec a, Vec b) -> Vec { return minimum(a, b); });
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,12 +104,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
|
|||||||
reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
|
reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
|
||||||
[=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
|
[=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
|
||||||
} else {
|
} else {
|
||||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
|
AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
|
||||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||||
reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
|
reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
|
||||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
|
[=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); });
|
[=](Vec a, Vec b) -> Vec { return maximum(a, b); });
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,7 +200,7 @@ void aminmax_allreduce_kernel(
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
|
AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||||
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
||||||
reduce_all_impl_vec_two_outputs<scalar_t>(
|
reduce_all_impl_vec_two_outputs<scalar_t>(
|
||||||
@ -214,7 +215,7 @@ void aminmax_allreduce_kernel(
|
|||||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); },
|
[=](Vec a, Vec b) -> Vec { return minimum(a, b); },
|
||||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); }
|
[=](Vec a, Vec b) -> Vec { return maximum(a, b); }
|
||||||
);
|
);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <ATen/core/Tensor.h>
|
#include <ATen/core/Tensor.h>
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/OpMathType.h>
|
#include <ATen/OpMathType.h>
|
||||||
#include <ATen/cpu/vec/vec.h>
|
#include <ATen/cpu/vec/vec.h>
|
||||||
#include <ATen/cpu/vec/functional.h>
|
#include <ATen/cpu/vec/functional.h>
|
||||||
@ -347,34 +348,35 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
void min_values_kernel_impl(TensorIterator& iter) {
|
void min_values_kernel_impl(TensorIterator& iter) {
|
||||||
if (iter.dtype() == kLong) {
|
// This case is special because of Vectorized<int64_t> does not
|
||||||
// This case is special because of Vectorized<int64_t> does not
|
// handle upper_bound<int64_t>().
|
||||||
// handle upper_bound<int64_t>().
|
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
|
||||||
using scalar_t = int64_t;
|
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||||
binary_kernel_reduce(
|
binary_kernel_reduce(
|
||||||
iter,
|
iter,
|
||||||
MinValuesOps<scalar_t>{},
|
MinValuesOps<scalar_t>{},
|
||||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||||
|
}), kLong, kUInt64);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
|
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||||
binary_kernel_reduce_vec(
|
binary_kernel_reduce_vec(
|
||||||
iter,
|
iter,
|
||||||
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
|
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
|
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
|
||||||
static_cast<double>(upper_bound<scalar_t>()));
|
static_cast<double>(upper_bound<scalar_t>()));
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void max_values_kernel_impl(TensorIterator& iter) {
|
void max_values_kernel_impl(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
|
AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
|
||||||
binary_kernel_reduce_vec(
|
binary_kernel_reduce_vec(
|
||||||
iter,
|
iter,
|
||||||
[](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
|
[](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
|
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
|
||||||
lower_bound<scalar_t>());
|
lower_bound<scalar_t>());
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void argmax_kernel_impl(TensorIterator &iter) {
|
void argmax_kernel_impl(TensorIterator &iter) {
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/NumericUtils.h>
|
#include <ATen/NumericUtils.h>
|
||||||
#include <ATen/TensorIterator.h>
|
#include <ATen/TensorIterator.h>
|
||||||
@ -106,7 +107,7 @@ void min_kernel_impl(
|
|||||||
bool keepdim) {
|
bool keepdim) {
|
||||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
|
AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
|
||||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||||
scalar_t* result_data, int64_t* indice_data,
|
scalar_t* result_data, int64_t* indice_data,
|
||||||
const scalar_t* self_data, auto self_dim_stride) {
|
const scalar_t* self_data, auto self_dim_stride) {
|
||||||
@ -128,7 +129,7 @@ void min_kernel_impl(
|
|||||||
*indice_data = index;
|
*indice_data = index;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void max_kernel_impl(
|
void max_kernel_impl(
|
||||||
@ -139,7 +140,7 @@ void max_kernel_impl(
|
|||||||
bool keepdim) {
|
bool keepdim) {
|
||||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
|
AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
|
||||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||||
scalar_t* result_data, int64_t* indice_data,
|
scalar_t* result_data, int64_t* indice_data,
|
||||||
const scalar_t* self_data, auto self_dim_stride) {
|
const scalar_t* self_data, auto self_dim_stride) {
|
||||||
@ -161,7 +162,7 @@ void max_kernel_impl(
|
|||||||
*indice_data = index;
|
*indice_data = index;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aminmax_kernel(
|
void aminmax_kernel(
|
||||||
@ -186,7 +187,7 @@ void aminmax_kernel(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
|
AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||||
compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
|
compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
|
||||||
scalar_t* min_result_data, scalar_t* max_result_data,
|
scalar_t* min_result_data, scalar_t* max_result_data,
|
||||||
const scalar_t* self_data, auto self_dim_stride) {
|
const scalar_t* self_data, auto self_dim_stride) {
|
||||||
@ -209,7 +210,7 @@ void aminmax_kernel(
|
|||||||
*max_result_data = max_number;
|
*max_result_data = max_number;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
|
||||||
}
|
}
|
||||||
|
|
||||||
void where_kernel_impl(TensorIterator &iter) {
|
void where_kernel_impl(TensorIterator &iter) {
|
||||||
|
|||||||
@ -884,69 +884,6 @@ struct type_specialized_kernel_launcher {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <int arg_index>
|
|
||||||
struct type_specialized_broadcast_kernel_launcher {
|
|
||||||
template <
|
|
||||||
typename func_t,
|
|
||||||
typename array_t,
|
|
||||||
typename dtypes_t,
|
|
||||||
typename calc_t>
|
|
||||||
static void apply(
|
|
||||||
int64_t numel,
|
|
||||||
func_t f,
|
|
||||||
array_t data,
|
|
||||||
dtypes_t dtypes,
|
|
||||||
calc_t offset_calc) {
|
|
||||||
using traits = function_traits<func_t>;
|
|
||||||
using ret_t = typename traits::result_type;
|
|
||||||
using arg0_t = typename traits::template arg<0>::type;
|
|
||||||
using arg1_t = typename traits::template arg<1>::type;
|
|
||||||
if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
|
|
||||||
dtypes[1] == rt_binary_specializations[arg_index][1] &&
|
|
||||||
dtypes[2] == rt_binary_specializations[arg_index][2]) {
|
|
||||||
using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
|
|
||||||
using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
|
|
||||||
using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
|
|
||||||
constexpr int grp_sz = 128;
|
|
||||||
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
|
||||||
if (unrl) {
|
|
||||||
auto offsets0 = offset_calc.get(idx);
|
|
||||||
auto offsets1 = offset_calc.get(idx + grp_sz);
|
|
||||||
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
|
|
||||||
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
|
|
||||||
void* out0 = data[0] + offsets0[0];
|
|
||||||
void* out1 = data[0] + offsets1[0];
|
|
||||||
void* out2 = data[0] + offsets2[0];
|
|
||||||
void* out3 = data[0] + offsets3[0];
|
|
||||||
auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
|
|
||||||
auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
|
|
||||||
ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
|
|
||||||
auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
|
|
||||||
auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
|
|
||||||
ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
|
|
||||||
auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
|
|
||||||
auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
|
|
||||||
ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
|
|
||||||
auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
|
|
||||||
auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
|
|
||||||
ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
|
|
||||||
*(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
|
|
||||||
*(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
|
|
||||||
*(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
|
|
||||||
*(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
|
|
||||||
} else {
|
|
||||||
auto offsets = offset_calc.get(idx);
|
|
||||||
void* out = data[0] + offsets[0];
|
|
||||||
auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
|
|
||||||
auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
|
|
||||||
ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
|
|
||||||
*(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1065,32 +1002,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
|||||||
}
|
}
|
||||||
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
|
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
if (check_binary_rt_types_for_specialization(iter)) {
|
|
||||||
// constexpr to reduce the amount of kernels generated for
|
|
||||||
// broadcast elementwise with mexed dtypes and limit which functors are actually
|
|
||||||
// applied to the load and store at compile time.
|
|
||||||
using func_tuple = typename traits::ArgsTuple;
|
|
||||||
if constexpr (
|
|
||||||
std::is_same_v<float, arg0_t> && traits::arity == 2 &&
|
|
||||||
check_binary_functor_types_for_specialization<
|
|
||||||
func_tuple,
|
|
||||||
float,
|
|
||||||
float,
|
|
||||||
traits::arity,
|
|
||||||
/*arg_num=*/0>::check()) {
|
|
||||||
memory::detail::static_unroll<
|
|
||||||
type_specialized_broadcast_kernel_launcher,
|
|
||||||
rt_binary_specializations.size()>::with_args(
|
|
||||||
numel,
|
|
||||||
f,
|
|
||||||
data,
|
|
||||||
dtypes,
|
|
||||||
offset_calc
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int grp_sz = 128;
|
constexpr int grp_sz = 128;
|
||||||
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
||||||
if (unrl) {
|
if (unrl) {
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
#define TORCH_ASSERT_NO_OPERATORS
|
#define TORCH_ASSERT_NO_OPERATORS
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/NumericUtils.h>
|
#include <ATen/NumericUtils.h>
|
||||||
#include <ATen/native/DispatchStub.h>
|
#include <ATen/native/DispatchStub.h>
|
||||||
#include <ATen/native/ReduceAllOps.h>
|
#include <ATen/native/ReduceAllOps.h>
|
||||||
@ -28,22 +29,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
|
void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(
|
AT_DISPATCH_V2(
|
||||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
|
iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
|
||||||
_min_max_values_kernel_cuda_impl<scalar_t>(iter);
|
_min_max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aminmax_launch_kernel(TensorIterator& iter) {
|
void aminmax_launch_kernel(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(
|
AT_DISPATCH_V2(
|
||||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
|
iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
|
||||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||||
iter,
|
iter,
|
||||||
MinMaxOps<scalar_t, scalar_t, int32_t>{},
|
MinMaxOps<scalar_t, scalar_t, int32_t>{},
|
||||||
thrust::pair<scalar_t, scalar_t>(
|
thrust::pair<scalar_t, scalar_t>(
|
||||||
at::numeric_limits<scalar_t>::upper_bound(),
|
at::numeric_limits<scalar_t>::upper_bound(),
|
||||||
at::numeric_limits<scalar_t>::lower_bound()));
|
at::numeric_limits<scalar_t>::lower_bound()));
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
#define TORCH_ASSERT_NO_OPERATORS
|
#define TORCH_ASSERT_NO_OPERATORS
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/NumericUtils.h>
|
#include <ATen/NumericUtils.h>
|
||||||
#include <ATen/native/DispatchStub.h>
|
#include <ATen/native/DispatchStub.h>
|
||||||
#include <ATen/native/ReduceAllOps.h>
|
#include <ATen/native/ReduceAllOps.h>
|
||||||
@ -33,27 +34,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void max_values_kernel_cuda(TensorIterator& iter) {
|
void max_values_kernel_cuda(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(
|
AT_DISPATCH_V2(
|
||||||
kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
|
iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
|
||||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void max_launch_kernel(TensorIterator& iter) {
|
void max_launch_kernel(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(
|
AT_DISPATCH_V2(
|
||||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
|
iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
|
||||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||||
iter,
|
iter,
|
||||||
MaxOps<scalar_t>{},
|
MaxOps<scalar_t>{},
|
||||||
thrust::pair<scalar_t, int64_t>(
|
thrust::pair<scalar_t, int64_t>(
|
||||||
at::numeric_limits<scalar_t>::lower_bound(), 0));
|
at::numeric_limits<scalar_t>::lower_bound(), 0));
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void max_all_launch_kernel(TensorIterator &iter) {
|
void max_all_launch_kernel(TensorIterator &iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
|
AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
|
||||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
|
REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
#include <ATen/NumericUtils.h>
|
#include <ATen/NumericUtils.h>
|
||||||
|
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
|
#include <ATen/Dispatch_v2.h>
|
||||||
#include <ATen/NumericUtils.h>
|
#include <ATen/NumericUtils.h>
|
||||||
#include <ATen/cuda/NumericLimits.cuh>
|
#include <ATen/cuda/NumericLimits.cuh>
|
||||||
|
|
||||||
@ -33,24 +34,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void min_values_kernel_cuda(TensorIterator& iter) {
|
void min_values_kernel_cuda(TensorIterator& iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
|
AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
|
||||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void min_launch_kernel(TensorIterator &iter) {
|
void min_launch_kernel(TensorIterator &iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
|
AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
|
||||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||||
iter,
|
iter,
|
||||||
MinOps<scalar_t>{},
|
MinOps<scalar_t>{},
|
||||||
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
|
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
void min_all_launch_kernel(TensorIterator &iter) {
|
void min_all_launch_kernel(TensorIterator &iter) {
|
||||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
|
AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
|
||||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||||
});
|
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
|
REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
|
||||||
|
|||||||
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
|
|||||||
// supported in conv.
|
// supported in conv.
|
||||||
mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
|
mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
|
||||||
if (groups > 1 && weight_zero_points.numel() > 1)
|
if (groups > 1 && weight_zero_points.numel() > 1)
|
||||||
mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
|
mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
|
||||||
dnnl::primitive_attr pattr;
|
dnnl::primitive_attr pattr;
|
||||||
|
|
||||||
bool src_need_zp = (act_zero_point != 0);
|
bool src_need_zp = (act_zero_point != 0);
|
||||||
|
|||||||
@ -40,6 +40,8 @@ using namespace at::mps;
|
|||||||
|
|
||||||
namespace at::native::mps {
|
namespace at::native::mps {
|
||||||
|
|
||||||
|
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
|
||||||
|
|
||||||
struct MPSScalar {
|
struct MPSScalar {
|
||||||
id<MTLBuffer> getMTLBuffer() const {
|
id<MTLBuffer> getMTLBuffer() const {
|
||||||
return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
|
return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
|
||||||
|
|||||||
@ -53,6 +53,21 @@
|
|||||||
@end
|
@end
|
||||||
|
|
||||||
namespace at::native::mps {
|
namespace at::native::mps {
|
||||||
|
|
||||||
|
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
|
||||||
|
__block std::optional<std::exception_ptr> block_exception;
|
||||||
|
dispatch_sync(queue, ^() {
|
||||||
|
try {
|
||||||
|
block();
|
||||||
|
} catch (...) {
|
||||||
|
block_exception = std::current_exception();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (block_exception) {
|
||||||
|
std::rethrow_exception(*block_exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes distance from lowest to highest element offset in given tensor.
|
* Computes distance from lowest to highest element offset in given tensor.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
#include <c10/metal/atomic.h>
|
#include <c10/metal/atomic.h>
|
||||||
#include <c10/metal/error.h>
|
|
||||||
#include <c10/metal/indexing.h>
|
#include <c10/metal/indexing.h>
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
|
|
||||||
@ -32,24 +31,10 @@ OffsetT index_apply_indices(
|
|||||||
constant IndexAB* indices,
|
constant IndexAB* indices,
|
||||||
constant int64_t* sizes,
|
constant int64_t* sizes,
|
||||||
constant int64_t* strides,
|
constant int64_t* strides,
|
||||||
uint num_indices,
|
uint num_indices) {
|
||||||
thread bool& error,
|
|
||||||
device ErrorMessages* error_buf) {
|
|
||||||
OffsetT rc = offs.x;
|
OffsetT rc = offs.x;
|
||||||
for (uint i = 0; i < num_indices; i++) {
|
for (uint i = 0; i < num_indices; i++) {
|
||||||
auto idx = indices[i].indexArray[offs.y];
|
auto idx = indices[i].indexArray[offs.y];
|
||||||
if (idx < -sizes[i] || idx >= sizes[i]) {
|
|
||||||
TORCH_REPORT_ERROR(
|
|
||||||
error_buf,
|
|
||||||
"index ",
|
|
||||||
idx,
|
|
||||||
" is out of bounds for dimension ",
|
|
||||||
i,
|
|
||||||
" with size ",
|
|
||||||
sizes[i]);
|
|
||||||
error = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (idx < 0) {
|
if (idx < 0) {
|
||||||
idx += sizes[i];
|
idx += sizes[i];
|
||||||
}
|
}
|
||||||
@ -70,7 +55,6 @@ kernel void index_select(
|
|||||||
constant int64_t* index_sizes,
|
constant int64_t* index_sizes,
|
||||||
constant int64_t* index_strides,
|
constant int64_t* index_strides,
|
||||||
constant uint4& ndim_nindices_numel,
|
constant uint4& ndim_nindices_numel,
|
||||||
device ErrorMessages* error_buffer,
|
|
||||||
uint thread_index [[thread_position_in_grid]]) {
|
uint thread_index [[thread_position_in_grid]]) {
|
||||||
const auto ndim = ndim_nindices_numel.x;
|
const auto ndim = ndim_nindices_numel.x;
|
||||||
const auto num_indices = ndim_nindices_numel.y;
|
const auto num_indices = ndim_nindices_numel.y;
|
||||||
@ -81,19 +65,8 @@ kernel void index_select(
|
|||||||
indices_strides,
|
indices_strides,
|
||||||
ndim,
|
ndim,
|
||||||
thread_index);
|
thread_index);
|
||||||
bool error = false;
|
|
||||||
auto input_offs = index_apply_indices<OffsetT>(
|
auto input_offs = index_apply_indices<OffsetT>(
|
||||||
offs.yz,
|
offs.yz, indices, index_sizes, index_strides, num_indices);
|
||||||
indices,
|
|
||||||
index_sizes,
|
|
||||||
index_strides,
|
|
||||||
num_indices,
|
|
||||||
error,
|
|
||||||
error_buffer);
|
|
||||||
if (error) {
|
|
||||||
output[offs.x / sizeof(T)] = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
|
output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,9 +82,7 @@ inline void index_put_impl(
|
|||||||
constant int64_t* index_sizes,
|
constant int64_t* index_sizes,
|
||||||
constant int64_t* index_strides,
|
constant int64_t* index_strides,
|
||||||
constant uint4& ndim_nindices_numel,
|
constant uint4& ndim_nindices_numel,
|
||||||
device ErrorMessages* error_buffer,
|
|
||||||
uint thread_index) {
|
uint thread_index) {
|
||||||
bool error = false;
|
|
||||||
const auto ndim = ndim_nindices_numel.x;
|
const auto ndim = ndim_nindices_numel.x;
|
||||||
const auto num_indices = ndim_nindices_numel.y;
|
const auto num_indices = ndim_nindices_numel.y;
|
||||||
const auto offs = index_get_offsets(
|
const auto offs = index_get_offsets(
|
||||||
@ -122,16 +93,7 @@ inline void index_put_impl(
|
|||||||
ndim,
|
ndim,
|
||||||
thread_index);
|
thread_index);
|
||||||
auto output_offs = index_apply_indices<OffsetT>(
|
auto output_offs = index_apply_indices<OffsetT>(
|
||||||
offs.xz,
|
offs.xz, indices, index_sizes, index_strides, num_indices);
|
||||||
indices,
|
|
||||||
index_sizes,
|
|
||||||
index_strides,
|
|
||||||
num_indices,
|
|
||||||
error,
|
|
||||||
error_buffer);
|
|
||||||
if (error) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
|
output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -147,7 +109,6 @@ kernel void index_put(
|
|||||||
constant int64_t* index_sizes,
|
constant int64_t* index_sizes,
|
||||||
constant int64_t* index_strides,
|
constant int64_t* index_strides,
|
||||||
constant uint4& ndim_nindices_numel,
|
constant uint4& ndim_nindices_numel,
|
||||||
device ErrorMessages* error_buffer,
|
|
||||||
uint thread_index [[thread_position_in_grid]]) {
|
uint thread_index [[thread_position_in_grid]]) {
|
||||||
index_put_impl(
|
index_put_impl(
|
||||||
output,
|
output,
|
||||||
@ -160,7 +121,6 @@ kernel void index_put(
|
|||||||
index_sizes,
|
index_sizes,
|
||||||
index_strides,
|
index_strides,
|
||||||
ndim_nindices_numel,
|
ndim_nindices_numel,
|
||||||
error_buffer,
|
|
||||||
thread_index);
|
thread_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,7 +136,6 @@ kernel void index_put_serial(
|
|||||||
constant int64_t* index_sizes,
|
constant int64_t* index_sizes,
|
||||||
constant int64_t* index_strides,
|
constant int64_t* index_strides,
|
||||||
constant uint4& ndim_nindices_numel,
|
constant uint4& ndim_nindices_numel,
|
||||||
device ErrorMessages* error_buffer,
|
|
||||||
uint thread_index [[thread_position_in_grid]]) {
|
uint thread_index [[thread_position_in_grid]]) {
|
||||||
(void)thread_index; // Suppress unused vairable varning
|
(void)thread_index; // Suppress unused vairable varning
|
||||||
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
||||||
@ -191,7 +150,6 @@ kernel void index_put_serial(
|
|||||||
index_sizes,
|
index_sizes,
|
||||||
index_strides,
|
index_strides,
|
||||||
ndim_nindices_numel,
|
ndim_nindices_numel,
|
||||||
error_buffer,
|
|
||||||
idx);
|
idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -208,7 +166,6 @@ kernel void index_put_accumulate(
|
|||||||
constant int64_t* index_sizes,
|
constant int64_t* index_sizes,
|
||||||
constant int64_t* index_strides,
|
constant int64_t* index_strides,
|
||||||
constant uint4& ndim_nindices_numel,
|
constant uint4& ndim_nindices_numel,
|
||||||
device ErrorMessages* error_buffer,
|
|
||||||
uint thread_index [[thread_position_in_grid]]) {
|
uint thread_index [[thread_position_in_grid]]) {
|
||||||
const auto ndim = ndim_nindices_numel.x;
|
const auto ndim = ndim_nindices_numel.x;
|
||||||
const auto num_indices = ndim_nindices_numel.y;
|
const auto num_indices = ndim_nindices_numel.y;
|
||||||
@ -219,18 +176,8 @@ kernel void index_put_accumulate(
|
|||||||
indices_strides,
|
indices_strides,
|
||||||
ndim,
|
ndim,
|
||||||
thread_index);
|
thread_index);
|
||||||
bool error = false;
|
|
||||||
auto output_offs = index_apply_indices<OffsetT>(
|
auto output_offs = index_apply_indices<OffsetT>(
|
||||||
offs.xz,
|
offs.xz, indices, index_sizes, index_strides, num_indices);
|
||||||
indices,
|
|
||||||
index_sizes,
|
|
||||||
index_strides,
|
|
||||||
num_indices,
|
|
||||||
error,
|
|
||||||
error_buffer);
|
|
||||||
if (error) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
AtomicType<T>::atomic_add(
|
AtomicType<T>::atomic_add(
|
||||||
reinterpret_cast<device AtomicType_t<T>*>(output),
|
reinterpret_cast<device AtomicType_t<T>*>(output),
|
||||||
output_offs / sizeof(T),
|
output_offs / sizeof(T),
|
||||||
@ -250,7 +197,6 @@ kernel void index_put_accumulate(
|
|||||||
constant int64_t* index_sizes, \
|
constant int64_t* index_sizes, \
|
||||||
constant int64_t* index_strides, \
|
constant int64_t* index_strides, \
|
||||||
constant uint4& ndim_nindices_numel, \
|
constant uint4& ndim_nindices_numel, \
|
||||||
device ErrorMessages* error_buffer, \
|
|
||||||
uint thread_index [[thread_position_in_grid]])
|
uint thread_index [[thread_position_in_grid]])
|
||||||
|
|
||||||
#define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
|
#define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
|
||||||
|
|||||||
@ -141,9 +141,6 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
|
|||||||
};
|
};
|
||||||
|
|
||||||
MPSStream* stream = at::mps::getCurrentMPSStream();
|
MPSStream* stream = at::mps::getCurrentMPSStream();
|
||||||
if (result.numel() == 0) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);
|
Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
|
|||||||
@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
|
|||||||
auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
|
auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||||
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
|
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
|
||||||
@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
|
|||||||
auto num_threads = num_indices * feature_size;
|
auto num_threads = num_indices * feature_size;
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||||
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
|
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
|
||||||
|
|||||||
@ -179,8 +179,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
|
|||||||
iter.strides(2),
|
iter.strides(2),
|
||||||
index_size,
|
index_size,
|
||||||
index_stride,
|
index_stride,
|
||||||
ndim_nindiees,
|
ndim_nindiees);
|
||||||
mpsStream->getErrorBuffer());
|
|
||||||
mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
|
mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -300,7 +299,7 @@ static Tensor& nonzero_out_native_mps(const Tensor& self, Tensor& out_) {
|
|||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
dispatch_sync(stream->queue(), ^() {
|
||||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||||
});
|
});
|
||||||
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
||||||
@ -385,7 +384,7 @@ Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
|
|||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
dispatch_sync(stream->queue(), ^() {
|
||||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||||
});
|
});
|
||||||
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
||||||
|
|||||||
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
|
|||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
|
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||||
// which kernel variant to use based on the normalized axis N size
|
// which kernel variant to use based on the normalized axis N size
|
||||||
const int N_READS = 4;
|
const int N_READS = 4;
|
||||||
auto metalType = mps::scalarToMetalTypeString(input);
|
auto metalType = mps::scalarToMetalTypeString(input);
|
||||||
|
|||||||
@ -192,11 +192,6 @@
|
|||||||
CompositeExplicitAutograd: _assert_tensor_metadata
|
CompositeExplicitAutograd: _assert_tensor_metadata
|
||||||
Meta: _assert_tensor_metadata_meta_symint
|
Meta: _assert_tensor_metadata_meta_symint
|
||||||
|
|
||||||
- func: _async_error(str msg) -> ()
|
|
||||||
dispatch:
|
|
||||||
CompositeExplicitAutograd: _async_error
|
|
||||||
Meta: _async_error_meta
|
|
||||||
|
|
||||||
- func: _print(str s) -> ()
|
- func: _print(str s) -> ()
|
||||||
dispatch:
|
dispatch:
|
||||||
CompositeExplicitAutograd: _print
|
CompositeExplicitAutograd: _print
|
||||||
@ -2808,7 +2803,7 @@
|
|||||||
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
device_check: NoCheck # TensorIterator
|
device_check: NoCheck # TensorIterator
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA, MPS, MTIA: floor_divide_out
|
CPU, CUDA, MPS: floor_divide_out
|
||||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
||||||
|
|
||||||
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||||
@ -4297,7 +4292,6 @@
|
|||||||
dispatch:
|
dispatch:
|
||||||
SparseCPU: sparse_sparse_matmul_cpu
|
SparseCPU: sparse_sparse_matmul_cpu
|
||||||
SparseCUDA: sparse_sparse_matmul_cuda
|
SparseCUDA: sparse_sparse_matmul_cuda
|
||||||
SparseMPS: sparse_sparse_matmul_mps
|
|
||||||
autogen: _sparse_sparse_matmul.out
|
autogen: _sparse_sparse_matmul.out
|
||||||
|
|
||||||
- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
|
- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||||
@ -4389,7 +4383,7 @@
|
|||||||
variants: function, method
|
variants: function, method
|
||||||
dispatch:
|
dispatch:
|
||||||
CompositeExplicitAutograd: mv
|
CompositeExplicitAutograd: mv
|
||||||
SparseCPU, SparseCUDA, SparseMPS: mv_sparse
|
SparseCPU, SparseCUDA: mv_sparse
|
||||||
|
|
||||||
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
|
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
dispatch:
|
dispatch:
|
||||||
@ -9838,7 +9832,7 @@
|
|||||||
structured_delegate: erfinv.out
|
structured_delegate: erfinv.out
|
||||||
variants: method, function
|
variants: method, function
|
||||||
dispatch:
|
dispatch:
|
||||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
|
SparseCPU, SparseCUDA: erfinv_sparse
|
||||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
|
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
@ -9847,7 +9841,7 @@
|
|||||||
structured_delegate: erfinv.out
|
structured_delegate: erfinv.out
|
||||||
variants: method
|
variants: method
|
||||||
dispatch:
|
dispatch:
|
||||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
|
SparseCPU, SparseCUDA: erfinv_sparse_
|
||||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
|
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
@ -9857,7 +9851,7 @@
|
|||||||
structured_inherits: TensorIteratorBase
|
structured_inherits: TensorIteratorBase
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA, MPS: erfinv_out
|
CPU, CUDA, MPS: erfinv_out
|
||||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
|
SparseCPU, SparseCUDA: erfinv_sparse_out
|
||||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
|
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
|
|||||||
@ -10,10 +10,6 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#else
|
#else
|
||||||
#include <ATen/ops/_coalesce_native.h>
|
#include <ATen/ops/_coalesce_native.h>
|
||||||
#include <ATen/ops/repeat_interleave_native.h>
|
|
||||||
#include <ATen/ops/cumsum.h>
|
|
||||||
#include <ATen/ops/_sparse_sparse_matmul_native.h>
|
|
||||||
#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
|
|
||||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||||
#include <ATen/ops/cat.h>
|
#include <ATen/ops/cat.h>
|
||||||
#include <ATen/ops/add_native.h>
|
#include <ATen/ops/add_native.h>
|
||||||
@ -892,114 +888,5 @@ static void sparse_mask_intersection_out_mps_kernel(
|
|||||||
/*coalesce_mask=*/false);
|
/*coalesce_mask=*/false);
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
|
|
||||||
TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
|
|
||||||
"sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
|
|
||||||
TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
|
|
||||||
"sparse_sparse_matmul_mps: both inputs must be on MPS device");
|
|
||||||
TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
|
|
||||||
"sparse_sparse_matmul_mps: both inputs must be 2D matrices");
|
|
||||||
TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
|
|
||||||
"sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
|
|
||||||
TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
|
|
||||||
"mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
|
|
||||||
TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
|
|
||||||
"sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
|
|
||||||
" does not match mat2 dtype ", mat2_.scalar_type());
|
|
||||||
|
|
||||||
const auto device = mat1_.device();
|
|
||||||
|
|
||||||
auto A = mat1_.coalesce();
|
|
||||||
auto B = mat2_.coalesce();
|
|
||||||
|
|
||||||
const auto I = A.size(0);
|
|
||||||
const auto K = A.size(1);
|
|
||||||
const auto N = B.size(1);
|
|
||||||
|
|
||||||
const auto nnzA = A._nnz();
|
|
||||||
const auto nnzB = B._nnz();
|
|
||||||
|
|
||||||
// Early empty result, return an empty, coalesced tensor
|
|
||||||
if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
|
|
||||||
auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
|
|
||||||
auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
|
|
||||||
auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
|
|
||||||
out._coalesced_(true);
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto computeDtype = at::result_type(mat1_, mat2_);
|
|
||||||
|
|
||||||
auto A_idx = A._indices().contiguous();
|
|
||||||
auto A_val = A._values().to(computeDtype).contiguous();
|
|
||||||
auto A_i = A_idx.select(0, 0).contiguous();
|
|
||||||
auto A_k = A_idx.select(0, 1).contiguous();
|
|
||||||
|
|
||||||
auto B_idx = B._indices().contiguous();
|
|
||||||
auto B_val = B._values().to(computeDtype).contiguous();
|
|
||||||
auto B_k = B_idx.select(0, 0).contiguous();
|
|
||||||
auto B_j = B_idx.select(0, 1).contiguous();
|
|
||||||
|
|
||||||
// csr-style row pointers for B by k (the shared dimension)
|
|
||||||
Tensor row_ptr_B;
|
|
||||||
{
|
|
||||||
auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
|
|
||||||
row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
|
|
||||||
build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
|
|
||||||
auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
|
|
||||||
auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
|
|
||||||
|
|
||||||
auto counts = deg_B.index_select(0, A_k);
|
|
||||||
|
|
||||||
const int64_t P = counts.sum().item<int64_t>();
|
|
||||||
if (P == 0) {
|
|
||||||
auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
|
|
||||||
auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
|
|
||||||
auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
|
|
||||||
out._coalesced_(true);
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto group_ids = repeat_interleave_mps(counts);
|
|
||||||
|
|
||||||
// exclusive cumsum of counts
|
|
||||||
auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
|
|
||||||
auto offsets_gather = offsets.index_select(0, group_ids);
|
|
||||||
auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
|
|
||||||
|
|
||||||
// Map each output element to its source B row and position
|
|
||||||
auto k_per_out = A_k.index_select(0, group_ids);
|
|
||||||
auto start_in_B = row_ptr_B.index_select(0, k_per_out);
|
|
||||||
auto seg_index = start_in_B.add(within);
|
|
||||||
|
|
||||||
// Assemble candidate coo pairs and values
|
|
||||||
auto i_out = A_i.index_select(0, group_ids).contiguous();
|
|
||||||
auto j_out = B_j.index_select(0, seg_index).contiguous();
|
|
||||||
auto vA_out = A_val.index_select(0, group_ids).contiguous();
|
|
||||||
auto vB_out = B_val.index_select(0, seg_index).contiguous();
|
|
||||||
auto v_out = vA_out.mul(vB_out);
|
|
||||||
|
|
||||||
// build (2, P) indices
|
|
||||||
auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
|
|
||||||
out_indices.select(0, 0).copy_(i_out);
|
|
||||||
out_indices.select(0, 1).copy_(j_out);
|
|
||||||
|
|
||||||
auto result = _sparse_coo_tensor_unsafe(
|
|
||||||
out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
|
|
||||||
|
|
||||||
result = result.coalesce();
|
|
||||||
|
|
||||||
if (result.scalar_type() != mat1_.scalar_type()) {
|
|
||||||
auto cast_vals = result._values().to(mat1_.scalar_type());
|
|
||||||
auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
|
|
||||||
out._coalesced_(true);
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
|
REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
|||||||
first_fields.append(kwargs["tag"])
|
first_fields.append(kwargs["tag"])
|
||||||
headers = first_headers + ["speedup", "abs_latency"]
|
headers = first_headers + ["speedup", "abs_latency"]
|
||||||
row = first_fields + [float(speedup), median[1] * 1000]
|
row = first_fields + [float(speedup), median[1] * 1000]
|
||||||
msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
|
msg = f"{speedup:.3f}x"
|
||||||
if args.baseline:
|
if args.baseline:
|
||||||
headers.extend(
|
headers.extend(
|
||||||
[
|
[
|
||||||
@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
|||||||
# Hypothetically you can use this from other places, but it's currently
|
# Hypothetically you can use this from other places, but it's currently
|
||||||
# inaccessible, and when this assert fails you need to update the
|
# inaccessible, and when this assert fails you need to update the
|
||||||
# event_name here to account for the other cases you are using this
|
# event_name here to account for the other cases you are using this
|
||||||
assert any([args.quantization, args.optimus])
|
assert args.quantization is not None
|
||||||
output_signpost(
|
output_signpost(
|
||||||
dict(zip(headers, row)),
|
dict(zip(headers, row)),
|
||||||
args,
|
args,
|
||||||
@ -2587,9 +2587,6 @@ class BenchmarkRunner:
|
|||||||
**experiment_kwargs,
|
**experiment_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# reset dynamo
|
|
||||||
torch._dynamo.reset()
|
|
||||||
|
|
||||||
if self.args.export_aot_inductor:
|
if self.args.export_aot_inductor:
|
||||||
optimized_model_iter_fn = optimize_ctx
|
optimized_model_iter_fn = optimize_ctx
|
||||||
else:
|
else:
|
||||||
@ -2953,7 +2950,7 @@ class BenchmarkRunner:
|
|||||||
status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
|
status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
|
||||||
print(status)
|
print(status)
|
||||||
elif self.args.performance:
|
elif self.args.performance:
|
||||||
if self.args.backend in ["torchao", "optimus"]:
|
if self.args.backend == "torchao":
|
||||||
status = self.run_performance_test_non_alternate(
|
status = self.run_performance_test_non_alternate(
|
||||||
name, model, example_inputs, optimize_ctx, experiment, tag
|
name, model, example_inputs, optimize_ctx, experiment, tag
|
||||||
)
|
)
|
||||||
@ -3529,12 +3526,6 @@ def parse_args(args=None):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Measure speedup with TorchInductor",
|
help="Measure speedup with TorchInductor",
|
||||||
)
|
)
|
||||||
group.add_argument(
|
|
||||||
"--optimus",
|
|
||||||
choices=["vertical_opt", "horizontal_opt", "all"],
|
|
||||||
default=None,
|
|
||||||
help="Measure speedup of Optimus with TorchInductor baseline",
|
|
||||||
)
|
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--quantization",
|
"--quantization",
|
||||||
choices=[
|
choices=[
|
||||||
@ -3792,9 +3783,6 @@ def run(runner, args, original_dir=None):
|
|||||||
if args.inductor:
|
if args.inductor:
|
||||||
assert args.backend is None
|
assert args.backend is None
|
||||||
args.backend = "inductor"
|
args.backend = "inductor"
|
||||||
if args.optimus:
|
|
||||||
assert args.backend is None
|
|
||||||
args.backend = "optimus"
|
|
||||||
if args.quantization:
|
if args.quantization:
|
||||||
assert args.backend is None
|
assert args.backend is None
|
||||||
args.backend = "torchao"
|
args.backend = "torchao"
|
||||||
@ -4079,22 +4067,10 @@ def run(runner, args, original_dir=None):
|
|||||||
|
|
||||||
runner.model_iter_fn = model_iter_fn_and_mark_step
|
runner.model_iter_fn = model_iter_fn_and_mark_step
|
||||||
optimize_ctx = torchao_optimize_ctx(args.quantization)
|
optimize_ctx = torchao_optimize_ctx(args.quantization)
|
||||||
elif args.backend == "optimus":
|
|
||||||
from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
|
|
||||||
|
|
||||||
baseline_ctx = get_baseline_ctx(
|
|
||||||
nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
|
|
||||||
)
|
|
||||||
runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
|
|
||||||
optimize_ctx = get_optimus_optimize_ctx(
|
|
||||||
args.optimus, args.nopython, args.inductor_compile_mode
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
|
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
|
||||||
experiment = (
|
experiment = (
|
||||||
speedup_experiment
|
speedup_experiment if args.backend != "torchao" else latency_experiment
|
||||||
if args.backend not in ["torchao", "optimus"]
|
|
||||||
else latency_experiment
|
|
||||||
)
|
)
|
||||||
if args.accuracy:
|
if args.accuracy:
|
||||||
output_filename = f"accuracy_{args.backend}.csv"
|
output_filename = f"accuracy_{args.backend}.csv"
|
||||||
@ -4115,12 +4091,7 @@ def run(runner, args, original_dir=None):
|
|||||||
if args.only in runner.disable_cudagraph_models:
|
if args.only in runner.disable_cudagraph_models:
|
||||||
args.disable_cudagraphs = True
|
args.disable_cudagraphs = True
|
||||||
|
|
||||||
if (
|
if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
|
||||||
args.inductor
|
|
||||||
or args.backend == "inductor"
|
|
||||||
or args.export_aot_inductor
|
|
||||||
or args.backend == "optimus"
|
|
||||||
):
|
|
||||||
inductor_config.triton.cudagraphs = not args.disable_cudagraphs
|
inductor_config.triton.cudagraphs = not args.disable_cudagraphs
|
||||||
inductor_config.triton.persistent_reductions = (
|
inductor_config.triton.persistent_reductions = (
|
||||||
not args.disable_persistent_reductions
|
not args.disable_persistent_reductions
|
||||||
|
|||||||
@ -1,62 +0,0 @@
|
|||||||
import functools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def get_baseline_ctx(nopython, inductor_compile_mode):
|
|
||||||
return functools.partial(
|
|
||||||
torch.compile,
|
|
||||||
backend="inductor",
|
|
||||||
fullgraph=nopython,
|
|
||||||
mode=inductor_compile_mode,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
|
|
||||||
if config == "vertical_opt":
|
|
||||||
optimus_inductor_config = {
|
|
||||||
"pre_grad_fusion_options": {
|
|
||||||
"normalization_pass": {},
|
|
||||||
"merge_splits_pass": {},
|
|
||||||
"split_cat_pass": {},
|
|
||||||
"unbind_stack_pass": {},
|
|
||||||
"unbind_cat_to_view_pass": {},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
elif config == "horizontal_opt":
|
|
||||||
optimus_inductor_config = {
|
|
||||||
"pre_grad_fusion_options": {
|
|
||||||
"normalization_pass": {},
|
|
||||||
"batch_linear": {},
|
|
||||||
"batch_layernorm": {},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
elif config == "all":
|
|
||||||
optimus_inductor_config = {
|
|
||||||
"pre_grad_fusion_options": {
|
|
||||||
"normalization_pass": {},
|
|
||||||
"batch_linear": {},
|
|
||||||
"batch_layernorm": {},
|
|
||||||
"merge_splits_pass": {},
|
|
||||||
"split_cat_pass": {},
|
|
||||||
"unbind_stack_pass": {},
|
|
||||||
"unbind_cat_to_view_pass": {},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unknown optimus config: {config}")
|
|
||||||
|
|
||||||
def _inner(fn):
|
|
||||||
if "pre_grad_fusion_options" in optimus_inductor_config:
|
|
||||||
torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
|
|
||||||
"pre_grad_fusion_options"
|
|
||||||
]
|
|
||||||
if "post_grad_fusion_options" in optimus_inductor_config:
|
|
||||||
torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
|
|
||||||
"post_grad_fusion_options"
|
|
||||||
]
|
|
||||||
return torch.compile(
|
|
||||||
fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
|
|
||||||
)
|
|
||||||
|
|
||||||
return _inner
|
|
||||||
@ -484,106 +484,24 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
|
|||||||
PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
|
PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
|
||||||
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
|
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
|
||||||
PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
|
PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000
|
|
||||||
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
|
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
|
||||||
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
|
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
|
||||||
PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
|
PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
|
||||||
|
|||||||
|
@ -4,84 +4,74 @@ import torch
|
|||||||
|
|
||||||
|
|
||||||
tensor_conversion_short_configs = op_bench.cross_product_configs(
|
tensor_conversion_short_configs = op_bench.cross_product_configs(
|
||||||
M=[32],
|
M=(
|
||||||
N=[128],
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
),
|
||||||
|
N=(
|
||||||
|
16,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
),
|
||||||
device=["cpu", "cuda"],
|
device=["cpu", "cuda"],
|
||||||
dtype_one=[
|
|
||||||
torch.bool,
|
|
||||||
torch.uint8,
|
|
||||||
torch.int8,
|
|
||||||
torch.int16,
|
|
||||||
torch.int32,
|
|
||||||
torch.int64,
|
|
||||||
torch.half,
|
|
||||||
torch.bfloat16,
|
|
||||||
torch.float,
|
|
||||||
torch.double,
|
|
||||||
],
|
|
||||||
dtype_two=[
|
|
||||||
torch.bool,
|
|
||||||
torch.uint8,
|
|
||||||
torch.int8,
|
|
||||||
torch.int16,
|
|
||||||
torch.int32,
|
|
||||||
torch.int64,
|
|
||||||
torch.half,
|
|
||||||
torch.bfloat16,
|
|
||||||
torch.float,
|
|
||||||
torch.double,
|
|
||||||
],
|
|
||||||
tags=["short"],
|
tags=["short"],
|
||||||
)
|
)
|
||||||
|
|
||||||
tensor_conversion_long_configs = op_bench.cross_product_configs(
|
tensor_conversion_long_configs = op_bench.cross_product_configs(
|
||||||
M=[1024],
|
M=(
|
||||||
N=[1024],
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
),
|
||||||
|
N=(
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
),
|
||||||
device=["cpu", "cuda"],
|
device=["cpu", "cuda"],
|
||||||
dtype_one=[
|
|
||||||
torch.bool,
|
|
||||||
torch.uint8,
|
|
||||||
torch.int8,
|
|
||||||
torch.int16,
|
|
||||||
torch.int32,
|
|
||||||
torch.int64,
|
|
||||||
torch.half,
|
|
||||||
torch.bfloat16,
|
|
||||||
torch.float,
|
|
||||||
torch.double,
|
|
||||||
],
|
|
||||||
dtype_two=[
|
|
||||||
torch.bool,
|
|
||||||
torch.uint8,
|
|
||||||
torch.int8,
|
|
||||||
torch.int16,
|
|
||||||
torch.int32,
|
|
||||||
torch.int64,
|
|
||||||
torch.half,
|
|
||||||
torch.bfloat16,
|
|
||||||
torch.float,
|
|
||||||
torch.double,
|
|
||||||
],
|
|
||||||
tags=["long"],
|
tags=["long"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
||||||
def init(self, M, N, dtype_one, dtype_two, device):
|
def init(self, M, N, device):
|
||||||
self.inputs = {
|
self.inputs = {
|
||||||
"input": torch.rand(
|
"input": torch.rand(
|
||||||
M, N, device=device, requires_grad=False, dtype=torch.float
|
M, N, device=device, requires_grad=False, dtype=torch.float
|
||||||
).to(dtype=dtype_one)
|
)
|
||||||
}
|
}
|
||||||
self.dtype_one = dtype_one
|
|
||||||
self.dtype_two = dtype_two
|
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
return input.to(dtype=self.dtype_two)
|
return input.to(torch.half)
|
||||||
|
|
||||||
|
|
||||||
op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark)
|
class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
||||||
op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark)
|
def init(self, M, N, device):
|
||||||
|
self.inputs = {
|
||||||
|
"input": torch.rand(
|
||||||
|
M, N, device=device, requires_grad=False, dtype=torch.half
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
return input.to(torch.float)
|
||||||
|
|
||||||
|
|
||||||
|
op_bench.generate_pt_test(
|
||||||
|
tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark
|
||||||
|
)
|
||||||
|
op_bench.generate_pt_test(
|
||||||
|
tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark
|
||||||
|
)
|
||||||
|
op_bench.generate_pt_test(
|
||||||
|
tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark
|
||||||
|
)
|
||||||
|
op_bench.generate_pt_test(
|
||||||
|
tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark
|
||||||
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
op_bench.benchmark_runner.main()
|
op_bench.benchmark_runner.main()
|
||||||
|
|||||||
@ -349,106 +349,24 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
|
|||||||
PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765
|
PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765
|
||||||
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
|
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
|
||||||
PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287
|
PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772
|
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749
|
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988
|
|
||||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792
|
|
||||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657
|
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657
|
||||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625
|
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625
|
||||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165
|
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165
|
||||||
|
|||||||
|
@ -52,18 +52,19 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
|
|||||||
start.record()
|
start.record()
|
||||||
coo.matmul(mat)
|
coo.matmul(mat)
|
||||||
stop.record()
|
stop.record()
|
||||||
|
|
||||||
times.append(start.elapsed_time(stop))
|
times.append(start.elapsed_time(stop))
|
||||||
|
|
||||||
coo_mean_time = sum(times) / len(times)
|
coo_mean_time = sum(times) / len(times)
|
||||||
|
|
||||||
times = []
|
times = []
|
||||||
for _ in range(test_count):
|
for _ in range(test_count):
|
||||||
start.record()
|
start.record()
|
||||||
csr.matmul(mat)
|
csr.matmul(mat)
|
||||||
stop.record()
|
stop.record()
|
||||||
times.append(start.elapsed_time(stop))
|
times.append(start.elapsed_time(stop))
|
||||||
|
|
||||||
csr_mean_time = sum(times) / len(times)
|
csr_mean_time = sum(times) / len(times)
|
||||||
|
|
||||||
return coo_mean_time, csr_mean_time
|
return coo_mean_time, csr_mean_time
|
||||||
|
|
||||||
|
|||||||
@ -12,22 +12,20 @@ constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
|
|||||||
|
|
||||||
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
||||||
static AcceleratorAllocatorConfig instance;
|
static AcceleratorAllocatorConfig instance;
|
||||||
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env) \
|
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
|
||||||
auto env##_name = c10::utils::get_env(#env); \
|
auto env##_name = c10::utils::get_env(#env); \
|
||||||
if (env##_name.has_value()) { \
|
if (env##_name.has_value()) { \
|
||||||
instance.parseArgs(env##_name.value()); \
|
if (deprecated) { \
|
||||||
return true; \
|
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
|
||||||
|
} \
|
||||||
|
instance.parseArgs(env##_name.value()); \
|
||||||
|
return true; \
|
||||||
}
|
}
|
||||||
static bool env_flag [[maybe_unused]] = []() {
|
static bool env_flag [[maybe_unused]] = []() {
|
||||||
// Parse allocator configuration from environment variables.
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
|
||||||
// The first two entries are kept for backward compatibility with legacy
|
// Keep this for backwards compatibility
|
||||||
// CUDA and HIP environment variable names. The new unified variable
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
|
||||||
// (PYTORCH_ALLOC_CONF) should be used going forward.
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
|
||||||
// Note: keep the parsing order and logic stable to avoid potential
|
|
||||||
// performance regressions in internal tests.
|
|
||||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF)
|
|
||||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF)
|
|
||||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF)
|
|
||||||
return false;
|
return false;
|
||||||
}();
|
}();
|
||||||
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
||||||
|
|||||||
@ -1,8 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <c10/core/SafePyObject.h>
|
|
||||||
#include <c10/macros/Export.h>
|
#include <c10/macros/Export.h>
|
||||||
#include <optional>
|
|
||||||
|
|
||||||
namespace c10 {
|
namespace c10 {
|
||||||
|
|
||||||
@ -17,8 +15,7 @@ struct C10_API AutogradState {
|
|||||||
bool inference_mode,
|
bool inference_mode,
|
||||||
bool fw_grad_mode,
|
bool fw_grad_mode,
|
||||||
bool multithreading_enabled)
|
bool multithreading_enabled)
|
||||||
: graph_exec_group_(std::nullopt),
|
: grad_mode_(grad_mode),
|
||||||
grad_mode_(grad_mode),
|
|
||||||
inference_mode_(inference_mode),
|
inference_mode_(inference_mode),
|
||||||
fw_grad_mode_(fw_grad_mode),
|
fw_grad_mode_(fw_grad_mode),
|
||||||
multithreading_enabled_(multithreading_enabled),
|
multithreading_enabled_(multithreading_enabled),
|
||||||
@ -44,10 +41,6 @@ struct C10_API AutogradState {
|
|||||||
view_replay_enabled_ = view_replay_enabled;
|
view_replay_enabled_ = view_replay_enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_graph_exec_group(std::optional<SafePyObject> group) {
|
|
||||||
graph_exec_group_ = std::move(group);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool get_grad_mode() const {
|
bool get_grad_mode() const {
|
||||||
return grad_mode_;
|
return grad_mode_;
|
||||||
}
|
}
|
||||||
@ -68,12 +61,7 @@ struct C10_API AutogradState {
|
|||||||
return view_replay_enabled_;
|
return view_replay_enabled_;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::optional<SafePyObject>& get_graph_exec_group() const {
|
|
||||||
return graph_exec_group_;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::optional<SafePyObject> graph_exec_group_;
|
|
||||||
bool grad_mode_ : 1;
|
bool grad_mode_ : 1;
|
||||||
bool inference_mode_ : 1;
|
bool inference_mode_ : 1;
|
||||||
bool fw_grad_mode_ : 1;
|
bool fw_grad_mode_ : 1;
|
||||||
|
|||||||
@ -96,10 +96,6 @@ struct C10_API DeviceAllocator : public c10::Allocator {
|
|||||||
|
|
||||||
// Resets peak memory usage statistics for the specified device
|
// Resets peak memory usage statistics for the specified device
|
||||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||||
|
|
||||||
// Return the free memory size and total memory size in bytes for the
|
|
||||||
// specified device.
|
|
||||||
virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// This function is used to get the DeviceAllocator for a specific device type
|
// This function is used to get the DeviceAllocator for a specific device type
|
||||||
|
|||||||
@ -120,18 +120,16 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
|||||||
static CUDAAllocatorConfig& instance() {
|
static CUDAAllocatorConfig& instance() {
|
||||||
static CUDAAllocatorConfig* s_instance = ([]() {
|
static CUDAAllocatorConfig* s_instance = ([]() {
|
||||||
auto inst = new CUDAAllocatorConfig();
|
auto inst = new CUDAAllocatorConfig();
|
||||||
auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
||||||
|
if (!env.has_value()) {
|
||||||
|
env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||||
|
}
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
// convenience for ROCm users, allow alternative HIP token
|
// convenience for ROCm users, allow alternative HIP token
|
||||||
if (!env.has_value()) {
|
if (!env.has_value()) {
|
||||||
env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// Note: keep the parsing order and logic stable to avoid potential
|
|
||||||
// performance regressions in internal tests.
|
|
||||||
if (!env.has_value()) {
|
|
||||||
env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
|
||||||
}
|
|
||||||
if (env.has_value()) {
|
if (env.has_value()) {
|
||||||
inst->parseArgs(env.value());
|
inst->parseArgs(env.value());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1566,7 +1566,7 @@ class DeviceCachingAllocator {
|
|||||||
reserved_bytes - allocated_bytes - allocated_in_private_pools),
|
reserved_bytes - allocated_bytes - allocated_in_private_pools),
|
||||||
" is reserved by PyTorch but unallocated.",
|
" is reserved by PyTorch but unallocated.",
|
||||||
" If reserved but unallocated memory is large try setting",
|
" If reserved but unallocated memory is large try setting",
|
||||||
" PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
|
" PYTORCH_ALLOC_CONF=expandable_segments:True to avoid"
|
||||||
" fragmentation. See documentation for Memory Management "
|
" fragmentation. See documentation for Memory Management "
|
||||||
" (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)");
|
" (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)");
|
||||||
}
|
}
|
||||||
@ -4449,16 +4449,16 @@ struct BackendStaticInitializer {
|
|||||||
// instance based on its value. If no valid configuration is found, it falls
|
// instance based on its value. If no valid configuration is found, it falls
|
||||||
// back to the default Native allocator.
|
// back to the default Native allocator.
|
||||||
CUDAAllocator* parseEnvForBackend() {
|
CUDAAllocator* parseEnvForBackend() {
|
||||||
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
auto val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
||||||
|
if (!val.has_value()) {
|
||||||
|
val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||||
|
}
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||||
if (!val.has_value()) {
|
if (!val.has_value()) {
|
||||||
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!val.has_value()) {
|
|
||||||
val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
|
||||||
}
|
|
||||||
if (val.has_value()) {
|
if (val.has_value()) {
|
||||||
c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
|
c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
|
||||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||||
|
|||||||
@ -345,13 +345,6 @@ class CUDAAllocator : public DeviceAllocator {
|
|||||||
c10::DeviceIndex device,
|
c10::DeviceIndex device,
|
||||||
std::shared_ptr<AllocatorState> pps) = 0;
|
std::shared_ptr<AllocatorState> pps) = 0;
|
||||||
virtual std::string name() = 0;
|
virtual std::string name() = 0;
|
||||||
std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
|
|
||||||
c10::DeviceGuard device_guard({at::kCUDA, device});
|
|
||||||
size_t free = 0;
|
|
||||||
size_t total = 0;
|
|
||||||
C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
|
|
||||||
return {free, total};
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Allocator object, statically initialized
|
// Allocator object, statically initialized
|
||||||
|
|||||||
@ -1,111 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include <c10/metal/common.h>
|
|
||||||
|
|
||||||
namespace c10 {
|
|
||||||
namespace metal {
|
|
||||||
C10_METAL_CONSTEXPR unsigned error_message_count = 30;
|
|
||||||
struct ErrorMessage {
|
|
||||||
char file[128];
|
|
||||||
char func[128];
|
|
||||||
char message[250];
|
|
||||||
unsigned int line;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ErrorMessages {
|
|
||||||
#ifdef __METAL__
|
|
||||||
::metal::atomic<unsigned int> count;
|
|
||||||
#else
|
|
||||||
unsigned int count;
|
|
||||||
#endif
|
|
||||||
ErrorMessage msg[error_message_count];
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef __METAL__
|
|
||||||
namespace detail {
|
|
||||||
static uint strncpy(device char* dst, constant const char* src, unsigned len) {
|
|
||||||
uint i = 0;
|
|
||||||
while (src[i] != 0 && i < len - 1) {
|
|
||||||
dst[i] = src[i];
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
dst[i] = 0;
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline uint print_arg(
|
|
||||||
device char* ptr,
|
|
||||||
unsigned len,
|
|
||||||
constant const char* arg) {
|
|
||||||
return strncpy(ptr, arg, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns number length as string in base10
|
|
||||||
static inline uint base10_length(long num) {
|
|
||||||
uint rc = 1;
|
|
||||||
if (num < 0) {
|
|
||||||
num = -num;
|
|
||||||
rc += 1;
|
|
||||||
}
|
|
||||||
while (num > 9) {
|
|
||||||
num /= 10;
|
|
||||||
rc++;
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Converts signed integer to string
|
|
||||||
inline uint print_arg(device char* ptr, unsigned len, long arg) {
|
|
||||||
const auto arg_len = base10_length(arg);
|
|
||||||
if (arg_len >= len)
|
|
||||||
return 0;
|
|
||||||
if (arg < 0) {
|
|
||||||
ptr[0] = '-';
|
|
||||||
arg = -arg;
|
|
||||||
}
|
|
||||||
uint idx = 1;
|
|
||||||
do {
|
|
||||||
ptr[arg_len - idx] = '0' + (arg % 10);
|
|
||||||
arg /= 10;
|
|
||||||
idx++;
|
|
||||||
} while (arg > 0);
|
|
||||||
ptr[arg_len] = 0;
|
|
||||||
return arg_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline void print_args(device char* ptr, unsigned len, T arg) {
|
|
||||||
print_arg(ptr, len, arg);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T, typename... Args>
|
|
||||||
inline void print_args(device char* ptr, unsigned len, T arg, Args... args) {
|
|
||||||
const auto rc = print_arg(ptr, len, arg);
|
|
||||||
print_args(ptr + rc, len - rc, args...);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
template <typename... Args>
|
|
||||||
static void report_error(
|
|
||||||
device ErrorMessages* msgs,
|
|
||||||
constant const char* file,
|
|
||||||
int line,
|
|
||||||
constant const char* func,
|
|
||||||
Args... args) {
|
|
||||||
const auto idx =
|
|
||||||
atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed);
|
|
||||||
if (idx >= error_message_count) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
device auto* msg = &msgs->msg[idx];
|
|
||||||
detail::strncpy(msg->file, file, 128);
|
|
||||||
detail::strncpy(msg->func, func, 128);
|
|
||||||
detail::print_args(msg->message, 250, args...);
|
|
||||||
msg->line = line;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define TORCH_REPORT_ERROR(buf, ...) \
|
|
||||||
::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__)
|
|
||||||
#endif
|
|
||||||
} // namespace metal
|
|
||||||
} // namespace c10
|
|
||||||
@ -66,15 +66,6 @@ def define_targets(rules):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
rules.cc_test(
|
|
||||||
name = "util/nofatal_test",
|
|
||||||
srcs = ["util/nofatal_test.cpp"],
|
|
||||||
deps = [
|
|
||||||
"//c10/util:base",
|
|
||||||
"@com_google_googletest//:gtest_main",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
rules.cc_test(
|
rules.cc_test(
|
||||||
name = "util/ssize_test",
|
name = "util/ssize_test",
|
||||||
srcs = ["util/ssize_test.cpp"],
|
srcs = ["util/ssize_test.cpp"],
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
|
#include <c10/test/util/Macros.h>
|
||||||
|
#include <c10/util/Metaprogramming.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <torch/headeronly/util/Metaprogramming.h>
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
|
||||||
using namespace torch::headeronly::guts;
|
using namespace c10::guts;
|
||||||
|
|
||||||
// NOLINTBEGIN(modernize*, cppcoreguidelines-special-member-functions)
|
// NOLINTBEGIN(modernize*, cppcoreguidelines-special-member-functions)
|
||||||
namespace {
|
namespace {
|
||||||
@ -64,15 +65,6 @@ static_assert(
|
|||||||
typename make_function_traits_t<void, typelist::typelist<int, float>>::
|
typename make_function_traits_t<void, typelist::typelist<int, float>>::
|
||||||
func_type>::value,
|
func_type>::value,
|
||||||
"");
|
"");
|
||||||
|
|
||||||
struct Functor final {
|
|
||||||
std::string operator()(int64_t a, float b) const;
|
|
||||||
};
|
|
||||||
static_assert(
|
|
||||||
std::is_same<
|
|
||||||
std::string(int64_t, float),
|
|
||||||
typename infer_function_traits_t<Functor>::func_type>::value,
|
|
||||||
"");
|
|
||||||
} // namespace test_function_traits
|
} // namespace test_function_traits
|
||||||
|
|
||||||
struct MovableOnly {
|
struct MovableOnly {
|
||||||
@ -1,8 +1,8 @@
|
|||||||
|
#include <c10/util/TypeList.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <torch/headeronly/util/TypeList.h>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
using namespace torch::headeronly::guts::typelist;
|
using namespace c10::guts::typelist;
|
||||||
// NOLINTBEGIN(modernize-unary-static-assert)
|
// NOLINTBEGIN(modernize-unary-static-assert)
|
||||||
namespace test_size {
|
namespace test_size {
|
||||||
class MyClass {};
|
class MyClass {};
|
||||||
@ -1,7 +1,7 @@
|
|||||||
|
#include <c10/util/TypeTraits.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <torch/headeronly/util/TypeTraits.h>
|
|
||||||
|
|
||||||
using namespace torch::headeronly::guts;
|
using namespace c10::guts;
|
||||||
|
|
||||||
// NOLINTBEGIN(modernize-unary-static-assert)
|
// NOLINTBEGIN(modernize-unary-static-assert)
|
||||||
namespace {
|
namespace {
|
||||||
@ -1,53 +0,0 @@
|
|||||||
#include <gtest/gtest.h>
|
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
|
||||||
#include <c10/util/Logging.h>
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
template <typename T>
|
|
||||||
inline void expectThrowsEq(T&& fn, const char* expected_msg) {
|
|
||||||
try {
|
|
||||||
std::forward<T>(fn)();
|
|
||||||
} catch (const c10::Error& e) {
|
|
||||||
EXPECT_TRUE(
|
|
||||||
std::string(e.what_without_backtrace()).find(expected_msg) !=
|
|
||||||
std::string::npos);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
|
|
||||||
<< "\" but didn't throw";
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
TEST(NofatalTest, TorchCheckComparisons) {
|
|
||||||
// quick make sure that no-op works as expected
|
|
||||||
TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
|
|
||||||
"Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
|
|
||||||
expectThrowsEq(
|
|
||||||
[]() {
|
|
||||||
void* p = nullptr;
|
|
||||||
TORCH_CHECK_NOTNULL(p);
|
|
||||||
},
|
|
||||||
"Check failed: 'p' must be non NULL.");
|
|
||||||
|
|
||||||
#if GTEST_HAS_DEATH_TEST
|
|
||||||
#ifndef NDEBUG
|
|
||||||
// if dbg build, DCHECK should result in deth
|
|
||||||
EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
|
|
||||||
#else
|
|
||||||
TORCH_DCHECK_EQ(1, 2); // no-op
|
|
||||||
#endif
|
|
||||||
#endif // GTEST_HAS_DEATH_TEST
|
|
||||||
}
|
|
||||||
@ -702,98 +702,6 @@ namespace c10::detail {
|
|||||||
#define TORCH_CHECK_ARG(cond, argN, ...) \
|
#define TORCH_CHECK_ARG(cond, argN, ...) \
|
||||||
TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
|
TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
|
||||||
|
|
||||||
#ifndef FATAL_IF
|
|
||||||
#ifdef C10_USE_GLOG
|
|
||||||
#define FATAL_IF(condition) \
|
|
||||||
condition ? (void)0 \
|
|
||||||
: ::c10::LoggerVoidify() & \
|
|
||||||
::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
|
|
||||||
.stream()
|
|
||||||
#else
|
|
||||||
#define FATAL_IF(condition) \
|
|
||||||
condition ? (void)0 \
|
|
||||||
: ::c10::LoggerVoidify() & \
|
|
||||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef NON_FATAL_IF
|
|
||||||
#ifdef C10_USE_GLOG
|
|
||||||
#define NON_FATAL_IF(condition) \
|
|
||||||
condition ? (void)0 \
|
|
||||||
: ::c10::LoggerVoidify() & \
|
|
||||||
::c10::MessageLogger( \
|
|
||||||
__FILE__, __LINE__, ::google::GLOG_FATAL, false) \
|
|
||||||
.stream()
|
|
||||||
#else
|
|
||||||
#define NON_FATAL_IF(condition) \
|
|
||||||
condition ? (void)0 \
|
|
||||||
: ::c10::LoggerVoidify() & \
|
|
||||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
|
|
||||||
.stream()
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Binary comparison check macros
|
|
||||||
#define TORCH_CHECK_OP(val1, val2, op) \
|
|
||||||
NON_FATAL_IF(((val1)op(val2))) \
|
|
||||||
<< "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
|
|
||||||
<< (val2) << "). "
|
|
||||||
|
|
||||||
#define TORCH_DCHECK_OP(val1, val2, op) \
|
|
||||||
FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
|
|
||||||
<< (val1) << " vs. " << (val2) << "). "
|
|
||||||
|
|
||||||
#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
|
||||||
#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
|
||||||
#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
|
||||||
#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
|
||||||
#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
|
||||||
#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
|
||||||
|
|
||||||
// Debug versions of TORCH_CHECK_OP macros
|
|
||||||
#ifndef NDEBUG
|
|
||||||
#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
|
|
||||||
#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
|
|
||||||
#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
|
|
||||||
#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
|
|
||||||
#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
|
|
||||||
#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
|
|
||||||
#else // !NDEBUG
|
|
||||||
// Optimized versions - generate no code
|
|
||||||
#define TORCH_DCHECK_EQ(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, ==)
|
|
||||||
#define TORCH_DCHECK_NE(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, !=)
|
|
||||||
#define TORCH_DCHECK_LE(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, <=)
|
|
||||||
#define TORCH_DCHECK_LT(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, <)
|
|
||||||
#define TORCH_DCHECK_GE(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, >=)
|
|
||||||
#define TORCH_DCHECK_GT(val1, val2) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_DCHECK_OP(val1, val2, >)
|
|
||||||
#endif // NDEBUG
|
|
||||||
|
|
||||||
// Null pointer check macro
|
|
||||||
#define TORCH_CHECK_NOTNULL(val) \
|
|
||||||
::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
#define TORCH_DCHECK_NOTNULL(val) \
|
|
||||||
::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
|
|
||||||
#else // !NDEBUG
|
|
||||||
#define TORCH_DCHECK_NOTNULL(val) \
|
|
||||||
while (false) \
|
|
||||||
TORCH_CHECK_NOTNULL(val)
|
|
||||||
#endif // NDEBUG
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Deprecated macros
|
// Deprecated macros
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|||||||
@ -291,32 +291,6 @@ namespace c10 {
|
|||||||
using fLB::FLAGS_logtostderr;
|
using fLB::FLAGS_logtostderr;
|
||||||
using fLI::FLAGS_minloglevel;
|
using fLI::FLAGS_minloglevel;
|
||||||
using fLI::FLAGS_v;
|
using fLI::FLAGS_v;
|
||||||
|
|
||||||
MessageLogger::MessageLogger(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
int severity,
|
|
||||||
bool exit_on_fatal)
|
|
||||||
: stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
|
|
||||||
|
|
||||||
MessageLogger::~MessageLogger() noexcept(false) {
|
|
||||||
if (severity_ == ::google::GLOG_FATAL) {
|
|
||||||
DealWithFatal();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::stringstream& MessageLogger::stream() {
|
|
||||||
return stream_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void MessageLogger::DealWithFatal() {
|
|
||||||
if (exit_on_fatal_) {
|
|
||||||
LOG(FATAL) << stream_.str();
|
|
||||||
} else {
|
|
||||||
throw c10::Error(stream_.str(), nullptr, nullptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace c10
|
} // namespace c10
|
||||||
|
|
||||||
C10_DEFINE_int(
|
C10_DEFINE_int(
|
||||||
@ -438,16 +412,17 @@ void ShowLogInfoToStderr() {
|
|||||||
FLAGS_caffe2_log_level = GLOG_INFO;
|
FLAGS_caffe2_log_level = GLOG_INFO;
|
||||||
}
|
}
|
||||||
|
|
||||||
MessageLogger::MessageLogger(
|
MessageLogger::MessageLogger(const char* file, int line, int severity)
|
||||||
const char* file,
|
: severity_(severity) {
|
||||||
int line,
|
|
||||||
int severity,
|
|
||||||
bool exit_on_fatal)
|
|
||||||
: severity_(severity), exit_on_fatal_(exit_on_fatal) {
|
|
||||||
if (severity_ < FLAGS_caffe2_log_level) {
|
if (severity_ < FLAGS_caffe2_log_level) {
|
||||||
// Nothing needs to be logged.
|
// Nothing needs to be logged.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef ANDROID
|
||||||
|
tag_ = "native";
|
||||||
|
#else // !ANDROID
|
||||||
|
tag_ = "";
|
||||||
|
#endif // ANDROID
|
||||||
|
|
||||||
time_t rawtime = 0;
|
time_t rawtime = 0;
|
||||||
time(&rawtime);
|
time(&rawtime);
|
||||||
@ -483,7 +458,7 @@ MessageLogger::MessageLogger(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Output the contents of the stream to the proper channel on destruction.
|
// Output the contents of the stream to the proper channel on destruction.
|
||||||
MessageLogger::~MessageLogger() noexcept(false) {
|
MessageLogger::~MessageLogger() {
|
||||||
if (severity_ < FLAGS_caffe2_log_level) {
|
if (severity_ < FLAGS_caffe2_log_level) {
|
||||||
// Nothing needs to be logged.
|
// Nothing needs to be logged.
|
||||||
return;
|
return;
|
||||||
@ -523,18 +498,6 @@ MessageLogger::~MessageLogger() noexcept(false) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::stringstream& MessageLogger::stream() {
|
|
||||||
return stream_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void MessageLogger::DealWithFatal() {
|
|
||||||
if (exit_on_fatal_) {
|
|
||||||
abort();
|
|
||||||
} else {
|
|
||||||
throw c10::Error(stream_.str(), nullptr, nullptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace c10
|
} // namespace c10
|
||||||
|
|
||||||
#endif // !C10_USE_GLOG
|
#endif // !C10_USE_GLOG
|
||||||
|
|||||||
1
c10/util/Metaprogramming.cpp
Normal file
1
c10/util/Metaprogramming.cpp
Normal file
@ -0,0 +1 @@
|
|||||||
|
#include <c10/util/Metaprogramming.h>
|
||||||
@ -1 +1,224 @@
|
|||||||
#include <torch/headeronly/util/Metaprogramming.h>
|
#pragma once
|
||||||
|
|
||||||
|
#include <c10/util/TypeList.h>
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
|
namespace c10::guts {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Access information about result type or arguments from a function type.
|
||||||
|
* Example:
|
||||||
|
* using A = function_traits<int (float, double)>::return_type // A == int
|
||||||
|
* using A = function_traits<int (float, double)>::parameter_types::tuple_type
|
||||||
|
* // A == tuple<float, double>
|
||||||
|
*/
|
||||||
|
template <class Func>
|
||||||
|
struct function_traits {
|
||||||
|
static_assert(
|
||||||
|
!std::is_same_v<Func, Func>,
|
||||||
|
"In function_traits<Func>, Func must be a plain function type.");
|
||||||
|
};
|
||||||
|
template <class Result, class... Args>
|
||||||
|
struct function_traits<Result(Args...)> {
|
||||||
|
using func_type = Result(Args...);
|
||||||
|
using return_type = Result;
|
||||||
|
using parameter_types = typelist::typelist<Args...>;
|
||||||
|
static constexpr auto number_of_parameters = sizeof...(Args);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* infer_function_traits: creates a `function_traits` type for a simple
|
||||||
|
* function (pointer) or functor (lambda/struct). Currently does not support
|
||||||
|
* class methods.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename Functor>
|
||||||
|
struct infer_function_traits {
|
||||||
|
using type = function_traits<
|
||||||
|
c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Result, typename... Args>
|
||||||
|
struct infer_function_traits<Result (*)(Args...)> {
|
||||||
|
using type = function_traits<Result(Args...)>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Result, typename... Args>
|
||||||
|
struct infer_function_traits<Result(Args...)> {
|
||||||
|
using type = function_traits<Result(Args...)>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using infer_function_traits_t = typename infer_function_traits<T>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* make_function_traits: creates a `function_traits` type given a Return type
|
||||||
|
* and a typelist of Argument types
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* bool f(int, int);
|
||||||
|
*
|
||||||
|
* infer_function_traits_t<f> == make_function_traits_t<bool,
|
||||||
|
* typelist::typelist<int, int>>
|
||||||
|
*/
|
||||||
|
template <typename Result, typename ArgList>
|
||||||
|
struct make_function_traits {
|
||||||
|
static_assert(
|
||||||
|
false_t<ArgList>::value,
|
||||||
|
"In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Result, typename... Args>
|
||||||
|
struct make_function_traits<Result, typelist::typelist<Args...>> {
|
||||||
|
using type = function_traits<Result(Args...)>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Result, typename ArgList>
|
||||||
|
using make_function_traits_t =
|
||||||
|
typename make_function_traits<Result, ArgList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* make_offset_index_sequence<Start, N>
|
||||||
|
* Like make_index_sequence<N>, but starting from Start instead of 0.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
|
||||||
|
*/
|
||||||
|
template <size_t Start, size_t N, size_t... Is>
|
||||||
|
struct make_offset_index_sequence_impl
|
||||||
|
: make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
|
||||||
|
static_assert(
|
||||||
|
static_cast<int>(Start) >= 0,
|
||||||
|
"make_offset_index_sequence: Start < 0");
|
||||||
|
static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
|
||||||
|
};
|
||||||
|
|
||||||
|
template <size_t Start, size_t... Is>
|
||||||
|
struct make_offset_index_sequence_impl<Start, 0, Is...> {
|
||||||
|
typedef std::index_sequence<Is...> type;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <size_t Start, size_t N>
|
||||||
|
using make_offset_index_sequence =
|
||||||
|
typename make_offset_index_sequence_impl<Start, N>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use tuple_elements to extract a position-indexed subset of elements
|
||||||
|
* from the argument tuple into a result tuple.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
|
||||||
|
* std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
|
||||||
|
* 2>());
|
||||||
|
*/
|
||||||
|
template <class Tuple, size_t... Is>
|
||||||
|
constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
|
||||||
|
return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use tuple_take to extract the first or last n elements from the argument
|
||||||
|
* tuple into a result tuple.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
|
||||||
|
* std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
|
||||||
|
* std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
|
||||||
|
*/
|
||||||
|
template <class Tuple, int N, class Enable = void>
|
||||||
|
struct TupleTake {};
|
||||||
|
|
||||||
|
template <class Tuple, int N>
|
||||||
|
struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
|
||||||
|
static auto call(Tuple t) {
|
||||||
|
constexpr size_t size = std::tuple_size<Tuple>();
|
||||||
|
static_assert(N <= size, "tuple_take: N > size");
|
||||||
|
return tuple_elements(t, std::make_index_sequence<N>{});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Tuple, int N>
|
||||||
|
struct TupleTake < Tuple,
|
||||||
|
N, std::enable_if_t<N<0, void>> {
|
||||||
|
static auto call(Tuple t) {
|
||||||
|
constexpr size_t size = std::tuple_size<Tuple>();
|
||||||
|
static_assert(-N <= size, "tuple_take: -N > size");
|
||||||
|
return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Tuple, int N>
|
||||||
|
auto tuple_take(Tuple t) {
|
||||||
|
return TupleTake<Tuple, N>::call(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use tuple_slice to extract a contiguous subtuple from the argument.
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
|
||||||
|
* "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
|
||||||
|
* tuple_slice<decltype(t), 1, 2>(t);
|
||||||
|
*/
|
||||||
|
template <class Tuple, size_t Start, size_t N>
|
||||||
|
constexpr auto tuple_slice(Tuple t) {
|
||||||
|
constexpr size_t size = std::tuple_size<Tuple>();
|
||||||
|
static_assert(Start + N <= size, "tuple_slice: Start + N > size");
|
||||||
|
return tuple_elements(t, make_offset_index_sequence<Start, N>{});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use tuple_map to run a mapping function over a tuple to get a new tuple.
|
||||||
|
*
|
||||||
|
* Example 1:
|
||||||
|
* auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
|
||||||
|
* (int32_t a) -> int16_t {return a+1;});
|
||||||
|
* // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
|
||||||
|
*
|
||||||
|
* Example 2:
|
||||||
|
* struct Mapper {
|
||||||
|
* std::string operator()(int32_t a) const {
|
||||||
|
* return std::to_string(a);
|
||||||
|
* }
|
||||||
|
* int64_t operator()(const std::string& a) const {
|
||||||
|
* return atoi(a.c_str());
|
||||||
|
* }
|
||||||
|
* };
|
||||||
|
* auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
|
||||||
|
* Mapper());
|
||||||
|
* // result == std::tuple<std::string, int64_t>("3", 4)
|
||||||
|
*
|
||||||
|
* Example 3:
|
||||||
|
* struct A final {
|
||||||
|
* int32_t func() {
|
||||||
|
* return 5;
|
||||||
|
* }
|
||||||
|
* };
|
||||||
|
* struct B final {
|
||||||
|
* std::string func() {
|
||||||
|
* return "5";
|
||||||
|
* }
|
||||||
|
* };
|
||||||
|
* auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
|
||||||
|
* a.func(); });
|
||||||
|
* // result == std::tuple<int32_t, std::string>(5, "5");
|
||||||
|
*/
|
||||||
|
namespace detail {
|
||||||
|
template <class Mapper, class... Args, size_t... Indices>
|
||||||
|
auto tuple_map(
|
||||||
|
// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
|
||||||
|
std::tuple<Args...>&& tuple,
|
||||||
|
const Mapper& mapper,
|
||||||
|
std::index_sequence<Indices...> /*unused*/) {
|
||||||
|
return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
|
||||||
|
tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
|
||||||
|
}
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
template <class Mapper, class... Args>
|
||||||
|
auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
|
||||||
|
return detail::tuple_map(
|
||||||
|
std::move(tuple), mapper, std::index_sequence_for<Args...>());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace c10::guts
|
||||||
|
|||||||
@ -1 +1,515 @@
|
|||||||
#include <torch/headeronly/util/TypeList.h>
|
#pragma once
|
||||||
|
|
||||||
|
#include <c10/util/TypeTraits.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <tuple>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
namespace c10::guts {
|
||||||
|
|
||||||
|
template <class... T>
|
||||||
|
struct false_t : std::false_type {};
|
||||||
|
template <template <class> class... T>
|
||||||
|
struct false_higher_t : std::false_type {};
|
||||||
|
|
||||||
|
namespace typelist {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type holding a list of types for compile time type computations
|
||||||
|
*/
|
||||||
|
template <class... Items>
|
||||||
|
struct typelist final {
|
||||||
|
public:
|
||||||
|
typelist() = delete; // not for instantiation
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of types in a typelist
|
||||||
|
* Example:
|
||||||
|
* 3 == size<typelist<int, int, double>>::value
|
||||||
|
*/
|
||||||
|
template <class TypeList>
|
||||||
|
struct size final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::size<T>, T must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class... Types>
|
||||||
|
struct size<typelist<Types...>> final {
|
||||||
|
static constexpr size_t value = sizeof...(Types);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transforms a list of types into a tuple holding these types.
|
||||||
|
* Example:
|
||||||
|
* std::tuple<int, string> == to_tuple_t<typelist<int, string>>
|
||||||
|
*/
|
||||||
|
template <class TypeList>
|
||||||
|
struct to_tuple final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::to_tuple<T>, T must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class... Types>
|
||||||
|
struct to_tuple<typelist<Types...>> final {
|
||||||
|
using type = std::tuple<Types...>;
|
||||||
|
};
|
||||||
|
template <class TypeList>
|
||||||
|
using to_tuple_t = typename to_tuple<TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a typelist containing the types of a given tuple.
|
||||||
|
* Example:
|
||||||
|
* typelist<int, string> == from_tuple_t<std::tuple<int, string>>
|
||||||
|
*/
|
||||||
|
template <class Tuple>
|
||||||
|
struct from_tuple final {
|
||||||
|
static_assert(
|
||||||
|
false_t<Tuple>::value,
|
||||||
|
"In typelist::from_tuple<T>, T must be std::tuple<...>.");
|
||||||
|
};
|
||||||
|
template <class... Types>
|
||||||
|
struct from_tuple<std::tuple<Types...>> final {
|
||||||
|
using type = typelist<Types...>;
|
||||||
|
};
|
||||||
|
template <class Tuple>
|
||||||
|
using from_tuple_t = typename from_tuple<Tuple>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Concatenates multiple type lists.
|
||||||
|
* Example:
|
||||||
|
* typelist<int, string, int> == concat_t<typelist<int, string>,
|
||||||
|
* typelist<int>>
|
||||||
|
*/
|
||||||
|
template <class... TypeLists>
|
||||||
|
struct concat final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeLists...>::value,
|
||||||
|
"In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class... Head1Types, class... Head2Types, class... TailLists>
|
||||||
|
struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...>
|
||||||
|
final {
|
||||||
|
using type =
|
||||||
|
typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::
|
||||||
|
type;
|
||||||
|
};
|
||||||
|
template <class... HeadTypes>
|
||||||
|
struct concat<typelist<HeadTypes...>> final {
|
||||||
|
using type = typelist<HeadTypes...>;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct concat<> final {
|
||||||
|
using type = typelist<>;
|
||||||
|
};
|
||||||
|
template <class... TypeLists>
|
||||||
|
using concat_t = typename concat<TypeLists...>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filters the types in a type list by a type trait.
|
||||||
|
* Examples:
|
||||||
|
* typelist<int&, const string&&> == filter_t<std::is_reference,
|
||||||
|
* typelist<void, string, int&, bool, const string&&, int>>
|
||||||
|
*/
|
||||||
|
template <template <class> class Condition, class TypeList>
|
||||||
|
struct filter final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <template <class> class Condition, class Head, class... Tail>
|
||||||
|
struct filter<Condition, typelist<Head, Tail...>> final {
|
||||||
|
static_assert(
|
||||||
|
is_type_condition<Condition>::value,
|
||||||
|
"In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||||
|
using type = std::conditional_t<
|
||||||
|
Condition<Head>::value,
|
||||||
|
concat_t<
|
||||||
|
typelist<Head>,
|
||||||
|
typename filter<Condition, typelist<Tail...>>::type>,
|
||||||
|
typename filter<Condition, typelist<Tail...>>::type>;
|
||||||
|
};
|
||||||
|
template <template <class> class Condition>
|
||||||
|
struct filter<Condition, typelist<>> final {
|
||||||
|
static_assert(
|
||||||
|
is_type_condition<Condition>::value,
|
||||||
|
"In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||||
|
using type = typelist<>;
|
||||||
|
};
|
||||||
|
template <template <class> class Condition, class TypeList>
|
||||||
|
using filter_t = typename filter<Condition, TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts how many types in the list fulfill a type trait
|
||||||
|
* Examples:
|
||||||
|
* 2 == count_if<std::is_reference, typelist<void, string, int&, bool, const
|
||||||
|
* string&&, int>>
|
||||||
|
*/
|
||||||
|
template <template <class> class Condition, class TypeList>
|
||||||
|
struct count_if final {
|
||||||
|
static_assert(
|
||||||
|
is_type_condition<Condition>::value,
|
||||||
|
"In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||||
|
static_assert(
|
||||||
|
is_instantiation_of<typelist, TypeList>::value,
|
||||||
|
"In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||||
|
// TODO Direct implementation might be faster
|
||||||
|
static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if a typelist contains a certain type.
|
||||||
|
* Examples:
|
||||||
|
* contains<typelist<int, string>, string> == true_type
|
||||||
|
* contains<typelist<int, string>, double> == false_type
|
||||||
|
*/
|
||||||
|
namespace detail {
|
||||||
|
template <class TypeList, class Type, class Enable = void>
|
||||||
|
struct contains {};
|
||||||
|
template <class Type>
|
||||||
|
struct contains<typelist<>, Type, void> : std::false_type {};
|
||||||
|
template <class Type, class Head, class... Tail>
|
||||||
|
struct contains<
|
||||||
|
typelist<Head, Tail...>,
|
||||||
|
Type,
|
||||||
|
std::enable_if_t<std::is_same_v<Head, Type>>> : std::true_type {};
|
||||||
|
template <class Type, class Head, class... Tail>
|
||||||
|
struct contains<
|
||||||
|
typelist<Head, Tail...>,
|
||||||
|
Type,
|
||||||
|
std::enable_if_t<!std::is_same_v<Head, Type>>>
|
||||||
|
: contains<typelist<Tail...>, Type> {};
|
||||||
|
} // namespace detail
|
||||||
|
template <class TypeList, class Type>
|
||||||
|
using contains = typename detail::contains<TypeList, Type>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true iff the type trait is true for all types in the type list
|
||||||
|
* Examples:
|
||||||
|
* true == all<std::is_reference, typelist<int&, const float&&, const
|
||||||
|
* MyClass&>>::value false == all<std::is_reference, typelist<int&, const
|
||||||
|
* float&&, MyClass>>::value
|
||||||
|
*/
|
||||||
|
template <template <class> class Condition, class TypeList>
|
||||||
|
struct all {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::all<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <template <class> class Condition, class... Types>
|
||||||
|
struct all<Condition, typelist<Types...>>
|
||||||
|
: std::conjunction<Condition<Types>...> {
|
||||||
|
static_assert(
|
||||||
|
is_type_condition<Condition>::value,
|
||||||
|
"In typelist::all<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true iff the type trait is true for any type in the type list
|
||||||
|
* Examples:
|
||||||
|
* true == true_for_any_type<std::is_reference, typelist<int, const
|
||||||
|
* float&&, const MyClass>>::value false ==
|
||||||
|
* true_for_any_type<std::is_reference, typelist<int, const float,
|
||||||
|
* MyClass>>::value
|
||||||
|
*/
|
||||||
|
template <template <class> class Condition, class TypeList>
|
||||||
|
struct true_for_any_type final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::true_for_any_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <template <class> class Condition, class... Types>
|
||||||
|
struct true_for_any_type<Condition, typelist<Types...>> final
|
||||||
|
: std::disjunction<Condition<Types>...> {
|
||||||
|
static_assert(
|
||||||
|
is_type_condition<Condition>::value,
|
||||||
|
"In typelist::true_for_any_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps types of a type list using a type trait
|
||||||
|
* Example:
|
||||||
|
* typelist<int&, double&, string&> == map_t<std::add_lvalue_reference_t,
|
||||||
|
* typelist<int, double, string>>
|
||||||
|
*/
|
||||||
|
template <template <class> class Mapper, class TypeList>
|
||||||
|
struct map final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <template <class> class Mapper, class... Types>
|
||||||
|
struct map<Mapper, typelist<Types...>> final {
|
||||||
|
using type = typelist<Mapper<Types>...>;
|
||||||
|
};
|
||||||
|
template <template <class> class Mapper, class TypeList>
|
||||||
|
using map_t = typename map<Mapper, TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the first element of a type list.
|
||||||
|
* Example:
|
||||||
|
* int == head_t<typelist<int, string>>
|
||||||
|
*/
|
||||||
|
template <class TypeList>
|
||||||
|
struct head final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::head<T>, the T argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class Head, class... Tail>
|
||||||
|
struct head<typelist<Head, Tail...>> final {
|
||||||
|
using type = Head;
|
||||||
|
};
|
||||||
|
template <class TypeList>
|
||||||
|
using head_t = typename head<TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the first element of a type list, or the specified default if the
|
||||||
|
* type list is empty. Example: int == head_t<bool, typelist<int, string>>
|
||||||
|
* bool == head_t<bool, typelist<>>
|
||||||
|
*/
|
||||||
|
template <class Default, class TypeList>
|
||||||
|
struct head_with_default final {
|
||||||
|
using type = Default;
|
||||||
|
};
|
||||||
|
template <class Default, class Head, class... Tail>
|
||||||
|
struct head_with_default<Default, typelist<Head, Tail...>> final {
|
||||||
|
using type = Head;
|
||||||
|
};
|
||||||
|
template <class Default, class TypeList>
|
||||||
|
using head_with_default_t = typename head_with_default<Default, TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the N-th element of a type list.
|
||||||
|
* Example:
|
||||||
|
* int == element_t<1, typelist<float, int, char>>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/// Base template.
|
||||||
|
template <size_t Index, class TypeList>
|
||||||
|
struct element final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::element<T>, the T argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Successful case, we have reached the zero index and can "return" the head
|
||||||
|
/// type.
|
||||||
|
template <class Head, class... Tail>
|
||||||
|
struct element<0, typelist<Head, Tail...>> {
|
||||||
|
using type = Head;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Error case, we have an index but ran out of types! It will only be selected
|
||||||
|
/// if `Ts...` is actually empty!
|
||||||
|
template <size_t Index, class... Ts>
|
||||||
|
struct element<Index, typelist<Ts...>> {
|
||||||
|
static_assert(
|
||||||
|
Index < sizeof...(Ts),
|
||||||
|
"Index is out of bounds in typelist::element");
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
|
||||||
|
template <size_t Index, class Head, class... Tail>
|
||||||
|
struct element<Index, typelist<Head, Tail...>>
|
||||||
|
: element<Index - 1, typelist<Tail...>> {};
|
||||||
|
|
||||||
|
/// Convenience alias.
|
||||||
|
template <size_t Index, class TypeList>
|
||||||
|
using element_t = typename element<Index, TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the last element of a type list.
|
||||||
|
* Example:
|
||||||
|
* int == last_t<typelist<int, string>>
|
||||||
|
*/
|
||||||
|
template <class TypeList>
|
||||||
|
struct last final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::last<T>, the T argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class Head, class... Tail>
|
||||||
|
struct last<typelist<Head, Tail...>> final {
|
||||||
|
using type = typename last<typelist<Tail...>>::type;
|
||||||
|
};
|
||||||
|
template <class Head>
|
||||||
|
struct last<typelist<Head>> final {
|
||||||
|
using type = Head;
|
||||||
|
};
|
||||||
|
template <class TypeList>
|
||||||
|
using last_t = typename last<TypeList>::type;
|
||||||
|
static_assert(std::is_same_v<int, last_t<typelist<double, float, int>>>);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Take/drop a number of arguments from a typelist.
|
||||||
|
* Example:
|
||||||
|
* typelist<int, string> == take_t<typelist<int, string, bool>, 2>
|
||||||
|
* typelist<bool> == drop_t<typelist<int, string, bool>, 2>
|
||||||
|
*/
|
||||||
|
namespace detail {
|
||||||
|
template <class TypeList, size_t offset, class IndexSequence>
|
||||||
|
struct take_elements final {};
|
||||||
|
|
||||||
|
template <class TypeList, size_t offset, size_t... Indices>
|
||||||
|
struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
|
||||||
|
using type = typelist<typename element<offset + Indices, TypeList>::type...>;
|
||||||
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
struct take final {
|
||||||
|
static_assert(
|
||||||
|
is_instantiation_of<typelist, TypeList>::value,
|
||||||
|
"In typelist::take<T, num>, the T argument must be typelist<...>.");
|
||||||
|
static_assert(
|
||||||
|
num <= size<TypeList>::value,
|
||||||
|
"Tried to typelist::take more elements than there are in the list");
|
||||||
|
using type = typename detail::
|
||||||
|
take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
|
||||||
|
};
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
using take_t = typename take<TypeList, num>::type;
|
||||||
|
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
struct drop final {
|
||||||
|
static_assert(
|
||||||
|
is_instantiation_of<typelist, TypeList>::value,
|
||||||
|
"In typelist::drop<T, num>, the T argument must be typelist<...>.");
|
||||||
|
static_assert(
|
||||||
|
num <= size<TypeList>::value,
|
||||||
|
"Tried to typelist::drop more elements than there are in the list");
|
||||||
|
using type = typename detail::take_elements<
|
||||||
|
TypeList,
|
||||||
|
num,
|
||||||
|
std::make_index_sequence<size<TypeList>::value - num>>::type;
|
||||||
|
};
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
using drop_t = typename drop<TypeList, num>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Like drop, but returns an empty list rather than an assertion error if `num`
|
||||||
|
* is larger than the size of the TypeList.
|
||||||
|
* Example:
|
||||||
|
* typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
|
||||||
|
* typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
|
||||||
|
*/
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
struct drop_if_nonempty final {
|
||||||
|
static_assert(
|
||||||
|
is_instantiation_of<typelist, TypeList>::value,
|
||||||
|
"In typelist::drop<T, num>, the T argument must be typelist<...>.");
|
||||||
|
using type = typename detail::take_elements<
|
||||||
|
TypeList,
|
||||||
|
std::min(num, size<TypeList>::value),
|
||||||
|
std::make_index_sequence<
|
||||||
|
size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
|
||||||
|
};
|
||||||
|
template <class TypeList, size_t num>
|
||||||
|
using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverses a typelist.
|
||||||
|
* Example:
|
||||||
|
* typelist<int, string> == reverse_t<typelist<string, int>>
|
||||||
|
*/
|
||||||
|
template <class TypeList>
|
||||||
|
struct reverse final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::reverse<T>, the T argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class Head, class... Tail>
|
||||||
|
struct reverse<typelist<Head, Tail...>> final {
|
||||||
|
using type =
|
||||||
|
concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct reverse<typelist<>> final {
|
||||||
|
using type = typelist<>;
|
||||||
|
};
|
||||||
|
template <class TypeList>
|
||||||
|
using reverse_t = typename reverse<TypeList>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the index of the first type in a typelist fulfilling a type trait
|
||||||
|
* condition. Example:
|
||||||
|
*
|
||||||
|
* 2 == find_if<typelist<char, int, char&, int&>, std::is_reference>::value
|
||||||
|
*/
|
||||||
|
template <class TypeList, template <class> class Condition, class Enable = void>
|
||||||
|
struct find_if final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::find_if<TypeList, Condition>, the TypeList argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <template <class> class Condition>
|
||||||
|
struct find_if<typelist<>, Condition, void> final {
|
||||||
|
static_assert(
|
||||||
|
false_higher_t<Condition>::value,
|
||||||
|
"In typelist::find_if<Type/List, Condition>, didn't find any type fulfilling the Condition.");
|
||||||
|
};
|
||||||
|
template <class Head, class... Tail, template <class> class Condition>
|
||||||
|
struct find_if<
|
||||||
|
typelist<Head, Tail...>,
|
||||||
|
Condition,
|
||||||
|
std::enable_if_t<Condition<Head>::value>>
|
||||||
|
final {
|
||||||
|
static constexpr size_t value = 0;
|
||||||
|
};
|
||||||
|
template <class Head, class... Tail, template <class> class Condition>
|
||||||
|
struct find_if<
|
||||||
|
typelist<Head, Tail...>,
|
||||||
|
Condition,
|
||||||
|
std::enable_if_t<!Condition<Head>::value>>
|
||||||
|
final {
|
||||||
|
static constexpr size_t value =
|
||||||
|
1 + find_if<typelist<Tail...>, Condition>::value;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps a list of types into a list of values.
|
||||||
|
* Examples:
|
||||||
|
* // Example 1
|
||||||
|
* auto sizes =
|
||||||
|
* map_types_to_values<typelist<int64_t, bool, uint32_t>>(
|
||||||
|
* [] (auto t) { return sizeof(decltype(t)::type); }
|
||||||
|
* );
|
||||||
|
* // sizes == std::tuple<size_t, size_t, size_t>{8, 1, 4}
|
||||||
|
*
|
||||||
|
* // Example 2
|
||||||
|
* auto shared_ptrs =
|
||||||
|
* map_types_to_values<typelist<int, double>>(
|
||||||
|
* [] (auto t) { return make_shared<typename decltype(t)::type>(); }
|
||||||
|
* );
|
||||||
|
* // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
|
||||||
|
*/
|
||||||
|
namespace detail {
|
||||||
|
template <class T>
|
||||||
|
struct type_ final {
|
||||||
|
using type = T;
|
||||||
|
};
|
||||||
|
template <class TypeList>
|
||||||
|
struct map_types_to_values final {
|
||||||
|
static_assert(
|
||||||
|
false_t<TypeList>::value,
|
||||||
|
"In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
|
||||||
|
};
|
||||||
|
template <class... Types>
|
||||||
|
struct map_types_to_values<typelist<Types...>> final {
|
||||||
|
template <class Func>
|
||||||
|
static auto call(Func&& func) {
|
||||||
|
return std::tuple{std::forward<Func>(func)(type_<Types>())...};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
template <class TypeList, class Func>
|
||||||
|
auto map_types_to_values(Func&& func) {
|
||||||
|
return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace typelist
|
||||||
|
} // namespace c10::guts
|
||||||
|
|||||||
@ -1 +1,151 @@
|
|||||||
#include <torch/headeronly/util/TypeTraits.h>
|
#pragma once
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
|
namespace c10::guts {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_equality_comparable<T> is true_type iff the equality operator is defined
|
||||||
|
* for T.
|
||||||
|
*/
|
||||||
|
template <class T, class Enable = void>
|
||||||
|
struct is_equality_comparable : std::false_type {};
|
||||||
|
template <class T>
|
||||||
|
struct is_equality_comparable<
|
||||||
|
T,
|
||||||
|
std::void_t<decltype(std::declval<T&>() == std::declval<T&>())>>
|
||||||
|
: std::true_type {};
|
||||||
|
template <class T>
|
||||||
|
using is_equality_comparable_t = typename is_equality_comparable<T>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_hashable<T> is true_type iff std::hash is defined for T
|
||||||
|
*/
|
||||||
|
template <class T, class Enable = void>
|
||||||
|
struct is_hashable : std::false_type {};
|
||||||
|
template <class T>
|
||||||
|
struct is_hashable<T, std::void_t<decltype(std::hash<T>()(std::declval<T&>()))>>
|
||||||
|
: std::true_type {};
|
||||||
|
template <class T>
|
||||||
|
using is_hashable_t = typename is_hashable<T>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_function_type<T> is true_type iff T is a plain function type (i.e.
|
||||||
|
* "Result(Args...)")
|
||||||
|
*/
|
||||||
|
template <class T>
|
||||||
|
struct is_function_type : std::false_type {};
|
||||||
|
template <class Result, class... Args>
|
||||||
|
struct is_function_type<Result(Args...)> : std::true_type {};
|
||||||
|
template <class T>
|
||||||
|
using is_function_type_t = typename is_function_type<T>::type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
|
||||||
|
* (e.g. vector<int> is an instantiation of vector) Example:
|
||||||
|
* is_instantiation_of_t<vector, vector<int>> // true
|
||||||
|
* is_instantiation_of_t<pair, pair<int, string>> // true
|
||||||
|
* is_instantiation_of_t<vector, pair<int, string>> // false
|
||||||
|
*/
|
||||||
|
template <template <class...> class Template, class T>
|
||||||
|
struct is_instantiation_of : std::false_type {};
|
||||||
|
template <template <class...> class Template, class... Args>
|
||||||
|
struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
|
||||||
|
template <template <class...> class Template, class T>
|
||||||
|
using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/**
|
||||||
|
* strip_class: helper to remove the class type from pointers to `operator()`.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct strip_class {};
|
||||||
|
template <typename Class, typename Result, typename... Args>
|
||||||
|
struct strip_class<Result (Class::*)(Args...)> {
|
||||||
|
using type = Result(Args...);
|
||||||
|
};
|
||||||
|
template <typename Class, typename Result, typename... Args>
|
||||||
|
struct strip_class<Result (Class::*)(Args...) const> {
|
||||||
|
using type = Result(Args...);
|
||||||
|
};
|
||||||
|
template <typename T>
|
||||||
|
using strip_class_t = typename strip_class<T>::type;
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates to true_type, iff the given class is a Functor
|
||||||
|
* (i.e. has a call operator with some set of arguments)
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <class Functor, class Enable = void>
|
||||||
|
struct is_functor : std::false_type {};
|
||||||
|
template <class Functor>
|
||||||
|
struct is_functor<
|
||||||
|
Functor,
|
||||||
|
std::enable_if_t<is_function_type<
|
||||||
|
detail::strip_class_t<decltype(&Functor::operator())>>::value>>
|
||||||
|
: std::true_type {};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lambda_is_stateless<T> is true iff the lambda type T is stateless
|
||||||
|
* (i.e. does not have a closure).
|
||||||
|
* Example:
|
||||||
|
* auto stateless_lambda = [] (int a) {return a;};
|
||||||
|
* lambda_is_stateless<decltype(stateless_lambda)> // true
|
||||||
|
* auto stateful_lambda = [&] (int a) {return a;};
|
||||||
|
* lambda_is_stateless<decltype(stateful_lambda)> // false
|
||||||
|
*/
|
||||||
|
namespace detail {
|
||||||
|
template <class LambdaType, class FuncType>
|
||||||
|
struct is_stateless_lambda__ final {
|
||||||
|
static_assert(
|
||||||
|
!std::is_same_v<LambdaType, LambdaType>,
|
||||||
|
"Base case shouldn't be hit");
|
||||||
|
};
|
||||||
|
// implementation idea: According to the C++ standard, stateless lambdas are
|
||||||
|
// convertible to function pointers
|
||||||
|
template <class LambdaType, class C, class Result, class... Args>
|
||||||
|
struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...) const>
|
||||||
|
: std::is_convertible<LambdaType, Result (*)(Args...)> {};
|
||||||
|
template <class LambdaType, class C, class Result, class... Args>
|
||||||
|
struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...)>
|
||||||
|
: std::is_convertible<LambdaType, Result (*)(Args...)> {};
|
||||||
|
|
||||||
|
// case where LambdaType is not even a functor
|
||||||
|
template <class LambdaType, class Enable = void>
|
||||||
|
struct is_stateless_lambda_ final : std::false_type {};
|
||||||
|
// case where LambdaType is a functor
|
||||||
|
template <class LambdaType>
|
||||||
|
struct is_stateless_lambda_<
|
||||||
|
LambdaType,
|
||||||
|
std::enable_if_t<is_functor<LambdaType>::value>>
|
||||||
|
: is_stateless_lambda__<LambdaType, decltype(&LambdaType::operator())> {};
|
||||||
|
} // namespace detail
|
||||||
|
template <class T>
|
||||||
|
using is_stateless_lambda = detail::is_stateless_lambda_<std::decay_t<T>>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_type_condition<C> is true_type iff C<...> is a type trait representing a
|
||||||
|
* condition (i.e. has a constexpr static bool ::value member) Example:
|
||||||
|
* is_type_condition<std::is_reference> // true
|
||||||
|
*/
|
||||||
|
template <template <class> class C, class Enable = void>
|
||||||
|
struct is_type_condition : std::false_type {};
|
||||||
|
template <template <class> class C>
|
||||||
|
struct is_type_condition<
|
||||||
|
C,
|
||||||
|
std::enable_if_t<
|
||||||
|
std::is_same_v<bool, std::remove_cv_t<decltype(C<int>::value)>>>>
|
||||||
|
: std::true_type {};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is_fundamental<T> is true_type iff the lambda type T is a fundamental type
|
||||||
|
* (that is, arithmetic type, void, or nullptr_t). Example: is_fundamental<int>
|
||||||
|
* // true We define it here to resolve a MSVC bug. See
|
||||||
|
* https://github.com/pytorch/pytorch/issues/30932 for details.
|
||||||
|
*/
|
||||||
|
template <class T>
|
||||||
|
struct is_fundamental : std::is_fundamental<T> {};
|
||||||
|
} // namespace c10::guts
|
||||||
|
|||||||
@ -1,74 +0,0 @@
|
|||||||
#ifndef C10_UTIL_LOGGING_COMMON_H_
|
|
||||||
#define C10_UTIL_LOGGING_COMMON_H_
|
|
||||||
|
|
||||||
#include <c10/macros/Export.h>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
namespace c10 {
|
|
||||||
|
|
||||||
// MessageLogger that throws exceptions instead of aborting (glog version)
|
|
||||||
// or logs and may abort (non-glog version).
|
|
||||||
class C10_API MessageLogger {
|
|
||||||
public:
|
|
||||||
MessageLogger(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
int severity,
|
|
||||||
bool exit_on_fatal = true);
|
|
||||||
~MessageLogger() noexcept(false);
|
|
||||||
|
|
||||||
// Return the stream associated with the logger object.
|
|
||||||
std::stringstream& stream();
|
|
||||||
|
|
||||||
private:
|
|
||||||
// When there is a fatal log, and fatal == true, we abort
|
|
||||||
// otherwise, we throw.
|
|
||||||
void DealWithFatal();
|
|
||||||
|
|
||||||
#if defined(ANDROID) && !defined(C10_USE_GLOG)
|
|
||||||
const char* tag_{"native"};
|
|
||||||
#endif
|
|
||||||
std::stringstream stream_;
|
|
||||||
int severity_;
|
|
||||||
bool exit_on_fatal_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// This class is used to explicitly ignore values in the conditional
|
|
||||||
// logging macros. This avoids compiler warnings like "value computed
|
|
||||||
// is not used" and "statement has no effect".
|
|
||||||
class C10_API LoggerVoidify {
|
|
||||||
public:
|
|
||||||
LoggerVoidify() = default;
|
|
||||||
// This has to be an operator with a precedence lower than << but
|
|
||||||
// higher than ?:
|
|
||||||
void operator&(const std::ostream& s [[maybe_unused]]) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Forward declarations for CheckNotNull functions
|
|
||||||
template <typename T>
|
|
||||||
T& CheckNotNullCommon(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal = true);
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T* CheckNotNull(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T* t,
|
|
||||||
bool fatal = true);
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T& CheckNotNull(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal = true);
|
|
||||||
|
|
||||||
} // namespace c10
|
|
||||||
|
|
||||||
#endif // C10_UTIL_LOGGING_COMMON_H_
|
|
||||||
@ -47,53 +47,57 @@ INSTANTIATE_FOR_CONTAINER(set)
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <c10/util/logging_common.h>
|
|
||||||
#include <glog/logging.h>
|
#include <glog/logging.h>
|
||||||
|
|
||||||
namespace c10 {
|
// Additional macros on top of glog
|
||||||
|
#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
|
||||||
|
#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
|
||||||
|
#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
|
||||||
|
#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
|
||||||
|
#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
|
||||||
|
#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
|
||||||
|
|
||||||
[[noreturn]] void ThrowEnforceNotMet(
|
#ifndef NDEBUG
|
||||||
const char* file,
|
#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
|
||||||
const int line,
|
#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
|
||||||
const char* condition,
|
#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
|
||||||
const std::string& msg,
|
#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
|
||||||
const void* caller);
|
#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
|
||||||
|
#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
|
||||||
|
#else // !NDEBUG
|
||||||
|
// These versions generate no code in optimized mode.
|
||||||
|
#define TORCH_DCHECK_EQ(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_EQ(val1, val2)
|
||||||
|
#define TORCH_DCHECK_NE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_NE(val1, val2)
|
||||||
|
#define TORCH_DCHECK_LE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_LE(val1, val2)
|
||||||
|
#define TORCH_DCHECK_LT(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_LT(val1, val2)
|
||||||
|
#define TORCH_DCHECK_GE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_GE(val1, val2)
|
||||||
|
#define TORCH_DCHECK_GT(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
DCHECK_GT(val1, val2)
|
||||||
|
#endif // NDEBUG
|
||||||
|
|
||||||
template <typename T>
|
// Check that a pointer is not null.
|
||||||
T& CheckNotNullCommon(
|
#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal) {
|
|
||||||
if (t == nullptr) {
|
|
||||||
MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
|
|
||||||
<< "Check failed: '" << names << "' must be non NULL. ";
|
|
||||||
}
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
#ifndef NDEBUG
|
||||||
T* CheckNotNull(
|
// Debug only version of TORCH_CHECK_NOTNULL
|
||||||
const char* file,
|
#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
|
||||||
int line,
|
#else // !NDEBUG
|
||||||
const char* names,
|
// Optimized version - generates no code.
|
||||||
T* t,
|
#define TORCH_DCHECK_NOTNULL(val) \
|
||||||
bool fatal) {
|
while (false) \
|
||||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
DCHECK_NOTNULL(val)
|
||||||
}
|
#endif // NDEBUG
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T& CheckNotNull(
|
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal) {
|
|
||||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace c10
|
|
||||||
|
|
||||||
// Log with source location information override (to be used in generic
|
// Log with source location information override (to be used in generic
|
||||||
// warning/error handlers implemented as functions, not macros)
|
// warning/error handlers implemented as functions, not macros)
|
||||||
|
|||||||
@ -13,7 +13,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <c10/util/Flags.h>
|
#include <c10/util/Flags.h>
|
||||||
#include <c10/util/logging_common.h>
|
|
||||||
|
|
||||||
const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
|
const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
|
||||||
|
|
||||||
@ -25,40 +24,61 @@ const int GLOG_ERROR = 2;
|
|||||||
const int GLOG_WARNING = 1;
|
const int GLOG_WARNING = 1;
|
||||||
const int GLOG_INFO = 0;
|
const int GLOG_INFO = 0;
|
||||||
|
|
||||||
|
class C10_API MessageLogger {
|
||||||
|
public:
|
||||||
|
MessageLogger(const char* file, int line, int severity);
|
||||||
|
~MessageLogger();
|
||||||
|
// Return the stream associated with the logger object.
|
||||||
|
std::stringstream& stream() {
|
||||||
|
return stream_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// When there is a fatal log, we simply abort.
|
||||||
|
void DealWithFatal() {
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* tag_;
|
||||||
|
std::stringstream stream_;
|
||||||
|
int severity_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// This class is used to explicitly ignore values in the conditional
|
||||||
|
// logging macros. This avoids compiler warnings like "value computed
|
||||||
|
// is not used" and "statement has no effect".
|
||||||
|
class C10_API LoggerVoidify {
|
||||||
|
public:
|
||||||
|
LoggerVoidify() = default;
|
||||||
|
// This has to be an operator with a precedence lower than << but
|
||||||
|
// higher than ?:
|
||||||
|
void operator&(const std::ostream& s [[maybe_unused]]) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Log a message and terminate.
|
||||||
|
template <class T>
|
||||||
|
void LogMessageFatal(const char* file, int line, const T& message) {
|
||||||
|
MessageLogger(file, line, GLOG_FATAL).stream() << message;
|
||||||
|
}
|
||||||
|
|
||||||
// Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
|
// Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
|
||||||
// pointers and smart pointers.
|
// pointers and smart pointers.
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T& CheckNotNullCommon(
|
T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
|
||||||
const char* file,
|
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal) {
|
|
||||||
if (t == nullptr) {
|
if (t == nullptr) {
|
||||||
MessageLogger(file, line, GLOG_FATAL, fatal).stream()
|
LogMessageFatal(file, line, std::string(names));
|
||||||
<< "Check failed: '" << names << "' must be non NULL. ";
|
|
||||||
}
|
}
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T* CheckNotNull(
|
T* CheckNotNull(const char* file, int line, const char* names, T* t) {
|
||||||
const char* file,
|
return CheckNotNullCommon(file, line, names, t);
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T* t,
|
|
||||||
bool fatal) {
|
|
||||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T& CheckNotNull(
|
T& CheckNotNull(const char* file, int line, const char* names, T& t) {
|
||||||
const char* file,
|
return CheckNotNullCommon(file, line, names, t);
|
||||||
int line,
|
|
||||||
const char* names,
|
|
||||||
T& t,
|
|
||||||
bool fatal) {
|
|
||||||
return CheckNotNullCommon(file, line, names, t, fatal);
|
|
||||||
}
|
}
|
||||||
} // namespace c10
|
} // namespace c10
|
||||||
|
|
||||||
@ -116,6 +136,65 @@ static_assert(
|
|||||||
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
|
::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
|
||||||
#endif // NDEBUG
|
#endif // NDEBUG
|
||||||
|
|
||||||
|
#define TORCH_CHECK_OP(val1, val2, op) \
|
||||||
|
FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
|
||||||
|
<< (val1) << " vs. " << (val2) << ") "
|
||||||
|
|
||||||
|
// TORCH_CHECK_OP macro definitions
|
||||||
|
#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
||||||
|
#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
||||||
|
#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
||||||
|
#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
||||||
|
#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
||||||
|
#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// Debug only versions of TORCH_CHECK_OP macros.
|
||||||
|
#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
|
||||||
|
#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
|
||||||
|
#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
|
||||||
|
#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
|
||||||
|
#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
|
||||||
|
#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
|
||||||
|
#else // !NDEBUG
|
||||||
|
// These versions generate no code in optimized mode.
|
||||||
|
#define TORCH_DCHECK_EQ(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, ==)
|
||||||
|
#define TORCH_DCHECK_NE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, !=)
|
||||||
|
#define TORCH_DCHECK_LE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, <=)
|
||||||
|
#define TORCH_DCHECK_LT(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, <)
|
||||||
|
#define TORCH_DCHECK_GE(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, >=)
|
||||||
|
#define TORCH_DCHECK_GT(val1, val2) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_OP(val1, val2, >)
|
||||||
|
#endif // NDEBUG
|
||||||
|
|
||||||
|
// Check that a pointer is not null.
|
||||||
|
#define TORCH_CHECK_NOTNULL(val) \
|
||||||
|
::c10::CheckNotNull( \
|
||||||
|
__FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// Debug only version of TORCH_CHECK_NOTNULL
|
||||||
|
#define TORCH_DCHECK_NOTNULL(val) \
|
||||||
|
::c10::CheckNotNull( \
|
||||||
|
__FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
|
||||||
|
#else // !NDEBUG
|
||||||
|
// Optimized version - generates no code.
|
||||||
|
#define TORCH_DCHECK_NOTNULL(val) \
|
||||||
|
while (false) \
|
||||||
|
TORCH_CHECK_NOTNULL(val)
|
||||||
|
#endif // NDEBUG
|
||||||
|
|
||||||
// ---------------------- Support for std objects --------------------------
|
// ---------------------- Support for std objects --------------------------
|
||||||
// These are adapted from glog to support a limited set of logging capability
|
// These are adapted from glog to support a limited set of logging capability
|
||||||
// for STL objects.
|
// for STL objects.
|
||||||
|
|||||||
@ -926,14 +926,15 @@ class DeviceCachingAllocator {
|
|||||||
(release_cached_blocks() && alloc_block(params, true));
|
(release_cached_blocks() && alloc_block(params, true));
|
||||||
}
|
}
|
||||||
if (!block_found) {
|
if (!block_found) {
|
||||||
const auto& raw_device = c10::xpu::get_raw_device(device);
|
c10::xpu::DeviceProp device_prop;
|
||||||
const auto device_total =
|
c10::xpu::get_device_properties(&device_prop, device);
|
||||||
raw_device.get_info<sycl::info::device::global_mem_size>();
|
auto device_total = device_prop.global_mem_size;
|
||||||
// Estimate the available device memory when the SYCL runtime does not
|
// Estimate the available device memory when the SYCL runtime does not
|
||||||
// support the corresponding aspect (ext_intel_free_memory).
|
// support the corresponding aspect (ext_intel_free_memory).
|
||||||
size_t device_free = device_total -
|
size_t device_free = device_prop.global_mem_size -
|
||||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||||
.current;
|
.current;
|
||||||
|
auto& raw_device = c10::xpu::get_raw_device(device);
|
||||||
// TODO: Remove the aspect check once the SYCL runtime bug is fixed on
|
// TODO: Remove the aspect check once the SYCL runtime bug is fixed on
|
||||||
// affected devices.
|
// affected devices.
|
||||||
if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
|
if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
|
||||||
@ -1051,37 +1052,21 @@ class DeviceCachingAllocator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<size_t, size_t> getMemoryInfo() {
|
|
||||||
const auto& device = c10::xpu::get_raw_device(device_index);
|
|
||||||
const size_t total = device.get_info<sycl::info::device::global_mem_size>();
|
|
||||||
TORCH_CHECK(
|
|
||||||
device.has(sycl::aspect::ext_intel_free_memory),
|
|
||||||
"The device (",
|
|
||||||
device.get_info<sycl::info::device::name>(),
|
|
||||||
") doesn't support querying the available free memory. ",
|
|
||||||
"You can file an issue at https://github.com/pytorch/pytorch/issues ",
|
|
||||||
"to help us prioritize its implementation.");
|
|
||||||
const size_t free =
|
|
||||||
device.get_info<sycl::ext::intel::info::device::free_memory>();
|
|
||||||
return {free, total};
|
|
||||||
}
|
|
||||||
|
|
||||||
double getMemoryFraction() {
|
double getMemoryFraction() {
|
||||||
if (!set_fraction) {
|
if (!set_fraction) {
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto device_total =
|
c10::xpu::DeviceProp device_prop;
|
||||||
xpu::get_raw_device(device_index)
|
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||||
.get_info<sycl::info::device::global_mem_size>();
|
|
||||||
return static_cast<double>(allowed_memory_maximum) /
|
return static_cast<double>(allowed_memory_maximum) /
|
||||||
static_cast<double>(device_total);
|
static_cast<double>(device_prop.global_mem_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void setMemoryFraction(double fraction) {
|
void setMemoryFraction(double fraction) {
|
||||||
const auto device_total =
|
c10::xpu::DeviceProp device_prop;
|
||||||
xpu::get_raw_device(device_index)
|
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||||
.get_info<sycl::info::device::global_mem_size>();
|
auto device_total = device_prop.global_mem_size;
|
||||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||||
set_fraction = true;
|
set_fraction = true;
|
||||||
}
|
}
|
||||||
@ -1255,11 +1240,6 @@ class XPUAllocator : public DeviceAllocator {
|
|||||||
c10::xpu::get_raw_device(dev_to_access));
|
c10::xpu::get_raw_device(dev_to_access));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
|
|
||||||
assertValidDevice(device);
|
|
||||||
return device_allocators[device]->getMemoryInfo();
|
|
||||||
}
|
|
||||||
|
|
||||||
double getMemoryFraction(DeviceIndex device) {
|
double getMemoryFraction(DeviceIndex device) {
|
||||||
assertValidDevice(device);
|
assertValidDevice(device);
|
||||||
return device_allocators[device]->getMemoryFraction();
|
return device_allocators[device]->getMemoryFraction();
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user