mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-03 15:35:04 +08:00
Compare commits
162 Commits
test-vec-m
...
VLA_exp
| Author | SHA1 | Date | |
|---|---|---|---|
| 3411990fa0 | |||
| fc32f3d5eb | |||
| ef8f493676 | |||
| 92eaa3d3b8 | |||
| e0340e599e | |||
| c3e4e4079e | |||
| 62f61292e3 | |||
| 41cbceee59 | |||
| 46706e7c34 | |||
| 6662a76f59 | |||
| 05aade1b6d | |||
| f946b25865 | |||
| d2e02585b8 | |||
| 3dd7ebf418 | |||
| 8273ee0646 | |||
| c57382a493 | |||
| e7cc42df58 | |||
| 72c69e731f | |||
| 78b9dea754 | |||
| 838924436e | |||
| 2ffb510942 | |||
| 20b5f694f8 | |||
| 447e300d55 | |||
| 5b2ad9279c | |||
| 78d7f0cdec | |||
| d5c719ec3c | |||
| c44efc3755 | |||
| 6b9473469f | |||
| 7a4167a164 | |||
| 8e67a6ae89 | |||
| c68ad1bd6a | |||
| 3e5e094615 | |||
| c65efc8ea1 | |||
| a9049413e2 | |||
| d7a5ec9355 | |||
| 2c46922ce4 | |||
| 668d414ae7 | |||
| 4541509237 | |||
| 6c7f88c2c9 | |||
| c400c8e2e0 | |||
| 25c3a7e317 | |||
| de7376537f | |||
| fd2c64e286 | |||
| 2b1ae29960 | |||
| 1293405c8d | |||
| 3a65ff84b6 | |||
| acf13a9b75 | |||
| 3a55676200 | |||
| af39144a93 | |||
| 25343b343e | |||
| 07fad04181 | |||
| 7ac70ac4cd | |||
| e221a1c853 | |||
| 4defea1e2c | |||
| 53d68b95de | |||
| f74842d57f | |||
| 644fee2610 | |||
| 7821fbc560 | |||
| 73ee323380 | |||
| 176c6446f8 | |||
| debc0591b8 | |||
| 0df78f0c11 | |||
| d0e8a0ec4c | |||
| 22492848b6 | |||
| 5c14315b05 | |||
| 1b99c1859c | |||
| 435edbcb5d | |||
| 6c6e11c206 | |||
| a775c8e73e | |||
| 24d07b3a67 | |||
| 90fd06be71 | |||
| 002f18807e | |||
| 259e79e3ff | |||
| ee343ce60c | |||
| ea5369113a | |||
| b268f22ab2 | |||
| 52a52d1b78 | |||
| eaadd1282c | |||
| 1465757959 | |||
| 17b9c618dd | |||
| d3ce45012e | |||
| 1fc010a9d8 | |||
| dfacf11f66 | |||
| c8cf811995 | |||
| 914b1a3873 | |||
| 7eb5fdb358 | |||
| f1fb57d854 | |||
| 6d0f4566e2 | |||
| e785c087c5 | |||
| d214901133 | |||
| 96ac64d00c | |||
| 46d34d6766 | |||
| 880249adbc | |||
| 846ada4973 | |||
| badd0618e4 | |||
| a753a72b14 | |||
| b57d1ef110 | |||
| dd7c996d5c | |||
| 70d2e9ba45 | |||
| 62f98dbb44 | |||
| e288c258f7 | |||
| df58db8831 | |||
| 15bb81ea4f | |||
| 8d37073bac | |||
| dc286aef61 | |||
| b4619f0272 | |||
| 477c2273e1 | |||
| 2176d481c1 | |||
| b97274e8ac | |||
| f9be65cea4 | |||
| 4e3e3dc0a7 | |||
| fcf59df2b6 | |||
| 1bcb2f41e0 | |||
| 8460131087 | |||
| c0c24b61ff | |||
| 4fac43b21f | |||
| b794e77b7b | |||
| d987a6f7f0 | |||
| 5d93127c87 | |||
| a3a51282db | |||
| e557b3d5e5 | |||
| f3a9e99036 | |||
| f7d6e9f500 | |||
| e43e09e6c1 | |||
| 2004f8aa10 | |||
| 31b3b38e3a | |||
| 2f0db0444e | |||
| 6162e650b0 | |||
| 5d89634ca8 | |||
| 52e180c379 | |||
| c55e72bea1 | |||
| 750348b579 | |||
| 52b9af163c | |||
| f4bfac11c7 | |||
| 8d00833fdb | |||
| de529ef002 | |||
| 61aa2ae20f | |||
| 9d32aa9789 | |||
| 5cf77a0ea2 | |||
| efcf87654e | |||
| 2523e58781 | |||
| 222fa451a2 | |||
| 6de24135e5 | |||
| 27ae72036d | |||
| e924df23a6 | |||
| 67e68e0785 | |||
| 775788f93b | |||
| 19ce1beb05 | |||
| a91ddea61f | |||
| ffccb90ff4 | |||
| f916f34739 | |||
| c32994ce4b | |||
| 433e43cbec | |||
| e469414b59 | |||
| 657e5e9aa6 | |||
| f02b783aae | |||
| 8ad96a563c | |||
| 59e261bbd8 | |||
| 08ea8fccaf | |||
| 41754539be | |||
| 716d52779f | |||
| 3bf41f26c8 |
@ -104,7 +104,6 @@ If your new Docker image needs a library installed from a specific pinned commit
|
||||
```bash
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
|
||||
@ -93,7 +93,6 @@ tag=$(echo $image | awk -F':' '{print $2}')
|
||||
case "$tag" in
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.4
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -104,7 +103,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -115,7 +113,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -127,7 +124,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -139,7 +135,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -151,7 +146,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.6.3
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -162,7 +156,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -173,7 +166,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -185,7 +177,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -197,7 +188,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -209,7 +199,6 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -299,7 +288,6 @@ case "$tag" in
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
@ -378,7 +366,6 @@ case "$tag" in
|
||||
fi
|
||||
if [[ "$image" == *cuda* ]]; then
|
||||
extract_version_from_image_name cuda CUDA_VERSION
|
||||
extract_version_from_image_name cudnn CUDNN_VERSION
|
||||
fi
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
extract_version_from_image_name rocm ROCM_VERSION
|
||||
@ -430,9 +417,6 @@ docker build \
|
||||
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
|
||||
--build-arg "GCC_VERSION=${GCC_VERSION}" \
|
||||
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
|
||||
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
|
||||
--build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
|
||||
--build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [[ -n "${CUDNN_VERSION}" ]]; then
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
mkdir tmp_cudnn
|
||||
pushd tmp_cudnn
|
||||
if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
|
||||
else
|
||||
print "Unsupported CUDA version ${CUDA_VERSION}"
|
||||
exit 1
|
||||
fi
|
||||
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
|
||||
tar xf ${CUDNN_NAME}.tar.xz
|
||||
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
|
||||
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
|
||||
popd
|
||||
rm -rf tmp_cudnn
|
||||
ldconfig
|
||||
fi
|
||||
@ -103,5 +103,5 @@ fi
|
||||
# It depends on torch and triton. We don't want to install
|
||||
# triton and torch from production on Docker CI images
|
||||
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
|
||||
pip_install helion --no-deps
|
||||
pip_install helion==0.0.10 --no-deps
|
||||
fi
|
||||
|
||||
@ -50,8 +50,8 @@ IPython==8.12.0
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 0.13.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs
|
||||
#Pinned versions: 0.17.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
@ -59,4 +59,3 @@ sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==0.18.1
|
||||
myst-nb
|
||||
|
||||
@ -50,6 +50,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if ! which conda; then
|
||||
# In ROCm CIs, we are doing cross compilation on build machines with
|
||||
@ -189,6 +192,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
|
||||
export USE_ASAN=1
|
||||
export REL_WITH_DEB_INFO=1
|
||||
export UBSAN_FLAGS="-fno-sanitize-recover=all"
|
||||
unset USE_LLVM
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
|
||||
|
||||
@ -1039,10 +1039,20 @@ test_libtorch_api() {
|
||||
mkdir -p $TEST_REPORTS_DIR
|
||||
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
||||
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
|
||||
else
|
||||
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
||||
|
||||
# On s390x, pytorch is built without llvm.
|
||||
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
|
||||
# test fails with errors like:
|
||||
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
|
||||
# unknown file: Failure
|
||||
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
||||
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
|
||||
fi
|
||||
fi
|
||||
|
||||
# quantization is not fully supported on s390x yet
|
||||
|
||||
10
.github/actionlint.yaml
vendored
10
.github/actionlint.yaml
vendored
@ -53,16 +53,12 @@ self-hosted-runner:
|
||||
- linux.rocm.gpu.mi250
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
# MI300 runners
|
||||
- linux.rocm.gpu.mi300.2
|
||||
- linux.rocm.gpu.mi300.4
|
||||
# gfx942 runners
|
||||
- linux.rocm.gpu.gfx942.2
|
||||
- linux.rocm.gpu.gfx942.4
|
||||
- rocm-docker
|
||||
# Repo-specific Apple hosted runners
|
||||
- macos-m1-ultra
|
||||
- macos-m2-14
|
||||
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
|
||||
- macos-m1-stable
|
||||
- macos-m1-13
|
||||
- macos-m1-14
|
||||
# GitHub-hosted MacOS runners
|
||||
- macos-latest-xlarge
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
f6dfe1231dcdd221a68416e49ab85c2575cbb824
|
||||
bf305f538005f2e900f8850ed57146024a8bc559
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
8f605ee30912541126c0fe46d0c8c413101b600a
|
||||
ca9e2be3ed6320b51f52f536595cd24e254f8bb2
|
||||
|
||||
@ -2,7 +2,7 @@ boto3==1.35.42
|
||||
cmake==3.27.*
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
filelock==3.13.1
|
||||
filelock==3.18.0
|
||||
hypothesis==6.56.4
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
|
||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -1891,7 +1891,9 @@ def validate_revert(
|
||||
else pr.get_comment_by_id(comment_id)
|
||||
)
|
||||
if comment.editor_login is not None:
|
||||
raise PostCommentError("Don't want to revert based on edited command")
|
||||
raise PostCommentError(
|
||||
"Halting the revert as the revert comment has been edited."
|
||||
)
|
||||
author_association = comment.author_association
|
||||
author_login = comment.author_login
|
||||
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]
|
||||
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -269,8 +269,8 @@ jobs:
|
||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||
|
||||
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
|
||||
- name: Change permissions (only needed for kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
|
||||
run: |
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||
|
||||
|
||||
8
.github/workflows/build-triton-wheel.yml
vendored
8
.github/workflows/build-triton-wheel.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
device: ["cuda", "rocm", "xpu", "aarch64"]
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
@ -126,6 +126,12 @@ jobs:
|
||||
3.13t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
|
||||
;;
|
||||
3.14)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
|
||||
;;
|
||||
3.14t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported python version ${PY_VERS}"
|
||||
exit 1
|
||||
|
||||
@ -88,23 +88,23 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
4
.github/workflows/inductor-rocm-mi300.yml
vendored
4
.github/workflows/inductor-rocm-mi300.yml
vendored
@ -47,8 +47,8 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.github/workflows/mac-mps.yml
vendored
1
.github/workflows/mac-mps.yml
vendored
@ -28,7 +28,6 @@ jobs:
|
||||
# than our AWS macos-m1-14 runners
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
|
||||
]}
|
||||
|
||||
6
.github/workflows/periodic-rocm-mi300.yml
vendored
6
.github/workflows/periodic-rocm-mi300.yml
vendored
@ -59,9 +59,9 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
12
.github/workflows/rocm-mi300.yml
vendored
12
.github/workflows/rocm-mi300.yml
vendored
@ -48,12 +48,12 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
2
.github/workflows/rocm-mi355.yml
vendored
2
.github/workflows/rocm-mi355.yml
vendored
@ -3,7 +3,7 @@ name: rocm-mi355
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 9 * * * # about 2:30am PDT
|
||||
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -94,7 +94,6 @@ jobs:
|
||||
{ config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
|
||||
]}
|
||||
|
||||
@ -164,7 +164,7 @@ init_command = [
|
||||
'types-setuptools==79.0.0.20250422',
|
||||
'types-jinja2==2.11.9',
|
||||
'types-colorama==0.4.6',
|
||||
'filelock==3.13.1',
|
||||
'filelock==3.18.0',
|
||||
'junitparser==2.1.1',
|
||||
'rich==14.1.0',
|
||||
'pyyaml==6.0.2',
|
||||
|
||||
@ -679,6 +679,7 @@ cc_library(
|
||||
[
|
||||
"torch/*.h",
|
||||
"torch/csrc/**/*.h",
|
||||
"torch/nativert/**/*.h",
|
||||
"torch/csrc/distributed/c10d/**/*.hpp",
|
||||
"torch/lib/libshm/*.h",
|
||||
],
|
||||
|
||||
@ -564,7 +564,7 @@ if(MSVC)
|
||||
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
|
||||
if(MSVC_Z7_OVERRIDE)
|
||||
# CMake set debug flags to use /Z7
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
|
||||
endif()
|
||||
foreach(
|
||||
flag_var
|
||||
@ -872,6 +872,14 @@ cmake_dependent_option(
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
USE_FBGEMM_GENAI
|
||||
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
|
||||
Will be disabled if not supported by the platform"
|
||||
OFF
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
||||
# Eff Attention won't
|
||||
cmake_dependent_option(
|
||||
@ -905,6 +913,10 @@ if(USE_FBGEMM)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
|
||||
endif()
|
||||
|
||||
if(USE_FBGEMM_GENAI)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
|
||||
endif()
|
||||
|
||||
if(USE_PYTORCH_QNNPACK)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
|
||||
endif()
|
||||
|
||||
12
CODEOWNERS
12
CODEOWNERS
@ -51,12 +51,12 @@ nn/qat/ @jerryzh168
|
||||
/torch/csrc/distributed/c10d/Ops.* @kwen2501
|
||||
|
||||
# ONNX Export
|
||||
/torch/_dynamo/backends/onnxrt.py @wschin
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
|
||||
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
|
||||
/torch/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
/test/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
|
||||
# CI
|
||||
/.ci @pytorch/pytorch-dev-infra
|
||||
|
||||
@ -247,6 +247,50 @@ if(USE_MEM_EFF_ATTENTION)
|
||||
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
|
||||
endif()
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# FBGEMM GenAI
|
||||
IF(USE_FBGEMM_GENAI)
|
||||
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
||||
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||
|
||||
if(USE_ROCM)
|
||||
# Only include the kernels we want to build to avoid increasing binary size.
|
||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
target_include_directories(fbgemm_genai PUBLIC
|
||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||
${FBGEMM_GENAI_DIR}/include/
|
||||
${FBGEMM_GENAI_DIR}/common/include/
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# XNNPACK
|
||||
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
|
||||
|
||||
|
||||
@ -10,6 +10,10 @@
|
||||
#include <ideep.hpp>
|
||||
#endif
|
||||
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
#include <cpuinfo.h>
|
||||
#endif
|
||||
|
||||
#include <caffe2/core/common.h>
|
||||
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
@ -103,7 +107,9 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE:
|
||||
return "SVE";
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
@ -118,6 +124,12 @@ std::string get_cpu_capability() {
|
||||
return "";
|
||||
}
|
||||
|
||||
int get_sve_len() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
return cpuinfo_get_max_arm_sve_length();
|
||||
}
|
||||
|
||||
static std::string used_cpu_capability() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
|
||||
@ -15,4 +15,6 @@ TORCH_API std::string get_cxx_flags();
|
||||
|
||||
TORCH_API std::string get_cpu_capability();
|
||||
|
||||
TORCH_API int get_sve_len();
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -34,9 +34,9 @@ inline scalar_t vec_reduce_all(
|
||||
scalar_t acc_arr[Vec::size()];
|
||||
acc_vec.store(acc_arr);
|
||||
for (const auto i : c10::irange(1, size)) {
|
||||
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
||||
scalar_t acc_arr_next[Vec::size()] = {0};
|
||||
acc_arr_next[0] = acc_arr[i];
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next);
|
||||
acc_vec = vec_fun(acc_vec, acc_vec_next);
|
||||
}
|
||||
acc_vec.store(acc_arr);
|
||||
@ -102,8 +102,7 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -143,8 +142,7 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
typedef svbool_t vls_pred_t
|
||||
@ -77,4 +77,4 @@ typedef svfloat64_t vls_float64_t
|
||||
#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
|
||||
#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
@ -19,7 +19,7 @@ namespace vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
|
||||
@ -230,8 +230,6 @@ __attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>>
|
||||
convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
|
||||
auto bf16_vec1 = svzip1_bf16(zero, a);
|
||||
auto bf16_vec2 = svzip2_bf16(zero, a);
|
||||
@ -243,19 +241,18 @@ convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
|
||||
svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
|
||||
return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
delete[] values;
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
@ -308,8 +305,8 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
auto vals_f = svdup_n_f32(0);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
@ -27,7 +27,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix) \
|
||||
@ -231,6 +231,5 @@ std::pair<
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<double> : std::bool_constant<true> {};
|
||||
@ -55,10 +55,11 @@ class Vectorized<double> {
|
||||
operator svfloat64_t() const {
|
||||
return values;
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<double> blend(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b) {
|
||||
const Vectorized<double>& b,
|
||||
int64_t mask
|
||||
) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int64_t flag_arr[size()];
|
||||
|
||||
@ -2,8 +2,10 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
@ -22,7 +24,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
@ -30,52 +32,77 @@ struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
template <>
|
||||
class Vectorized<float> {
|
||||
private:
|
||||
vls_float32_t values;
|
||||
|
||||
__at_align__ float values[2048 / sizeof(float)];
|
||||
public:
|
||||
|
||||
using value_type = float;
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
static inline size_type size() {
|
||||
return svcntw();
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f32(0);
|
||||
inline Vectorized() {svst1_f32(ptrue, values, svdup_n_f32(0));}
|
||||
inline Vectorized(const float val) {
|
||||
svst1_f32(ptrue, values, svdup_n_f32(val));
|
||||
}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
inline Vectorized(const svfloat32_t val) {
|
||||
svst1_f32(ptrue, values, val);
|
||||
}
|
||||
template <
|
||||
typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vectorized(Args... vals) {
|
||||
__at_align__ float buffer[size()] = {vals...};
|
||||
values = svld1_f32(ptrue, buffer);
|
||||
template<typename T,
|
||||
typename = std::enable_if_t<std::is_pointer_v<T>>>
|
||||
inline Vectorized(float * val) {
|
||||
svst1_f32(ptrue, values, svld1_f32(ptrue, val));
|
||||
}
|
||||
operator svfloat32_t() const {
|
||||
return values;
|
||||
template<typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
inline Vectorized(Args... vals) {
|
||||
values = { vals... };
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t flag_arr[size()];
|
||||
inline operator svfloat32_t() const {
|
||||
return svld1_f32(ptrue, values);
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(ptrue, static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs, int count) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(svwhilelt_b32_s32(0, count), static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
inline void set_lane(int i, float value) {
|
||||
values[i] = value;
|
||||
}
|
||||
inline Vectorized<float> map(float (*fn)(float)) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline Vectorized<float> map2(float (*fn)(float, float), const Vectorized<float> &b) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i], b.values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b, const uint64_t mask) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t * flag_arr = new int32_t[size()];
|
||||
for (int i = 0; i < size(); i++) {
|
||||
flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
|
||||
}
|
||||
// Load the flag array into an SVE int32 vector.
|
||||
svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where
|
||||
// true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else
|
||||
// from a.
|
||||
svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
|
||||
return Vectorized<float>(result);
|
||||
svint32_t int_mask = svld1_s32(ptrue, flag_arr);
|
||||
delete[] flag_arr;
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(ptrue, int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else from a.
|
||||
return svsel_f32(blend_mask, b, a);
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
static inline Vectorized<float> blendv(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask_) {
|
||||
@ -84,16 +111,18 @@ class Vectorized<float> {
|
||||
return svsel_f32(mask, b, a);
|
||||
}
|
||||
template <typename step_t>
|
||||
static Vectorized<float> arange(
|
||||
static inline Vectorized<float> arange(
|
||||
float base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ float buffer[size()];
|
||||
__at_align__ float * buffer = new float[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svld1_f32(ptrue, buffer);
|
||||
auto tmp = Vectorized<float>::from_ptr(buffer);
|
||||
delete[] buffer;
|
||||
return tmp;
|
||||
}
|
||||
static Vectorized<float> set(
|
||||
static inline Vectorized<float> set(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
int64_t count = size()) {
|
||||
@ -169,271 +198,219 @@ class Vectorized<float> {
|
||||
poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
|
||||
return poly;
|
||||
}
|
||||
static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
|
||||
static inline Vectorized<float> loadu(const void* ptr) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
|
||||
} else {
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
|
||||
}
|
||||
static inline Vectorized<float> loadu(const void* ptr, int64_t count) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr), count);
|
||||
}
|
||||
const float& operator[](int idx) const = delete;
|
||||
float& operator[](int idx) = delete;
|
||||
int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit
|
||||
// and others are translated to 0-bit
|
||||
inline void store(void* ptr) const {
|
||||
svst1_f32(ptrue, static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline void store(void* ptr, int count) const {
|
||||
svst1_f32(svwhilelt_b32_s32(0, count), static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline const float& operator[](int idx) const {
|
||||
return values[idx];
|
||||
};
|
||||
inline float& operator[](int idx) {
|
||||
return values[idx];
|
||||
};
|
||||
inline int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
|
||||
int64_t mask = 0;
|
||||
__at_align__ int32_t mask_array[size()];
|
||||
__at_align__ int32_t * mask_array = new int32_t[size()];
|
||||
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
|
||||
svst1_s32(
|
||||
ptrue,
|
||||
mask_array,
|
||||
svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
if (mask_array[i])
|
||||
mask |= (1ull << i);
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, *this, ZERO_F32);
|
||||
svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
|
||||
ALL_S32_TRUE_MASK,
|
||||
ALL_S32_FALSE_MASK));
|
||||
for (int64_t j = 0; j < size(); ++j) {
|
||||
if (mask_array[j]) mask |= (1ull << j);
|
||||
}
|
||||
delete[] mask_array;
|
||||
return mask;
|
||||
}
|
||||
Vectorized<float> isnan() const {
|
||||
inline Vectorized<float> isnan() const {
|
||||
// NaN check
|
||||
svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
auto mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
bool has_inf_nan() const {
|
||||
return svptest_any(
|
||||
ptrue,
|
||||
svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
|
||||
inline bool has_inf_nan() const {
|
||||
return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, *this, *this), ZERO_F32));
|
||||
}
|
||||
Vectorized<float> map(float (*f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
||||
inline Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> angle() const {
|
||||
inline Vectorized<float> angle() const {
|
||||
const auto nan_vec = svdup_n_f32(NAN);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
const auto pi = svdup_n_f32(c10::pi<float>);
|
||||
|
||||
const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
|
||||
const auto neg_mask = svcmplt_f32(ptrue, *this, ZERO_F32);
|
||||
auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
|
||||
angle = svsel_f32(nan_mask, nan_vec, angle);
|
||||
return angle;
|
||||
return svsel_f32(nan_mask, nan_vec, angle);
|
||||
}
|
||||
Vectorized<float> real() const {
|
||||
return values;
|
||||
inline Vectorized<float> real() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> imag() const {
|
||||
inline Vectorized<float> imag() const {
|
||||
return Vectorized<float>(0.f);
|
||||
}
|
||||
Vectorized<float> conj() const {
|
||||
return values;
|
||||
inline Vectorized<float> conj() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> acos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
|
||||
inline Vectorized<float> acos() const {
|
||||
return USE_SLEEF(Sleef_acosfx_u10sve(*this), map(std::acos));
|
||||
}
|
||||
Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
|
||||
inline Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(Sleef_acoshfx_u10sve(*this), map(std::acosh));
|
||||
}
|
||||
Vectorized<float> asin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
|
||||
inline Vectorized<float> asin() const {
|
||||
return USE_SLEEF(Sleef_asinfx_u10sve(*this), map(std::asin));
|
||||
}
|
||||
Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
|
||||
inline Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(Sleef_asinhfx_u10sve(*this), map(std::asinh));
|
||||
}
|
||||
Vectorized<float> atan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
|
||||
inline Vectorized<float> atan() const {
|
||||
return USE_SLEEF(Sleef_atanfx_u10sve(*this), map(std::atan));
|
||||
}
|
||||
Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
|
||||
inline Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(Sleef_atanhfx_u10sve(*this), map(std::atanh));
|
||||
}
|
||||
Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> copysign(const Vectorized<float>& sign) const {
|
||||
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_sign[size()];
|
||||
store(tmp);
|
||||
sign.store(tmp_sign);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> erf() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
|
||||
inline Vectorized<float> atan2(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_atan2fx_u10sve(*this, b), map2(std::atan2, b));
|
||||
}
|
||||
Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
|
||||
inline Vectorized<float> copysign(const Vectorized<float> &sign) const {
|
||||
return USE_SLEEF(Sleef_copysignfx_sve(*this, sign), map2(std::copysign, sign));
|
||||
}
|
||||
Vectorized<float> erfinv() const {
|
||||
inline Vectorized<float> erf() const {
|
||||
return USE_SLEEF(Sleef_erffx_u10sve(*this), map(std::erf));
|
||||
}
|
||||
inline Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(Sleef_erfcfx_u15sve(*this), map(std::erfc));
|
||||
}
|
||||
inline Vectorized<float> erfinv() const {
|
||||
return map(calc_erfinv);
|
||||
}
|
||||
Vectorized<float> exp() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
|
||||
inline Vectorized<float> exp() const {
|
||||
return USE_SLEEF(Sleef_expfx_u10sve(*this), map(std::exp));
|
||||
}
|
||||
Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
|
||||
inline Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(Sleef_exp2fx_u10sve(*this), map(std::exp2));
|
||||
}
|
||||
Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
|
||||
inline Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(Sleef_expm1fx_u10sve(*this), map(std::expm1));
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// Implementation copied from Arm Optimized Routines:
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
|
||||
inline Vectorized<float> exp_u20() {
|
||||
|
||||
// Load values into an SVE vector
|
||||
svfloat32_t val_vec = svld1(svptrue_b32(), values); // 'values' is float*
|
||||
|
||||
// Check for special case: |x| >= 87.3...
|
||||
svbool_t is_special_case = svacgt(svptrue_b32(), val_vec, 0x1.5d5e2ap+6f);
|
||||
if (svptest_any(svptrue_b32(), is_special_case)) {
|
||||
return exp(); // fallback to scalar exp() for special cases
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
|
||||
// Constants
|
||||
const svfloat32_t ln2_hi = svdup_f32(0x1.62e4p-1f);
|
||||
const svfloat32_t ln2_lo = svdup_f32(0x1.7f7d1cp-20f);
|
||||
const svfloat32_t c1 = svdup_f32(0.5f);
|
||||
const svfloat32_t inv_ln2 = svdup_f32(0x1.715476p+0f);
|
||||
const svfloat32_t shift_vec = svdup_f32(0x1.803f8p17f); // scalar to vector
|
||||
|
||||
// n = round(x / ln2)
|
||||
svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, val_vec, shift_vec);
|
||||
svfloat32_t n = svsub_x(svptrue_b32(), z, shift_vec);
|
||||
|
||||
// r = x - n * ln2
|
||||
svfloat32_t r = svsub_x(svptrue_b32(), val_vec, svmul_x(svptrue_b32(), n, ln2_hi));
|
||||
r = svsub_x(svptrue_b32(), r, svmul_x(svptrue_b32(), n, ln2_lo));
|
||||
|
||||
// scale = 2^(n)
|
||||
svfloat32_t scale = svexpa(svreinterpret_u32(z));
|
||||
|
||||
// poly(r) = exp(r) - 1 ≈ r + 0.5 * r^2
|
||||
svfloat32_t r2 = svmul_x(svptrue_b32(), r, r);
|
||||
svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
|
||||
|
||||
// return scale * (1 + poly)
|
||||
return svmla_x(svptrue_b32(), scale, scale, poly);
|
||||
}
|
||||
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> hypot(const Vectorized<float>& b) const {
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> i0() const {
|
||||
|
||||
inline Vectorized<float> fexp_u20() {
|
||||
return exp_u20();
|
||||
}
|
||||
inline Vectorized<float> fmod(const Vectorized<float>& q) const {
|
||||
return USE_SLEEF(Sleef_fmodfx_sve(*this, q), return map2(std::fmod, q));
|
||||
}
|
||||
inline Vectorized<float> hypot(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_hypotfx_u05sve(*this, b), map2(std::hypot, b));
|
||||
}
|
||||
inline Vectorized<float> i0() const {
|
||||
return map(calc_i0);
|
||||
}
|
||||
Vectorized<float> i0e() const {
|
||||
return map(calc_i0e);
|
||||
inline Vectorized<float> i0e() const {
|
||||
return map(calc_i0e<float>);
|
||||
}
|
||||
Vectorized<float> digamma() const {
|
||||
inline Vectorized<float> digamma() const {
|
||||
return map(calc_digamma);
|
||||
}
|
||||
Vectorized<float> igamma(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igamma(const Vectorized<float> &x) const {
|
||||
return map2(calc_igamma<float>, x);
|
||||
}
|
||||
Vectorized<float> igammac(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igammac(const Vectorized<float> &x) const {
|
||||
return map2(calc_igammac<float>, x);
|
||||
}
|
||||
Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> log() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
|
||||
inline Vectorized<float> nextafter(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_nextafterfx_sve(*this, b), map2(std::nextafter, b));
|
||||
}
|
||||
Vectorized<float> log2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
|
||||
inline Vectorized<float> log() const {
|
||||
return USE_SLEEF(Sleef_logfx_u10sve(*this), map(std::log));
|
||||
}
|
||||
Vectorized<float> log10() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
|
||||
inline Vectorized<float> log2() const {
|
||||
return USE_SLEEF(Sleef_log2fx_u10sve(*this), map(std::log2));
|
||||
}
|
||||
Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
|
||||
inline Vectorized<float> log10() const {
|
||||
return USE_SLEEF(Sleef_log10fx_u10sve(*this), map(std::log10));
|
||||
}
|
||||
Vectorized<float> frac() const;
|
||||
Vectorized<float> sin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
|
||||
inline Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(Sleef_log1pfx_u10sve(*this), map(std::log1p));
|
||||
}
|
||||
Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
|
||||
inline Vectorized<float> frac() const;
|
||||
inline Vectorized<float> sin() const {
|
||||
return USE_SLEEF(Sleef_sinfx_u10sve(*this), map(std::sin));
|
||||
}
|
||||
Vectorized<float> cos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
|
||||
inline Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(Sleef_sinhfx_u10sve(*this), map(std::sinh));
|
||||
}
|
||||
Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
|
||||
inline Vectorized<float> cos() const {
|
||||
return USE_SLEEF(Sleef_cosfx_u10sve(*this), map(std::cos));
|
||||
}
|
||||
Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, values);
|
||||
inline Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(Sleef_coshfx_u10sve(*this), map(std::cosh));
|
||||
}
|
||||
Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, values);
|
||||
inline Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, values);
|
||||
inline Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, values);
|
||||
inline Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> tan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
|
||||
inline Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, *this);
|
||||
}
|
||||
inline Vectorized<float> tan() const {
|
||||
return USE_SLEEF(Sleef_tanfx_u10sve(*this), map(std::tan));
|
||||
}
|
||||
// Implementation is picked from
|
||||
// https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
|
||||
Vectorized<float> tanh() const {
|
||||
inline Vectorized<float> tanh() const {
|
||||
// Constants used for the tanh calculation.
|
||||
const svfloat32_t CONST_1 =
|
||||
svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
|
||||
@ -450,7 +427,7 @@ class Vectorized<float> {
|
||||
// instability. svmax_f32_z ensures values are greater than -10, and
|
||||
// svmin_f32_z ensures they are less than 10.
|
||||
svfloat32_t x = svmin_f32_z(
|
||||
ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
ptrue, svmax_f32_z(ptrue, *this, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
|
||||
// Step 2: Calculate exp(2 * x), where x is the clamped value.
|
||||
// svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
|
||||
@ -472,104 +449,85 @@ class Vectorized<float> {
|
||||
// Return the calculated tanh values.
|
||||
return tanh;
|
||||
}
|
||||
Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, values);
|
||||
inline Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
|
||||
inline Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(Sleef_lgammafx_u10sve(*this), map(std::lgamma));
|
||||
}
|
||||
Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, values);
|
||||
inline Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, values, ONE_F32);
|
||||
inline Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, *this, svdup_n_f32(1.f));
|
||||
}
|
||||
Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
|
||||
inline Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, *this), ONE_F32);
|
||||
}
|
||||
Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} // Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, values, other);
|
||||
inline Vectorized<float> pow(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_powfx_u10sve(*this, b), map(std::pow, b));
|
||||
}
|
||||
// Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
inline Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator+(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svadd_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator-(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svsub_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator*(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmul_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator/(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svdiv_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
Vectorized<float> inline Vectorized<float>::frac() const {
|
||||
inline Vectorized<float> Vectorized<float>::frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
@ -585,115 +543,91 @@ Vectorized<float> inline maximum(
|
||||
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<float> inline minimum(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmin_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_max(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_min(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min) {
|
||||
inline Vectorized<float> clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
|
||||
return svmax_f32_x(ptrue, min, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator&(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator|(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator^(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::eq(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
|
||||
return (*this == other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ne(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
|
||||
return (*this != other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::gt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
|
||||
return (*this > other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ge(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
|
||||
return (*this >= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::lt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
|
||||
return (*this < other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::le(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
|
||||
return (*this <= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
const int64_t fraction = n % svcntw();
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
svbool_t pg = svwhilelt_b32(i, n);
|
||||
svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const float *src, at::Half *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
|
||||
ZERO_F16);
|
||||
svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
@ -703,19 +637,18 @@ inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const at::Half *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
@ -726,20 +659,19 @@ inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const bool *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svuint8_t src_vec_u8 =
|
||||
svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svuint8_t src_vec_u8 =
|
||||
@ -751,10 +683,7 @@ inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
inline Vectorized<float> fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
@ -785,4 +714,4 @@ Vectorized<float> inline fnmsub(
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
} // namespace at::vec
|
||||
@ -15,7 +15,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#define VEC_INT_SVE_TEMPLATE(vl, bit) \
|
||||
template <> \
|
||||
@ -49,10 +49,11 @@ inline namespace CPU_CAPABILITY {
|
||||
operator svint##bit##_t() const { \
|
||||
return values; \
|
||||
} \
|
||||
template <uint64_t mask> \
|
||||
static Vectorized<int##bit##_t> blend( \
|
||||
const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
const Vectorized<int##bit##_t>& b, \
|
||||
uint64_t mask \
|
||||
) { \
|
||||
__at_align__ int##bit##_t flag_arr[size()]; \
|
||||
for (int i = 0; i < size(); ++i) { \
|
||||
flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0; \
|
||||
@ -493,7 +494,7 @@ Vectorized<int8_t> inline operator>>(
|
||||
return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -46,7 +46,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with SVE. This may not be an issue, because
|
||||
@ -100,12 +100,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -113,6 +113,10 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
@ -121,12 +125,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> scale,
|
||||
Vectorized<float> zero_point) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -134,6 +138,9 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
@ -205,7 +212,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -216,10 +223,11 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint32, /*precision=*/32>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint32*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint32>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -359,7 +367,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -370,10 +378,11 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -511,7 +520,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -522,10 +531,11 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::quint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::quint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::quint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -600,7 +610,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -4,7 +4,9 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
@ -241,7 +241,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized() = default;
|
||||
|
||||
Vectorized(c10::BFloat16 val)
|
||||
: Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
|
||||
: Vectorized16(at_vdupq_n_bf16(val.x)) {}
|
||||
Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
|
||||
Vectorized(
|
||||
value_type val0,
|
||||
@ -253,14 +253,14 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
value_type val6,
|
||||
value_type val7)
|
||||
: Vectorized16(at_bfloat16x8_t{
|
||||
c10::bit_cast<at_bfloat16_t>(val0.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val1.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val2.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val3.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val4.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val5.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
val0.x,
|
||||
val1.x,
|
||||
val2.x,
|
||||
val3.x,
|
||||
val4.x,
|
||||
val5.x,
|
||||
val6.x,
|
||||
val7.x}) {}
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
|
||||
@ -41,32 +41,16 @@ inline namespace CPU_CAPABILITY {
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
template <int index>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res);
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, true> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, false> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
|
||||
}
|
||||
float32x4_t& res,
|
||||
bool mask_val
|
||||
) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(mask_val ? b : a, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
@ -94,19 +78,15 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
const Vectorized<float>& b,
|
||||
int64_t mask) {
|
||||
Vectorized<float> vec;
|
||||
vec.values = BlendRegs < 0,
|
||||
(mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 1,
|
||||
(mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 2,
|
||||
(mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 3,
|
||||
(mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs <0>::impl(a.values, b.values, vec.values, (mask & 0x01) != 0);
|
||||
vec.values = BlendRegs <1> ::impl(a.values, b.values, vec.values, (mask & 0x02) != 0);
|
||||
vec.values = BlendRegs <2> ::impl(a.values, b.values, vec.values, (mask & 0x04) != 0);
|
||||
vec.values = BlendRegs <3> ::impl(a.values, b.values, vec.values, (mask & 0x08) != 0);
|
||||
return vec;
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
@ -307,11 +287,50 @@ class Vectorized<float> {
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||
// Implementation copied from Arm Optimized Routine https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// bail out to sleef if it's a special case:
|
||||
// i.e. there's an input s.t. |input| > 87.3....
|
||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||
uint32x4_t cmp = vcagtq_f32 (values, special_bound);
|
||||
if (vpaddd_u64 (vreinterpretq_u64_u32 (cmp)) != 0) {
|
||||
return exp();
|
||||
}
|
||||
|
||||
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
|
||||
const float ln2_hi = 0x1.62e4p-1f;
|
||||
const float ln2_lo = 0x1.7f7d1cp-20f;
|
||||
const float c0 = 0x1.0e4020p-7f;
|
||||
const float c2 = 0x1.555e66p-3f;
|
||||
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
|
||||
|
||||
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
|
||||
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
|
||||
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
|
||||
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (values, inv_ln2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (values, n, ln2_c02, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, exponent_bias));
|
||||
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (c1, r, ln2_c02, 2);
|
||||
float32x4_t q = vfmaq_laneq_f32 (c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (c4, r);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
|
||||
}
|
||||
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
return exp_u20();
|
||||
}
|
||||
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
||||
fmod,
|
||||
@ -645,4 +664,4 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
#endif /* defined(aarch64) */
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
} // namespace at::vec
|
||||
@ -813,11 +813,12 @@ static inline Vectorized<T> binary_op_as_fp32(
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
delete[] values; \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
|
||||
@ -269,12 +269,13 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#endif
|
||||
#endif // defined(CPU_CAPABILITY_AVX2)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -294,7 +294,7 @@ struct VecConvert<
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct VecConvert<float, 1, BFloat16, 1> {
|
||||
|
||||
@ -270,7 +270,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
!defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
||||
@ -915,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
#elif !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1374,11 +1374,11 @@ Vectorized<c10::quint8> inline maximum(
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
std::pair<Vectorized<float>, Vectorized<float>>
|
||||
inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
|
||||
@ -68,7 +68,7 @@ Windows llvm will not have this definition.
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
!defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
@ -79,6 +79,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -88,6 +100,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#endif // CPU_CAPABILITY_SVE256
|
||||
#endif // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
@ -210,8 +234,7 @@ struct Vectorized {
|
||||
auto as_bytes() const -> const char* {
|
||||
return reinterpret_cast<const char*>(values);
|
||||
}
|
||||
template <int64_t mask_>
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, const int64_t mask_) {
|
||||
int64_t mask = mask_;
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
@ -1312,7 +1335,7 @@ std::
|
||||
T const* base_addr,
|
||||
const Vectorized<int_same_size_t<T>>& vindex,
|
||||
Vectorized<T>& mask) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T src_arr[size];
|
||||
int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
|
||||
int_same_size_t<T> index_arr[size];
|
||||
@ -1405,7 +1428,7 @@ inline Vectorized<T> convert_to_fp_of_same_size(
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1444,7 +1467,7 @@ VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1486,7 +1509,7 @@ inline void convert(const src_T* src, dst_T* dst, int64_t n) {
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> flip(const Vectorized<T>& data) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T output[size];
|
||||
T buffer[size];
|
||||
data.store(static_cast<void*>(buffer));
|
||||
|
||||
@ -15,7 +15,7 @@ template <
|
||||
struct VecConvert {
|
||||
static inline VectorizedN<dst_t, dst_n> apply(
|
||||
const VectorizedN<src_t, src_n>& src) {
|
||||
constexpr int count = std::min(
|
||||
const int count = std::min(
|
||||
VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
|
||||
__at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
|
||||
src.store(src_buf);
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec_n.h>
|
||||
|
||||
#include <cassert>
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
@ -38,9 +40,9 @@ struct VecMaskLoad {
|
||||
static inline VectorizedN<data_t, data_n> apply(
|
||||
const data_t* ptr,
|
||||
const VecMask<mask_t, mask_n>& vec_mask) {
|
||||
constexpr typename VecMask<mask_t, mask_n>::size_type size =
|
||||
const typename VecMask<mask_t, mask_n>::size_type size =
|
||||
VecMask<mask_t, mask_n>::size();
|
||||
static_assert(VectorizedN<data_t, data_n>::size() >= size);
|
||||
assert((VectorizedN<data_t, data_n>::size() >= size));
|
||||
__at_align__ data_t data[size];
|
||||
__at_align__ mask_t mask[size];
|
||||
auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
|
||||
@ -134,7 +136,7 @@ class VecMask {
|
||||
template <typename U, int L>
|
||||
static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
|
||||
__at_align__ U b_buf[size()];
|
||||
if constexpr (size() >= VectorizedN<U, L>::size()) {
|
||||
if (size() >= VectorizedN<U, L>::size()) {
|
||||
b_vec.store(b_buf);
|
||||
for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
|
||||
b_buf[i] = static_cast<U>(0);
|
||||
@ -235,16 +237,18 @@ class VecMask {
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L >= 2, int> = 0>
|
||||
VectorizedN<U, L> loadu(const U* ptr) const {
|
||||
assert((VectorizedN<U, L>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L == 1, int> = 0>
|
||||
Vectorized<U> loadu(const U* ptr) const {
|
||||
assert((Vectorized<U>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
@ -28,7 +28,7 @@ class VectorizedN {
|
||||
using size_type = int;
|
||||
|
||||
static constexpr size_type size_T = sizeof(T);
|
||||
static constexpr size_type size() {
|
||||
static size_type size() {
|
||||
return Vectorized<T>::size() * N;
|
||||
}
|
||||
|
||||
|
||||
@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
|
||||
}
|
||||
|
||||
bool pinned_use_background_threads() override {
|
||||
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
}
|
||||
|
||||
|
||||
@ -1157,6 +1157,7 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
|
||||
@ -1164,6 +1165,7 @@ REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
|
||||
@ -1171,6 +1173,7 @@ REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
|
||||
@ -1178,6 +1181,7 @@ REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
|
||||
@ -1185,6 +1189,7 @@ REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
|
||||
@ -1192,6 +1197,7 @@ REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
|
||||
@ -1199,6 +1205,7 @@ REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
|
||||
@ -1206,6 +1213,7 @@ REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
|
||||
@ -1213,6 +1221,7 @@ REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
|
||||
@ -1220,6 +1229,7 @@ REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
|
||||
@ -1227,6 +1237,7 @@ REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
|
||||
@ -1234,6 +1245,7 @@ REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
|
||||
@ -1241,6 +1253,7 @@ REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
|
||||
@ -1248,6 +1261,7 @@ REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
|
||||
@ -1255,5 +1269,6 @@ REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
} // namespace at::native
|
||||
|
||||
@ -38,17 +38,27 @@ static CPUCapability compute_cpu_capability() {
|
||||
return CPUCapability::ZVECTOR;
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); // Returns maximum SVE VL supported by your HW.
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (envar == "sve256") {
|
||||
if (sve_vl == 256) {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (sve_vl == 256) {
|
||||
return CPUCapability::SVE256;
|
||||
} else if (sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
#endif
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
} else if (envar == "sve") {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16() && sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
#endif
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
@ -100,19 +110,15 @@ static CPUCapability compute_cpu_capability() {
|
||||
#if defined(__linux__) && defined(HAVE_SVE_CPU_DEFINITION)
|
||||
if (cpuinfo_initialize() && cpuinfo_has_arm_sve()) {
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (sve_vl <= 0) {
|
||||
// SVE is not supported on this system.
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (sve_vl == 256) { // Check for SVE256
|
||||
return CPUCapability::SVE256;
|
||||
} else if (sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (sve_vl == 256) { // Check for SVE256
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16())
|
||||
return CPUCapability::SVE256;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
@ -144,7 +150,8 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -182,7 +189,8 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, SVE
|
||||
, SVE256
|
||||
#endif
|
||||
);
|
||||
@ -239,7 +247,8 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -263,7 +272,9 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
,
|
||||
ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
,
|
||||
SVE
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
@ -298,7 +309,8 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
){
|
||||
@ -333,7 +345,7 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return ZVECTOR != nullptr ? DispatchResult(ZVECTOR) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
@ -342,6 +354,14 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE)) {
|
||||
if (C10_UNLIKELY(!SVE)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -360,7 +380,8 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -398,7 +419,7 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return ZVECTOR;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
@ -408,6 +429,15 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE)) {
|
||||
if (C10_UNLIKELY(!SVE)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
||||
@ -64,8 +64,9 @@ enum class CPUCapability {
|
||||
VSX = 1,
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
ZVECTOR = 1,
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE256 = 1,
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE=1,
|
||||
SVE256 = 2,
|
||||
#else
|
||||
AVX2 = 1,
|
||||
AVX512 = 2,
|
||||
@ -115,7 +116,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -136,7 +138,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -157,7 +160,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -181,7 +185,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -238,7 +243,8 @@ private:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE)
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
)
|
||||
@ -299,7 +305,8 @@ public:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE)
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
);
|
||||
@ -322,7 +329,8 @@ public:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
static TORCH_API FnPtr ZVECTOR;
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE;
|
||||
static TORCH_API FnPtr SVE256;
|
||||
#endif
|
||||
private:
|
||||
@ -426,9 +434,11 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_ZVECTOR_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
#define REGISTER_SVE_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE, fn)
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE256, fn)
|
||||
#else
|
||||
#define REGISTER_SVE_DISPATCH(name, fn)
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
@ -440,6 +450,7 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
REGISTER_AVX2_DISPATCH(name, fn) \
|
||||
REGISTER_VSX_DISPATCH(name, fn) \
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn) \
|
||||
REGISTER_SVE_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_NO_CPU_DISPATCH(name) \
|
||||
@ -488,6 +499,7 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
} // namespace at::native
|
||||
|
||||
@ -466,6 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
|
||||
// offsets dispatches
|
||||
@ -477,6 +478,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
|
||||
// Currently some computation is being duplicated across forward and backward.
|
||||
@ -548,6 +550,9 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
@ -568,6 +573,9 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
|
||||
@ -274,7 +274,7 @@ inline Vectorized<scalar_t> div_floor_floating_vec(
|
||||
return floordiv;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
// Since sve lacks sufficient bf16 intrinsics, do the calculations in f32 to
|
||||
// avoid rounding errors. This should not cause performance issues as
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include <ATen/native/transformers/attention.h>
|
||||
#include <ATen/native/transformers/sdp_utils_cpp.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <variant>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -44,13 +45,23 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
#endif
|
||||
const auto vec_size1 = at::vec::Vectorized<T1>::size();
|
||||
const auto vec_size2 = at::vec::Vectorized<T2>::size();
|
||||
constexpr int64_t T1_n =
|
||||
const int64_t T1_n =
|
||||
(vec_size2 == vec_size1 * 2 && is_reduced_floating_point_v<T2>) ? 2 : 1;
|
||||
constexpr int64_t T2_n = 1;
|
||||
auto vec_scale = at::vec::VectorizedN<T1, T1_n>(val);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> vec_scale;
|
||||
if (T1_n == 2)
|
||||
vec_scale = at::vec::VectorizedN<T1, 2>(val);
|
||||
else if (T1_n == 1)
|
||||
vec_scale = at::vec::VectorizedN<T1, 1>(val);
|
||||
|
||||
int64_t i = 0;
|
||||
for (; i < size - (size % vec_size2); i += vec_size2) {
|
||||
auto a_n = at::vec::VectorizedN<T1, T1_n>::loadu(a + i);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> a_n;
|
||||
if (T1_n == 2)
|
||||
a_n = at::vec::VectorizedN<T1, 2>::loadu(a + i);
|
||||
else if (T1_n == 1)
|
||||
a_n = at::vec::VectorizedN<T1, 1>::loadu(a + i);
|
||||
|
||||
at::vec::VectorizedN<T2, T2_n> b_n;
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
if (is_b_stride_zero) {
|
||||
@ -61,9 +72,16 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
} else {
|
||||
b_n = at::vec::VectorizedN<T2, T2_n>::loadu(b + i);
|
||||
}
|
||||
auto b_n_convert = at::vec::convert<T1, T1_n, T2, T2_n, true>(b_n);
|
||||
auto res = a_n * vec_scale + b_n_convert;
|
||||
res.store(out + i);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> b_n_convert;
|
||||
if (T1_n == 2) {
|
||||
auto b_n_convert = at::vec::convert<T1, 2, T2, T2_n, true>(b_n);
|
||||
auto res = std::get<at::vec::VectorizedN<T1, 2>>(a_n) * std::get<at::vec::VectorizedN<T1, 2>>(vec_scale) + b_n_convert;
|
||||
res.store(out + i);
|
||||
} else if(T1_n == 1) {
|
||||
auto b_n_convert = at::vec::convert<T1, 1, T2, T2_n, true>(b_n);
|
||||
auto res = std::get<at::vec::VectorizedN<T1, 1>>(a_n) * std::get<at::vec::VectorizedN<T1, 1>>(vec_scale) + b_n_convert;
|
||||
res.store(out + i);
|
||||
}
|
||||
}
|
||||
for (; i < size; i++) {
|
||||
auto tmp0 = a[i];
|
||||
|
||||
@ -694,7 +694,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
|
||||
gx = gx * gx_mult;
|
||||
gy = gy * gy_mult;
|
||||
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
auto interleaved_gGrid = interleave2(gx, gy);
|
||||
auto gGrid_ptr = gGrid_slice.data() + offset * 2;
|
||||
std::get<0>(interleaved_gGrid).store(gGrid_ptr,
|
||||
@ -1010,7 +1010,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
|
||||
gx = gx * gx_mult;
|
||||
gy = gy * gy_mult;
|
||||
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
auto interleaved_gGrid = interleave2(gx, gy);
|
||||
auto gGrid_ptr = gGrid_slice.data() + offset * 2;
|
||||
std::get<0>(interleaved_gGrid).store(gGrid_ptr,
|
||||
@ -1041,7 +1041,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
|
||||
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
using iVec = Vectorized<int_same_size_t<scalar_t>>;
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
|
||||
// Loop over each output pixel in grid.
|
||||
// We consider the following three cases (after slicing out the batch
|
||||
|
||||
@ -19,7 +19,7 @@ Vectorized<scalar_t> is_lerp_weight_small(Vectorized<scalar_t> weight) {
|
||||
// is_lerp_weight_small doesn't work for complex because z.abs() returns a
|
||||
// complex vector which can't be compared. Either implement it with z.abs_2_(),
|
||||
// or fallback to the scalar function.
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE))
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
|
||||
template <typename value_t>
|
||||
Vectorized<c10::complex<value_t>> is_lerp_weight_small(Vectorized<c10::complex<value_t>> weight) {
|
||||
using vec_reg_t = decltype(weight.abs_2_());
|
||||
|
||||
@ -210,13 +210,22 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
|
||||
|
||||
Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0));
|
||||
int64_t i = 0;
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
int size = Vec::size();
|
||||
#if !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256)
|
||||
// Loop unrolling prevents compiler from optimizing the SVE classes
|
||||
for (; i <= n - 2 * size; i += 2 * size) {
|
||||
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
||||
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
|
||||
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + size);
|
||||
auto out1 = c10::guts::apply(vop, std::move(args1));
|
||||
auto out2 = c10::guts::apply(vop, std::move(args2));
|
||||
out1.store(data[0] + i * sizeof(scalar_t));
|
||||
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
|
||||
out2.store(data[0] + (i + size) * sizeof(scalar_t));
|
||||
}
|
||||
#endif
|
||||
for (; i <= n - size; i += size) {
|
||||
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
||||
auto out1 = c10::guts::apply(vop, std::move(args1));
|
||||
out1.store(data[0] + i * sizeof(scalar_t));
|
||||
}
|
||||
if (i < n) {
|
||||
int64_t strides[ntensors];
|
||||
|
||||
@ -80,7 +80,7 @@ inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n,
|
||||
template <typename func_t, typename vec_func_t>
|
||||
inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
const int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
int64_t count = n / (4 * Vec::size());
|
||||
if (count > 0) {
|
||||
vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
|
||||
@ -96,7 +96,7 @@ inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
|
||||
// reduce down each column of 4 * Vec::size() elements.
|
||||
constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
const int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
int64_t outer_stride[2] = { vector_stride, vector_stride };
|
||||
UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
|
||||
vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
|
||||
|
||||
@ -154,8 +154,8 @@ inline void map_acc(
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
using aVec = vec::Vectorized<accumut>;
|
||||
int64_t d = 0;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
constexpr int64_t kaVecSize = aVec::size();
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t kaVecSize = aVec::size();
|
||||
for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
|
||||
Vec data2_vec = Vec::loadu(input_data2 + d);
|
||||
auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
|
||||
|
||||
@ -22,8 +22,8 @@ inline namespace CPU_CAPABILITY {
|
||||
|
||||
constexpr auto kF32RegisterPairsPerIteration = 4;
|
||||
constexpr auto kF32RegistersPerIteration = kF32RegisterPairsPerIteration * 2;
|
||||
constexpr auto kF32ElementsPerRegister = vec::Vectorized<float>::size();
|
||||
constexpr auto kF32ElementsPerIteration = kF32RegistersPerIteration * kF32ElementsPerRegister;
|
||||
const auto kF32ElementsPerRegister = vec::Vectorized<float>::size();
|
||||
const auto kF32ElementsPerIteration = kF32RegistersPerIteration * kF32ElementsPerRegister;
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
@ -150,16 +150,16 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
|
||||
// BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
|
||||
// working.
|
||||
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) && defined(__clang__) && __clang_major__ > 15
|
||||
// https://godbolt.org/z/z8P4Yncra
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 1
|
||||
#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 10
|
||||
#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 10
|
||||
// https://gcc.gnu.org/gcc-10/changes.html
|
||||
// https://godbolt.org/z/cdGG7vn8o
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 1
|
||||
#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 0
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#else // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 0
|
||||
#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
const vec::Vectorized<c10::Half>& b,
|
||||
const vec::Vectorized<float>& acc_low,
|
||||
const vec::Vectorized<float>& acc_high) {
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
|
||||
#else
|
||||
const auto [a_float_low, a_float_high] = convert_half_float(a);
|
||||
|
||||
@ -28,8 +28,8 @@ inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, co
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
using aVec = VecType<scalar_t>;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
constexpr int64_t kVLEN = kVecSize * 4;
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t kVLEN = kVecSize * 4;
|
||||
|
||||
int64_t k = 0;
|
||||
aVec val_vec = aVec((opmath_t)val);
|
||||
|
||||
@ -21,11 +21,11 @@ Vectorized<acc_t> load_reduce_vec(const scalar_t* data, F reduce, acc_t ident) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
static_assert(vacc_t::size() <= vec_t::size());
|
||||
const auto val = vec_t::loadu(data);
|
||||
alignas(64) std::array<scalar_t, vec_t::size()> values;
|
||||
val.store(values.data());
|
||||
alignas(64) scalar_t values[vec_t::size()];
|
||||
val.store(values);
|
||||
|
||||
constexpr int vstride = vec_t::size() / vacc_t::size();
|
||||
alignas(64) std::array<acc_t, vacc_t::size()> acc;
|
||||
alignas(64) acc_t acc[vacc_t::size()];
|
||||
acc.fill(ident);
|
||||
for (const auto k : c10::irange(vstride)) {
|
||||
for (const auto i : c10::irange(vacc_t::size())) {
|
||||
@ -33,7 +33,7 @@ Vectorized<acc_t> load_reduce_vec(const scalar_t* data, F reduce, acc_t ident) {
|
||||
}
|
||||
}
|
||||
|
||||
return vacc_t::loadu(acc.data());
|
||||
return vacc_t::loadu(acc);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
@ -138,7 +138,7 @@ struct OuterSumCastLoadPolicy <vec_t, vacc_t,
|
||||
using scalar_t = vechold_type<vec_t>;
|
||||
using acc_t = vechold_type<vacc_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return sizeof(scalar_t) * vacc_t::size();
|
||||
}
|
||||
|
||||
@ -161,7 +161,7 @@ template <typename vec_t, typename vacc_t>
|
||||
struct OuterSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_floating_point_v<vechold_type<vec_t>>>> {
|
||||
using scalar_t = vechold_type<vec_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return sizeof(scalar_t) * vacc_t::size();
|
||||
}
|
||||
|
||||
@ -198,7 +198,7 @@ template <typename scalar_t>
|
||||
struct NanSumLoadPolicy<Vectorized<scalar_t>> {
|
||||
using vec_t = Vectorized<scalar_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return LoadPolicy<vec_t>::memsize();
|
||||
}
|
||||
|
||||
@ -267,7 +267,7 @@ struct InnerNanSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_flo
|
||||
|
||||
template <typename vec_t, typename vacc_t>
|
||||
struct OuterNanSumCastLoadPolicy {
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return OuterSumCastLoadPolicy<vec_t, vacc_t>::memsize();
|
||||
}
|
||||
|
||||
@ -300,13 +300,23 @@ static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename StorePolicy, typename scalar_t>
|
||||
static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
const scalar_t *values, size_t numel) {
|
||||
auto *base_ptr = data + stride * index;
|
||||
for (const auto k : c10::irange(numel)) {
|
||||
auto val = values[k];
|
||||
StorePolicy::store(base_ptr, stride, k, val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename StorePolicy, typename scalar_t>
|
||||
static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
const Vectorized<scalar_t> &values) {
|
||||
using vec_t = Vectorized<scalar_t>;
|
||||
alignas(64) std::array<scalar_t, vec_t::size()> array_values{};
|
||||
values.store(array_values.data());
|
||||
store<StorePolicy>(data, stride, index, array_values);
|
||||
alignas(64) scalar_t array_values[vec_t::size()] = {};
|
||||
values.store(array_values);
|
||||
store<StorePolicy, scalar_t>(data, stride, index, array_values, vec_t::size());
|
||||
}
|
||||
|
||||
/** Simultaneously sum over n rows at once
|
||||
@ -436,9 +446,9 @@ void vectorized_inner_sum(
|
||||
char * C10_RESTRICT data[2], int64_t outer_stride, int64_t out_stride,
|
||||
int64_t size0, int64_t size1) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
constexpr int64_t vec_numel = vec_stride / scalar_stride;
|
||||
const int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
const int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
const int64_t vec_numel = vec_stride / scalar_stride;
|
||||
const int64_t vec_size = size0 / vec_numel;
|
||||
|
||||
// Input is contiguous over the first (reduced) dimension
|
||||
@ -451,9 +461,9 @@ void vectorized_inner_sum(
|
||||
final_acc += ScalarLoadPolicy::load(row_in, scalar_stride, k);
|
||||
}
|
||||
|
||||
alignas(64) std::array<acc_t, vacc_t::size()> partials{};
|
||||
vec_acc.store(partials.data());
|
||||
for (const auto k : c10::irange(partials.size())) {
|
||||
alignas(64) acc_t partials[vacc_t::size()] = {};
|
||||
vec_acc.store(partials);
|
||||
for (const auto k : c10::irange(vacc_t::size())) {
|
||||
final_acc += partials[k];
|
||||
}
|
||||
store<StorePolicy>(data[0], out_stride, j, final_acc);
|
||||
@ -479,7 +489,7 @@ void vectorized_outer_sum(
|
||||
int64_t size0, int64_t size1) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
const int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
constexpr int64_t nrows = 4;
|
||||
|
||||
// Input is contiguous over the second (non-reduced) dimension
|
||||
|
||||
@ -93,7 +93,7 @@ ColumnwiseMoments(
|
||||
int64_t C,
|
||||
int64_t D) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
const int64_t inner_size = D / K * K;
|
||||
Vec acc0_vec{0}, acc1_vec{0};
|
||||
for (const auto m : c10::irange(HxW)) {
|
||||
@ -668,20 +668,20 @@ void GroupNormInputBackward(
|
||||
const opmath_t s = opmath_t(1) / static_cast<opmath_t>(D * HxW);
|
||||
const bool gamma_null = (gamma == nullptr);
|
||||
at::parallel_for(0, N * G, 1, [=](int64_t start, int64_t end) {
|
||||
constexpr int64_t K = vec::Vectorized<PT>::size();
|
||||
const int64_t K = vec::Vectorized<PT>::size();
|
||||
const int64_t d = D / K * K;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
std::array<opmath_t, at::vec::Vectorized<opmath_t>::size()> ds_arr;
|
||||
opmath_t ds_arr[at::vec::Vectorized<opmath_t>::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
std::array<opmath_t, at::vec::Vectorized<opmath_t>::size()> db_arr;
|
||||
opmath_t db_arr[at::vec::Vectorized<opmath_t>::size()];
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const int64_t g = i % G;
|
||||
const opmath_t* ds_ptr = ds + i * D;
|
||||
const opmath_t* db_ptr = db + i * D;
|
||||
const PT* gamma_ptr = gamma_null ? nullptr : (gamma + g * D);
|
||||
CalcDsDb(ds_ptr, db_ptr, gamma_ptr, d, K, ds_arr.data(), db_arr.data());
|
||||
opmath_t ds_val = std::accumulate(ds_arr.cbegin(), ds_arr.cend(), opmath_t(0));
|
||||
opmath_t db_val = std::accumulate(db_arr.cbegin(), db_arr.cend(), opmath_t(0));
|
||||
CalcDsDb(ds_ptr, db_ptr, gamma_ptr, d, K, ds_arr, db_arr);
|
||||
opmath_t ds_val = std::accumulate(&ds_arr[0], &ds_arr[at::vec::Vectorized<opmath_t>::size()], opmath_t(0));
|
||||
opmath_t db_val = std::accumulate(&db_arr[0], &db_arr[at::vec::Vectorized<opmath_t>::size()], opmath_t(0));
|
||||
for (const auto j : c10::irange(d, D)) {
|
||||
const opmath_t gamma_v = gamma_null ? opmath_t(1) : opmath_t(gamma[g * D + j]);
|
||||
ds_val += ds_ptr[j] * gamma_v;
|
||||
@ -718,7 +718,7 @@ GammaBackward(
|
||||
PT* dgamma) {
|
||||
const int64_t G = group;
|
||||
const int64_t D = C / G;
|
||||
constexpr int64_t K = at::vec::Vectorized<PT>::size();
|
||||
const int64_t K = at::vec::Vectorized<PT>::size();
|
||||
using Vec = at::vec::Vectorized<PT>;
|
||||
const int64_t inner_size = D / K * K;
|
||||
for (const auto g : c10::irange(G)) {
|
||||
@ -818,7 +818,7 @@ template <typename PT, typename opmath_t>
|
||||
std::enable_if_t<std::is_same_v<PT, opmath_t>, void>
|
||||
BetaBackward(int64_t N, int64_t C, const opmath_t* db, PT* dbeta) {
|
||||
using Vec = at::vec::Vectorized<PT>;
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
Vec acc_vec{0}, zero{0};
|
||||
const int64_t inner_size = C / K * K;
|
||||
int64_t i = 0;
|
||||
@ -943,7 +943,7 @@ DsDbRowwiseMomentsChannelsLast(
|
||||
opmath_t* db_ptr,
|
||||
int64_t C) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t K = vec::Vectorized<T>::size();
|
||||
const int64_t K = vec::Vectorized<T>::size();
|
||||
const int64_t inner_size = C / K * K;
|
||||
int64_t d = 0;
|
||||
for (; d < inner_size; d += K) {
|
||||
@ -1247,7 +1247,7 @@ inline typename std::
|
||||
int64_t D) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
const bool gamma_null = (gamma_ptr == nullptr);
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
const int64_t inner_size = D / K * K;
|
||||
int64_t d = 0;
|
||||
opmath_t ds_gamma{0}, db_gamma{0};
|
||||
|
||||
@ -625,7 +625,7 @@ void weight_to_int4pack_kernel(
|
||||
int K = weight.size(1);
|
||||
|
||||
// 64 for avx512 and 32 for avx2/non-vectorized
|
||||
constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
|
||||
|
||||
// parallel on NB blocks
|
||||
@ -713,7 +713,7 @@ void int4pack_mm_kernel_(
|
||||
|
||||
constexpr int BLOCK_M = 4;
|
||||
// 64 for avx512 and 32 for avx2/non-vectorized
|
||||
constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
// 32, 64, 128, 256
|
||||
const int BLOCK_K = qGroupSize;
|
||||
|
||||
|
||||
@ -109,8 +109,8 @@ template <typename T, int64_t kMaxDepth>
|
||||
std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
|
||||
using math_t = opmath_t<T>;
|
||||
|
||||
constexpr int64_t kVecSize = vec::Vectorized<T>::size();
|
||||
constexpr int64_t kAccVecSize = vec::Vectorized<math_t>::size();
|
||||
const int64_t kVecSize = vec::Vectorized<T>::size();
|
||||
const int64_t kAccVecSize = vec::Vectorized<math_t>::size();
|
||||
const int64_t n = N / kVecSize;
|
||||
const int64_t m = divup(n, kChunkSize);
|
||||
const int64_t depth = utils::CeilLog2(m);
|
||||
@ -155,10 +155,10 @@ std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, in
|
||||
m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]);
|
||||
}
|
||||
|
||||
std::array<math_t, kAccVecSize> m1_arr{};
|
||||
std::array<math_t, kAccVecSize> m2_arr{};
|
||||
m1_stk[0].store(m1_arr.data());
|
||||
m2_stk[0].store(m2_arr.data());
|
||||
math_t m1_arr[kAccVecSize] = {};
|
||||
math_t m2_arr[kAccVecSize] = {};
|
||||
m1_stk[0].store(m1_arr);
|
||||
m2_stk[0].store(m2_arr);
|
||||
|
||||
int64_t m0 = 0;
|
||||
math_t m1 = 0;
|
||||
@ -182,7 +182,7 @@ std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, in
|
||||
template <typename T>
|
||||
std::pair<opmath_t<T>, opmath_t<T>> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t n = N / kVecSize;
|
||||
const int64_t m = divup(n, kChunkSize);
|
||||
const int64_t depth = utils::CeilLog2(m);
|
||||
|
||||
@ -21,6 +21,10 @@
|
||||
#include <ATen/native/cuda/GroupMM.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
#include <fbgemm_gpu/torch_ops.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
@ -1216,7 +1220,7 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
|
||||
// - `use_fast_accum`: if true, enables fast float8 accumulation
|
||||
// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
|
||||
// - `out`: a reference to the output tensor
|
||||
|
||||
Tensor&
|
||||
@ -1525,6 +1529,7 @@ namespace {
|
||||
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
|
||||
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// For TMA transfers, strides of output tensor have to be either
|
||||
// 1, or aligned to 16 bytes.
|
||||
const auto last_dim = out_size.size() - 1;
|
||||
@ -1536,9 +1541,10 @@ namespace {
|
||||
} else {
|
||||
out_stride = {out_size[1] * size_padded, size_padded, 1};
|
||||
}
|
||||
auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return out;
|
||||
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
|
||||
#else
|
||||
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
|
||||
#endif
|
||||
}
|
||||
|
||||
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
|
||||
@ -1619,12 +1625,9 @@ const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
#ifndef USE_ROCM
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
|
||||
bool allowed_device = _scaled_mm_allowed_device();
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
|
||||
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
|
||||
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
|
||||
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
|
||||
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
|
||||
@ -1664,6 +1667,10 @@ bool use_fast_accum) {
|
||||
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
|
||||
|
||||
at::cuda::detail::f8f8bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -1674,12 +1681,23 @@ bool use_fast_accum) {
|
||||
use_fast_accum,
|
||||
out);
|
||||
return out;
|
||||
|
||||
|
||||
|
||||
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
|
||||
|
||||
fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
|
||||
mat_a,
|
||||
// FBGEMM expects B matrix shape to be (.., N, K)
|
||||
mat_b.transpose(-2, -1),
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs,
|
||||
out);
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@ -38,17 +38,19 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
|
||||
return "CUFFT_INVALID_SIZE";
|
||||
case CUFFT_UNALIGNED_DATA:
|
||||
return "CUFFT_UNALIGNED_DATA";
|
||||
case CUFFT_INCOMPLETE_PARAMETER_LIST:
|
||||
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case CUFFT_INVALID_DEVICE:
|
||||
return "CUFFT_INVALID_DEVICE";
|
||||
case CUFFT_PARSE_ERROR:
|
||||
return "CUFFT_PARSE_ERROR";
|
||||
case CUFFT_NO_WORKSPACE:
|
||||
return "CUFFT_NO_WORKSPACE";
|
||||
case CUFFT_NOT_IMPLEMENTED:
|
||||
return "CUFFT_NOT_IMPLEMENTED";
|
||||
#if !defined(USE_ROCM)
|
||||
#if CUDA_VERSION <= 12090
|
||||
case CUFFT_INCOMPLETE_PARAMETER_LIST:
|
||||
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case CUFFT_PARSE_ERROR:
|
||||
return "CUFFT_PARSE_ERROR";
|
||||
#endif
|
||||
#if !defined(USE_ROCM) && CUDA_VERSION <= 12090
|
||||
case CUFFT_LICENSE_ERROR:
|
||||
return "CUFFT_LICENSE_ERROR";
|
||||
#endif
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
||||
// Determine if the architecture supports rowwise scaled mm
|
||||
// Currently failing on windows with:
|
||||
@ -44,6 +45,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
|
||||
#include <ATen/native/cuda/cutlass_common.cuh>
|
||||
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
// Two warninngs in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
||||
// Determine if the architecture supports rowwise scaled mm
|
||||
// Currently failing on windows with:
|
||||
@ -44,6 +45,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
#include <cutlass/gemm/kernel/gemm_universal.hpp>
|
||||
#include <cutlass/util/packed_stride.hpp>
|
||||
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ namespace at::cuda::jit {
|
||||
// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
|
||||
// If not compiling for ROCm, return the original get_traits_string().
|
||||
std::string get_traits_string_but_hiprtc_safe() {
|
||||
#if defined(USE_ROCM) && ROCM_VERSION < 70000
|
||||
#if defined(USE_ROCM) && HIP_VERSION_MAJOR < 7
|
||||
return R"ESCAPE(
|
||||
namespace std {
|
||||
|
||||
|
||||
@ -342,8 +342,8 @@ Tensor rms_norm_symint(
|
||||
|
||||
if (weight_opt.has_value() && weight_opt.value().defined() && weight_opt.value().dtype() != input.dtype()) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Mismatch dtype between input and module: input dtype = ", input.dtype(),
|
||||
", module dtype = ", weight_opt.value().dtype(), ", Can not dispatch to fused implementation"
|
||||
"Mismatch dtype between input and weight: input dtype = ", input.dtype(),
|
||||
", weight dtype = ", weight_opt.value().dtype(), ", Cannot dispatch to fused implementation."
|
||||
);
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
@ -165,6 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
|
||||
REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
|
||||
// _out variants can be shared between PocketFFT and MKL
|
||||
|
||||
@ -22,6 +22,22 @@ struct PoolingParams {
|
||||
bool return_indices;
|
||||
};
|
||||
|
||||
template <unsigned N = 5, typename idx_type_t = int32_t>
|
||||
struct AvgPoolingParams {
|
||||
int32_t dims;
|
||||
int32_t pooling_dims;
|
||||
::c10::metal::array<idx_type_t, N> input_sizes;
|
||||
::c10::metal::array<idx_type_t, N> input_strides;
|
||||
::c10::metal::array<idx_type_t, N> output_sizes;
|
||||
::c10::metal::array<idx_type_t, N> output_strides;
|
||||
::c10::metal::array<idx_type_t, N - 2> kernel_size;
|
||||
::c10::metal::array<idx_type_t, N - 2> stride;
|
||||
::c10::metal::array<idx_type_t, N - 2> padding;
|
||||
bool count_include_pad;
|
||||
bool has_divisor_override;
|
||||
int32_t divisor_override;
|
||||
};
|
||||
|
||||
template <unsigned N = 5, typename idx_type_t = int32_t>
|
||||
struct PoolingBackwardParams {
|
||||
int32_t dims;
|
||||
|
||||
@ -292,12 +292,154 @@ kernel void max_pool_backward(
|
||||
pooling_dims);
|
||||
}
|
||||
|
||||
#define REGISTER_MAX_POOL_OP(DTYPE) \
|
||||
template <typename T>
|
||||
struct AvgPoolIterBounds {
|
||||
T start;
|
||||
T end;
|
||||
T count;
|
||||
};
|
||||
|
||||
template <int32_t dim>
|
||||
AvgPoolIterBounds<int32_t> get_avg_pool_input_iter_bounds(
|
||||
constant int32_t* input_sizes,
|
||||
thread int32_t (&pooling_dim_indices)[3],
|
||||
constant int32_t* kernel_size,
|
||||
constant int32_t* stride,
|
||||
constant int32_t* padding,
|
||||
bool count_include_pad) {
|
||||
auto start = stride[dim] * pooling_dim_indices[dim] - padding[dim];
|
||||
auto end = start + kernel_size[dim];
|
||||
auto end_corrected = min(start + kernel_size[dim], input_sizes[dim]);
|
||||
auto start_corrected = (start < 0) ? 0 : start;
|
||||
auto count = count_include_pad
|
||||
? (min(end, input_sizes[dim] + padding[dim]) - start)
|
||||
: (end_corrected - start_corrected);
|
||||
return {start_corrected, end_corrected, count};
|
||||
}
|
||||
|
||||
// Iterates through all the input elements that this kernel needs to
|
||||
// apply max to. Specialized for 3 pooling dimensions.
|
||||
template <typename T>
|
||||
void avg_pool_3d_input_iter(
|
||||
constant T* input,
|
||||
device T* output,
|
||||
constant int32_t* input_sizes,
|
||||
constant int32_t* input_strides,
|
||||
thread int32_t (&pooling_dim_indices)[3],
|
||||
constant int32_t* kernel_size,
|
||||
constant int32_t* stride,
|
||||
constant int32_t* padding,
|
||||
bool count_include_pad,
|
||||
bool has_divisor_override,
|
||||
int32_t divisor_override) {
|
||||
auto bounds0 = get_avg_pool_input_iter_bounds<0>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds1 = get_avg_pool_input_iter_bounds<1>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds2 = get_avg_pool_input_iter_bounds<2>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
|
||||
T value_sum = 0;
|
||||
auto divisor = has_divisor_override
|
||||
? divisor_override
|
||||
: (bounds0.count) * (bounds1.count) * (bounds2.count);
|
||||
auto size12 = input_sizes[1] * input_sizes[2];
|
||||
|
||||
for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
|
||||
auto offset0 = input_strides[0] * i0;
|
||||
|
||||
for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
|
||||
auto offset1 = input_strides[1] * i1;
|
||||
|
||||
for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
|
||||
auto offset2 = input_strides[2] * i2;
|
||||
auto input_value = input[offset0 + offset1 + offset2];
|
||||
value_sum += input_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
*output = value_sum / static_cast<T>(divisor);
|
||||
}
|
||||
|
||||
// Kernel computes one element of the output per kernel call.
|
||||
template <typename T>
|
||||
kernel void avg_pool(
|
||||
constant T* input [[buffer(0)]],
|
||||
device T* output [[buffer(1)]],
|
||||
constant AvgPoolingParams<5>& params [[buffer(2)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
auto pooling_dims = params.pooling_dims;
|
||||
auto dims = params.dims;
|
||||
auto input_sizes = params.input_sizes.data();
|
||||
auto input_strides = params.input_strides.data();
|
||||
auto output_sizes = params.output_sizes.data();
|
||||
auto output_strides = params.output_strides.data();
|
||||
auto kernel_size = params.kernel_size.data();
|
||||
auto stride = params.stride.data();
|
||||
auto padding = params.padding.data();
|
||||
auto leading_dims = dims - pooling_dims;
|
||||
|
||||
// This buffer keeps track of the pooling dimension indices of this thread's
|
||||
// element of the output. We need to fill it with the proper values below.
|
||||
int32_t pooling_dim_indices[3];
|
||||
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
output_sizes,
|
||||
output_strides,
|
||||
/*indices_strides=*/nullptr,
|
||||
input_strides,
|
||||
pooling_dim_indices,
|
||||
dims,
|
||||
leading_dims,
|
||||
/*return_indices=*/false,
|
||||
tid);
|
||||
|
||||
output += offsets.output;
|
||||
input += offsets.input_leading;
|
||||
input_sizes += leading_dims;
|
||||
input_strides += leading_dims;
|
||||
|
||||
avg_pool_3d_input_iter<T>(
|
||||
input,
|
||||
output,
|
||||
input_sizes,
|
||||
input_strides,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
params.count_include_pad,
|
||||
params.has_divisor_override,
|
||||
params.divisor_override);
|
||||
}
|
||||
|
||||
#define REGISTER_POOL_OP(DTYPE) \
|
||||
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
|
||||
constant DTYPE * input [[buffer(0)]], \
|
||||
device DTYPE * output [[buffer(1)]], \
|
||||
device int64_t* indices [[buffer(2)]], \
|
||||
constant PoolingParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]); \
|
||||
\
|
||||
template [[host_name("avg_pool_" #DTYPE)]] kernel void avg_pool<DTYPE>( \
|
||||
constant DTYPE * input [[buffer(0)]], \
|
||||
device DTYPE * output [[buffer(1)]], \
|
||||
constant AvgPoolingParams<5> & params [[buffer(2)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE) \
|
||||
@ -309,19 +451,19 @@ kernel void max_pool_backward(
|
||||
constant PoolingBackwardParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
REGISTER_MAX_POOL_OP(float);
|
||||
REGISTER_MAX_POOL_OP(half);
|
||||
REGISTER_MAX_POOL_OP(int);
|
||||
REGISTER_MAX_POOL_OP(long);
|
||||
REGISTER_MAX_POOL_OP(short);
|
||||
REGISTER_MAX_POOL_OP(char);
|
||||
REGISTER_MAX_POOL_OP(uchar);
|
||||
REGISTER_MAX_POOL_OP(bool);
|
||||
REGISTER_POOL_OP(float);
|
||||
REGISTER_POOL_OP(half);
|
||||
REGISTER_POOL_OP(int);
|
||||
REGISTER_POOL_OP(long);
|
||||
REGISTER_POOL_OP(short);
|
||||
REGISTER_POOL_OP(char);
|
||||
REGISTER_POOL_OP(uchar);
|
||||
REGISTER_POOL_OP(bool);
|
||||
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(float);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(half);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_MAX_POOL_OP(bfloat);
|
||||
REGISTER_POOL_OP(bfloat);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
|
||||
#endif
|
||||
|
||||
@ -418,8 +418,9 @@ Tensor& exponential_mps_(Tensor& self, double lambda, std::optional<Generator> g
|
||||
MPSGraphTensor* logTensor = [mpsGraph logarithmWithTensor:subtractTensor name:nil];
|
||||
return [mpsGraph divisionWithPrimaryTensor:logTensor secondaryTensor:minusLambdaTensor name:nil];
|
||||
};
|
||||
auto eps = std::numeric_limits<float>::epsilon();
|
||||
return mps::random_mps_impl<double>(self,
|
||||
0.0,
|
||||
eps,
|
||||
1.0,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#include <ATen/ops/avg_pool2d_backward.h>
|
||||
#include <ATen/ops/avg_pool2d_backward_native.h>
|
||||
#include <ATen/ops/avg_pool2d_native.h>
|
||||
#include <ATen/ops/avg_pool3d_native.h>
|
||||
#include <ATen/ops/max_pool2d_backward_native.h>
|
||||
#include <ATen/ops/max_pool2d_native.h>
|
||||
#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
|
||||
@ -265,13 +266,13 @@ using PoolSizes = std::tuple<int32_t,
|
||||
std::vector<int32_t>,
|
||||
std::vector<int32_t>,
|
||||
std::vector<int32_t>,
|
||||
std::vector<int32_t>>;
|
||||
std::optional<std::vector<int32_t>>>;
|
||||
|
||||
static PoolSizes process_pool_sizes(const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
std::optional<IntArrayRef> dilation_opt,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
@ -305,18 +306,22 @@ static PoolSizes process_pool_sizes(const Tensor& input,
|
||||
pooling_dims,
|
||||
" ints");
|
||||
|
||||
TORCH_CHECK(dilation.size() == 1 || dilation.size() == pooling_dims,
|
||||
op_name,
|
||||
": dilation must be either a single int, or a tuple of ",
|
||||
pooling_dims,
|
||||
" ints");
|
||||
if (dilation_opt.has_value()) {
|
||||
auto dilation = dilation_opt.value();
|
||||
TORCH_CHECK(dilation.size() == 1 || dilation.size() == pooling_dims,
|
||||
op_name,
|
||||
": dilation must be either a single int, or a tuple of ",
|
||||
pooling_dims,
|
||||
" ints");
|
||||
}
|
||||
|
||||
int32_t leading_dims = input.dim() - pooling_dims;
|
||||
|
||||
const auto kernel_size_expanded = copy_and_maybe_expand(kernel_size, pooling_dims);
|
||||
const auto stride_expanded = copy_and_maybe_expand(stride.empty() ? kernel_size : stride, pooling_dims);
|
||||
const auto padding_expanded = copy_and_maybe_expand(padding, pooling_dims);
|
||||
const auto dilation_expanded = copy_and_maybe_expand(dilation, pooling_dims);
|
||||
const auto dilation_expanded = dilation_opt.has_value() ? copy_and_maybe_expand(dilation_opt.value(), pooling_dims)
|
||||
: std::vector<int32_t>(pooling_dims, 1);
|
||||
|
||||
for (const auto dim : c10::irange(pooling_dims)) {
|
||||
TORCH_CHECK(padding_expanded[dim] >= 0, op_name, ": pad must be non-negative");
|
||||
@ -362,7 +367,12 @@ static PoolSizes process_pool_sizes(const Tensor& input,
|
||||
output_size[leading_dims + dim] = output_pooling_size[dim];
|
||||
}
|
||||
|
||||
return PoolSizes(dims, output_size, kernel_size_expanded, stride_expanded, padding_expanded, dilation_expanded);
|
||||
return PoolSizes(dims,
|
||||
output_size,
|
||||
kernel_size_expanded,
|
||||
stride_expanded,
|
||||
padding_expanded,
|
||||
dilation_opt.has_value() ? std::make_optional(dilation_expanded) : std::nullopt);
|
||||
}
|
||||
|
||||
static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
@ -375,8 +385,10 @@ static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
auto [dims, output_size, kernel_size, stride, padding, dilation] =
|
||||
auto [dims, output_size, kernel_size, stride, padding, dilation_opt] =
|
||||
process_pool_sizes(input, _kernel_size, _stride, _padding, _dilation, ceil_mode, pooling_dims, op_name);
|
||||
TORCH_INTERNAL_ASSERT(dilation_opt.has_value());
|
||||
auto dilation = dilation_opt.value();
|
||||
const Tensor& indices = *(at::borrow_from_optional_tensor(indices_opt));
|
||||
const bool return_indices = indices.defined();
|
||||
|
||||
@ -442,7 +454,7 @@ static void max_pool_with_indices_backward_out_mps_template(Tensor& grad_input,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
auto [dims, output_size, kernel_size, stride, padding, dilation] =
|
||||
auto [dims, output_size, kernel_size, stride, padding, dilation_opt] =
|
||||
process_pool_sizes(input, _kernel_size, _stride, _padding, _dilation, ceil_mode, pooling_dims, op_name);
|
||||
|
||||
const auto memory_format = input.suggest_memory_format();
|
||||
@ -601,6 +613,62 @@ static void avg_pool2d_template(const Tensor& input,
|
||||
op_name);
|
||||
}
|
||||
|
||||
static void avg_pool_out_mps_template(const Tensor& output,
|
||||
const Tensor& input,
|
||||
IntArrayRef _kernel_size,
|
||||
IntArrayRef _stride,
|
||||
IntArrayRef _padding,
|
||||
bool ceil_mode,
|
||||
bool count_include_pad,
|
||||
std::optional<int64_t> divisor_override,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
auto [dims, output_size, kernel_size, stride, padding, _] =
|
||||
process_pool_sizes(input, _kernel_size, _stride, _padding, std::nullopt, ceil_mode, pooling_dims, op_name);
|
||||
|
||||
const auto memory_format = input.suggest_memory_format();
|
||||
output.resize_(output_size, memory_format);
|
||||
|
||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto numThreads = output.numel();
|
||||
|
||||
AvgPoolingParams<5> params;
|
||||
|
||||
params.dims = dims;
|
||||
params.pooling_dims = pooling_dims;
|
||||
params.count_include_pad = count_include_pad;
|
||||
params.has_divisor_override = divisor_override.has_value();
|
||||
if (divisor_override.has_value()) {
|
||||
params.divisor_override = safe_downcast<int32_t, int64_t>(divisor_override.value());
|
||||
}
|
||||
|
||||
for (const auto dim : c10::irange(dims)) {
|
||||
params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
|
||||
params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
|
||||
params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
|
||||
params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
|
||||
}
|
||||
|
||||
memcpy(params.kernel_size.data(), kernel_size.data(), pooling_dims * sizeof(int32_t));
|
||||
memcpy(params.stride.data(), stride.data(), pooling_dims * sizeof(int32_t));
|
||||
memcpy(params.padding.data(), padding.data(), pooling_dims * sizeof(int32_t));
|
||||
|
||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
|
||||
auto PSO = lib.getPipelineStateForFunc("avg_pool_" + scalarToMetalTypeString(input));
|
||||
|
||||
getMPSProfiler().beginProfileKernel(PSO, op_name, {input});
|
||||
[computeEncoder setComputePipelineState:PSO];
|
||||
mtl_setArgs(computeEncoder, input, output, params);
|
||||
|
||||
mtl_dispatch1DJob(computeEncoder, PSO, numThreads);
|
||||
getMPSProfiler().endProfileKernel(PSO);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mps
|
||||
|
||||
Tensor mps_max_pool2d(const Tensor& input,
|
||||
@ -876,4 +944,25 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps)
|
||||
"avg_pool2d_backward");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(avg_pool3d_out_mps)
|
||||
(const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
bool ceil_mode,
|
||||
bool count_include_pad,
|
||||
std::optional<int64_t> divisor_override,
|
||||
const Tensor& output) {
|
||||
mps::avg_pool_out_mps_template(output,
|
||||
input,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
ceil_mode,
|
||||
count_include_pad,
|
||||
divisor_override,
|
||||
/*pooling_dims=*/3,
|
||||
"avg_pool3d");
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -12334,6 +12334,7 @@
|
||||
dispatch:
|
||||
CPU: avg_pool3d_out_cpu
|
||||
CUDA: avg_pool3d_out_cuda
|
||||
MPS: avg_pool3d_out_mps
|
||||
MkldnnCPU: mkldnn_avg_pool3d_out
|
||||
|
||||
- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
|
||||
|
||||
@ -142,7 +142,7 @@ Tensor qcat_nhwc_kernel(
|
||||
continue;
|
||||
}
|
||||
|
||||
constexpr auto VLEN = Vec::size();
|
||||
const auto VLEN = Vec::size();
|
||||
int64_t c = 0;
|
||||
|
||||
// Vectorized loop
|
||||
@ -170,16 +170,16 @@ Tensor qcat_nhwc_kernel(
|
||||
}
|
||||
|
||||
// Vectorized loop for channel between 8 and 32 (avx2)
|
||||
constexpr auto kVLEN = Vectorized<float>::size();
|
||||
const auto kVLEN = Vectorized<float>::size();
|
||||
int64_t elem_size = curr_C - c;
|
||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
int64_t vec_num = elem_size / kVLEN;
|
||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||
memcpy(buf_in.data(), iptr + c, vec_num * kVLEN);
|
||||
auto inp_vec = Vec::loadu(buf_in.data());
|
||||
typename scalar_t::underlying buf_in[VLEN] = {};
|
||||
memcpy(buf_in, iptr + c, vec_num * kVLEN);
|
||||
auto inp_vec = Vec::loadu(buf_in);
|
||||
auto float_values = inp_vec.dequantize(
|
||||
curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul);
|
||||
Vec::float_vec_return_type retvals;
|
||||
@ -1487,7 +1487,7 @@ void _qmaxpool_2d_nhwc_kernel(
|
||||
int64_t c = 0;
|
||||
|
||||
// Interleaved vector loop 4x
|
||||
constexpr auto vec_width = Vectorized<scalar_t>::size();
|
||||
const auto vec_width = Vectorized<scalar_t>::size();
|
||||
for (; c + 4 * vec_width <= iC; c += 4 * vec_width) {
|
||||
Vectorized<scalar_t> acc{
|
||||
scalar_t(std::numeric_limits<scalar_t_underlying>::lowest())};
|
||||
@ -1623,7 +1623,7 @@ void qmaxpool_3d_nthwc_kernel(
|
||||
w_start += dW;
|
||||
|
||||
int64_t c = 0;
|
||||
constexpr auto vec_width = Vectorized<scalar_t>::size();
|
||||
const auto vec_width = Vectorized<scalar_t>::size();
|
||||
// Vector loop
|
||||
for (; c + vec_width <= iC; c += vec_width) {
|
||||
Vectorized<scalar_t> acc{
|
||||
@ -2449,7 +2449,7 @@ void q_batch_norm_kernel(
|
||||
reinterpret_cast<scalar_t::underlying*>(input.data_ptr());
|
||||
scalar_t::underlying* Y = reinterpret_cast<scalar_t::underlying*>(output.data_ptr());
|
||||
|
||||
constexpr int kVLen = Vectorized<float>::size();
|
||||
const int kVLen = Vectorized<float>::size();
|
||||
const int64_t outer_size = N * HxW;
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
// Hoisted variables
|
||||
@ -2975,7 +2975,7 @@ void quantized_normalize_kernel(
|
||||
float y_scale = Y->q_scale();
|
||||
float y_inv_scale = 1.0f / y_scale;
|
||||
|
||||
constexpr int kFloatVLen = fVec::size();
|
||||
const int kFloatVLen = fVec::size();
|
||||
int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
|
||||
int64_t kNumIntVecInLayer = N / kIntVLen;
|
||||
int64_t kNonVecRemInLayer = N % kIntVLen;
|
||||
@ -3263,7 +3263,7 @@ void quantized_groupnorm_nhwc_kernel(
|
||||
float y_scale = Y->q_scale();
|
||||
float y_inv_scale = 1.0f / y_scale;
|
||||
|
||||
constexpr int kFloatVLen = fVec::size();
|
||||
const int kFloatVLen = fVec::size();
|
||||
int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
|
||||
int64_t channels_per_group = C / G;
|
||||
int64_t HxW = N / channels_per_group;
|
||||
|
||||
@ -955,7 +955,10 @@ static at::Tensor fp8_qlinear_onednn_ref(
|
||||
std::vector<int64_t> w_scales_new_shape(weight.dim(), 1);
|
||||
w_scales_new_shape[0] = -1;
|
||||
auto dqw = weight.to(at::kFloat) * weight_scales.reshape(w_scales_new_shape);
|
||||
auto y_f32 = at::linear(dqx, dqw, bias);
|
||||
auto y_f32 = at::linear(dqx, dqw);
|
||||
if (bias.has_value()) {
|
||||
y_f32 += bias.value().to(at::kFloat);
|
||||
}
|
||||
if (binary_post_op == "none") {
|
||||
if (unary_post_op == "relu") {
|
||||
at::relu_(y_f32);
|
||||
|
||||
@ -27,6 +27,7 @@ REGISTER_AVX512_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -161,6 +161,7 @@ REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_
|
||||
REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel)
|
||||
@ -168,6 +169,7 @@ REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_interse
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_projection_out_stub, DEFAULT, &sparse_mask_projection_out_cpu_kernel)
|
||||
@ -175,5 +177,6 @@ REGISTER_AVX512_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projectio
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
}
|
||||
|
||||
@ -448,6 +448,7 @@ REGISTER_AVX2_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
|
||||
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <test/cpp/tensorexpr/test_base.h>
|
||||
#include <thread>
|
||||
|
||||
|
||||
@ -10,7 +9,7 @@
|
||||
// numbers of threads set and also whether the scheduler
|
||||
// will throw an exception when multiple threads call
|
||||
// their first parallel construct.
|
||||
static void test(int given_num_threads) {
|
||||
void test(int given_num_threads) {
|
||||
auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat));
|
||||
ASSERT_TRUE(given_num_threads >= 0);
|
||||
ASSERT_EQ(at::get_num_threads(), given_num_threads);
|
||||
@ -20,7 +19,7 @@ static void test(int given_num_threads) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ThreadInitTest, ThreadInit) {
|
||||
int main() {
|
||||
at::init_num_threads();
|
||||
|
||||
at::set_num_threads(4);
|
||||
@ -33,11 +32,13 @@ TEST(ThreadInitTest, ThreadInit) {
|
||||
|
||||
#if !AT_PARALLEL_NATIVE
|
||||
at::set_num_threads(5);
|
||||
ASSERT_EQ(at::get_num_threads(), 5);
|
||||
ASSERT_TRUE(at::get_num_threads() == 5);
|
||||
#endif
|
||||
|
||||
// test inter-op settings
|
||||
at::set_num_interop_threads(5);
|
||||
ASSERT_EQ(at::get_num_interop_threads(), 5);
|
||||
ASSERT_ANY_THROW(at::set_num_interop_threads(6));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ namespace {
|
||||
TYPED_TEST(Memory, UnAlignedLoadStore) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr size_t b_size = vec::size() * sizeof(VT);
|
||||
const size_t b_size = vec::size() * sizeof(VT);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN unsigned char ref_storage[128 * b_size];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -164,7 +164,7 @@ namespace {
|
||||
for (size_t offset = 0; offset < b_size; offset += 1) {
|
||||
unsigned char* p1 = ref_storage + offset;
|
||||
unsigned char* p2 = storage + offset;
|
||||
for (; p1 + b_size <= std::end(ref_storage); p1 += b_size, p2 += b_size) {
|
||||
for (; p1 + b_size <= &ref_storage[128 * b_size]; p1 += b_size, p2 += b_size) {
|
||||
vec v = vec::loadu(p1);
|
||||
v.store(p2);
|
||||
}
|
||||
@ -381,7 +381,7 @@ namespace {
|
||||
TYPED_TEST(Hyperbolic, Tanh) {
|
||||
using vec = TypeParam;
|
||||
// NOTE: Because SVE uses ACL logic, the precision changes, hence the adjusted tolerance.
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
using UVT = UvalueType<vec>;
|
||||
UVT tolerance = getDefaultTolerance<UVT>();
|
||||
test_unary<vec>(
|
||||
@ -586,7 +586,7 @@ namespace {
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256)) && defined(__ARM_FEATURE_BF16)
|
||||
TEST(NanBfloat16, IsNan) {
|
||||
for (unsigned int ii = 0; ii < 0xFFFF; ++ii) {
|
||||
c10::BFloat16 val(ii, c10::BFloat16::from_bits());
|
||||
@ -598,6 +598,19 @@ namespace {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if (defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
TEST(NanBfloat16, IsNan) {
|
||||
for (unsigned int ii = 0; ii < 0xFFFF; ++ii) {
|
||||
c10::BFloat16 val(ii, c10::BFloat16::from_bits());
|
||||
bool expected = std::isnan(val);
|
||||
CACHE_ALIGN c10::BFloat16 actual_vals[at::vec::SVE::Vectorized<c10::BFloat16>::size()];
|
||||
at::vec::SVE::Vectorized<c10::BFloat16>(val).isnan().store(actual_vals);
|
||||
for (int jj = 0; jj < at::vec::SVE::Vectorized<c10::BFloat16>::size(); ++jj) {
|
||||
EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "bf16 isnan failure for bit pattern " << std::hex << ii << std::dec;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TYPED_TEST(LGamma, LGamma) {
|
||||
using vec = TypeParam;
|
||||
@ -653,7 +666,7 @@ namespace {
|
||||
TYPED_TEST(Interleave, Interleave) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto N = vec::size() * 2LL;
|
||||
const auto N = vec::size() * 2LL;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT vals[N];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -663,7 +676,7 @@ namespace {
|
||||
for (VT& v : vals) {
|
||||
v = generator.get();
|
||||
}
|
||||
copy_interleave(vals, interleaved);
|
||||
copy_interleave<VT>(vals, interleaved, N);
|
||||
auto a = vec::loadu(vals);
|
||||
auto b = vec::loadu(vals + vec::size());
|
||||
auto cc = interleave2(a, b);
|
||||
@ -673,7 +686,7 @@ namespace {
|
||||
TYPED_TEST(Interleave, DeInterleave) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto N = vec::size() * 2LL;
|
||||
const auto N = vec::size() * 2LL;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT vals[N];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -683,7 +696,7 @@ namespace {
|
||||
for (VT& v : vals) {
|
||||
v = generator.get();
|
||||
}
|
||||
copy_interleave(vals, interleaved);
|
||||
copy_interleave<VT>(vals, interleaved, N);
|
||||
// test interleaved with vals this time
|
||||
auto a = vec::loadu(interleaved);
|
||||
auto b = vec::loadu(interleaved + vec::size());
|
||||
@ -1017,78 +1030,70 @@ namespace {
|
||||
RESOLVE_OVERLOAD(filter_fmadd));
|
||||
}
|
||||
#endif
|
||||
template<typename vec, typename VT, int64_t mask>
|
||||
typename std::enable_if_t<(mask < 0 || mask> 255), void>
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blend(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()])
|
||||
{
|
||||
void test_blend(VT * expected_val, VT * a, VT * b, int64_t mask) {
|
||||
if (mask >= 0 && mask <= 255) {
|
||||
// generate expected_val
|
||||
int64_t m = mask;
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
expected_val[i] = (m & 0x01) ? b[i] : a[i];
|
||||
m = m >> 1;
|
||||
}
|
||||
// test with blend
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blend(vec_a, vec_b, mask);
|
||||
auto mask_str = std::string("\nblend mask: ") + std::to_string(mask);
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blend)) + mask_str, expected, actual).check()) return;
|
||||
test_blend<vec, VT>(expected_val, a, b, mask - 1);
|
||||
}
|
||||
}
|
||||
template<typename vec, typename VT, int64_t mask>
|
||||
typename std::enable_if_t<(mask >= 0 && mask <= 255), void>
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blend(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()]) {
|
||||
// generate expected_val
|
||||
int64_t m = mask;
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
expected_val[i] = (m & 0x01) ? b[i] : a[i];
|
||||
m = m >> 1;
|
||||
}
|
||||
// test with blend
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::template blend<mask>(vec_a, vec_b);
|
||||
auto mask_str = std::string("\nblend mask: ") + std::to_string(mask);
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blend)) + mask_str, expected, actual).check()) return;
|
||||
test_blend<vec, VT, mask - 1>(expected_val, a, b);
|
||||
bool test_blendv(VT * expected_val, VT * a, VT * b, VT * mask, int64_t idx, size_t N) {
|
||||
if ((size_t) idx == N) {
|
||||
using bit_rep = BitType<VT>;
|
||||
// generate expected_val
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
bit_rep hex_mask = 0;
|
||||
hex_mask=c10::bit_cast<bit_rep>(mask[i]);
|
||||
expected_val[i] = (hex_mask & 0x01) ? b[i] : a[i];
|
||||
}
|
||||
// test with blendv
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto vec_m = vec::loadu(mask);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blendv(vec_a, vec_b, vec_m);
|
||||
auto mask_str = std::string("\nblendv mask: ");
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
mask_str += std::to_string(mask[i]) + " ";
|
||||
}
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blendv)) + mask_str, expected, actual).check()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// shuffle mask and do blendv test
|
||||
VT m = mask[idx];
|
||||
if (!test_blendv<vec, VT>(expected_val, a, b, mask, idx+1, N)) return false;
|
||||
if (m != (VT)0) {
|
||||
mask[idx] = (VT)0;
|
||||
}
|
||||
else {
|
||||
uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
|
||||
}
|
||||
if (!test_blendv<vec, VT>(expected_val, a, b, mask, idx+1, N)) return false;
|
||||
mask[idx] = m;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
template<typename vec, typename VT, int64_t idx, int64_t N>
|
||||
std::enable_if_t<(!is_complex<VT>::value && idx == N), bool>
|
||||
template<typename T>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blendv(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], VT mask[vec::size()]) {
|
||||
using bit_rep = BitType<VT>;
|
||||
// generate expected_val
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
bit_rep hex_mask = 0;
|
||||
hex_mask=c10::bit_cast<bit_rep>(mask[i]);
|
||||
expected_val[i] = (hex_mask & 0x01) ? b[i] : a[i];
|
||||
}
|
||||
// test with blendv
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto vec_m = vec::loadu(mask);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blendv(vec_a, vec_b, vec_m);
|
||||
auto mask_str = std::string("\nblendv mask: ");
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
mask_str += std::to_string(mask[i]) + " ";
|
||||
}
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blendv)) + mask_str, expected, actual).check()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
template<typename vec, typename VT, int64_t idx, int64_t N>
|
||||
std::enable_if_t<(!is_complex<VT>::value && idx != N), bool>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blendv(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], VT mask[vec::size()]) {
|
||||
// shuffle mask and do blendv test
|
||||
VT m = mask[idx];
|
||||
if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
|
||||
if (m != (VT)0) {
|
||||
mask[idx] = (VT)0;
|
||||
}
|
||||
else {
|
||||
uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
|
||||
}
|
||||
if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
|
||||
mask[idx] = m;
|
||||
return true;
|
||||
}
|
||||
template<typename T, int N>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
void blend_init(T(&a)[N], T(&b)[N]) {
|
||||
void blend_init(T * a, T * b, int N) {
|
||||
a[0] = (T)1.0;
|
||||
b[0] = a[0] + (T)N;
|
||||
for (const auto i : c10::irange(1, N)) {
|
||||
@ -1107,8 +1112,8 @@ namespace {
|
||||
CACHE_ALIGN VT mask[vec::size()] = {0};
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
test_blendv<vec, VT, 0, vec::size()>(expected_val, a, b, mask);
|
||||
blend_init(a, b, vec::size());
|
||||
test_blendv<vec, VT>(expected_val, a, b, mask, 0, vec::size());
|
||||
}
|
||||
TYPED_TEST(BitwiseFloatsAdditional2, Blend) {
|
||||
using vec = TypeParam;
|
||||
@ -1119,9 +1124,9 @@ namespace {
|
||||
CACHE_ALIGN VT b[vec::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
constexpr int64_t power_sets = 1LL << (vec::size());
|
||||
test_blend<vec, VT, power_sets - 1>(expected_val, a, b);
|
||||
blend_init(a, b, vec::size());
|
||||
const int64_t power_sets = 1LL << (vec::size());
|
||||
test_blend<vec, VT>(expected_val, a, b, power_sets - 1);
|
||||
}
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -1152,7 +1157,7 @@ namespace {
|
||||
CACHE_ALIGN VT b[vec::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
blend_init(a, b, vec::size());
|
||||
test_set<vec, VT>(expected_val, a, b, vec::size());
|
||||
}
|
||||
template<typename T>
|
||||
@ -1218,7 +1223,7 @@ namespace {
|
||||
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
||||
constexpr int min_val = std::numeric_limits<underlying>::min();
|
||||
constexpr int max_val = std::numeric_limits<underlying>::max();
|
||||
constexpr int el_count = vfloat::size();
|
||||
const int el_count = vfloat::size();
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN float unit_float_vec[el_count];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -1566,7 +1571,7 @@ namespace {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto R = 2LL; // residual
|
||||
constexpr auto N = vec::size() + R;
|
||||
const auto N = vec::size() + R;
|
||||
CACHE_ALIGN VT x1[N];
|
||||
CACHE_ALIGN VT x2[N];
|
||||
CACHE_ALIGN VT x3[N];
|
||||
@ -2130,7 +2135,7 @@ namespace {
|
||||
ASSERT_TRUE(vec_pinf.has_inf_nan()) << "Test failed for positive Infinity\n";
|
||||
ASSERT_TRUE(vec_ninf.has_inf_nan()) << "Test failed for negative Infinity\n";
|
||||
}
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
template <typename vec, typename dst_t>
|
||||
void test_convert_to(const char* dst_t_name) {
|
||||
using src_t = ValueType<vec>;
|
||||
@ -2213,13 +2218,13 @@ namespace {
|
||||
TYPED_TEST(VecMaskTests, MaskedLoad) {
|
||||
using vec = TypeParam;
|
||||
using src_t = ValueType<TypeParam>;
|
||||
constexpr auto size = vec::size();
|
||||
const auto size = vec::size();
|
||||
|
||||
#define TEST_MASK_LOAD(dst_t, mask_t, mask_n) \
|
||||
do { \
|
||||
constexpr int dst_size = at::vec::Vectorized<dst_t>::size(); \
|
||||
constexpr int dst_n = mask_n * size / dst_size; \
|
||||
if constexpr(dst_n * dst_size >= mask_n * size) { \
|
||||
int dst_size = at::vec::Vectorized<dst_t>::size(); \
|
||||
int dst_n = mask_n * size / dst_size; \
|
||||
if (dst_n * dst_size >= mask_n * size) { \
|
||||
CACHE_ALIGN dst_t x[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t y[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t ref[mask_n * size]; \
|
||||
@ -2230,9 +2235,47 @@ namespace {
|
||||
x[i] = generator.get(); \
|
||||
} \
|
||||
auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed); \
|
||||
constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
|
||||
auto x_vec = vec_mask.template loadu<dst_t, rnd_n>(x); \
|
||||
x_vec.store(y); \
|
||||
int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
|
||||
switch (rnd_n) { \
|
||||
case 1: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 1>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 2: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 2>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 3: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 3>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 4: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 4>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 8: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 8>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 16: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 16>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
default: \
|
||||
throw std::out_of_range("Unexpected rnd_n call to vec_mask"); \
|
||||
} \
|
||||
for (const auto i : c10::irange(mask_n * size)) { \
|
||||
if (vec_mask.is_masked(i)) { \
|
||||
ref[i] = x[i]; \
|
||||
@ -2269,7 +2312,7 @@ namespace {
|
||||
#undef TEST_MASK_LOAD
|
||||
#undef TEST_MASK_LOAD_N
|
||||
}
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, MaskedCheck) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
using vec = TypeParam;
|
||||
@ -2294,7 +2337,7 @@ namespace {
|
||||
#undef TEST_MASK_CHECK_N
|
||||
}
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, ToFrom) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
@ -2321,7 +2364,7 @@ namespace {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, Cast) {
|
||||
using vec = TypeParam;
|
||||
using src_t = ValueType<TypeParam>;
|
||||
|
||||
@ -56,7 +56,7 @@ CACHE_ALIGN #define
|
||||
defined(CPU_CAPABILITY_AVX512) && (defined(__GNUC__) || defined(__GNUG__))
|
||||
#undef CHECK_DEQUANT_WITH_LOW_PRECISION
|
||||
#define CHECK_WITH_FMA 1
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
#elif defined(CPU_CAPABILITY_SVE256)
|
||||
#define CHECK_DEQUANT_WITH_LOW_PRECISION 1
|
||||
#define CHECK_WITH_FMA 1
|
||||
#elif !defined(CPU_CAPABILITY_VSX) && !defined(CPU_CAPABILITY_AVX2)
|
||||
@ -136,7 +136,7 @@ template<typename T>
|
||||
struct VecTypeHelper {
|
||||
using holdType = typename T::value_type;
|
||||
using memStorageType = typename T::value_type;
|
||||
static constexpr int holdCount = T::size();
|
||||
static inline int holdCount = T::size();
|
||||
static constexpr int unitStorageCount = 1;
|
||||
};
|
||||
|
||||
@ -399,9 +399,9 @@ T clamp_min(const T& a, const T& min) {
|
||||
return a < min ? min : a;
|
||||
}
|
||||
|
||||
template <class VT, size_t N>
|
||||
void copy_interleave(VT(&vals)[N], VT(&interleaved)[N]) {
|
||||
static_assert(N % 2 == 0, "should be even");
|
||||
template <class VT>
|
||||
void copy_interleave(VT * vals, VT * interleaved, size_t N) {
|
||||
assert(N % 2 == 0);
|
||||
auto ptr1 = vals;
|
||||
auto ptr2 = vals + N / 2;
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
@ -871,10 +871,10 @@ public:
|
||||
using UVT = UvalueType<T>;
|
||||
using BVT = BitType<UVT>;
|
||||
UVT absErr = correctEpsilon(toleranceEps);
|
||||
constexpr int sizeX = VecTypeHelper<T>::holdCount * VecTypeHelper<T>::unitStorageCount;
|
||||
const int sizeX = VecTypeHelper<T>::holdCount * VecTypeHelper<T>::unitStorageCount;
|
||||
constexpr int unitStorageCount = VecTypeHelper<T>::unitStorageCount;
|
||||
CACHE_ALIGN UVT expArr[sizeX];
|
||||
CACHE_ALIGN UVT actArr[sizeX];
|
||||
UVT expArr[sizeX];
|
||||
UVT actArr[sizeX];
|
||||
exp.store(expArr);
|
||||
act.store(actArr);
|
||||
if (bitwise)
|
||||
@ -942,7 +942,7 @@ void test_unary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals[el_count];
|
||||
CACHE_ALIGN VT expected[el_count];
|
||||
bool bitwise = testCase.isBitwise();
|
||||
@ -1000,7 +1000,7 @@ void test_binary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals0[el_count];
|
||||
CACHE_ALIGN VT vals1[el_count];
|
||||
CACHE_ALIGN VT expected[el_count];
|
||||
@ -1163,7 +1163,7 @@ void test_ternary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals0[el_count];
|
||||
CACHE_ALIGN VT vals1[el_count];
|
||||
CACHE_ALIGN VT vals2[el_count];
|
||||
@ -1203,12 +1203,15 @@ void test_ternary(
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto input2 = vec_type::loadu(vals2);
|
||||
auto actual = actualFunction(input0, input1, input2);
|
||||
CACHE_ALIGN VT actual_[vec_type::size()];
|
||||
actual.store(actual_);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
|
||||
AssertVectorized<vec_type> vecAssert(
|
||||
testNameInfo, seed, vec_expected, actual, input0, input1, input2);
|
||||
if (vecAssert.check(
|
||||
bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
|
||||
return;
|
||||
return;
|
||||
} // trial
|
||||
changeSeedBy += 1;
|
||||
}
|
||||
@ -1573,19 +1576,19 @@ double getDefaultTolerance() {
|
||||
|
||||
template<typename T, int N = 1>
|
||||
at::vec::VecMask<T, N> create_vec_mask(uint64_t bitmask) {
|
||||
constexpr auto size = at::vec::Vectorized<T>::size();
|
||||
std::array<int, N * size> mask;
|
||||
const auto size = at::vec::Vectorized<T>::size();
|
||||
int mask[N * size];
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
mask[n * size + i] = (bitmask >> i) & 1;
|
||||
}
|
||||
}
|
||||
return at::vec::VecMask<T, N>::from(mask.data());
|
||||
return at::vec::VecMask<T, N>::from(mask);
|
||||
}
|
||||
|
||||
template<typename T, int N = 1>
|
||||
at::vec::VecMask<T, N> generate_vec_mask(int seed) {
|
||||
constexpr auto size = at::vec::Vectorized<T>::size();
|
||||
const auto size = at::vec::Vectorized<T>::size();
|
||||
ValueGen<uint64_t> generator(0, (1ULL << size) - 1, seed);
|
||||
auto bitmask = generator.get();
|
||||
return create_vec_mask<T, N>(bitmask);
|
||||
|
||||
@ -13,6 +13,7 @@ flaky_models = {
|
||||
"gluon_inception_v3",
|
||||
"detectron2_maskrcnn_r_101_c4",
|
||||
"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148
|
||||
"detectron2_fcos_r_50_fpn",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -346,7 +346,7 @@ vgg16,pass,0
|
||||
|
||||
|
||||
|
||||
vision_maskrcnn,fail_accuracy,30
|
||||
vision_maskrcnn,fail_accuracy,29
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,32 +1,32 @@
|
||||
add_loop_eager,compile_time_instruction_count,3070000000,0.10
|
||||
add_loop_eager,compile_time_instruction_count,3070000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.10
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_inductor,compile_time_instruction_count,30280000000,0.10
|
||||
add_loop_inductor,compile_time_instruction_count,30280000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39910000000,0.10
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39910000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.10
|
||||
add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,969100000,0.10
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,969100000,0.1
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,18030000000,0.10
|
||||
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,15240000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.10
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -34,56 +34,56 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,11090000
|
||||
|
||||
|
||||
|
||||
update_hint_regression,compile_time_instruction_count,1719000000,0.10
|
||||
update_hint_regression,compile_time_instruction_count,1719000000,0.1
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,966100000,0.10
|
||||
sum_floordiv_regression,compile_time_instruction_count,966100000,0.1
|
||||
|
||||
|
||||
|
||||
symint_sum,compile_time_instruction_count,3237000000,0.10
|
||||
symint_sum,compile_time_instruction_count,3237000000,0.1
|
||||
|
||||
|
||||
|
||||
symint_sum_loop,compile_time_instruction_count,4299000000,0.10
|
||||
symint_sum_loop,compile_time_instruction_count,4299000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.10
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.10
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.10
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.10
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.10
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.10
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.10
|
||||
mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.1
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.10
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_NestedModule_eager,compile_time_instruction_count,8348000000,0.10
|
||||
basic_NestedModule_eager,compile_time_instruction_count,8348000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.10
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
|
||||
|
||||
|
@ -944,6 +944,7 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
("", "torch/library.h"),
|
||||
|
||||
@ -593,6 +593,7 @@ libtorch_core_jit_sources = sorted(jit_sources_full)
|
||||
|
||||
|
||||
libtorch_nativert_sources = [
|
||||
"torch/nativert/ModelRunner.cpp",
|
||||
"torch/nativert/graph/Graph.cpp",
|
||||
"torch/nativert/graph/GraphPasses.cpp",
|
||||
"torch/nativert/graph/GraphSignature.cpp",
|
||||
@ -864,6 +865,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/QScheme.cpp",
|
||||
"torch/csrc/Module.cpp",
|
||||
"torch/csrc/PyInterpreter.cpp",
|
||||
"torch/csrc/PyInterpreterHooks.cpp",
|
||||
"torch/csrc/python_dimname.cpp",
|
||||
"torch/csrc/Size.cpp",
|
||||
"torch/csrc/Storage.cpp",
|
||||
@ -986,6 +988,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/utils/verbose.cpp",
|
||||
"torch/csrc/cpu/Module.cpp",
|
||||
"torch/csrc/instruction_counter/Module.cpp",
|
||||
"torch/nativert/python/Bindings.cpp",
|
||||
] + lazy_tensor_core_python_sources
|
||||
|
||||
libtorch_python_distributed_core_sources = [
|
||||
|
||||
241
c10/core/AllocatorConfig.cpp
Normal file
241
c10/core/AllocatorConfig.cpp
Normal file
@ -0,0 +1,241 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/util/env.h>
|
||||
|
||||
namespace c10::CachingAllocator {
|
||||
|
||||
namespace {
|
||||
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
||||
constexpr size_t kMB = 1024 * 1024ul;
|
||||
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
|
||||
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
|
||||
} // anonymous namespace
|
||||
|
||||
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
||||
static AcceleratorAllocatorConfig instance;
|
||||
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
|
||||
auto env##_name = c10::utils::get_env(#env); \
|
||||
if (env##_name.has_value()) { \
|
||||
if (deprecated) { \
|
||||
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
|
||||
} \
|
||||
instance.parseArgs(env##_name.value()); \
|
||||
return true; \
|
||||
}
|
||||
static bool env_flag [[maybe_unused]] = []() {
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
|
||||
// Keep this for backwards compatibility
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
|
||||
return false;
|
||||
}();
|
||||
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
||||
return instance;
|
||||
}
|
||||
|
||||
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
|
||||
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
||||
|
||||
// Our intervals start at 1MB and end at 64GB
|
||||
const size_t interval_start =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
||||
TORCH_CHECK_VALUE(
|
||||
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
size_t index =
|
||||
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
|
||||
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
|
||||
return instance().roundup_power2_divisions_[index];
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||
constexpr size_t max_allowed_split_size_mb =
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||
max_split_size_ = val_env * kMB;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||
constexpr size_t max_allowed_split_size_mb =
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||
max_non_split_rounding_size_ = val_env * kMB;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val_env > 0 && val_env < 1.0,
|
||||
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
||||
garbage_collection_threshold_ = val_env;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
bool first_value = true;
|
||||
|
||||
if (tokenizer[++i] == "[") {
|
||||
size_t last_index = 0;
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < tokenizer.size() && tokenizer[i] != "]") {
|
||||
size_t value_index = i;
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t value = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
value == 0 || llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
||||
|
||||
if (tokenizer[value_index] == ">") {
|
||||
std::fill(
|
||||
std::next(
|
||||
roundup_power2_divisions_.begin(),
|
||||
static_cast<std::vector<size_t>::difference_type>(
|
||||
last_index + 1)),
|
||||
roundup_power2_divisions_.end(),
|
||||
value);
|
||||
} else {
|
||||
size_t boundary = tokenizer.toSizeT(value_index);
|
||||
TORCH_CHECK_VALUE(
|
||||
llvm::isPowerOf2_64(boundary),
|
||||
"For roundups, the intervals have to be power of 2 ");
|
||||
|
||||
size_t index = 63 - llvm::countLeadingZeros(boundary);
|
||||
index =
|
||||
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
|
||||
|
||||
if (first_value) {
|
||||
std::fill(
|
||||
roundup_power2_divisions_.begin(),
|
||||
std::next(
|
||||
roundup_power2_divisions_.begin(),
|
||||
static_cast<std::vector<size_t>::difference_type>(index)),
|
||||
value);
|
||||
first_value = false;
|
||||
}
|
||||
roundup_power2_divisions_[index] = value;
|
||||
last_index = index;
|
||||
}
|
||||
|
||||
if (tokenizer[i + 1] != "]") {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < tokenizer.size(),
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t value = tokenizer.toSizeT(i);
|
||||
TORCH_CHECK_VALUE(
|
||||
llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 ");
|
||||
std::fill(
|
||||
roundup_power2_divisions_.begin(),
|
||||
roundup_power2_divisions_.end(),
|
||||
value);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
use_expandable_segments_ = tokenizer.toBool(++i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
pinned_use_background_threads_ = tokenizer.toBool(++i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
|
||||
// The following option will be reset to its default value if not explicitly
|
||||
// set each time.
|
||||
max_split_size_ = std::numeric_limits<size_t>::max();
|
||||
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
garbage_collection_threshold_ = 0;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
|
||||
last_allocator_settings_ = env;
|
||||
}
|
||||
|
||||
ConfigTokenizer tokenizer(env);
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "max_split_size_mb") {
|
||||
i = parseMaxSplitSize(tokenizer, i);
|
||||
} else if (key == "max_non_split_rounding_mb") {
|
||||
i = parseMaxNonSplitRoundingSize(tokenizer, i);
|
||||
} else if (key == "garbage_collection_threshold") {
|
||||
i = parseGarbageCollectionThreshold(tokenizer, i);
|
||||
} else if (key == "roundup_power2_divisions") {
|
||||
i = parseRoundUpPower2Divisions(tokenizer, i);
|
||||
} else if (key == "expandable_segments") {
|
||||
i = parseExpandableSegments(tokenizer, i);
|
||||
} else if (key == "pinned_use_background_threads") {
|
||||
i = parsePinnedUseBackgroundThreads(tokenizer, i);
|
||||
} else {
|
||||
// If a device-specific configuration parser hook is registered, it will
|
||||
// check if the key is unrecognized.
|
||||
if (device_config_parser_hook_) {
|
||||
TORCH_CHECK(
|
||||
keys_.find(key) != keys_.end(),
|
||||
"Unrecognized key '",
|
||||
key,
|
||||
"' in Accelerator allocator config.");
|
||||
}
|
||||
i = tokenizer.skipKey(i);
|
||||
}
|
||||
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10::CachingAllocator
|
||||
372
c10/core/AllocatorConfig.h
Normal file
372
c10/core/AllocatorConfig.h
Normal file
@ -0,0 +1,372 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace c10::CachingAllocator {
|
||||
|
||||
// "large" allocations may be packed in 20 MiB blocks
|
||||
const size_t kLargeBuffer = 20971520;
|
||||
|
||||
// A utility class for tokenizing allocator configuration strings into discrete
|
||||
// parts. For example, the config string:
|
||||
// "key1:val1,key2:[val2,val3]"
|
||||
// is tokenized into:
|
||||
// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
|
||||
//
|
||||
// Tokens include keys, values, and special characters (':', ',', '[', ']').
|
||||
// Whitespace is ignored.
|
||||
class ConfigTokenizer {
|
||||
public:
|
||||
explicit ConfigTokenizer(const std::string& env) {
|
||||
std::string buffer;
|
||||
for (char ch : env) {
|
||||
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
||||
if (!buffer.empty()) {
|
||||
config_.emplace_back(std::move(buffer));
|
||||
buffer.clear();
|
||||
}
|
||||
config_.emplace_back(1, ch);
|
||||
} else if (!std::isspace(static_cast<unsigned char>(ch))) {
|
||||
buffer += ch;
|
||||
}
|
||||
}
|
||||
if (!buffer.empty()) {
|
||||
config_.emplace_back(std::move(buffer));
|
||||
}
|
||||
}
|
||||
|
||||
const std::string& operator[](size_t i) const {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||
return config_[i];
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return config_.size();
|
||||
}
|
||||
|
||||
bool checkToken(size_t i, const std::string& token) const {
|
||||
checkIndex(i);
|
||||
return config_[i] == token;
|
||||
}
|
||||
|
||||
size_t toSizeT(size_t i) const {
|
||||
checkIndex(i);
|
||||
return std::stoull(config_[i]);
|
||||
}
|
||||
|
||||
double toDouble(size_t i) const {
|
||||
checkIndex(i);
|
||||
return std::stod(config_[i]);
|
||||
}
|
||||
|
||||
bool toBool(size_t i) const {
|
||||
checkIndex(i);
|
||||
const auto& token = config_[i];
|
||||
if (token == "True") {
|
||||
return true;
|
||||
} else if (token == "False") {
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
false,
|
||||
"Expected 'True' or 'False' at index ",
|
||||
i,
|
||||
" in ConfigTokenizer but got '",
|
||||
token,
|
||||
"'");
|
||||
}
|
||||
}
|
||||
|
||||
// Skips the current token group and returns the index of the value token.
|
||||
// Assumes the current index `i` points to a key name in a key-value pair.
|
||||
size_t skipKey(size_t i) const {
|
||||
// Expect a colon after the key
|
||||
checkToken(++i, ":");
|
||||
|
||||
++i; // Move to the value
|
||||
checkIndex(i);
|
||||
if (config_[i] != "[") {
|
||||
// Value is a single token (not a list) -> return its index
|
||||
return i;
|
||||
}
|
||||
|
||||
// Skip tokens inside the list until matching ']'
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < config_.size() && config_[i] != "]") {
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(),
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
|
||||
return i; // Return the index of the closing ']'
|
||||
}
|
||||
|
||||
private:
|
||||
void checkIndex(size_t i) const {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||
}
|
||||
|
||||
std::vector<std::string> config_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Note [AcceleratorAllocatorConfig design]
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
* This class configures memory allocation for both device and host memory. A
|
||||
* single `AcceleratorAllocatorConfig` instance is shared across all accelerator
|
||||
* backends, such as CUDA and XPU, under the assumption that relevant
|
||||
* environment variables apply uniformly to all accelerators. Device-specific
|
||||
* configuration extensions are supported via hooks (see
|
||||
* `registerDeviceConfigParserHook`).
|
||||
*
|
||||
* Recommended design:
|
||||
* - Place common configurations in `AcceleratorAllocatorConfig`.
|
||||
* - Extend backend-specific configurations in corresponding device-specific
|
||||
* classes, such as `CUDAAllocatorConfig`, etc.
|
||||
*
|
||||
* Scope:
|
||||
* - Configuration options must be environment-variable driven.
|
||||
*
|
||||
* Naming Convention:
|
||||
* - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
|
||||
* - Members prefixed with `pinned_` are specific to the host/pinned allocator.
|
||||
* - Environment variable names should be generic across backends.
|
||||
* - Comma-separated key-value pairs in the format: `key:value`. Use square
|
||||
* brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
|
||||
*
|
||||
* Environment Variables:
|
||||
* - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
|
||||
* - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
|
||||
* with lower priority.
|
||||
*/
|
||||
|
||||
class C10_API AcceleratorAllocatorConfig {
|
||||
public:
|
||||
static AcceleratorAllocatorConfig& instance();
|
||||
|
||||
C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
|
||||
AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
|
||||
AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
|
||||
~AcceleratorAllocatorConfig() = default;
|
||||
|
||||
/* Device allocator settings */
|
||||
|
||||
// Returns the maximum block size (in MB) that is allowed to be split. The
|
||||
// default is unlimited (all blocks can be split).
|
||||
static size_t max_split_size() {
|
||||
return instance().max_split_size_;
|
||||
}
|
||||
|
||||
// Returns the maximum block size (in MB) that is allowed to be rounded up
|
||||
// without requiring splitting when searching for a free block. The default is
|
||||
// 20 MiB.
|
||||
static size_t max_non_split_rounding_size() {
|
||||
return instance().max_non_split_rounding_size_;
|
||||
}
|
||||
|
||||
// Return the number of divisions used when rounding up allocation sizes (in
|
||||
// MB) to the nearest power-of-2 boundary.
|
||||
static size_t roundup_power2_divisions(size_t size);
|
||||
|
||||
// Returns the vector of division factors used for rounding up allocation
|
||||
// sizes. These divisions apply to size intervals between 1MB and 64GB.
|
||||
static const std::vector<size_t>& roundup_power2_divisions() {
|
||||
return instance().roundup_power2_divisions_;
|
||||
}
|
||||
|
||||
// Returns the threshold that triggers garbage collection when the ratio of
|
||||
// used memory to maximum allowed memory exceeds this value. The default is 0,
|
||||
// meaning no garbage collection is triggered. The value should be in the
|
||||
// range (0.0, 1.0).
|
||||
static double garbage_collection_threshold() {
|
||||
return instance().garbage_collection_threshold_;
|
||||
}
|
||||
|
||||
// Returns whether the expandable segment feature is enabled. This allows the
|
||||
// allocator to start with one segment that grows as needed, rather than
|
||||
// creating a new segment for each allocation. Default is false (expandable
|
||||
// segments disabled).
|
||||
static bool use_expandable_segments() {
|
||||
return instance().use_expandable_segments_;
|
||||
}
|
||||
|
||||
/* Host allocator settings */
|
||||
|
||||
// Returns whether the pinned host allocator uses background threads for
|
||||
// processing events. This is useful for improving performance in scenarios
|
||||
// where many small allocations are made. Default is false (background threads
|
||||
// disabled).
|
||||
static bool pinned_use_background_threads() {
|
||||
return instance().pinned_use_background_threads_;
|
||||
}
|
||||
|
||||
/* Settings for both device and host allocator */
|
||||
|
||||
// Returns the current allocator settings as a string. This string is useful
|
||||
// to expand device-specific allocator configurations
|
||||
static std::string last_allocator_settings() {
|
||||
std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
|
||||
return instance().last_allocator_settings_;
|
||||
}
|
||||
|
||||
// Returns the set of valid keys for the allocator configuration.
|
||||
// This set is used to validate the presence and correctness of keys in
|
||||
// device-specific configuration parsers.
|
||||
static const std::unordered_set<std::string>& getKeys() {
|
||||
return keys_;
|
||||
}
|
||||
|
||||
// Registers a device-specific configuration parser hook and its key. This
|
||||
// allows backends to parse additional device-specific configuration options
|
||||
// from the environment variable. The hook should be a function that takes a
|
||||
// string (the environment variable value) and parses it to set
|
||||
// device-specific configuration options. The hook will be called when the
|
||||
// environment variable is parsed. If a hook is already registered, it will be
|
||||
// replaced with the new one.
|
||||
static void registerDeviceConfigParserHook(
|
||||
std::function<void(const std::string&)>&& hook,
|
||||
const std::unordered_set<std::string>& keys) {
|
||||
device_config_parser_hook_ = std::move(hook);
|
||||
for (auto& key : keys) {
|
||||
TORCH_CHECK(
|
||||
keys_.insert(key).second,
|
||||
"Duplicated key '",
|
||||
key,
|
||||
"' found in device-specific configuration parser hook registration");
|
||||
}
|
||||
}
|
||||
|
||||
// Calls the registered device-specific configuration parser hook with the
|
||||
// provided environment string. This allows backends to parse additional
|
||||
// device-specific configuration options from the environment variable.
|
||||
// If no hook is registered, this function does nothing.
|
||||
static void callDeviceConfigParserHook(const std::string& env) {
|
||||
if (device_config_parser_hook_) {
|
||||
device_config_parser_hook_(env);
|
||||
}
|
||||
}
|
||||
|
||||
// Parses the environment variable `env` to update the allocator settings.
|
||||
// If the environment variable is not set, it does nothing.
|
||||
// The configuration string should be a comma-separated list of key-value
|
||||
// pairs, where each key is a configuration option and the value is the
|
||||
// corresponding setting. For example:
|
||||
// "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
|
||||
void parseArgs(const std::string& env);
|
||||
|
||||
private:
|
||||
AcceleratorAllocatorConfig();
|
||||
|
||||
/* Internal functions for device allocator */
|
||||
|
||||
// Parse `max_split_size_mb` from environment variable.
|
||||
size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
|
||||
// Parse `max_non_split_rounding_mb` from environment variable.
|
||||
size_t parseMaxNonSplitRoundingSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `garbage_collection_threshold` from environment variable.
|
||||
size_t parseGarbageCollectionThreshold(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `roundup_power2_divisions` from environment variable.
|
||||
size_t parseRoundUpPower2Divisions(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `expandable_segments` from environment variable.
|
||||
size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
|
||||
|
||||
/* Internal functions for host allocator */
|
||||
|
||||
// Parse `pinned_use_background_threads` from environment variable.
|
||||
size_t parsePinnedUseBackgroundThreads(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
|
||||
/* The following members are specifically used for the device allocator. */
|
||||
|
||||
// The maximum block size that is allowed to be split.
|
||||
std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
|
||||
// The maximum allowable extra size of a memory block without requiring
|
||||
// splitting when searching for a free block.
|
||||
std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
|
||||
// Used to store how memory allocations of different sizes should be rounded
|
||||
// up to the nearest power of 2 divisions.
|
||||
std::vector<size_t> roundup_power2_divisions_;
|
||||
// The threshold that triggers garbage collection when the ratio of used
|
||||
// memory to maximum allowed memory exceeds this value.
|
||||
std::atomic<double> garbage_collection_threshold_{0};
|
||||
// A flag to enable expandable segments feature.
|
||||
std::atomic<bool> use_expandable_segments_{false};
|
||||
|
||||
/* The following members are specifically used for the host allocator. */
|
||||
|
||||
// A flag to enable background thread for processing events.
|
||||
std::atomic<bool> pinned_use_background_threads_{false};
|
||||
|
||||
/* The following members are used for both device and host allocator. */
|
||||
|
||||
// Record the last allocator config environment setting.
|
||||
std::mutex last_allocator_settings_mutex_;
|
||||
std::string last_allocator_settings_;
|
||||
|
||||
// Optional hook for parsing additional device-specific allocator settings.
|
||||
// This allows backends (e.g., CUDA, XPU) to register a custom parser for
|
||||
// their own environment configuration extensions.
|
||||
inline static std::function<void(const std::string&)>
|
||||
device_config_parser_hook_{nullptr};
|
||||
|
||||
// A set of valid configuration keys, including both common and
|
||||
// device-specific options. This set is used to validate the presence and
|
||||
// correctness of keys during parsing.
|
||||
inline static std::unordered_set<std::string> keys_{
|
||||
"max_split_size_mb",
|
||||
"max_non_split_rounding_mb",
|
||||
"garbage_collection_threshold",
|
||||
"roundup_power2_divisions",
|
||||
"expandable_segments",
|
||||
"pinned_use_background_threads"};
|
||||
};
|
||||
|
||||
C10_API inline void setAllocatorSettings(const std::string& env) {
|
||||
AcceleratorAllocatorConfig::instance().parseArgs(env);
|
||||
AcceleratorAllocatorConfig::callDeviceConfigParserHook(env);
|
||||
}
|
||||
|
||||
C10_API inline std::string getAllocatorSettings() {
|
||||
return AcceleratorAllocatorConfig::instance().last_allocator_settings();
|
||||
}
|
||||
|
||||
struct DeviceConfigParserHookRegistry {
|
||||
explicit DeviceConfigParserHookRegistry(
|
||||
std::function<void(const std::string&)>&& hook,
|
||||
const std::unordered_set<std::string>& keys) {
|
||||
// Use static method to avoid static initialization order fiasco issues
|
||||
AcceleratorAllocatorConfig::registerDeviceConfigParserHook(
|
||||
std::move(hook), keys);
|
||||
}
|
||||
};
|
||||
|
||||
// Assume each config parser has `parseArgs` and `getKeys` methods
|
||||
#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(parser_cls) \
|
||||
namespace { \
|
||||
static at::CachingAllocator::DeviceConfigParserHookRegistry \
|
||||
g_device_config_parse_hook_registry_instance( \
|
||||
[](const std::string& env) { \
|
||||
parser_cls::instance().parseArgs(env); \
|
||||
}, \
|
||||
parser_cls::getKeys()); \
|
||||
}
|
||||
|
||||
} // namespace c10::CachingAllocator
|
||||
@ -240,24 +240,4 @@ struct C10_API PyInterpreter {
|
||||
void disarm() noexcept;
|
||||
};
|
||||
|
||||
// PyInterpreterStatus describes what the state of its interpreter tag
|
||||
// is, relative to the thread currently holding the GIL.
|
||||
enum class PyInterpreterStatus {
|
||||
// We just allocated the Tensor, it hasn't escaped to other threads,
|
||||
// we know that it definitely hasn't been tagged to be associated
|
||||
// with an interpreter.
|
||||
DEFINITELY_UNINITIALIZED,
|
||||
// We queried the interpreter field and it looked uninitialized. But
|
||||
// another thread may have raced with us to tag it with some other
|
||||
// interpreter id. So we will have to do a CEX to make sure we can
|
||||
// actually nab it.
|
||||
MAYBE_UNINITIALIZED,
|
||||
// We queried the interpreter field and it was tagged to belong to us.
|
||||
// This means we have sole write access (as we hold the GIL for this
|
||||
// interpreter)
|
||||
TAGGED_BY_US,
|
||||
// Someone else tagged this. We can't use this TensorImpl from Python.
|
||||
TAGGED_BY_OTHER,
|
||||
};
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
32
c10/core/impl/PyInterpreterHooks.cpp
Normal file
32
c10/core/impl/PyInterpreterHooks.cpp
Normal file
@ -0,0 +1,32 @@
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Define the registry
|
||||
C10_DEFINE_REGISTRY(
|
||||
PyInterpreterHooksRegistry,
|
||||
PyInterpreterHooksInterface,
|
||||
PyInterpreterHooksArgs)
|
||||
|
||||
const PyInterpreterHooksInterface& getPyInterpreterHooks() {
|
||||
auto create_impl = [] {
|
||||
#if !defined C10_MOBILE
|
||||
auto hooks = PyInterpreterHooksRegistry()->Create(
|
||||
"PyInterpreterHooks", PyInterpreterHooksArgs{});
|
||||
if (hooks) {
|
||||
return hooks;
|
||||
}
|
||||
#endif
|
||||
// Return stub implementation that will throw errors when methods are called
|
||||
return std::make_unique<PyInterpreterHooksInterface>();
|
||||
};
|
||||
static auto hooks = create_impl();
|
||||
return *hooks;
|
||||
}
|
||||
|
||||
// Main function to get global PyInterpreter
|
||||
PyInterpreter* getGlobalPyInterpreter() {
|
||||
return getPyInterpreterHooks().getPyInterpreter();
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
39
c10/core/impl/PyInterpreterHooks.h
Normal file
39
c10/core/impl/PyInterpreterHooks.h
Normal file
@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/impl/PyInterpreter.h>
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/util/Registry.h>
|
||||
#include <memory>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Minimal interface for PyInterpreter hooks
|
||||
struct C10_API PyInterpreterHooksInterface {
|
||||
virtual ~PyInterpreterHooksInterface() = default;
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
}
|
||||
};
|
||||
|
||||
struct C10_API PyInterpreterHooksArgs{};
|
||||
|
||||
C10_DECLARE_REGISTRY(
|
||||
PyInterpreterHooksRegistry,
|
||||
PyInterpreterHooksInterface,
|
||||
PyInterpreterHooksArgs);
|
||||
|
||||
#define REGISTER_PYTHON_HOOKS(clsname) \
|
||||
C10_REGISTER_CLASS(PyInterpreterHooksRegistry, clsname, clsname)
|
||||
|
||||
// Get the global PyInterpreter hooks instance
|
||||
C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks();
|
||||
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -34,29 +34,12 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
|
||||
}
|
||||
|
||||
void PyObjectSlot::unchecked_clear_pyobj(PyInterpreter* interpreter) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(interpreter == pyobj_interpreter_.load());
|
||||
pyobj_ = nullptr;
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot access PyObject for Tensor on interpreter ",
|
||||
(*pyobj_interpreter_.load())->name());
|
||||
}
|
||||
|
||||
bool PyObjectSlot::check_interpreter(PyInterpreter* interpreter) {
|
||||
return interpreter == pyobj_interpreter();
|
||||
}
|
||||
|
||||
bool PyObjectSlot::has_pyobj_nonhermetic() {
|
||||
return check_pyobj(pyobj_interpreter(), /*ignore_hermetic_tls=*/true)
|
||||
.has_value();
|
||||
TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
|
||||
}
|
||||
|
||||
bool PyObjectSlot::owns_pyobj() {
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include <c10/core/impl/HermeticPyObjectTLS.h>
|
||||
#include <c10/core/impl/PyInterpreter.h>
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
@ -24,52 +25,9 @@ struct C10_API PyObjectSlot {
|
||||
//
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(
|
||||
PyInterpreter* self_interpreter,
|
||||
PyObject* pyobj,
|
||||
PyInterpreterStatus status) {
|
||||
impl::PyInterpreter* expected = nullptr;
|
||||
switch (status) {
|
||||
case impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED:
|
||||
// caller guarantees there is no multithreaded access; if there is
|
||||
// no data race OK to do a relaxed store
|
||||
pyobj_interpreter_.store(self_interpreter, std::memory_order_relaxed);
|
||||
break;
|
||||
case impl::PyInterpreterStatus::TAGGED_BY_US:
|
||||
// no tagging is necessary, the tag is already correct
|
||||
break;
|
||||
case impl::PyInterpreterStatus::MAYBE_UNINITIALIZED:
|
||||
// attempt to claim this TensorImpl with the specified interpreter
|
||||
// tag
|
||||
if (pyobj_interpreter_.compare_exchange_strong(
|
||||
expected, self_interpreter, std::memory_order_acq_rel)) {
|
||||
break;
|
||||
}
|
||||
// test if, actually, it was already tagged by us! this situation can't
|
||||
// be caused by a race, but it could be caused by a situation
|
||||
// where someone conservatively tagged the tensor as MAYBE_UNINITIALIZED
|
||||
// (because they didn't pre-check the tag) when actually it was
|
||||
// owned by the interpreter
|
||||
if (expected == self_interpreter) {
|
||||
break;
|
||||
}
|
||||
// fallthrough, we lost the race. We are guaranteed not to lose the
|
||||
// race with ourself, as calls to init_pyobj with the same interpreter
|
||||
// ID must be sequentialized by the GIL
|
||||
[[fallthrough]];
|
||||
case impl::PyInterpreterStatus::TAGGED_BY_OTHER:
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot allocate PyObject for Tensor on interpreter ",
|
||||
self_interpreter,
|
||||
" that has already been used by another torch deploy interpreter ",
|
||||
pyobj_interpreter_.load());
|
||||
}
|
||||
|
||||
// we are the ONLY thread that can have gotten to this point. It is not
|
||||
// possible to conflict with another zero interpreter as access is protected
|
||||
// by GIL
|
||||
// NB: owns_pyobj tag is initially false
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -94,49 +52,25 @@ struct C10_API PyObjectSlot {
|
||||
//
|
||||
// NB: this lives in header so that we can avoid actually creating the
|
||||
// std::optional
|
||||
std::optional<PyObject*> check_pyobj(
|
||||
PyInterpreter* self_interpreter,
|
||||
bool ignore_hermetic_tls = false) const {
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
// NB: This never returns DEFINITELY_UNINITIALIZED because there is
|
||||
// always the possibility that another thread races to initialize
|
||||
// after we query here. The only time when we can conclude a tensor
|
||||
// is definitely uninitialized is when we have just allocated it and
|
||||
// it cannot have escaped to other threads yet
|
||||
return std::nullopt;
|
||||
} else if (interpreter == self_interpreter) {
|
||||
// NB: pyobj_ could still be null!
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot access PyObject for Tensor on interpreter ",
|
||||
(*self_interpreter)->name(),
|
||||
" that has already been used by another torch deploy interpreter ",
|
||||
(*pyobj_interpreter_.load())->name());
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
}
|
||||
|
||||
// Clear the PyObject field for an interpreter, in situations where we
|
||||
// statically know the tensor is tagged with our interpreter.
|
||||
void unchecked_clear_pyobj(PyInterpreter* interpreter);
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
|
||||
// Check if the PyObjectSlot's interpreter is the same as the specified
|
||||
// interpreter
|
||||
bool check_interpreter(PyInterpreter* interpreter);
|
||||
|
||||
// Check if the PyObjectSlot is holding a PyObject, owned or non-owned
|
||||
bool has_pyobj_nonhermetic();
|
||||
|
||||
bool owns_pyobj();
|
||||
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
@ -1,389 +1,119 @@
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#endif
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace c10::cuda::CUDACachingAllocator {
|
||||
|
||||
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
||||
|
||||
CUDAAllocatorConfig::CUDAAllocatorConfig()
|
||||
: m_max_split_size(std::numeric_limits<size_t>::max()),
|
||||
m_max_non_split_rounding_size(kLargeBuffer),
|
||||
m_garbage_collection_threshold(0),
|
||||
m_pinned_num_register_threads(1),
|
||||
m_expandable_segments(false),
|
||||
#if CUDA_VERSION >= 12030
|
||||
m_expandable_segments_handle_type(
|
||||
Expandable_Segments_Handle_Type::UNSPECIFIED),
|
||||
#else
|
||||
m_expandable_segments_handle_type(
|
||||
Expandable_Segments_Handle_Type::POSIX_FD),
|
||||
#endif
|
||||
m_release_lock_on_cudamalloc(false),
|
||||
m_pinned_use_cuda_host_register(false),
|
||||
m_pinned_use_background_threads(false) {
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
||||
|
||||
// Our intervals start at 1MB and end at 64GB
|
||||
const size_t interval_start =
|
||||
63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
|
||||
TORCH_CHECK(
|
||||
(interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
|
||||
|
||||
index = std::max(0, index);
|
||||
index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
|
||||
return instance().m_roundup_power2_divisions[index];
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::lexArgs(
|
||||
const std::string& env,
|
||||
std::vector<std::string>& config) {
|
||||
std::vector<char> buf;
|
||||
|
||||
for (char ch : env) {
|
||||
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
||||
if (!buf.empty()) {
|
||||
config.emplace_back(buf.begin(), buf.end());
|
||||
buf.clear();
|
||||
}
|
||||
config.emplace_back(1, ch);
|
||||
} else if (ch != ' ') {
|
||||
buf.emplace_back(ch);
|
||||
}
|
||||
}
|
||||
if (!buf.empty()) {
|
||||
config.emplace_back(buf.begin(), buf.end());
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::consumeToken(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
const char c) {
|
||||
TORCH_CHECK(
|
||||
i < config.size() && config[i] == std::string(1, c),
|
||||
"Error parsing CachingAllocator settings, expected ",
|
||||
c,
|
||||
"");
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseMaxSplitSize(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
constexpr int mb = 1024 * 1024;
|
||||
if (++i < config.size()) {
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > kLargeBuffer / mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be > ",
|
||||
kLargeBuffer / mb,
|
||||
"");
|
||||
val1 = std::max(val1, kLargeBuffer / mb);
|
||||
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
|
||||
m_max_split_size = val1 * 1024 * 1024;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
constexpr int mb = 1024 * 1024;
|
||||
if (++i < config.size()) {
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > kLargeBuffer / mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be > ",
|
||||
kLargeBuffer / mb,
|
||||
"");
|
||||
val1 = std::max(val1, kLargeBuffer / mb);
|
||||
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
|
||||
m_max_non_split_rounding_size = val1 * 1024 * 1024;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
double val1 = stod(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
|
||||
TORCH_CHECK(
|
||||
val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
|
||||
m_garbage_collection_threshold = val1;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting garbage_collection_threshold value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
bool first_value = true;
|
||||
|
||||
if (++i < config.size()) {
|
||||
if (std::string_view(config[i]) == "[") {
|
||||
size_t last_index = 0;
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < config.size() && std::string_view(config[i]) != "]") {
|
||||
const std::string& val1 = config[i];
|
||||
size_t val2 = 0;
|
||||
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
val2 = stoi(config[i]);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error parsing roundup_power2_divisions value", "");
|
||||
}
|
||||
TORCH_CHECK(
|
||||
val2 == 0 || llvm::isPowerOf2_64(val2),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
|
||||
"");
|
||||
|
||||
if (std::string_view(val1) == ">") {
|
||||
std::fill(
|
||||
std::next(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
static_cast<std::vector<unsigned long>::difference_type>(
|
||||
last_index)),
|
||||
m_roundup_power2_divisions.end(),
|
||||
val2);
|
||||
} else {
|
||||
size_t val1_long = stoul(val1);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val1_long),
|
||||
"For roundups, the intervals have to be power of 2 ",
|
||||
"");
|
||||
|
||||
size_t index = 63 - llvm::countLeadingZeros(val1_long);
|
||||
index = std::max((size_t)0, index);
|
||||
index = std::min(index, m_roundup_power2_divisions.size() - 1);
|
||||
|
||||
if (first_value) {
|
||||
std::fill(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
std::next(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
static_cast<std::vector<unsigned long>::difference_type>(
|
||||
index)),
|
||||
val2);
|
||||
first_value = false;
|
||||
}
|
||||
if (index < m_roundup_power2_divisions.size()) {
|
||||
m_roundup_power2_divisions[index] = val2;
|
||||
}
|
||||
last_index = index;
|
||||
}
|
||||
|
||||
if (std::string_view(config[i + 1]) != "]") {
|
||||
consumeToken(config, ++i, ',');
|
||||
}
|
||||
}
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val1),
|
||||
"For roundups, the divisions has to be power of 2 ",
|
||||
"");
|
||||
std::fill(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
m_roundup_power2_divisions.end(),
|
||||
val1);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
bool& used_cudaMallocAsync) {
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
// For ease of maintenance and understanding, the CUDA and ROCm
|
||||
// implementations of this function are separated. This avoids having many
|
||||
// #ifdef's throughout.
|
||||
#ifdef USE_ROCM
|
||||
// Ease burden on ROCm users by allowing either cuda or hip tokens.
|
||||
// cuda token is broken up to prevent hipify matching it.
|
||||
#define PYTORCH_TOKEN1 \
|
||||
"cud" \
|
||||
"aMallocAsync"
|
||||
#define PYTORCH_TOKEN2 "hipMallocAsync"
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
i++; // Move to the value after the colon
|
||||
TORCH_CHECK_VALUE(
|
||||
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
|
||||
(tokenizer[i] == PYTORCH_TOKEN2)),
|
||||
"Unknown allocator backend, "
|
||||
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
||||
if (m_is_allocator_loaded) {
|
||||
bool aync_allocator_at_runtime = (tokenizer[i] != "native");
|
||||
TORCH_CHECK(
|
||||
((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
|
||||
(config[i] == PYTORCH_TOKEN2)),
|
||||
"Unknown allocator backend, "
|
||||
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
||||
used_cudaMallocAsync =
|
||||
(config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
config[i] == get()->name() ||
|
||||
(config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
|
||||
"Allocator backend parsed at runtime != "
|
||||
"allocator backend parsed at load time, ",
|
||||
config[i],
|
||||
aync_allocator_at_runtime == m_use_async_allocator,
|
||||
"Allocator async backend parsed at runtime != allocator async backend parsed at load time, ",
|
||||
aync_allocator_at_runtime,
|
||||
" != ",
|
||||
get()->name());
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
||||
m_use_async_allocator);
|
||||
}
|
||||
m_use_async_allocator =
|
||||
(tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
|
||||
// CUDA allocator is always loaded at the start of the program
|
||||
m_is_allocator_loaded = true;
|
||||
|
||||
#if defined(CUDA_VERSION)
|
||||
if (m_use_async_allocator) {
|
||||
#if CUDA_VERSION >= 11040
|
||||
int version = 0;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
TORCH_CHECK(
|
||||
version >= 11040,
|
||||
"backend:cudaMallocAsync requires CUDA runtime "
|
||||
"11.4 or newer, but cudaDriverGetVersion returned ",
|
||||
version);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"backend:cudaMallocAsync requires PyTorch to be built with "
|
||||
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
||||
CUDA_VERSION);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
return i;
|
||||
#undef PYTORCH_TOKEN1
|
||||
#undef PYTORCH_TOKEN2
|
||||
#else // USE_ROCM
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
((config[i] == "native") || (config[i] == "cudaMallocAsync")),
|
||||
"Unknown allocator backend, "
|
||||
"options are native and cudaMallocAsync");
|
||||
used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
|
||||
if (used_cudaMallocAsync) {
|
||||
#if CUDA_VERSION >= 11040
|
||||
int version = 0;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
TORCH_CHECK(
|
||||
version >= 11040,
|
||||
"backend:cudaMallocAsync requires CUDA runtime "
|
||||
"11.4 or newer, but cudaDriverGetVersion returned ",
|
||||
version);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"backend:cudaMallocAsync requires PyTorch to be built with "
|
||||
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
||||
CUDA_VERSION);
|
||||
#endif
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
config[i] == get()->name(),
|
||||
"Allocator backend parsed at runtime != "
|
||||
"allocator backend parsed at load time");
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
||||
}
|
||||
return i;
|
||||
#endif // USE_ROCM
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
|
||||
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
// If empty, set the default values
|
||||
m_max_split_size = std::numeric_limits<size_t>::max();
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
m_garbage_collection_threshold = 0;
|
||||
bool used_cudaMallocAsync = false;
|
||||
bool used_native_specific_option = false;
|
||||
|
||||
if (!env.has_value()) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
|
||||
m_last_allocator_settings = env.value();
|
||||
}
|
||||
|
||||
std::vector<std::string> config;
|
||||
lexArgs(env.value(), config);
|
||||
|
||||
for (size_t i = 0; i < config.size(); i++) {
|
||||
std::string_view config_item_view(config[i]);
|
||||
if (config_item_view == "max_split_size_mb") {
|
||||
i = parseMaxSplitSize(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "max_non_split_rounding_mb") {
|
||||
i = parseMaxNonSplitRoundingSize(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "garbage_collection_threshold") {
|
||||
i = parseGarbageCollectionThreshold(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "roundup_power2_divisions") {
|
||||
i = parseRoundUpPower2Divisions(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "backend") {
|
||||
i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
|
||||
} else if (config_item_view == "expandable_segments") {
|
||||
used_native_specific_option = true;
|
||||
consumeToken(config, ++i, ':');
|
||||
++i;
|
||||
TORCH_CHECK(
|
||||
i < config.size() &&
|
||||
(std::string_view(config[i]) == "True" ||
|
||||
std::string_view(config[i]) == "False"),
|
||||
"Expected a single True/False argument for expandable_segments");
|
||||
config_item_view = config[i];
|
||||
m_expandable_segments = (config_item_view == "True");
|
||||
c10::CachingAllocator::ConfigTokenizer tokenizer(env);
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "backend") {
|
||||
i = parseAllocatorConfig(tokenizer, i);
|
||||
} else if (
|
||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||
// use, accept both. We must break up the string to prevent hipify here.
|
||||
config_item_view == "release_lock_on_hipmalloc" ||
|
||||
config_item_view ==
|
||||
key == "release_lock_on_hipmalloc" ||
|
||||
key ==
|
||||
"release_lock_on_c"
|
||||
"udamalloc") {
|
||||
used_native_specific_option = true;
|
||||
consumeToken(config, ++i, ':');
|
||||
++i;
|
||||
TORCH_CHECK(
|
||||
i < config.size() &&
|
||||
(std::string_view(config[i]) == "True" ||
|
||||
std::string_view(config[i]) == "False"),
|
||||
"Expected a single True/False argument for release_lock_on_cudamalloc");
|
||||
config_item_view = config[i];
|
||||
m_release_lock_on_cudamalloc = (config_item_view == "True");
|
||||
tokenizer.checkToken(++i, ":");
|
||||
m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
|
||||
} else if (
|
||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||
// use, accept both. We must break up the string to prevent hipify here.
|
||||
config_item_view == "pinned_use_hip_host_register" ||
|
||||
config_item_view ==
|
||||
key == "pinned_use_hip_host_register" ||
|
||||
key ==
|
||||
"pinned_use_c"
|
||||
"uda_host_register") {
|
||||
i = parsePinnedUseCudaHostRegister(config, i);
|
||||
i = parsePinnedUseCudaHostRegister(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "pinned_num_register_threads") {
|
||||
i = parsePinnedNumRegisterThreads(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "pinned_use_background_threads") {
|
||||
i = parsePinnedUseBackgroundThreads(config, i);
|
||||
} else if (key == "pinned_num_register_threads") {
|
||||
i = parsePinnedNumRegisterThreads(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else {
|
||||
const auto& keys =
|
||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||
TORCH_CHECK(
|
||||
false, "Unrecognized CachingAllocator option: ", config_item_view);
|
||||
keys.find(key) != keys.end(),
|
||||
"Unrecognized key '",
|
||||
key,
|
||||
"' in Accelerator allocator config.");
|
||||
i = tokenizer.skipKey(i);
|
||||
}
|
||||
|
||||
if (i + 1 < config.size()) {
|
||||
consumeToken(config, ++i, ',');
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
|
||||
if (used_cudaMallocAsync && used_native_specific_option) {
|
||||
if (m_use_async_allocator && used_native_specific_option) {
|
||||
TORCH_WARN(
|
||||
"backend:cudaMallocAsync ignores max_split_size_mb,"
|
||||
"roundup_power2_divisions, and garbage_collect_threshold.");
|
||||
@ -391,64 +121,33 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
|
||||
const std::vector<std::string>& config,
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
(config[i] == "True" || config[i] == "False"),
|
||||
"Expected a single True/False argument for pinned_use_cuda_host_register");
|
||||
m_pinned_use_cuda_host_register = (config[i] == "True");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_use_cuda_host_register value", "");
|
||||
}
|
||||
tokenizer.checkToken(++i, ":");
|
||||
m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
const std::vector<std::string>& config,
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
size_t val2 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val2),
|
||||
"Number of register threads has to be power of 2 ",
|
||||
"");
|
||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||
TORCH_CHECK(
|
||||
val2 <= maxThreads,
|
||||
"Number of register threads should be less than or equal to " +
|
||||
std::to_string(maxThreads),
|
||||
"");
|
||||
m_pinned_num_register_threads = val2;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_num_register_threads value", "");
|
||||
}
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t val2 = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
llvm::isPowerOf2_64(val2),
|
||||
"Number of register threads has to be power of 2 ",
|
||||
"");
|
||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||
TORCH_CHECK_VALUE(
|
||||
val2 <= maxThreads,
|
||||
"Number of register threads should be less than or equal to " +
|
||||
std::to_string(maxThreads),
|
||||
"");
|
||||
m_pinned_num_register_threads = val2;
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
(config[i] == "True" || config[i] == "False"),
|
||||
"Expected a single True/False argument for pinned_use_background_threads");
|
||||
m_pinned_use_background_threads = (config[i] == "True");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_use_background_threads value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
// General caching allocator utilities
|
||||
void setAllocatorSettings(const std::string& env) {
|
||||
CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
|
||||
}
|
||||
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
|
||||
|
||||
} // namespace c10::cuda::CUDACachingAllocator
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user