mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-25 08:11:06 +08:00
Compare commits
84 Commits
move-theme
...
v1.6.0
| Author | SHA1 | Date | |
|---|---|---|---|
| b31f58de6f | |||
| 29fe90e2a2 | |||
| 35ad2d8586 | |||
| 994b37b36e | |||
| 8aa878fc93 | |||
| 7c7c9c3aa6 | |||
| a2922f589d | |||
| 8acfecaecb | |||
| 860e18a61b | |||
| 8f804baaa9 | |||
| a395e0903e | |||
| 2ca55430d2 | |||
| b8e77a42bd | |||
| 4081fdd3df | |||
| cefb9e0cd6 | |||
| d9e9e0087a | |||
| 43d746305c | |||
| 9409e03903 | |||
| c9a1853d2f | |||
| 7fa9b2923b | |||
| 40bf15a8ac | |||
| c164fc4d7f | |||
| e0b7480f34 | |||
| 89d7f194d8 | |||
| 59bb44a8e8 | |||
| 8f4d01d9f1 | |||
| 77ffb25925 | |||
| af9600b1f5 | |||
| 83262b1ba1 | |||
| f862a6ba4d | |||
| f3c1ea7455 | |||
| 2ed3ad2891 | |||
| a857af50a4 | |||
| d0045e5520 | |||
| 0406b69b79 | |||
| 6220cc4380 | |||
| eaf3f2fd34 | |||
| c35b4c770b | |||
| 11baccf1b5 | |||
| f0f0cbdd4a | |||
| 11b70b0041 | |||
| 01e9562313 | |||
| 3f13c9a2c8 | |||
| 63a94c021a | |||
| 2b175ba909 | |||
| 8c3f662224 | |||
| 0ffdd5aa1d | |||
| d53427c541 | |||
| b44b1d868e | |||
| 9184c9832e | |||
| e89c4f0dec | |||
| ea273c68f9 | |||
| 4dd37bfbf7 | |||
| 2533b9da83 | |||
| c5c8a85a82 | |||
| b4b8f5b9d4 | |||
| 41816dc97f | |||
| 31d9776c04 | |||
| ddea6c552f | |||
| 091537a764 | |||
| bf4d905ea1 | |||
| 415e499330 | |||
| eaf7dad5d6 | |||
| 75a074abdc | |||
| dede34eab7 | |||
| 0c90b6da5c | |||
| 4316199832 | |||
| f993e5ac88 | |||
| c5bd737f0c | |||
| fe45c2c986 | |||
| a9996bb482 | |||
| bdfcbfa18c | |||
| ea1b0dba18 | |||
| 6d85b2c989 | |||
| 44f79651a7 | |||
| 8682ac147b | |||
| 4cc605e80a | |||
| b0cce716f7 | |||
| 0dc93ac119 | |||
| bb848df10b | |||
| 2dc0b84aca | |||
| 168cddf5f1 | |||
| bc8760b3db | |||
| 4269b9a8fc |
@ -36,6 +36,12 @@ CONFIG_TREE_DATA = [
|
|||||||
("libtorch", [XImportant(True)])
|
("libtorch", [XImportant(True)])
|
||||||
]),
|
]),
|
||||||
]),
|
]),
|
||||||
|
("11.0", [
|
||||||
|
X("3.8"),
|
||||||
|
("3.8", [
|
||||||
|
("libtorch", [X(True)])
|
||||||
|
]),
|
||||||
|
]),
|
||||||
]),
|
]),
|
||||||
]),
|
]),
|
||||||
("bionic", [
|
("bionic", [
|
||||||
|
|||||||
@ -49,7 +49,8 @@ class Conf:
|
|||||||
|
|
||||||
cuda_parms = []
|
cuda_parms = []
|
||||||
if self.cuda_version:
|
if self.cuda_version:
|
||||||
cuda_parms.extend(["cuda" + self.cuda_version, "cudnn7"])
|
cudnn = "cudnn8" if self.cuda_version.startswith("11.") else "cudnn7"
|
||||||
|
cuda_parms.extend(["cuda" + self.cuda_version, cudnn])
|
||||||
result = leading + ["linux", self.distro] + cuda_parms + self.parms
|
result = leading + ["linux", self.distro] + cuda_parms + self.parms
|
||||||
if not for_docker and self.parms_list_ignored_for_docker_image is not None:
|
if not for_docker and self.parms_list_ignored_for_docker_image is not None:
|
||||||
result = result + self.parms_list_ignored_for_docker_image
|
result = result + self.parms_list_ignored_for_docker_image
|
||||||
@ -222,8 +223,7 @@ def instantiate_configs():
|
|||||||
python_version = fc.find_prop("pyver")
|
python_version = fc.find_prop("pyver")
|
||||||
parms_list[0] = fc.find_prop("abbreviated_pyver")
|
parms_list[0] = fc.find_prop("abbreviated_pyver")
|
||||||
|
|
||||||
if cuda_version in ["9.2", "10", "10.1", "10.2"]:
|
if cuda_version:
|
||||||
# TODO The gcc version is orthogonal to CUDA version?
|
|
||||||
cuda_gcc_version = fc.find_prop("cuda_gcc_override") or "gcc7"
|
cuda_gcc_version = fc.find_prop("cuda_gcc_override") or "gcc7"
|
||||||
parms_list.append(cuda_gcc_version)
|
parms_list.append(cuda_gcc_version)
|
||||||
|
|
||||||
|
|||||||
@ -958,6 +958,11 @@ jobs:
|
|||||||
no_output_timeout: "1h"
|
no_output_timeout: "1h"
|
||||||
command: |
|
command: |
|
||||||
source "/pytorch/.circleci/scripts/binary_linux_build.sh"
|
source "/pytorch/.circleci/scripts/binary_linux_build.sh"
|
||||||
|
- run:
|
||||||
|
name: Output binary sizes
|
||||||
|
no_output_timeout: "1m"
|
||||||
|
command: |
|
||||||
|
ls -lah /final_pkgs
|
||||||
- run:
|
- run:
|
||||||
name: save binary size
|
name: save binary size
|
||||||
no_output_timeout: "5m"
|
no_output_timeout: "5m"
|
||||||
@ -972,6 +977,9 @@ jobs:
|
|||||||
root: /
|
root: /
|
||||||
paths: final_pkgs
|
paths: final_pkgs
|
||||||
|
|
||||||
|
- store_artifacts:
|
||||||
|
path: /final_pkgs
|
||||||
|
|
||||||
# This should really just be another step of the binary_linux_build job above.
|
# This should really just be another step of the binary_linux_build job above.
|
||||||
# This isn't possible right now b/c the build job uses the docker executor
|
# This isn't possible right now b/c the build job uses the docker executor
|
||||||
# (otherwise they'd be really really slow) but this one uses the macine
|
# (otherwise they'd be really really slow) but this one uses the macine
|
||||||
@ -7388,6 +7396,54 @@ workflows:
|
|||||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
||||||
use_cuda_docker_runtime: "1"
|
use_cuda_docker_runtime: "1"
|
||||||
resource_class: gpu.medium
|
resource_class: gpu.medium
|
||||||
|
- pytorch_linux_build:
|
||||||
|
name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
|
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
|
||||||
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
||||||
|
- pytorch_linux_test:
|
||||||
|
name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
|
||||||
|
requires:
|
||||||
|
- pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
|
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
|
||||||
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
||||||
|
use_cuda_docker_runtime: "1"
|
||||||
|
resource_class: gpu.medium
|
||||||
|
- pytorch_linux_build:
|
||||||
|
name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
|
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
|
||||||
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
||||||
|
- pytorch_linux_test:
|
||||||
|
name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
|
||||||
|
requires:
|
||||||
|
- pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
|
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
|
||||||
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
|
||||||
|
use_cuda_docker_runtime: "1"
|
||||||
|
resource_class: gpu.medium
|
||||||
- pytorch_linux_build:
|
- pytorch_linux_build:
|
||||||
name: pytorch_linux_bionic_py3_6_clang9_build
|
name: pytorch_linux_bionic_py3_6_clang9_build
|
||||||
build_environment: "pytorch-linux-bionic-py3.6-clang9-build"
|
build_environment: "pytorch-linux-bionic-py3.6-clang9-build"
|
||||||
|
|||||||
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
|
|||||||
cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
|
cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
|
||||||
# build a FAT bianry
|
# build a FAT bianry
|
||||||
cd ${ZIP_DIR}/install/lib
|
cd ${ZIP_DIR}/install/lib
|
||||||
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
|
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
|
||||||
for lib in ${target_libs[*]}
|
for lib in ${target_libs[*]}
|
||||||
do
|
do
|
||||||
if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
|
if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
|
||||||
|
|||||||
@ -20,6 +20,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly}
|
|||||||
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
||||||
BACKUP_BUCKET="s3://pytorch-backup"
|
BACKUP_BUCKET="s3://pytorch-backup"
|
||||||
|
|
||||||
|
retry pip install -q awscli
|
||||||
# Upload the package to the final location
|
# Upload the package to the final location
|
||||||
pushd /home/circleci/project/final_pkgs
|
pushd /home/circleci/project/final_pkgs
|
||||||
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||||
@ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
|
|||||||
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
||||||
BACKUP_DIR="conda/${subdir}"
|
BACKUP_DIR="conda/${subdir}"
|
||||||
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
||||||
retry pip install -q awscli
|
|
||||||
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
for pkg in $(ls); do
|
for pkg in $(ls); do
|
||||||
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
||||||
done
|
done
|
||||||
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
else
|
else
|
||||||
retry pip install -q awscli
|
|
||||||
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
||||||
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
@ -45,5 +44,5 @@ fi
|
|||||||
|
|
||||||
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
||||||
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
||||||
retry aws s3 cp . "$s3_dir"
|
retry aws s3 cp --recursive . "$s3_dir"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -21,6 +21,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly}
|
|||||||
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
||||||
BACKUP_BUCKET="s3://pytorch-backup"
|
BACKUP_BUCKET="s3://pytorch-backup"
|
||||||
|
|
||||||
|
retry pip install -q awscli
|
||||||
pushd "$workdir/final_pkgs"
|
pushd "$workdir/final_pkgs"
|
||||||
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||||
retry conda install -yq anaconda-client
|
retry conda install -yq anaconda-client
|
||||||
@ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
|
|||||||
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
||||||
BACKUP_DIR="conda/${subdir}"
|
BACKUP_DIR="conda/${subdir}"
|
||||||
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
||||||
retry pip install -q awscli
|
|
||||||
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
for pkg in $(ls); do
|
for pkg in $(ls); do
|
||||||
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
||||||
done
|
done
|
||||||
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
else
|
else
|
||||||
retry pip install -q awscli
|
|
||||||
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
||||||
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
@ -45,5 +44,5 @@ fi
|
|||||||
|
|
||||||
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
||||||
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
||||||
retry aws s3 cp . "$s3_dir"
|
retry aws s3 cp --recursive . "$s3_dir"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -19,6 +19,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/}
|
|||||||
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
||||||
BACKUP_BUCKET="s3://pytorch-backup"
|
BACKUP_BUCKET="s3://pytorch-backup"
|
||||||
|
|
||||||
|
retry pip install -q awscli
|
||||||
pushd /root/workspace/final_pkgs
|
pushd /root/workspace/final_pkgs
|
||||||
# Upload the package to the final location
|
# Upload the package to the final location
|
||||||
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||||
@ -29,14 +30,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
|
|||||||
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
|
||||||
BACKUP_DIR="conda/${subdir}"
|
BACKUP_DIR="conda/${subdir}"
|
||||||
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
||||||
retry conda install -c conda-forge -yq awscli
|
|
||||||
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
for pkg in $(ls); do
|
for pkg in $(ls); do
|
||||||
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
||||||
done
|
done
|
||||||
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
else
|
else
|
||||||
retry conda install -c conda-forge -yq awscli
|
|
||||||
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
||||||
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
@ -44,5 +43,5 @@ fi
|
|||||||
|
|
||||||
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
||||||
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
|
||||||
retry aws s3 cp . "$s3_dir"
|
retry aws s3 cp --recursive . "$s3_dir"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -41,6 +41,11 @@
|
|||||||
no_output_timeout: "1h"
|
no_output_timeout: "1h"
|
||||||
command: |
|
command: |
|
||||||
source "/pytorch/.circleci/scripts/binary_linux_build.sh"
|
source "/pytorch/.circleci/scripts/binary_linux_build.sh"
|
||||||
|
- run:
|
||||||
|
name: Output binary sizes
|
||||||
|
no_output_timeout: "1m"
|
||||||
|
command: |
|
||||||
|
ls -lah /final_pkgs
|
||||||
- run:
|
- run:
|
||||||
name: save binary size
|
name: save binary size
|
||||||
no_output_timeout: "5m"
|
no_output_timeout: "5m"
|
||||||
@ -55,6 +60,9 @@
|
|||||||
root: /
|
root: /
|
||||||
paths: final_pkgs
|
paths: final_pkgs
|
||||||
|
|
||||||
|
- store_artifacts:
|
||||||
|
path: /final_pkgs
|
||||||
|
|
||||||
# This should really just be another step of the binary_linux_build job above.
|
# This should really just be another step of the binary_linux_build job above.
|
||||||
# This isn't possible right now b/c the build job uses the docker executor
|
# This isn't possible right now b/c the build job uses the docker executor
|
||||||
# (otherwise they'd be really really slow) but this one uses the macine
|
# (otherwise they'd be really really slow) but this one uses the macine
|
||||||
|
|||||||
@ -181,7 +181,7 @@ fi
|
|||||||
|
|
||||||
# Patch required to build xla
|
# Patch required to build xla
|
||||||
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
||||||
git clone --recursive https://github.com/pytorch/xla.git
|
git clone --recursive -b r1.6 https://github.com/pytorch/xla.git
|
||||||
./xla/scripts/apply_patches.sh
|
./xla/scripts/apply_patches.sh
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -185,9 +185,9 @@ function get_exit_code() {
|
|||||||
function file_diff_from_base() {
|
function file_diff_from_base() {
|
||||||
# The fetch may fail on Docker hosts, but it's not always necessary.
|
# The fetch may fail on Docker hosts, but it's not always necessary.
|
||||||
set +e
|
set +e
|
||||||
git fetch origin master --quiet
|
git fetch origin release/1.6 --quiet
|
||||||
set -e
|
set -e
|
||||||
git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
|
git diff --name-only "$(git merge-base origin/release/1.6 HEAD)" > "$1"
|
||||||
}
|
}
|
||||||
|
|
||||||
function get_bazel() {
|
function get_bazel() {
|
||||||
|
|||||||
@ -289,7 +289,7 @@ test_backward_compatibility() {
|
|||||||
pushd test/backward_compatibility
|
pushd test/backward_compatibility
|
||||||
python dump_all_function_schemas.py --filename new_schemas.txt
|
python dump_all_function_schemas.py --filename new_schemas.txt
|
||||||
pip_uninstall torch
|
pip_uninstall torch
|
||||||
pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
|
pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
|
||||||
python check_backward_compatibility.py --new-schemas new_schemas.txt
|
python check_backward_compatibility.py --new-schemas new_schemas.txt
|
||||||
popd
|
popd
|
||||||
set +x
|
set +x
|
||||||
@ -341,8 +341,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; t
|
|||||||
elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
||||||
test_bazel
|
test_bazel
|
||||||
elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4* ]]; then
|
||||||
# test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure
|
# test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure
|
||||||
# cpp extension can be built correctly under this old env
|
# cpp extension can be built correctly under this old env
|
||||||
test_cpp_extensions
|
test_cpp_extensions
|
||||||
else
|
else
|
||||||
test_torchvision
|
test_torchvision
|
||||||
|
|||||||
@ -1350,7 +1350,6 @@ filegroup(
|
|||||||
"caffe2/utils/smart_tensor_printer.cc",
|
"caffe2/utils/smart_tensor_printer.cc",
|
||||||
"caffe2/utils/string_utils.cc",
|
"caffe2/utils/string_utils.cc",
|
||||||
"caffe2/utils/threadpool/ThreadPool.cc",
|
"caffe2/utils/threadpool/ThreadPool.cc",
|
||||||
"caffe2/utils/threadpool/ThreadPoolMobile.cc",
|
|
||||||
"caffe2/utils/threadpool/pthreadpool.cc",
|
"caffe2/utils/threadpool/pthreadpool.cc",
|
||||||
"caffe2/utils/threadpool/pthreadpool_impl.cc",
|
"caffe2/utils/threadpool/pthreadpool_impl.cc",
|
||||||
],
|
],
|
||||||
|
|||||||
@ -481,7 +481,7 @@ if(USE_PYTORCH_QNNPACK)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_XNNPACK)
|
if(USE_XNNPACK)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK -DUSE_INTERNAL_THREADPOOL_IMPL")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_VULKAN)
|
if(USE_VULKAN)
|
||||||
|
|||||||
@ -99,6 +99,7 @@ if(ANDROID_ABI)
|
|||||||
import_static_lib(libnnpack)
|
import_static_lib(libnnpack)
|
||||||
import_static_lib(libXNNPACK)
|
import_static_lib(libXNNPACK)
|
||||||
import_static_lib(libpytorch_qnnpack)
|
import_static_lib(libpytorch_qnnpack)
|
||||||
|
import_static_lib(libpthreadpool)
|
||||||
import_static_lib(libeigen_blas)
|
import_static_lib(libeigen_blas)
|
||||||
import_static_lib(libcpuinfo)
|
import_static_lib(libcpuinfo)
|
||||||
import_static_lib(libclog)
|
import_static_lib(libclog)
|
||||||
@ -115,6 +116,7 @@ if(ANDROID_ABI)
|
|||||||
libnnpack
|
libnnpack
|
||||||
libXNNPACK
|
libXNNPACK
|
||||||
libpytorch_qnnpack
|
libpytorch_qnnpack
|
||||||
|
libpthreadpool
|
||||||
libeigen_blas
|
libeigen_blas
|
||||||
libcpuinfo
|
libcpuinfo
|
||||||
libclog
|
libclog
|
||||||
@ -129,6 +131,7 @@ else()
|
|||||||
nnpack
|
nnpack
|
||||||
XNNPACK
|
XNNPACK
|
||||||
pytorch_qnnpack
|
pytorch_qnnpack
|
||||||
|
pthreadpool
|
||||||
cpuinfo
|
cpuinfo
|
||||||
clog
|
clog
|
||||||
)
|
)
|
||||||
|
|||||||
@ -8,8 +8,10 @@
|
|||||||
|
|
||||||
#include "pytorch_jni_common.h"
|
#include "pytorch_jni_common.h"
|
||||||
#if defined(__ANDROID__)
|
#if defined(__ANDROID__)
|
||||||
#include <caffe2/utils/threadpool/ThreadPool.h>
|
#ifndef USE_PTHREADPOOL
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#define USE_PTHREADPOOL
|
||||||
|
#endif /* USE_PTHREADPOOL */
|
||||||
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace pytorch_jni {
|
namespace pytorch_jni {
|
||||||
@ -605,7 +607,7 @@ class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
|
static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
|
||||||
caffe2::mobile_threadpool()->setNumThreads(numThreads);
|
caffe2::pthreadpool()->set_thread_count(numThreads);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -56,6 +56,38 @@
|
|||||||
- THBoolTensor* mask
|
- THBoolTensor* mask
|
||||||
- THTensor* source
|
- THTensor* source
|
||||||
]]
|
]]
|
||||||
|
[[
|
||||||
|
name: _th_masked_select
|
||||||
|
cname: maskedSelect
|
||||||
|
cpu_bool: True
|
||||||
|
cpu_bfloat16: True
|
||||||
|
variants:
|
||||||
|
- function
|
||||||
|
backends:
|
||||||
|
- CPU
|
||||||
|
return: argument 0
|
||||||
|
arguments:
|
||||||
|
- arg: THTensor* result
|
||||||
|
output: True
|
||||||
|
- THTensor* self
|
||||||
|
- THByteTensor* mask
|
||||||
|
]]
|
||||||
|
[[
|
||||||
|
name: _th_masked_select_bool
|
||||||
|
cname: maskedSelectBool
|
||||||
|
cpu_bool: True
|
||||||
|
cpu_bfloat16: True
|
||||||
|
variants:
|
||||||
|
- function
|
||||||
|
backends:
|
||||||
|
- CPU
|
||||||
|
return: argument 0
|
||||||
|
arguments:
|
||||||
|
- arg: THTensor* result
|
||||||
|
output: True
|
||||||
|
- THTensor* self
|
||||||
|
- THBoolTensor* mask
|
||||||
|
]]
|
||||||
[[
|
[[
|
||||||
name: _th_nonzero
|
name: _th_nonzero
|
||||||
cname: nonzero
|
cname: nonzero
|
||||||
|
|||||||
@ -6,8 +6,7 @@
|
|||||||
#ifndef C10_MOBILE
|
#ifndef C10_MOBILE
|
||||||
#include <c10/core/thread_pool.h>
|
#include <c10/core/thread_pool.h>
|
||||||
#else
|
#else
|
||||||
#include <caffe2/utils/threadpool/ThreadPool.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
@ -88,15 +87,15 @@ void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
|
|||||||
// Run the first task on the current thread directly.
|
// Run the first task on the current thread directly.
|
||||||
fn(0, 0);
|
fn(0, 0);
|
||||||
#else
|
#else
|
||||||
caffe2::ThreadPool* pool = caffe2::mobile_threadpool();
|
caffe2::PThreadPool* const pool = caffe2::pthreadpool();
|
||||||
if (pool) {
|
TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
|
||||||
// caffe2::ThreadPool can utilize the current thread.
|
|
||||||
pool->run(fn, range);
|
pool->run(
|
||||||
} else {
|
// PThreadPool::run() is blocking. A std::function [const] reference to
|
||||||
for (size_t i = 0; i < range; ++i) {
|
// this lambda cannot go out of scope before PThreadPool::run() returns.
|
||||||
fn(0, i);
|
[&fn](const size_t task_id) {
|
||||||
}
|
fn(0 /* unused */, task_id);
|
||||||
}
|
}, range);
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,7 +183,7 @@ void init_num_threads() {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef C10_MOBILE
|
#ifdef C10_MOBILE
|
||||||
caffe2::mobile_threadpool();
|
caffe2::pthreadpool();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,7 +207,9 @@ void set_num_threads(int nthreads) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
TORCH_CHECK(false, "set_num_threads is not supported for mobile.");
|
caffe2::PThreadPool* const pool = caffe2::pthreadpool();
|
||||||
|
TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
|
||||||
|
pool->set_thread_count(nthreads);
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,9 +227,9 @@ int get_num_threads() {
|
|||||||
return _get_intraop_pool().size() + 1;
|
return _get_intraop_pool().size() + 1;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
caffe2::ThreadPool* pool = caffe2::mobile_threadpool();
|
caffe2::PThreadPool* const pool = caffe2::pthreadpool();
|
||||||
// caffe2::ThreadPool::getNumThreads() counts the current thread.
|
TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!")
|
||||||
return !pool || in_parallel_region() ? 1 /* current thread */ : pool->getNumThreads();
|
return in_parallel_region() ? 1 /* current thread */ : pool->get_thread_count();
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -257,8 +258,8 @@ void intraop_launch(std::function<void()> func) {
|
|||||||
func();
|
func();
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// TODO: caffe2::ThreadPool doesn't support submitting tasks separately and
|
// TODO: caffe2::PThreadPool only provides a data-parallel API.
|
||||||
// running in parallel. Should fix it when this API becomes popular.
|
// Task parallelism is not currently supported.
|
||||||
func();
|
func();
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
}
|
}
|
||||||
@ -280,8 +281,8 @@ std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
|
|||||||
}
|
}
|
||||||
return future;
|
return future;
|
||||||
#else
|
#else
|
||||||
// TODO: caffe2::ThreadPool doesn't support submitting tasks separately and
|
// TODO: caffe2::PThreadPool only provides a data-parallel API.
|
||||||
// running in parallel. Should fix it when this API becomes popular.
|
// Task parallelism is not currently supported.
|
||||||
auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
|
auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
|
||||||
func();
|
func();
|
||||||
future->markCompleted();
|
future->markCompleted();
|
||||||
|
|||||||
@ -135,6 +135,7 @@ UPTOb( bool , equal , (const Tensor &A, const Tensor &B) )
|
|||||||
UPTOb( Tensor, cat , (TensorList A, int64_t B) )
|
UPTOb( Tensor, cat , (TensorList A, int64_t B) )
|
||||||
UPTOb( Tensor, cat , (TensorList A, Dimname B) )
|
UPTOb( Tensor, cat , (TensorList A, Dimname B) )
|
||||||
UPTOb( Tensor, _cat , (TensorList A, int64_t B) )
|
UPTOb( Tensor, _cat , (TensorList A, int64_t B) )
|
||||||
|
UPTOd( Tensor, index_put, (const Tensor &A, TensorList B, const Tensor & C, bool D) )
|
||||||
UPTOb( Tensor, stack , (TensorList A, int64_t B) )
|
UPTOb( Tensor, stack , (TensorList A, int64_t B) )
|
||||||
|
|
||||||
#undef UPTOa
|
#undef UPTOa
|
||||||
|
|||||||
@ -482,15 +482,16 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
|
|||||||
KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
|
KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
|
||||||
KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
|
KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
|
||||||
KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote)
|
KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote)
|
||||||
KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
|
|
||||||
KERNEL_UNBOXED_ONLY(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &), promote)
|
KERNEL_UNBOXED_ONLY(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &), promote)
|
||||||
KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)
|
|
||||||
KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
|
|
||||||
KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
|
|
||||||
KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
|
KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
|
||||||
KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
|
KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
|
||||||
KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote)
|
KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote)
|
||||||
|
KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
|
||||||
|
KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
|
||||||
|
KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
|
||||||
|
KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote)
|
||||||
KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
|
KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
|
||||||
|
KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)
|
||||||
|
|
||||||
m.impl_UNBOXED("binary_cross_entropy", &at::autocast::binary_cross_entropy_banned);
|
m.impl_UNBOXED("binary_cross_entropy", &at::autocast::binary_cross_entropy_banned);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -188,6 +188,7 @@ namespace c10 {
|
|||||||
_(prim, unchecked_unwrap_optional) \
|
_(prim, unchecked_unwrap_optional) \
|
||||||
_(aten, __contains__) \
|
_(aten, __contains__) \
|
||||||
_(prim, BailoutTemplate) \
|
_(prim, BailoutTemplate) \
|
||||||
|
_(prim, grad) \
|
||||||
_(aten, zero_) \
|
_(aten, zero_) \
|
||||||
_(aten, fill_) \
|
_(aten, fill_) \
|
||||||
FORALL_ATEN_BASE_SYMBOLS(_) \
|
FORALL_ATEN_BASE_SYMBOLS(_) \
|
||||||
|
|||||||
@ -1481,7 +1481,7 @@ inline TypePtr TensorType::fromBoolType() {
|
|||||||
|
|
||||||
inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const c10::TypePtr & type) {
|
inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const c10::TypePtr & type) {
|
||||||
if (type == FloatType::get()) {
|
if (type == FloatType::get()) {
|
||||||
return at::ScalarType::Double;
|
return at::typeMetaToScalarType(c10::get_default_dtype());
|
||||||
} else if (type == IntType::get()) {
|
} else if (type == IntType::get()) {
|
||||||
return at::ScalarType::Long;
|
return at::ScalarType::Long;
|
||||||
} else if (type == BoolType::get()) {
|
} else if (type == BoolType::get()) {
|
||||||
|
|||||||
@ -181,6 +181,10 @@ Allocator* CUDAHooks::getPinnedMemoryAllocator() const {
|
|||||||
return at::cuda::getPinnedMemoryAllocator();
|
return at::cuda::getPinnedMemoryAllocator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Allocator* CUDAHooks::getCUDADeviceAllocator() const {
|
||||||
|
return at::cuda::getCUDADeviceAllocator();
|
||||||
|
}
|
||||||
|
|
||||||
bool CUDAHooks::compiledWithCuDNN() const {
|
bool CUDAHooks::compiledWithCuDNN() const {
|
||||||
return AT_CUDNN_ENABLED();
|
return AT_CUDNN_ENABLED();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -22,6 +22,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
|||||||
int64_t current_device() const override;
|
int64_t current_device() const override;
|
||||||
bool hasPrimaryContext(int64_t device_index) const override;
|
bool hasPrimaryContext(int64_t device_index) const override;
|
||||||
c10::optional<int64_t> getDevceIndexWithPrimaryContext() const override;
|
c10::optional<int64_t> getDevceIndexWithPrimaryContext() const override;
|
||||||
|
Allocator* getCUDADeviceAllocator() const override;
|
||||||
Allocator* getPinnedMemoryAllocator() const override;
|
Allocator* getPinnedMemoryAllocator() const override;
|
||||||
bool compiledWithCuDNN() const override;
|
bool compiledWithCuDNN() const override;
|
||||||
bool compiledWithMIOpen() const override;
|
bool compiledWithMIOpen() const override;
|
||||||
|
|||||||
@ -16,10 +16,15 @@ void destroyCuDNNHandle(cudnnHandle_t handle) {
|
|||||||
// happens in fbcode setting. @colesbury and I decided to not destroy
|
// happens in fbcode setting. @colesbury and I decided to not destroy
|
||||||
// the handle as a workaround.
|
// the handle as a workaround.
|
||||||
// - @soumith
|
// - @soumith
|
||||||
#ifdef NO_CUDNN_DESTROY_HANDLE
|
//
|
||||||
#else
|
// Further note: this is now disabled globally, because we are seeing
|
||||||
cudnnDestroy(handle);
|
// the same issue as mentioned above in CUDA 11 CI.
|
||||||
#endif
|
// - @zasdfgbnm
|
||||||
|
//
|
||||||
|
// #ifdef NO_CUDNN_DESTROY_HANDLE
|
||||||
|
// #else
|
||||||
|
// cudnnDestroy(handle);
|
||||||
|
// #endif
|
||||||
}
|
}
|
||||||
|
|
||||||
using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDNNHandle, destroyCuDNNHandle>;
|
using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDNNHandle, destroyCuDNNHandle>;
|
||||||
|
|||||||
@ -121,6 +121,10 @@ struct CAFFE2_API CUDAHooksInterface {
|
|||||||
TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
|
TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual Allocator* getCUDADeviceAllocator() const {
|
||||||
|
TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP);
|
||||||
|
}
|
||||||
|
|
||||||
virtual bool compiledWithCuDNN() const {
|
virtual bool compiledWithCuDNN() const {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -262,9 +262,7 @@ auto ConvParams::use_xnnpack(
|
|||||||
const at::Tensor& input,
|
const at::Tensor& input,
|
||||||
const at::Tensor& weight,
|
const at::Tensor& weight,
|
||||||
const at::Tensor& bias) const -> bool {
|
const at::Tensor& bias) const -> bool {
|
||||||
// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
|
#if defined(C10_MOBILE)
|
||||||
// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
|
|
||||||
#if defined(C10_MOBILE) && !defined(__APPLE__)
|
|
||||||
if (!transposed) {
|
if (!transposed) {
|
||||||
return (input.size(1) == groups) &&
|
return (input.size(1) == groups) &&
|
||||||
xnnpack::use_convolution2d(
|
xnnpack::use_convolution2d(
|
||||||
|
|||||||
@ -22,6 +22,32 @@ Tensor & masked_scatter__cpu(Tensor& self, const Tensor & mask, const Tensor & s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor masked_select_cpu(const Tensor & self, const Tensor & mask) {
|
||||||
|
namedinference::compute_broadcast_outnames(self, mask);
|
||||||
|
|
||||||
|
Tensor b_self, b_mask;
|
||||||
|
std::tie(b_self, b_mask) = expand_outplace(self, mask, "masked_select");
|
||||||
|
if (b_mask.dtype() == at::ScalarType::Byte) {
|
||||||
|
TORCH_WARN("masked_select received a mask with dtype torch.uint8, this behavior is now deprecated," \
|
||||||
|
"please use a mask with dtype torch.bool instead.");
|
||||||
|
return legacy::cpu::_th_masked_select(b_self, b_mask);
|
||||||
|
} else {
|
||||||
|
return legacy::cpu::_th_masked_select_bool(b_self, b_mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor & masked_select_out_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
|
||||||
|
namedinference::compute_broadcast_outnames(self, mask);
|
||||||
|
|
||||||
|
Tensor b_self, b_mask;
|
||||||
|
std::tie(b_self, b_mask) = expand_outplace(self, mask, "masked_select_out");
|
||||||
|
if (b_mask.dtype() == at::ScalarType::Bool) {
|
||||||
|
return legacy::cpu::_th_masked_select_bool_out(result, b_self, b_mask);
|
||||||
|
} else {
|
||||||
|
return legacy::cpu::_th_masked_select_out(result, b_self, b_mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor argsort(const Tensor & self, int64_t dim, bool descending) {
|
Tensor argsort(const Tensor & self, int64_t dim, bool descending) {
|
||||||
return std::get<1>(at::sort(self, dim, descending));
|
return std::get<1>(at::sort(self, dim, descending));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,9 +17,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
|
|||||||
if (input.is_mkldnn()) {
|
if (input.is_mkldnn()) {
|
||||||
return at::mkldnn_linear(input, weight, bias);
|
return at::mkldnn_linear(input, weight, bias);
|
||||||
}
|
}
|
||||||
// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
|
#if defined(C10_MOBILE)
|
||||||
// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
|
|
||||||
#if defined(C10_MOBILE) && !defined(__APPLE__)
|
|
||||||
if (xnnpack::use_linear(input, weight, bias)) {
|
if (xnnpack::use_linear(input, weight, bias)) {
|
||||||
return xnnpack::linear(input, weight, bias);
|
return xnnpack::linear(input, weight, bias);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -58,8 +58,9 @@ bool _nnpack_available() {
|
|||||||
|
|
||||||
#include <nnpack.h>
|
#include <nnpack.h>
|
||||||
|
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <ATen/native/ConvUtils.h>
|
#include <ATen/native/ConvUtils.h>
|
||||||
|
#include <ATen/Parallel.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -87,15 +88,9 @@ static bool init_nnpack() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static pthreadpool_t nnpack_threadpool() {
|
static pthreadpool_t nnpack_threadpool() {
|
||||||
// Try initializing a threadpool for NNPACK's use. If we fail to
|
|
||||||
// successfully initialize an implementation, return nullptr which will
|
|
||||||
// instruct NNPACK to run single threaded.
|
|
||||||
|
|
||||||
#ifdef C10_MOBILE
|
#ifdef C10_MOBILE
|
||||||
// If building for mobile, use Caffe 2's mobile-friendly threadpool.
|
return caffe2::pthreadpool_();
|
||||||
return caffe2::mobile_pthreadpool();
|
|
||||||
#else
|
#else
|
||||||
// Otherwise, try using pthreadpool if we manage to initialize it successfully.
|
|
||||||
static pthreadpool_t nnpack_threadpool_ = nullptr;
|
static pthreadpool_t nnpack_threadpool_ = nullptr;
|
||||||
static bool called_nnpack_threadpool_ = false;
|
static bool called_nnpack_threadpool_ = false;
|
||||||
|
|
||||||
|
|||||||
@ -135,9 +135,7 @@ Tensor max_pool2d(
|
|||||||
self, kernel_size, stride, padding, dilation, ceil_mode);
|
self, kernel_size, stride, padding, dilation, ceil_mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
|
#if defined(C10_MOBILE)
|
||||||
// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
|
|
||||||
#if defined(C10_MOBILE) && !defined(__APPLE__)
|
|
||||||
if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
|
if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
|
||||||
dilation, ceil_mode)) {
|
dilation, ceil_mode)) {
|
||||||
return xnnpack::max_pool2d(
|
return xnnpack::max_pool2d(
|
||||||
|
|||||||
@ -34,12 +34,17 @@ static void scatter_gather_dtype_check(
|
|||||||
// Test:
|
// Test:
|
||||||
// 1. index.size(d) == self.size(d) for all d != dim
|
// 1. index.size(d) == self.size(d) for all d != dim
|
||||||
// 2. index.size(d) <= src.size(d) for all d != dim
|
// 2. index.size(d) <= src.size(d) for all d != dim
|
||||||
|
// 3. index.dim() == self.dim() == src.dim()
|
||||||
static void gather_shape_check(const Tensor& self, int64_t dim,
|
static void gather_shape_check(const Tensor& self, int64_t dim,
|
||||||
const Tensor& index, const Tensor& src
|
const Tensor& index, const Tensor& src
|
||||||
) {
|
) {
|
||||||
auto self_dims = ensure_nonempty_dim(self.dim());
|
auto self_dims = ensure_nonempty_dim(self.dim());
|
||||||
|
|
||||||
TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
|
TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
|
||||||
|
"Index tensor must have the same number of dimensions as out tensor"
|
||||||
|
);
|
||||||
|
|
||||||
|
auto src_dims = ensure_nonempty_dim(src.dim());
|
||||||
|
TORCH_CHECK(src_dims == ensure_nonempty_dim(index.dim()),
|
||||||
"Index tensor must have the same number of dimensions as input tensor"
|
"Index tensor must have the same number of dimensions as input tensor"
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -66,10 +71,16 @@ static void gather_shape_check(const Tensor& self, int64_t dim,
|
|||||||
// Tests:
|
// Tests:
|
||||||
// 1. index.size(d) <= self.size(d) for all d != dim
|
// 1. index.size(d) <= self.size(d) for all d != dim
|
||||||
// 2. index.size(d) <= src.size(d) for all d if src is a Tensor
|
// 2. index.size(d) <= src.size(d) for all d if src is a Tensor
|
||||||
|
// 3. index.dim() == self.dim() == src.dim()
|
||||||
static void scatter_shape_check(
|
static void scatter_shape_check(
|
||||||
const Tensor& self, int64_t dim, const Tensor& index,
|
const Tensor& self, int64_t dim, const Tensor& index,
|
||||||
const c10::optional<Tensor>& src_opt = c10::nullopt
|
const c10::optional<Tensor>& src_opt = c10::nullopt
|
||||||
) {
|
) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
|
||||||
|
"Index tensor must have the same number of dimensions as self tensor"
|
||||||
|
);
|
||||||
|
|
||||||
bool is_wrong_shape = false;
|
bool is_wrong_shape = false;
|
||||||
int64_t self_dims = ensure_nonempty_dim(self.dim());
|
int64_t self_dims = ensure_nonempty_dim(self.dim());
|
||||||
|
|
||||||
@ -97,6 +108,12 @@ static void scatter_shape_check(
|
|||||||
|
|
||||||
if (src_opt.has_value()) {
|
if (src_opt.has_value()) {
|
||||||
auto src = src_opt.value();
|
auto src = src_opt.value();
|
||||||
|
|
||||||
|
TORCH_CHECK(
|
||||||
|
ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()),
|
||||||
|
"Index tensor must have the same number of dimensions as src tensor"
|
||||||
|
);
|
||||||
|
|
||||||
TORCH_CHECK(!is_wrong_shape,
|
TORCH_CHECK(!is_wrong_shape,
|
||||||
"Expected index ", index.sizes(),
|
"Expected index ", index.sizes(),
|
||||||
" to be smaller than self ", self.sizes(),
|
" to be smaller than self ", self.sizes(),
|
||||||
|
|||||||
@ -71,8 +71,6 @@ DEFINE_DISPATCH(index_put_stub);
|
|||||||
DEFINE_DISPATCH(index_put_accum_stub);
|
DEFINE_DISPATCH(index_put_accum_stub);
|
||||||
DEFINE_DISPATCH(masked_fill_stub);
|
DEFINE_DISPATCH(masked_fill_stub);
|
||||||
REGISTER_NO_CPU_DISPATCH(index_put_accum_stub, index_put_accum_fn);
|
REGISTER_NO_CPU_DISPATCH(index_put_accum_stub, index_put_accum_fn);
|
||||||
DEFINE_DISPATCH(masked_select_serial_stub);
|
|
||||||
DEFINE_DISPATCH(masked_select_stub);
|
|
||||||
|
|
||||||
DEFINE_DISPATCH(gather_stub);
|
DEFINE_DISPATCH(gather_stub);
|
||||||
DEFINE_DISPATCH(scatter_stub);
|
DEFINE_DISPATCH(scatter_stub);
|
||||||
@ -629,82 +627,6 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Tensor & sour
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
|
|
||||||
NoNamesGuard guard;
|
|
||||||
|
|
||||||
TORCH_CHECK(mask.scalar_type() == ScalarType::Byte || mask.scalar_type() == ScalarType::Bool,
|
|
||||||
"masked_select: expected BoolTensor or ByteTensor for mask");
|
|
||||||
TORCH_CHECK(self.scalar_type() == result.scalar_type(),
|
|
||||||
"masked_select(): self and result must have the same scalar type");
|
|
||||||
|
|
||||||
if (mask.dtype() == at::ScalarType::Byte) {
|
|
||||||
TORCH_WARN("masked_select received a mask with dtype torch.uint8, this behavior is now deprecated," \
|
|
||||||
"please use a mask with dtype torch.bool instead.");
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor _mask, _self;
|
|
||||||
std::tie(_mask, _self) = expand_outplace(mask, self);
|
|
||||||
|
|
||||||
auto shape = _self.sizes();
|
|
||||||
int64_t numel = _mask.sum().item().toLong();
|
|
||||||
result.resize_({numel});
|
|
||||||
if (numel == 0) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create strided view of result before feeding into TensorIterator
|
|
||||||
auto strides = DimVector(shape.size(), 0);
|
|
||||||
auto result_strided = result.as_strided(shape, strides);
|
|
||||||
|
|
||||||
// serial kernel
|
|
||||||
bool use_serial_kernel = self.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
|
|
||||||
if (use_serial_kernel) {
|
|
||||||
auto iter = TensorIteratorConfig()
|
|
||||||
.check_all_same_dtype(false)
|
|
||||||
.resize_outputs(false)
|
|
||||||
.add_output(result_strided)
|
|
||||||
.add_input(_self)
|
|
||||||
.add_input(_mask)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
masked_select_serial_stub(iter.device_type(), iter);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use a prefix sum to record the output locations of the masked elements,
|
|
||||||
// so as to parallel with TensorIterator.
|
|
||||||
auto mask_long = at::empty(shape, self.options().dtype(at::kLong)).copy_(_mask);
|
|
||||||
auto mask_prefix_sum = at::empty(shape, self.options().dtype(at::kLong));
|
|
||||||
auto mask_long_data = mask_long.data_ptr<int64_t>();
|
|
||||||
auto mask_prefix_sum_data = mask_prefix_sum.data_ptr<int64_t>();
|
|
||||||
// TODO: Here can only use std::partial_sum for C++14,
|
|
||||||
// use std::exclusive_scan when PyTorch upgrades to C++17, which have better peformance.
|
|
||||||
// std::exclusive_scan(mask_long_data, mask_long_data + mask_long.numel(), mask_prefix_sum_data, 0);
|
|
||||||
std::partial_sum(mask_long_data, mask_long_data + mask_long.numel(), mask_prefix_sum_data);
|
|
||||||
|
|
||||||
auto iter = TensorIteratorConfig()
|
|
||||||
.check_all_same_dtype(false)
|
|
||||||
.resize_outputs(false)
|
|
||||||
.add_output(result_strided)
|
|
||||||
.add_input(_self)
|
|
||||||
.add_input(_mask)
|
|
||||||
.add_input(mask_prefix_sum)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
masked_select_stub(iter.device_type(), iter);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor & masked_select_out_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
|
|
||||||
namedinference::compute_broadcast_outnames(self, mask);
|
|
||||||
return masked_select_out_impl_cpu(result, self, mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor masked_select_cpu(const Tensor & self, const Tensor & mask) {
|
|
||||||
Tensor result = at::empty({0}, self.options());
|
|
||||||
return masked_select_out_cpu(result, self, mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor _gather_sparse_backward(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& grad){
|
Tensor _gather_sparse_backward(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& grad){
|
||||||
// special case scalar input and/or index
|
// special case scalar input and/or index
|
||||||
if (self.ndimension() == 0) return at::_sparse_coo_tensor_unsafe(at::empty({0,grad.numel()}, index.options()), grad, self.sizes());
|
if (self.ndimension() == 0) return at::_sparse_coo_tensor_unsafe(at::empty({0,grad.numel()}, index.options()), grad, self.sizes());
|
||||||
|
|||||||
@ -15,7 +15,6 @@ using index_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRe
|
|||||||
using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
|
using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
|
||||||
using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe);
|
using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe);
|
||||||
using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar);
|
using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar);
|
||||||
using masked_select_fn = void(*)(TensorIterator &);
|
|
||||||
|
|
||||||
using gather_fn = void (*)(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
|
using gather_fn = void (*)(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
|
||||||
using scatter_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
|
using scatter_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
|
||||||
@ -26,8 +25,6 @@ DECLARE_DISPATCH(index_fn, index_stub);
|
|||||||
DECLARE_DISPATCH(index_put_fn, index_put_stub);
|
DECLARE_DISPATCH(index_put_fn, index_put_stub);
|
||||||
DECLARE_DISPATCH(index_put_accum_fn, index_put_accum_stub);
|
DECLARE_DISPATCH(index_put_accum_fn, index_put_accum_stub);
|
||||||
DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
|
DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
|
||||||
DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub);
|
|
||||||
DECLARE_DISPATCH(masked_select_fn, masked_select_stub);
|
|
||||||
|
|
||||||
DECLARE_DISPATCH(gather_fn, gather_stub);
|
DECLARE_DISPATCH(gather_fn, gather_stub);
|
||||||
DECLARE_DISPATCH(scatter_fn, scatter_stub);
|
DECLARE_DISPATCH(scatter_fn, scatter_stub);
|
||||||
|
|||||||
@ -355,13 +355,12 @@ TensorOptions infer_full_options(
|
|||||||
|
|
||||||
if (!options.has_dtype()) {
|
if (!options.has_dtype()) {
|
||||||
if (fill_value.isIntegral(true)) {
|
if (fill_value.isIntegral(true)) {
|
||||||
TORCH_WARN_ONCE(
|
TORCH_CHECK(false,
|
||||||
"Deprecation warning: In a future PyTorch release torch.full ",
|
"Providing a bool or integral fill value without setting the optional ",
|
||||||
"will no longer return tensors of floating dtype by default. ",
|
"`dtype` or `out` arguments is currently unsupported. In PyTorch 1.7, ",
|
||||||
"Instead, a bool fill_value will return a tensor of torch.bool dtype, ",
|
"when `dtype` and `out` are not set a bool fill value will ",
|
||||||
"and an integral fill_value will return a tensor of torch.long dtype. ",
|
"return a tensor of torch.bool dtype, and an integral fill value ",
|
||||||
"Set the optional `dtype` or `out` arguments to suppress this warning."
|
"will return a tensor of torch.long dtype.");
|
||||||
);
|
|
||||||
} else if (fill_value.isComplex()) {
|
} else if (fill_value.isComplex()) {
|
||||||
auto scalar_type = (get_default_dtype() == ScalarType::Double) ?
|
auto scalar_type = (get_default_dtype() == ScalarType::Double) ?
|
||||||
ScalarType::ComplexDouble :
|
ScalarType::ComplexDouble :
|
||||||
|
|||||||
@ -706,8 +706,9 @@ TensorIterator TensorIterator::unary_op(Tensor& out, const Tensor& a,
|
|||||||
.set_check_mem_overlap(check_mem_overlap)
|
.set_check_mem_overlap(check_mem_overlap)
|
||||||
.add_output(out)
|
.add_output(out)
|
||||||
.add_input(a)
|
.add_input(a)
|
||||||
.cast_common_dtype_to_outputs(true)
|
.cast_common_dtype_to_outputs(false)
|
||||||
.enforce_safe_casting_to_output(true)
|
.enforce_safe_casting_to_output(false)
|
||||||
|
.check_all_same_dtype(true)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -762,7 +762,12 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
|
|||||||
|
|
||||||
Tensor xtensor = self.expand(padded_size);
|
Tensor xtensor = self.expand(padded_size);
|
||||||
|
|
||||||
Tensor result = at::empty(target_size, self.options());
|
Tensor result;
|
||||||
|
if (self.is_quantized()) {
|
||||||
|
result = at::empty_quantized(target_size, self);
|
||||||
|
} else {
|
||||||
|
result = at::empty(target_size, self.options());
|
||||||
|
}
|
||||||
|
|
||||||
// return an empty tensor if one of the repeat dimensions is zero
|
// return an empty tensor if one of the repeat dimensions is zero
|
||||||
if (zero_tensor) {
|
if (zero_tensor) {
|
||||||
|
|||||||
@ -67,7 +67,7 @@ static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, co
|
|||||||
|
|
||||||
// Copies the complex result to the actual result and returns it
|
// Copies the complex result to the actual result and returns it
|
||||||
result.resize_(complex_result.sizes());
|
result.resize_(complex_result.sizes());
|
||||||
result.copy_(complex_result);
|
result.copy_(at::real(complex_result));
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -163,90 +163,11 @@ void masked_fill_kernel(TensorIterator& iter, Scalar value) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename mask_t, typename func_t>
|
|
||||||
void cpu_masked_select_serial_kernel(TensorIterator& iter, const func_t& f) {
|
|
||||||
auto is_mask_bool = std::is_same<mask_t, bool>::value;
|
|
||||||
int64_t offset = 0;
|
|
||||||
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
|
|
||||||
char* dst = data[0];
|
|
||||||
char* src = data[1];
|
|
||||||
char* mask = data[2];
|
|
||||||
for (int64_t i = 0; i < n; i++) {
|
|
||||||
mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
|
|
||||||
if (!is_mask_bool) {
|
|
||||||
TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
|
|
||||||
}
|
|
||||||
if (mask_value) {
|
|
||||||
int64_t offset_bytes = offset * sizeof(scalar_t);
|
|
||||||
f(dst, src + strides[1] * i, offset_bytes);
|
|
||||||
offset++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
iter.serial_for_each(loop, {0, iter.numel()});
|
|
||||||
}
|
|
||||||
|
|
||||||
void masked_select_serial_kernel(TensorIterator& iter) {
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16,
|
|
||||||
iter.dtype(), "masked_select", [&] {
|
|
||||||
auto mask_dtype = iter.input_dtype(1);
|
|
||||||
if (mask_dtype == at::ScalarType::Bool) {
|
|
||||||
cpu_masked_select_serial_kernel<scalar_t, bool>(iter, [](char* dst, char* src, int64_t offset) {
|
|
||||||
*(scalar_t*)(dst + offset) = *(scalar_t*)src;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
cpu_masked_select_serial_kernel<scalar_t, unsigned char>(iter, [](char* dst, char* src, int64_t offset) {
|
|
||||||
*(scalar_t*)(dst + offset) = *(scalar_t*)src;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, typename mask_t, typename func_t>
|
|
||||||
void cpu_masked_select_kernel(TensorIterator& iter, const func_t& f) {
|
|
||||||
auto is_mask_bool = std::is_same<mask_t, bool>::value;
|
|
||||||
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
|
|
||||||
char* dst = data[0];
|
|
||||||
char* src = data[1];
|
|
||||||
char* mask = data[2];
|
|
||||||
char* mask_prefix_sum = data[3];
|
|
||||||
for (int64_t i = 0; i < n; i++) {
|
|
||||||
mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
|
|
||||||
if (!is_mask_bool) {
|
|
||||||
TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
|
|
||||||
}
|
|
||||||
if (mask_value) {
|
|
||||||
int64_t offset = *(int64_t*)(mask_prefix_sum + strides[3] * i);
|
|
||||||
int64_t offset_bytes = (offset - 1) * sizeof(scalar_t);
|
|
||||||
f(dst, src + strides[1] * i, offset_bytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
iter.for_each(loop);
|
|
||||||
}
|
|
||||||
|
|
||||||
void masked_select_kernel(TensorIterator& iter) {
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16,
|
|
||||||
iter.dtype(), "masked_select", [&] {
|
|
||||||
auto mask_dtype = iter.input_dtype(1);
|
|
||||||
if (mask_dtype == at::ScalarType::Bool) {
|
|
||||||
cpu_masked_select_kernel<scalar_t, bool>(iter, [](char* dst, char* src, int64_t offset) {
|
|
||||||
*(scalar_t*)(dst + offset) = *(scalar_t*)src;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
cpu_masked_select_kernel<scalar_t, unsigned char>(iter, [](char* dst, char* src, int64_t offset) {
|
|
||||||
*(scalar_t*)(dst + offset) = *(scalar_t*)src;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
|
|
||||||
REGISTER_DISPATCH(index_stub, &index_kernel);
|
REGISTER_DISPATCH(index_stub, &index_kernel);
|
||||||
REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
|
REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
|
||||||
REGISTER_DISPATCH(masked_fill_stub, &masked_fill_kernel);
|
REGISTER_DISPATCH(masked_fill_stub, &masked_fill_kernel);
|
||||||
REGISTER_DISPATCH(masked_select_serial_stub, &masked_select_serial_kernel);
|
|
||||||
REGISTER_DISPATCH(masked_select_stub, &masked_select_kernel);
|
|
||||||
|
|
||||||
}} // namespace at::native
|
}} // namespace at::native
|
||||||
|
|||||||
@ -1127,6 +1127,12 @@
|
|||||||
variants: method
|
variants: method
|
||||||
device_guard: False
|
device_guard: False
|
||||||
|
|
||||||
|
- func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
|
||||||
|
variants: function
|
||||||
|
dispatch:
|
||||||
|
QuantizedCPU: empty_quantized
|
||||||
|
QuantizedCUDA: empty_quantized
|
||||||
|
|
||||||
- func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
|
- func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
|
||||||
device_guard: False
|
device_guard: False
|
||||||
|
|
||||||
@ -5108,6 +5114,8 @@
|
|||||||
dispatch:
|
dispatch:
|
||||||
CPU: unfold
|
CPU: unfold
|
||||||
CUDA: unfold
|
CUDA: unfold
|
||||||
|
QuantizedCPU: unfold
|
||||||
|
QuantizedCUDA: unfold
|
||||||
|
|
||||||
- func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
|
- func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
|
||||||
variants: function
|
variants: function
|
||||||
|
|||||||
@ -76,5 +76,28 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub(
|
|||||||
TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
|
TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create an empty quantized Tensor with size, based on the options
|
||||||
|
// and quantization parameters of the input quantized Tensor
|
||||||
|
Tensor empty_quantized(IntArrayRef size, const Tensor& qtensor) {
|
||||||
|
Tensor output;
|
||||||
|
if (qtensor.qscheme() == kPerTensorAffine) {
|
||||||
|
output = at::_empty_affine_quantized(size, qtensor.options(),
|
||||||
|
qtensor.q_scale(),
|
||||||
|
qtensor.q_zero_point());
|
||||||
|
} else if (qtensor.qscheme() == kPerChannelAffine) {
|
||||||
|
output = at::_empty_per_channel_affine_quantized(
|
||||||
|
size,
|
||||||
|
qtensor.q_per_channel_scales(),
|
||||||
|
qtensor.q_per_channel_zero_points(),
|
||||||
|
qtensor.q_per_channel_axis(),
|
||||||
|
qtensor.options());
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false,
|
||||||
|
"QScheme not supported by empty_quantized:",
|
||||||
|
toString(qtensor.qscheme()));
|
||||||
|
}
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace native
|
} // namespace native
|
||||||
} // namespace at
|
} // namespace at
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <c10/util/math_compat.h>
|
#include <c10/util/math_compat.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -375,7 +375,7 @@ Tensor qnnpack_avg_pool2d(
|
|||||||
CAFFE_ENFORCE(
|
CAFFE_ENFORCE(
|
||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Average Pooling operator");
|
"failed to setup QNNPACK Average Pooling operator");
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
|||||||
@ -5,7 +5,6 @@
|
|||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
|
||||||
#include <c10/util/math_compat.h>
|
#include <c10/util/math_compat.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
|||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Add operator");
|
"failed to setup QNNPACK Add operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <c10/core/TensorOptions.h>
|
#include <c10/core/TensorOptions.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ Tensor quantized_channel_shuffle_impl(
|
|||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK ChannelShuffle operator");
|
"failed to setup QNNPACK ChannelShuffle operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/quantized/Quantizer.h>
|
#include <ATen/quantized/Quantizer.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ Tensor qnnpack_clamp(Tensor input, Scalar min, Scalar max) {
|
|||||||
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Clamp operator");
|
"failed to setup QNNPACK Clamp operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(clamp_op, threadpool);
|
pytorch_qnnp_run_operator(clamp_op, threadpool);
|
||||||
|
|||||||
@ -10,7 +10,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/quant_utils.h>
|
#include <ATen/native/quantized/cpu/quant_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/conv_packed_params.h>
|
#include <ATen/native/quantized/cpu/conv_packed_params.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
template <int kSpatialDim = 2>
|
template <int kSpatialDim = 2>
|
||||||
bool ConvDimChecks(
|
bool ConvDimChecks(
|
||||||
@ -603,7 +603,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
|
|||||||
output_min,
|
output_min,
|
||||||
output_max,
|
output_max,
|
||||||
reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()),
|
reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()),
|
||||||
caffe2::mobile_pthreadpool());
|
caffe2::pthreadpool_());
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
run_status == pytorch_qnnp_status_success,
|
run_status == pytorch_qnnp_status_success,
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ Tensor qnnpack_hardsigmoid(Tensor input) {
|
|||||||
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Hardsigmoid operator");
|
"failed to setup QNNPACK Hardsigmoid operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(hardsigmoid_op, threadpool);
|
pytorch_qnnp_run_operator(hardsigmoid_op, threadpool);
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -51,7 +51,7 @@ Tensor qnnpack_hardswish(const Tensor& qx, Tensor& qy) {
|
|||||||
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Hardswish operator");
|
"failed to setup QNNPACK Hardswish operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(hardswish_op, threadpool);
|
pytorch_qnnp_run_operator(hardswish_op, threadpool);
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
|
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/packed_params.h>
|
#include <ATen/native/quantized/cpu/packed_params.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <torch/custom_class.h>
|
#include <torch/custom_class.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
@ -341,7 +341,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
|
|||||||
packB->getPackedWeights(),
|
packB->getPackedWeights(),
|
||||||
(uint8_t*)output.data_ptr<c10::quint8>(),
|
(uint8_t*)output.data_ptr<c10::quint8>(),
|
||||||
rows_w /* output_stride */,
|
rows_w /* output_stride */,
|
||||||
caffe2::mobile_pthreadpool() /* threadpool */);
|
// TODO (Ashkan): Disabling temporarily.
|
||||||
|
// Throws a floating point exception with OSS pthreadpool.
|
||||||
|
caffe2::pthreadpool_() /* threadpool */);
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
runStatus == pytorch_qnnp_status_success,
|
runStatus == pytorch_qnnp_status_success,
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/packed_params.h>
|
#include <ATen/native/quantized/cpu/packed_params.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/quant_utils.h>
|
#include <ATen/native/quantized/cpu/quant_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
#include <torch/custom_class.h>
|
#include <torch/custom_class.h>
|
||||||
@ -241,8 +241,17 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
|
|||||||
|
|
||||||
// Calculate statistics for quantization of input Tensor
|
// Calculate statistics for quantization of input Tensor
|
||||||
// TODO: optimized kernel
|
// TODO: optimized kernel
|
||||||
float x_min = input_contig.min().item<float>();
|
float x_min;
|
||||||
float x_max = input_contig.max().item<float>();
|
float x_max;
|
||||||
|
if (input.numel() > 0) {
|
||||||
|
x_min = input_contig.min().item<float>();
|
||||||
|
x_max = input_contig.max().item<float>();
|
||||||
|
} else {
|
||||||
|
// On empty input, no output data will be generated,
|
||||||
|
// so use arbitrary qparams.
|
||||||
|
x_min = 0;
|
||||||
|
x_max = 0;
|
||||||
|
}
|
||||||
|
|
||||||
auto q_params = quant_utils::ChooseQuantizationParams(
|
auto q_params = quant_utils::ChooseQuantizationParams(
|
||||||
/*min=*/x_min,
|
/*min=*/x_min,
|
||||||
@ -327,7 +336,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
|
|||||||
bias_ptr,
|
bias_ptr,
|
||||||
output.data_ptr<float>(),
|
output.data_ptr<float>(),
|
||||||
rows_w /* output_stride */,
|
rows_w /* output_stride */,
|
||||||
caffe2::mobile_pthreadpool() /* threadpool */);
|
caffe2::pthreadpool_() /* threadpool */);
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
runStatus == pytorch_qnnp_status_success,
|
runStatus == pytorch_qnnp_status_success,
|
||||||
|
|||||||
@ -100,6 +100,12 @@ enum pytorch_qnnp_status qnnpackLinearDynamic(
|
|||||||
.ukernel = pytorch_qnnp_params.q8conv.gemm_dq,
|
.ukernel = pytorch_qnnp_params.q8conv.gemm_dq,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (output_size == 0) {
|
||||||
|
// pthreadpool can tolerate a range of 0, but not a tile of 0.
|
||||||
|
// We use output_size as a tile size, so bail here if it's 0.
|
||||||
|
return pytorch_qnnp_status_success;
|
||||||
|
}
|
||||||
|
|
||||||
pthreadpool_compute_4d_tiled(
|
pthreadpool_compute_4d_tiled(
|
||||||
threadpool,
|
threadpool,
|
||||||
(pthreadpool_function_4d_tiled_t)compute_q8gemm_dq,
|
(pthreadpool_function_4d_tiled_t)compute_q8gemm_dq,
|
||||||
|
|||||||
@ -98,6 +98,12 @@ enum pytorch_qnnp_status qnnpackLinear(
|
|||||||
.ukernel = pytorch_qnnp_params.q8conv.gemm,
|
.ukernel = pytorch_qnnp_params.q8conv.gemm,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (output_size == 0) {
|
||||||
|
// pthreadpool can tolerate a range of 0, but not a tile of 0.
|
||||||
|
// We use output_size as a tile size, so bail here if it's 0.
|
||||||
|
return pytorch_qnnp_status_success;
|
||||||
|
}
|
||||||
|
|
||||||
pthreadpool_compute_4d_tiled(
|
pthreadpool_compute_4d_tiled(
|
||||||
threadpool,
|
threadpool,
|
||||||
(pthreadpool_function_4d_tiled_t) compute_q8gemm,
|
(pthreadpool_function_4d_tiled_t) compute_q8gemm,
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -346,7 +346,7 @@ void check_maxpool2d_params(
|
|||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK MaxPool operator");
|
"failed to setup QNNPACK MaxPool operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -66,7 +66,7 @@ Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim) {
|
|||||||
CAFFE_ENFORCE(
|
CAFFE_ENFORCE(
|
||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Global Average Pooling operator");
|
"failed to setup QNNPACK Global Average Pooling operator");
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -69,7 +69,7 @@ Tensor qnnpack_relu(Tensor input) {
|
|||||||
setupStatus == pytorch_qnnp_status_success,
|
setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK Relu operator");
|
"failed to setup QNNPACK Relu operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -66,7 +66,7 @@ Tensor qnnpack_sigmoid(Tensor input) {
|
|||||||
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK sigmoid operator");
|
"failed to setup QNNPACK sigmoid operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(sigmoid_op, threadpool);
|
pytorch_qnnp_run_operator(sigmoid_op, threadpool);
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ Tensor qnnpack_tanh(Tensor input) {
|
|||||||
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
|
||||||
"failed to setup QNNPACK TanH operator");
|
"failed to setup QNNPACK TanH operator");
|
||||||
|
|
||||||
pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
|
pthreadpool_t threadpool = caffe2::pthreadpool_();
|
||||||
|
|
||||||
const pytorch_qnnp_status runStatus =
|
const pytorch_qnnp_status runStatus =
|
||||||
pytorch_qnnp_run_operator(tanh_op, threadpool);
|
pytorch_qnnp_run_operator(tanh_op, threadpool);
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
#ifdef USE_XNNPACK
|
#ifdef USE_XNNPACK
|
||||||
|
|
||||||
#include <xnnpack.h>
|
#include <xnnpack.h>
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
|
|||||||
@ -208,15 +208,15 @@ Tensor run(
|
|||||||
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
||||||
padded_input_nhwc.data_ptr<float>(), // input
|
padded_input_nhwc.data_ptr<float>(), // input
|
||||||
output.data_ptr<float>(), // output
|
output.data_ptr<float>(), // output
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
"xnn_setup_convolution2d_nhwc_f32 failed!");
|
"xnn_setup_convolution2d_nhwc_f32 failed!");
|
||||||
|
|
||||||
const xnn_status run_status = xnn_run_operator(
|
const xnn_status run_status = xnn_run_operator(
|
||||||
context.op.get(), // operator
|
context.op.get(), // operator
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
xnn_status_success == run_status,
|
xnn_status_success == run_status,
|
||||||
|
|||||||
@ -137,15 +137,15 @@ Tensor run(
|
|||||||
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
|
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
|
||||||
padded_input.data_ptr<float>(), // input
|
padded_input.data_ptr<float>(), // input
|
||||||
output.data_ptr<float>(), // output
|
output.data_ptr<float>(), // output
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
"xnn_setup_fully_connected_nc_f32 failed!");
|
"xnn_setup_fully_connected_nc_f32 failed!");
|
||||||
|
|
||||||
const xnn_status run_status = xnn_run_operator(
|
const xnn_status run_status = xnn_run_operator(
|
||||||
context.op.get(), // operator
|
context.op.get(), // operator
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
xnn_status_success == run_status,
|
xnn_status_success == run_status,
|
||||||
|
|||||||
@ -219,15 +219,15 @@ Tensor max_pool2d(
|
|||||||
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
|
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
|
||||||
input_padded_contig_nhwc.data_ptr<float>(), // input
|
input_padded_contig_nhwc.data_ptr<float>(), // input
|
||||||
output_padded_contig_nhwc.data_ptr<float>(), // output
|
output_padded_contig_nhwc.data_ptr<float>(), // output
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
"xnn_setup_max_pooling2d_nhwc_f32 failed!");
|
"xnn_setup_max_pooling2d_nhwc_f32 failed!");
|
||||||
|
|
||||||
const xnn_status run_status = xnn_run_operator(
|
const xnn_status run_status = xnn_run_operator(
|
||||||
max_pool_op, // operator
|
max_pool_op, // operator
|
||||||
caffe2::xnnpack_threadpool()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
xnn_status_success == run_status,
|
xnn_status_success == run_status,
|
||||||
|
|||||||
@ -4,10 +4,10 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/core/Tensor.h>
|
#include <ATen/core/Tensor.h>
|
||||||
|
#include <ATen/detail/CUDAHooksInterface.h>
|
||||||
#include <ATen/native/TensorFactories.h>
|
#include <ATen/native/TensorFactories.h>
|
||||||
#include <ATen/native/quantized/affine_quantizer.h>
|
#include <ATen/native/quantized/affine_quantizer.h>
|
||||||
#include <ATen/quantized/QTensorImpl.h>
|
#include <ATen/quantized/QTensorImpl.h>
|
||||||
#include <c10/core/Allocator.h>
|
|
||||||
#include <c10/core/CPUAllocator.h>
|
#include <c10/core/CPUAllocator.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <typeinfo>
|
#include <typeinfo>
|
||||||
@ -66,7 +66,9 @@ inline Tensor new_qtensor(
|
|||||||
const TensorOptions& options,
|
const TensorOptions& options,
|
||||||
QuantizerPtr quantizer) {
|
QuantizerPtr quantizer) {
|
||||||
auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
|
auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
|
||||||
at::Allocator* allocator = GetAllocator(options.device().type());
|
at::Allocator* allocator = options.device().type() == DeviceType::CUDA
|
||||||
|
? at::detail::getCUDAHooks().getCUDADeviceAllocator()
|
||||||
|
: at::getCPUAllocator();
|
||||||
|
|
||||||
#ifdef USE_PYTORCH_QNNPACK
|
#ifdef USE_PYTORCH_QNNPACK
|
||||||
if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
|
if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
|
||||||
|
|||||||
@ -99,6 +99,50 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
|
|||||||
|
|
||||||
#if !defined(TH_REAL_IS_HALF) /* non half part */
|
#if !defined(TH_REAL_IS_HALF) /* non half part */
|
||||||
|
|
||||||
|
void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
|
||||||
|
{
|
||||||
|
at::NoNamesGuard guard;
|
||||||
|
ptrdiff_t numel = THTensor_wrap(mask).sum().item<int64_t>();
|
||||||
|
scalar_t *tensor_data;
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
THAssert(numel <= LONG_MAX);
|
||||||
|
#endif
|
||||||
|
THTensor_(resize1d)(tensor,numel);
|
||||||
|
tensor_data = tensor->data<scalar_t>();
|
||||||
|
TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask,
|
||||||
|
if (*mask_data > 1)
|
||||||
|
{
|
||||||
|
THFree(mask_counter);
|
||||||
|
THFree(src_counter);
|
||||||
|
THError("Mask tensor can take 0 and 1 values only");
|
||||||
|
}
|
||||||
|
else if (*mask_data == 1)
|
||||||
|
{
|
||||||
|
*tensor_data = *src_data;
|
||||||
|
tensor_data++;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask)
|
||||||
|
{
|
||||||
|
at::NoNamesGuard guard;
|
||||||
|
ptrdiff_t numel = THTensor_wrap(mask).sum().item<int64_t>();
|
||||||
|
scalar_t *tensor_data;
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
THAssert(numel <= LONG_MAX);
|
||||||
|
#endif
|
||||||
|
THTensor_(resize1d)(tensor,numel);
|
||||||
|
tensor_data = tensor->data<scalar_t>();
|
||||||
|
TH_TENSOR_APPLY2(scalar_t, src, bool, mask,
|
||||||
|
if (*mask_data)
|
||||||
|
{
|
||||||
|
*tensor_data = *src_data;
|
||||||
|
tensor_data++;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
|
void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
|
||||||
{
|
{
|
||||||
THTensor *srct = THTensor_(newContiguous)(src);
|
THTensor *srct = THTensor_(newContiguous)(src);
|
||||||
|
|||||||
@ -9,6 +9,8 @@ TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
|
|||||||
|
|
||||||
#if !defined(TH_REAL_IS_HALF)
|
#if !defined(TH_REAL_IS_HALF)
|
||||||
|
|
||||||
|
TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
|
||||||
|
TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask);
|
||||||
TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
|
TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
|
||||||
TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* src);
|
TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* src);
|
||||||
|
|
||||||
|
|||||||
@ -155,10 +155,16 @@ public:
|
|||||||
|
|
||||||
static std::tuple<int, int> priority_range() {
|
static std::tuple<int, int> priority_range() {
|
||||||
#ifndef __HIP_PLATFORM_HCC__
|
#ifndef __HIP_PLATFORM_HCC__
|
||||||
|
// Note: this returns the range of priority **supported by PyTorch**, not
|
||||||
|
// the range of priority **supported by CUDA**. The former is a subset of
|
||||||
|
// the latter. Curently PyTorch only supports 0 and -1, which are "low" and
|
||||||
|
// "high" priority.
|
||||||
int least_priority, greatest_priority;
|
int least_priority, greatest_priority;
|
||||||
C10_CUDA_CHECK(
|
C10_CUDA_CHECK(
|
||||||
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
|
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
|
||||||
return std::make_tuple(least_priority, greatest_priority);
|
TORCH_INTERNAL_ASSERT(least_priority >= 0, "Unexpected CUDA stream priority range");
|
||||||
|
TORCH_INTERNAL_ASSERT(greatest_priority <= -1, "Unexpected CUDA stream priority range");
|
||||||
|
return std::make_tuple(0, -1);
|
||||||
#else
|
#else
|
||||||
AT_ERROR("cuDeviceGetStreamPriorityRange with HIP is not supported");
|
AT_ERROR("cuDeviceGetStreamPriorityRange with HIP is not supported");
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -87,7 +87,6 @@ endif()
|
|||||||
# Note: the folders that are being commented out have not been properly
|
# Note: the folders that are being commented out have not been properly
|
||||||
# addressed yet.
|
# addressed yet.
|
||||||
|
|
||||||
# For pthreadpool_new_if_impl. TODO: Remove when threadpools are unitied.
|
|
||||||
if(NOT MSVC AND USE_XNNPACK)
|
if(NOT MSVC AND USE_XNNPACK)
|
||||||
if(NOT TARGET fxdiv)
|
if(NOT TARGET fxdiv)
|
||||||
set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
|
set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
@ -96,10 +95,6 @@ if(NOT MSVC AND USE_XNNPACK)
|
|||||||
"${FXDIV_SOURCE_DIR}"
|
"${FXDIV_SOURCE_DIR}"
|
||||||
"${CMAKE_BINARY_DIR}/FXdiv")
|
"${CMAKE_BINARY_DIR}/FXdiv")
|
||||||
endif()
|
endif()
|
||||||
if(NOT (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
|
|
||||||
set_source_files_properties(
|
|
||||||
utils/threadpool/pthreadpool_new_if_impl.c PROPERTIES COMPILE_FLAGS -fno-openmp)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_subdirectory(core)
|
add_subdirectory(core)
|
||||||
|
|||||||
@ -818,6 +818,67 @@ c10::optional<int> OperatorBase::argumentIndexWithName(
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool OperatorBase::RunAsync(int stream_id) {
|
||||||
|
try {
|
||||||
|
auto result = Run(stream_id);
|
||||||
|
if (result) {
|
||||||
|
if (HasAsyncPart()) {
|
||||||
|
RecordEvent();
|
||||||
|
} else {
|
||||||
|
SetEventFinished();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
SetEventFinished(getErrorMsg().c_str());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} catch (EnforceNotMet& err) {
|
||||||
|
SetEventFinishedWithException(err.what());
|
||||||
|
throw;
|
||||||
|
} catch (const std::exception& err) {
|
||||||
|
SetEventFinishedWithException(err.what());
|
||||||
|
throw;
|
||||||
|
} catch (...) {
|
||||||
|
SetEventFinishedWithException(getErrorMsg().c_str());
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void OperatorBase::AddRelatedBlobInfo(EnforceNotMet* err) {
|
||||||
|
CAFFE_ENFORCE(
|
||||||
|
isLegacyOperator(),
|
||||||
|
"AddRelatedBlobInfo(err) not supported for operators exported to c10.");
|
||||||
|
|
||||||
|
if (!has_debug_def()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool found_input = false;
|
||||||
|
bool found_output = false;
|
||||||
|
if (err->caller() != nullptr) {
|
||||||
|
std::ostringstream oss;
|
||||||
|
for (size_t i = 0; i < inputs_.size(); i++) {
|
||||||
|
if (inputs_[i]->GetRaw() == err->caller()) {
|
||||||
|
found_input = true;
|
||||||
|
oss << "while accessing input: " << debug_def().input(i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < outputs_.size(); i++) {
|
||||||
|
if (outputs_[i]->GetRaw() == err->caller()) {
|
||||||
|
found_output = true;
|
||||||
|
if (found_input) {
|
||||||
|
oss << " OR ";
|
||||||
|
}
|
||||||
|
oss << "while accessing output: " << debug_def().output(i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found_input || found_output) {
|
||||||
|
err->add_context(oss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
OperatorBase::~OperatorBase() noexcept = default;
|
OperatorBase::~OperatorBase() noexcept = default;
|
||||||
|
|
||||||
#ifndef C10_MOBILE
|
#ifndef C10_MOBILE
|
||||||
|
|||||||
@ -480,70 +480,13 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
|
|||||||
|
|
||||||
virtual void CancelAsyncCallback() {}
|
virtual void CancelAsyncCallback() {}
|
||||||
|
|
||||||
// RunAsync, if implemenented by the specific operators, will schedule the
|
// RunAsync, if implemented by the specific operators, will schedule the
|
||||||
// computation on the corresponding context and record the event in its
|
// computation on the corresponding context and record the event in its
|
||||||
// event_ member object. If the specific operator does not support RunAsync,
|
// event_ member object. If the specific operator does not support RunAsync,
|
||||||
// it will simply be synchronous as a fallback.
|
// it will simply be synchronous as a fallback.
|
||||||
virtual bool RunAsync(int stream_id = 0) {
|
virtual bool RunAsync(int stream_id = 0);
|
||||||
try {
|
|
||||||
auto result = Run(stream_id);
|
|
||||||
if (result) {
|
|
||||||
if (HasAsyncPart()) {
|
|
||||||
RecordEvent();
|
|
||||||
} else {
|
|
||||||
SetEventFinished();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
SetEventFinished(getErrorMsg().c_str());
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
} catch (EnforceNotMet& err) {
|
|
||||||
SetEventFinishedWithException(err.what());
|
|
||||||
throw;
|
|
||||||
} catch (const std::exception& err) {
|
|
||||||
SetEventFinishedWithException(err.what());
|
|
||||||
throw;
|
|
||||||
} catch (...) {
|
|
||||||
SetEventFinishedWithException(getErrorMsg().c_str());
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void AddRelatedBlobInfo(EnforceNotMet* err) {
|
virtual void AddRelatedBlobInfo(EnforceNotMet* err);
|
||||||
CAFFE_ENFORCE(
|
|
||||||
isLegacyOperator(),
|
|
||||||
"AddRelatedBlobInfo(err) not supported for operators exported to c10.");
|
|
||||||
|
|
||||||
if (!has_debug_def()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool found_input = false;
|
|
||||||
bool found_output = false;
|
|
||||||
if (err->caller() != nullptr) {
|
|
||||||
std::ostringstream oss;
|
|
||||||
for (size_t i = 0; i < inputs_.size(); i++) {
|
|
||||||
if (inputs_[i]->GetRaw() == err->caller()) {
|
|
||||||
found_input = true;
|
|
||||||
oss << "while accessing input: " << debug_def().input(i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (size_t i = 0; i < outputs_.size(); i++) {
|
|
||||||
if (outputs_[i]->GetRaw() == err->caller()) {
|
|
||||||
found_output = true;
|
|
||||||
if (found_input) {
|
|
||||||
oss << " OR ";
|
|
||||||
}
|
|
||||||
oss << "while accessing output: " << debug_def().output(i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (found_input || found_output) {
|
|
||||||
err->add_context(oss.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual std::string debug_info_string() const {
|
virtual std::string debug_info_string() const {
|
||||||
return "";
|
return "";
|
||||||
|
|||||||
@ -3,6 +3,25 @@
|
|||||||
|
|
||||||
namespace caffe2 {
|
namespace caffe2 {
|
||||||
|
|
||||||
|
OpSchema::OpSchema(const string& type, const string& file, const int line)
|
||||||
|
: type_(type), file_(file), line_(line), tensor_inference_function_(
|
||||||
|
[](const OperatorDef& def, const vector<TensorShape>&) {
|
||||||
|
vector<TensorShape> out;
|
||||||
|
for (int i = 0; i < def.output_size(); i++) {
|
||||||
|
TensorShape ts;
|
||||||
|
ts.set_unknown_shape(true);
|
||||||
|
out.push_back(ts);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}), device_inference_function_(
|
||||||
|
[](const OperatorDef& def) {
|
||||||
|
auto op_device =
|
||||||
|
def.has_device_option() ? def.device_option() : DeviceOption();
|
||||||
|
vector<DeviceOption> in_dev(def.input_size(), op_device);
|
||||||
|
vector<DeviceOption> out_dev(def.output_size(), op_device);
|
||||||
|
return std::make_pair(in_dev, out_dev);
|
||||||
|
}) {}
|
||||||
|
|
||||||
bool OpSchema::Verify(const OperatorDef& def) const {
|
bool OpSchema::Verify(const OperatorDef& def) const {
|
||||||
// Check the number of inputs.
|
// Check the number of inputs.
|
||||||
if (def.input_size() < min_input_ || def.input_size() > max_input_) {
|
if (def.input_size() < min_input_ || def.input_size() > max_input_) {
|
||||||
|
|||||||
@ -39,9 +39,8 @@ constexpr int kCannotComputeNumOutputs = -1;
|
|||||||
*/
|
*/
|
||||||
class CAFFE2_API OpSchema {
|
class CAFFE2_API OpSchema {
|
||||||
public:
|
public:
|
||||||
OpSchema() : type_("unknown"), file_("unknown"), line_(0) {}
|
OpSchema() : OpSchema("unknown", "unknown", 0) {}
|
||||||
OpSchema(const string& type, const string& file, const int line)
|
OpSchema(const string& type, const string& file, const int line);
|
||||||
: type_(type), file_(file), line_(line) {}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Returns the file that the op schema is registered from.
|
* @brief Returns the file that the op schema is registered from.
|
||||||
@ -443,25 +442,9 @@ class CAFFE2_API OpSchema {
|
|||||||
std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
|
std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
TensorInferenceFunctionType tensor_inference_function_ =
|
TensorInferenceFunctionType tensor_inference_function_;
|
||||||
[](const OperatorDef& def, const vector<TensorShape>&) {
|
|
||||||
vector<TensorShape> out;
|
|
||||||
for (int i = 0; i < def.output_size(); i++) {
|
|
||||||
TensorShape ts;
|
|
||||||
ts.set_unknown_shape(true);
|
|
||||||
out.push_back(ts);
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
};
|
|
||||||
std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
|
std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
|
||||||
DeviceInferenceFunctionType device_inference_function_ =
|
DeviceInferenceFunctionType device_inference_function_;
|
||||||
[](const OperatorDef& def) {
|
|
||||||
auto op_device =
|
|
||||||
def.has_device_option() ? def.device_option() : DeviceOption();
|
|
||||||
vector<DeviceOption> in_dev(def.input_size(), op_device);
|
|
||||||
vector<DeviceOption> out_dev(def.output_size(), op_device);
|
|
||||||
return std::make_pair(in_dev, out_dev);
|
|
||||||
};
|
|
||||||
|
|
||||||
std::function<std::vector<TensorFiller>(
|
std::function<std::vector<TensorFiller>(
|
||||||
const std::vector<std::vector<int64_t>>&)>
|
const std::vector<std::vector<int64_t>>&)>
|
||||||
|
|||||||
@ -88,7 +88,7 @@ class Int8AddOp final : public Operator<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK add operator");
|
"failed to setup QNNPACK add operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -80,7 +80,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Global Average Pooling operator");
|
"failed to setup QNNPACK Global Average Pooling operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackGlobalOperator_,
|
qnnp_run_operator(this->qnnpackGlobalOperator_,
|
||||||
nullptr /* thread pool */);
|
nullptr /* thread pool */);
|
||||||
@ -122,7 +122,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Average Pooling operator");
|
"failed to setup QNNPACK Average Pooling operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -72,7 +72,7 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK channel shuffle operator");
|
"failed to setup QNNPACK channel shuffle operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -141,7 +141,7 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> {
|
|||||||
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -140,7 +140,7 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
|
|||||||
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -104,7 +104,7 @@ class Int8FCOp final : public Operator<CPUContext> {
|
|||||||
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -80,7 +80,7 @@ class Int8LeakyReluOp final : public Operator<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Leaky ReLU operator");
|
"failed to setup QNNPACK Leaky ReLU operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -74,7 +74,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Max Pooling operator");
|
"failed to setup QNNPACK Max Pooling operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -65,7 +65,7 @@ class Int8ReluOp final : public Operator<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Clamp operator");
|
"failed to setup QNNPACK Clamp operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -73,7 +73,7 @@ class Int8SigmoidOp final : public Operator<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK Sigmoid operator");
|
"failed to setup QNNPACK Sigmoid operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -73,7 +73,7 @@ class Int8SoftmaxOp final : public Operator<CPUContext> {
|
|||||||
setupStatus == qnnp_status_success,
|
setupStatus == qnnp_status_success,
|
||||||
"failed to setup QNNPACK SoftArgMax operator");
|
"failed to setup QNNPACK SoftArgMax operator");
|
||||||
|
|
||||||
#ifdef FBCODE_CAFFE2
|
#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
const qnnp_status runStatus =
|
const qnnp_status runStatus =
|
||||||
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -42,13 +42,48 @@ if platform.system() == 'Windows':
|
|||||||
else:
|
else:
|
||||||
cuda_path = ''
|
cuda_path = ''
|
||||||
|
|
||||||
if not is_conda and sys.version_info >= (3, 8):
|
import ctypes
|
||||||
dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
|
kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
|
||||||
|
dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
|
||||||
|
with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
|
||||||
|
prev_error_mode = kernel32.SetErrorMode(0x0001)
|
||||||
|
|
||||||
for dll_path in dll_paths:
|
kernel32.LoadLibraryW.restype = ctypes.c_void_p
|
||||||
|
if with_load_library_flags:
|
||||||
|
kernel32.AddDllDirectory.restype = ctypes.c_void_p
|
||||||
|
kernel32.LoadLibraryExW.restype = ctypes.c_void_p
|
||||||
|
|
||||||
|
for dll_path in dll_paths:
|
||||||
|
if sys.version_info >= (3, 8):
|
||||||
os.add_dll_directory(dll_path)
|
os.add_dll_directory(dll_path)
|
||||||
else:
|
elif with_load_library_flags:
|
||||||
dll_paths = [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]
|
res = kernel32.AddDllDirectory(dll_path)
|
||||||
dll_paths = list(filter(os.path.exists, dll_paths)) + [os.environ['PATH']]
|
if res is None:
|
||||||
|
err = ctypes.WinError(ctypes.get_last_error())
|
||||||
|
err.strerror += ' Error adding "{}" to the DLL directories.'.format(dll_path)
|
||||||
|
raise err
|
||||||
|
|
||||||
os.environ['PATH'] = ';'.join(dll_paths)
|
dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
|
||||||
|
path_patched = False
|
||||||
|
for dll in dlls:
|
||||||
|
is_loaded = False
|
||||||
|
if with_load_library_flags:
|
||||||
|
res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
|
||||||
|
last_error = ctypes.get_last_error()
|
||||||
|
if res is None and last_error != 126:
|
||||||
|
err = ctypes.WinError(last_error)
|
||||||
|
err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
|
||||||
|
raise err
|
||||||
|
elif res is not None:
|
||||||
|
is_loaded = True
|
||||||
|
if not is_loaded:
|
||||||
|
if not path_patched:
|
||||||
|
os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']])
|
||||||
|
path_patched = True
|
||||||
|
res = kernel32.LoadLibraryW(dll)
|
||||||
|
if res is None:
|
||||||
|
err = ctypes.WinError(ctypes.get_last_error())
|
||||||
|
err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
|
||||||
|
raise err
|
||||||
|
|
||||||
|
kernel32.SetErrorMode(prev_error_mode)
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <istream>
|
#include <istream>
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include <c10/core/Allocator.h>
|
#include <c10/core/Allocator.h>
|
||||||
#include <c10/core/Backend.h>
|
#include <c10/core/Backend.h>
|
||||||
@ -303,10 +304,10 @@ void PyTorchStreamWriter::setup(const string& file_name) {
|
|||||||
|
|
||||||
mz_zip_writer_init_v2(ar_.get(), 0, MZ_ZIP_FLAG_WRITE_ZIP64);
|
mz_zip_writer_init_v2(ar_.get(), 0, MZ_ZIP_FLAG_WRITE_ZIP64);
|
||||||
valid("initializing archive ", file_name.c_str());
|
valid("initializing archive ", file_name.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
std::string version = c10::to_string(kProducedFileFormatVersion);
|
void PyTorchStreamWriter::setMinVersion(const uint64_t version) {
|
||||||
version.push_back('\n');
|
version_ = std::max(version, version_);
|
||||||
writeRecord("version", version.c_str(), version.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void PyTorchStreamWriter::writeRecord(
|
void PyTorchStreamWriter::writeRecord(
|
||||||
@ -339,6 +340,11 @@ void PyTorchStreamWriter::writeRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PyTorchStreamWriter::writeEndOfFile() {
|
void PyTorchStreamWriter::writeEndOfFile() {
|
||||||
|
// Writes version info
|
||||||
|
std::string version = c10::to_string(version_);
|
||||||
|
version.push_back('\n');
|
||||||
|
writeRecord("version", version.c_str(), version.size());
|
||||||
|
|
||||||
AT_ASSERT(!finalized_);
|
AT_ASSERT(!finalized_);
|
||||||
finalized_ = true;
|
finalized_ = true;
|
||||||
mz_zip_writer_finalize_archive(ar_.get());
|
mz_zip_writer_finalize_archive(ar_.get());
|
||||||
|
|||||||
@ -94,14 +94,45 @@ constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
|
|||||||
constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
|
constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
|
||||||
|
|
||||||
// Versions (i.e. why was the version number bumped?)
|
// Versions (i.e. why was the version number bumped?)
|
||||||
|
|
||||||
|
// Note [Dynamic Versions and torch.jit.save vs. torch.save]
|
||||||
|
//
|
||||||
|
// Our versioning scheme has a "produced file format version" which
|
||||||
|
// describes how an archive is to be read. The version written in an archive
|
||||||
|
// is at least this current produced file format version, but may be greater
|
||||||
|
// if it includes certain symbols. We refer to these conditional versions
|
||||||
|
// as "dynamic," since they are identified at runtime.
|
||||||
|
//
|
||||||
|
// Dynamic versioning is useful when an operator's semantics are updated.
|
||||||
|
// When using torch.jit.save we want those semantics to be preserved. If
|
||||||
|
// we bumped the produced file format version on every change, however,
|
||||||
|
// then older versions of PyTorch couldn't read even simple archives, like
|
||||||
|
// a single tensor, from newer versions of PyTorch. Instead, we
|
||||||
|
// assign dynamic versions to these changes that override the
|
||||||
|
// produced file format version as needed. That is, when the semantics
|
||||||
|
// of torch.div changed it was assigned dynamic version 4, and when
|
||||||
|
// torch.jit.saving modules that use torch.div those archives also have
|
||||||
|
// (at least) version 4. This prevents earlier versions of PyTorch
|
||||||
|
// from accidentally performing the wrong kind of division. Modules
|
||||||
|
// that don't use torch.div or other operators with dynamic versions
|
||||||
|
// can write the produced file format version, and these programs will
|
||||||
|
// run as expected on earlier versions of PyTorch.
|
||||||
|
//
|
||||||
|
// While torch.jit.save attempts to preserve operator semantics,
|
||||||
|
// torch.save does not. torch.save is analogous to pickling Python, so
|
||||||
|
// a function that uses torch.div will have different behavior if torch.saved
|
||||||
|
// and torch.loaded across PyTorch versions. From a technical perspective,
|
||||||
|
// torch.save ignores dynamic versioning.
|
||||||
|
|
||||||
// 1. Initial version
|
// 1. Initial version
|
||||||
// 2. Removed op_version_set version numbers
|
// 2. Removed op_version_set version numbers
|
||||||
// 3. Added type tags to pickle serialization of container types
|
// 3. Added type tags to pickle serialization of container types
|
||||||
// 4. Stopped integer division using torch.div
|
// 4. (Dynamic) Stopped integer division using torch.div
|
||||||
// (a versioned symbol preserves the historic behavior of versions 1--3)
|
// (a versioned symbol preserves the historic behavior of versions 1--3)
|
||||||
// 5. (Read-only) Stops torch.full inferring a floating point dtype
|
// 5. (Dynamic) Stops torch.full inferring a floating point dtype
|
||||||
// when given integer fill values.
|
// when given bool or integer fill values.
|
||||||
constexpr uint64_t kProducedFileFormatVersion = 0x4L;
|
// (a versioned symbol preserves the historic behavior of versions 1--4)
|
||||||
|
constexpr uint64_t kProducedFileFormatVersion = 0x3L;
|
||||||
|
|
||||||
// Writer-specific constants
|
// Writer-specific constants
|
||||||
constexpr uint64_t kFieldAlignment = 64;
|
constexpr uint64_t kFieldAlignment = 64;
|
||||||
@ -144,6 +175,8 @@ class CAFFE2_API PyTorchStreamWriter final {
|
|||||||
explicit PyTorchStreamWriter(
|
explicit PyTorchStreamWriter(
|
||||||
const std::function<size_t(const void*, size_t)>& writer_func);
|
const std::function<size_t(const void*, size_t)>& writer_func);
|
||||||
|
|
||||||
|
void setMinVersion(const uint64_t version);
|
||||||
|
|
||||||
void writeRecord(
|
void writeRecord(
|
||||||
const std::string& name,
|
const std::string& name,
|
||||||
const void* data,
|
const void* data,
|
||||||
@ -171,6 +204,7 @@ class CAFFE2_API PyTorchStreamWriter final {
|
|||||||
std::string padding_;
|
std::string padding_;
|
||||||
std::ofstream file_stream_;
|
std::ofstream file_stream_;
|
||||||
std::function<size_t(const void*, size_t)> writer_func_;
|
std::function<size_t(const void*, size_t)> writer_func_;
|
||||||
|
uint64_t version_ = kProducedFileFormatVersion;
|
||||||
bool finalized_ = false;
|
bool finalized_ = false;
|
||||||
bool err_seen_ = false;
|
bool err_seen_ = false;
|
||||||
friend size_t ostream_write_func(
|
friend size_t ostream_write_func(
|
||||||
|
|||||||
@ -195,7 +195,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
|||||||
const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
|
const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
|
||||||
.height = static_cast<size_t>(stride_h())};
|
.height = static_cast<size_t>(stride_h())};
|
||||||
initNNPACK();
|
initNNPACK();
|
||||||
|
|
||||||
|
#if !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
|
pthreadpool_t pool = nullptr;
|
||||||
|
#else
|
||||||
pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
|
pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
|
||||||
|
#endif
|
||||||
|
|
||||||
runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
|
runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
|
||||||
if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
|
if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
|
||||||
|
|||||||
@ -1,15 +1,8 @@
|
|||||||
# TODO: Add ThreadPoolXNNPACK.cc when XNNPACK integration is updated
|
|
||||||
# to pass the actual threadpool ptr instead of nullptr.
|
|
||||||
if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
|
if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
|
||||||
add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL)
|
|
||||||
list(APPEND Caffe2_CPU_SRCS
|
list(APPEND Caffe2_CPU_SRCS
|
||||||
utils/string_utils.cc
|
utils/string_utils.cc
|
||||||
utils/threadpool/pthreadpool.cc
|
utils/threadpool/pthreadpool-cpp.cc
|
||||||
utils/threadpool/pthreadpool_impl.cc
|
|
||||||
utils/threadpool/pthreadpool_new_if_impl.c
|
|
||||||
utils/threadpool/ThreadPool.cc
|
utils/threadpool/ThreadPool.cc
|
||||||
utils/threadpool/ThreadPoolMobile.cc
|
|
||||||
utils/threadpool/ThreadPoolXNNPACK.cc
|
|
||||||
)
|
)
|
||||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||||
return()
|
return()
|
||||||
@ -28,23 +21,19 @@ list(APPEND Caffe2_CPU_SRCS
|
|||||||
utils/proto_convert.cc
|
utils/proto_convert.cc
|
||||||
utils/proto_utils.cc
|
utils/proto_utils.cc
|
||||||
utils/proto_wrap.cc
|
utils/proto_wrap.cc
|
||||||
|
utils/threadpool/ThreadPool.cc
|
||||||
utils/signal_handler.cc
|
utils/signal_handler.cc
|
||||||
utils/smart_tensor_printer.cc
|
utils/smart_tensor_printer.cc
|
||||||
utils/string_utils.cc
|
utils/string_utils.cc)
|
||||||
utils/threadpool/ThreadPool.cc)
|
|
||||||
|
|
||||||
# ---[ threadpool/pthreadpool* is a local modification of the NNPACK
|
if(USE_PTHREADPOOL)
|
||||||
# pthreadpool with a very similar interface. Neither NNPACK, nor this
|
list(APPEND Caffe2_CPU_SRCS
|
||||||
# thread pool supports Windows.
|
utils/threadpool/pthreadpool-cpp.cc)
|
||||||
if(NOT MSVC AND USE_XNNPACK)
|
if(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL)
|
list(APPEND Caffe2_CPU_SRCS
|
||||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
|
utils/threadpool/pthreadpool.cc
|
||||||
utils/threadpool/pthreadpool.cc
|
utils/threadpool/pthreadpool_impl.cc)
|
||||||
utils/threadpool/pthreadpool_impl.cc
|
endif()
|
||||||
utils/threadpool/pthreadpool_new_if_impl.c
|
|
||||||
utils/threadpool/ThreadPoolMobile.cc
|
|
||||||
utils/threadpool/ThreadPoolXNNPACK.cc
|
|
||||||
)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
|
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
|
||||||
#include <caffe2/utils/threadpool/ThreadPool.h>
|
|
||||||
#include <caffe2/utils/threadpool/pthreadpool.h>
|
|
||||||
|
|
||||||
namespace caffe2 {
|
|
||||||
|
|
||||||
caffe2::ThreadPool* mobile_threadpool() {
|
|
||||||
#ifdef C10_MOBILE
|
|
||||||
static std::unique_ptr<caffe2::ThreadPool> thread_pool =
|
|
||||||
caffe2::ThreadPool::defaultThreadPool();
|
|
||||||
return thread_pool.get();
|
|
||||||
#else
|
|
||||||
return nullptr;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
pthreadpool_t mobile_pthreadpool() {
|
|
||||||
return reinterpret_cast<pthreadpool_t>(mobile_threadpool());
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace caffe2
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include <caffe2/utils/threadpool/pthreadpool.h>
|
|
||||||
|
|
||||||
// TODO Implement a parallel_for version for Mobile here, add to Aten/Parallel.h
|
|
||||||
|
|
||||||
namespace caffe2 {
|
|
||||||
|
|
||||||
class ThreadPool;
|
|
||||||
|
|
||||||
// Return a singleton instance of caffe2::ThreadPool for ATen/TH multithreading.
|
|
||||||
ThreadPool* mobile_threadpool();
|
|
||||||
|
|
||||||
// NOTE: This interface is temporary and should not be used.
|
|
||||||
// Please use Aten/Parallel.h for parallel primitives in pytorch.
|
|
||||||
// This implementation will be used by pytorch mobile, specifically
|
|
||||||
// NNPACK/QNNPACK. For mobile we need to use caffe2::ThreadPool instead of the
|
|
||||||
// 3rd party pthreadpool. Future work (TODO) Implement a mobile version of
|
|
||||||
// "at::parallel_for" using caffe2::ThreadPool so all ATen/TH multithreading
|
|
||||||
// usage is mobile friendly; Refactor QNNPACK or pthreadpool to explicitly using
|
|
||||||
// "at::parallel_for" primitive to replace pthreadpool_compute_1d for Pytorch;
|
|
||||||
pthreadpool_t mobile_pthreadpool();
|
|
||||||
|
|
||||||
size_t getDefaultNumThreads();
|
|
||||||
} // namespace caffe2
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
#include <caffe2/utils/threadpool/pthreadpool.h>
|
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
|
|
||||||
#include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
namespace caffe2 {
|
|
||||||
|
|
||||||
// Will be unified.
|
|
||||||
pthreadpool_t xnnpack_threadpool() {
|
|
||||||
// Depending on internal implemenation vs. OSS we will link against pthreadpool_create_xnnpack
|
|
||||||
// or pthreadpool_create. This is only temporary. It will be unified soon.
|
|
||||||
#ifdef USE_INTERNAL_THREADPOOL_IMPL
|
|
||||||
static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy_xnnpack)>
|
|
||||||
threadpool(pthreadpool_create_xnnpack(getDefaultNumThreads()), pthreadpool_destroy_xnnpack);
|
|
||||||
#else
|
|
||||||
static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)>
|
|
||||||
threadpool(pthreadpool_create(getDefaultNumThreads()), pthreadpool_destroy);
|
|
||||||
#endif
|
|
||||||
return threadpool.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace caffe2
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
// Creating a separate .h/.cc file for creating threadpool for XNNPACK
|
|
||||||
// to avoid touching existing internal builds.
|
|
||||||
// When we unify threadpools this should all go away.
|
|
||||||
namespace caffe2 {
|
|
||||||
pthreadpool_t xnnpack_threadpool();
|
|
||||||
} // namespace caffe2
|
|
||||||
71
caffe2/utils/threadpool/pthreadpool-cpp.cc
Normal file
71
caffe2/utils/threadpool/pthreadpool-cpp.cc
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
|
#include <c10/util/Exception.h>
|
||||||
|
|
||||||
|
namespace caffe2 {
|
||||||
|
|
||||||
|
PThreadPool::PThreadPool(const size_t thread_count)
|
||||||
|
: threadpool_(pthreadpool_create(thread_count), pthreadpool_destroy) {}
|
||||||
|
|
||||||
|
size_t PThreadPool::get_thread_count() const {
|
||||||
|
std::lock_guard<std::mutex> lock{mutex_};
|
||||||
|
|
||||||
|
TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!");
|
||||||
|
return pthreadpool_get_threads_count(threadpool_.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
void PThreadPool::set_thread_count(const size_t thread_count) {
|
||||||
|
std::lock_guard<std::mutex> lock{mutex_};
|
||||||
|
|
||||||
|
// As it stands, pthreadpool is an entirely data parallel framework with no
|
||||||
|
// support for task parallelism. Hence, all functions are blocking, and no
|
||||||
|
// user-provided tasks can be in flight when the control is returned to the
|
||||||
|
// user of the API, which means re-initializing the library, without the
|
||||||
|
// need to wait on any pending tasks, is all one needs to do to re-adjust
|
||||||
|
// the thread count.
|
||||||
|
threadpool_.reset(pthreadpool_create(thread_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
void PThreadPool::run(
|
||||||
|
const std::function<void(size_t)>& fn,
|
||||||
|
const size_t range) {
|
||||||
|
std::lock_guard<std::mutex> lock{mutex_};
|
||||||
|
|
||||||
|
TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!");
|
||||||
|
|
||||||
|
struct Context final {
|
||||||
|
const std::function<void(size_t)>& fn;
|
||||||
|
} context{
|
||||||
|
fn,
|
||||||
|
};
|
||||||
|
|
||||||
|
pthreadpool_parallelize_1d(
|
||||||
|
threadpool_.get(),
|
||||||
|
// Note: pthreadpool_parallelize_1d() is a blocking function. The
|
||||||
|
// function pointer to this lambda passed on to
|
||||||
|
// pthreadpool_parallelize_1d() cannot go out of scope until
|
||||||
|
// pthreadpool_parallelize_1d() returns.
|
||||||
|
[](void* const context, const size_t item) {
|
||||||
|
reinterpret_cast<Context*>(context)->fn(item);
|
||||||
|
},
|
||||||
|
&context,
|
||||||
|
range,
|
||||||
|
0u);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward declaration
|
||||||
|
size_t getDefaultNumThreads();
|
||||||
|
|
||||||
|
PThreadPool* pthreadpool() {
|
||||||
|
static std::unique_ptr<PThreadPool> threadpool =
|
||||||
|
std::make_unique<PThreadPool>(getDefaultNumThreads());
|
||||||
|
return threadpool.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
pthreadpool_t pthreadpool_() {
|
||||||
|
PThreadPool* const threadpool = pthreadpool();
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
threadpool, "Failed to acquire an instance of PThreadPool!");
|
||||||
|
return threadpool->threadpool_.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace caffe2
|
||||||
54
caffe2/utils/threadpool/pthreadpool-cpp.h
Normal file
54
caffe2/utils/threadpool/pthreadpool-cpp.h
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifdef USE_PTHREADPOOL
|
||||||
|
|
||||||
|
#ifdef USE_INTERNAL_PTHREADPOOL_IMPL
|
||||||
|
#include <caffe2/utils/threadpool/pthreadpool.h>
|
||||||
|
#else
|
||||||
|
#include <pthreadpool.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
namespace caffe2 {
|
||||||
|
|
||||||
|
class PThreadPool final {
|
||||||
|
public:
|
||||||
|
explicit PThreadPool(size_t thread_count);
|
||||||
|
~PThreadPool() = default;
|
||||||
|
|
||||||
|
PThreadPool(const PThreadPool&) = delete;
|
||||||
|
PThreadPool& operator=(const PThreadPool&) = delete;
|
||||||
|
|
||||||
|
PThreadPool(PThreadPool&&) = delete;
|
||||||
|
PThreadPool& operator=(PThreadPool&&) = delete;
|
||||||
|
|
||||||
|
size_t get_thread_count() const;
|
||||||
|
void set_thread_count(size_t thread_count);
|
||||||
|
|
||||||
|
// Run, in parallel, function fn(task_id) over task_id in range [0, range).
|
||||||
|
// This function is blocking. All input is processed by the time it returns.
|
||||||
|
void run(const std::function<void(size_t)>& fn, size_t range);
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend pthreadpool_t pthreadpool_();
|
||||||
|
|
||||||
|
private:
|
||||||
|
mutable std::mutex mutex_;
|
||||||
|
std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Return a singleton instance of PThreadPool for ATen/TH multithreading.
|
||||||
|
PThreadPool* pthreadpool();
|
||||||
|
|
||||||
|
// Exposes the underlying implementation of PThreadPool.
|
||||||
|
// Only for use in external libraries so as to unify threading across
|
||||||
|
// internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK)
|
||||||
|
// use cases.
|
||||||
|
pthreadpool_t pthreadpool_();
|
||||||
|
|
||||||
|
} // namespace caffe2
|
||||||
|
|
||||||
|
#endif /* USE_PTHREADPOOL */
|
||||||
@ -32,7 +32,7 @@ static inline size_t min(size_t a, size_t b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct compute_1d_tiled_context {
|
struct compute_1d_tiled_context {
|
||||||
pthreadpool_function_1d_tiled_t function;
|
legacy_pthreadpool_function_1d_tiled_t function;
|
||||||
void* argument;
|
void* argument;
|
||||||
size_t range;
|
size_t range;
|
||||||
size_t tile;
|
size_t tile;
|
||||||
@ -46,9 +46,9 @@ static void compute_1d_tiled(void* context_, size_t linear_index) {
|
|||||||
context->function(context->argument, index, tile);
|
context->function(context->argument, index, tile);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_compute_1d_tiled(
|
void legacy_pthreadpool_compute_1d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_1d_tiled_t function,
|
legacy_pthreadpool_function_1d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range,
|
size_t range,
|
||||||
size_t tile)
|
size_t tile)
|
||||||
@ -65,12 +65,12 @@ void pthreadpool_compute_1d_tiled(
|
|||||||
/*.argument = */ argument,
|
/*.argument = */ argument,
|
||||||
/*.range = */ range,
|
/*.range = */ range,
|
||||||
/*.tile = */ tile};
|
/*.tile = */ tile};
|
||||||
pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
|
legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct compute_2d_context {
|
struct compute_2d_context {
|
||||||
pthreadpool_function_2d_t function;
|
legacy_pthreadpool_function_2d_t function;
|
||||||
void* argument;
|
void* argument;
|
||||||
caffe2::FixedDivisor<int32_t> range_j;
|
caffe2::FixedDivisor<int32_t> range_j;
|
||||||
};
|
};
|
||||||
@ -85,9 +85,9 @@ static void compute_2d(void* context_, size_t linear_index) {
|
|||||||
context->function(context->argument, q, r);
|
context->function(context->argument, q, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_compute_2d(
|
void legacy_pthreadpool_compute_2d(
|
||||||
struct pthreadpool* threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_2d_t function,
|
legacy_pthreadpool_function_2d_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j)
|
size_t range_j)
|
||||||
@ -106,12 +106,12 @@ void pthreadpool_compute_2d(
|
|||||||
/*.function = */ function,
|
/*.function = */ function,
|
||||||
/*.argument = */ argument,
|
/*.argument = */ argument,
|
||||||
/*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)};
|
/*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)};
|
||||||
pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
|
legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct compute_2d_tiled_context {
|
struct compute_2d_tiled_context {
|
||||||
pthreadpool_function_2d_tiled_t function;
|
legacy_pthreadpool_function_2d_tiled_t function;
|
||||||
void* argument;
|
void* argument;
|
||||||
caffe2::FixedDivisor<int32_t> tile_range_j;
|
caffe2::FixedDivisor<int32_t> tile_range_j;
|
||||||
size_t range_i;
|
size_t range_i;
|
||||||
@ -135,9 +135,9 @@ static void compute_2d_tiled(void* context_, size_t linear_index) {
|
|||||||
context->function(context->argument, index_i, index_j, tile_i, tile_j);
|
context->function(context->argument, index_i, index_j, tile_i, tile_j);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_compute_2d_tiled(
|
void legacy_pthreadpool_compute_2d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_2d_tiled_t function,
|
legacy_pthreadpool_function_2d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
@ -166,12 +166,12 @@ void pthreadpool_compute_2d_tiled(
|
|||||||
/*.range_j = */ range_j,
|
/*.range_j = */ range_j,
|
||||||
/*.tile_i = */ tile_i,
|
/*.tile_i = */ tile_i,
|
||||||
/*.tile_j = */ tile_j};
|
/*.tile_j = */ tile_j};
|
||||||
pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
|
legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct compute_3d_tiled_context {
|
struct compute_3d_tiled_context {
|
||||||
pthreadpool_function_3d_tiled_t function;
|
legacy_pthreadpool_function_3d_tiled_t function;
|
||||||
void* argument;
|
void* argument;
|
||||||
caffe2::FixedDivisor<int32_t> tile_range_j;
|
caffe2::FixedDivisor<int32_t> tile_range_j;
|
||||||
caffe2::FixedDivisor<int32_t> tile_range_k;
|
caffe2::FixedDivisor<int32_t> tile_range_k;
|
||||||
@ -205,9 +205,9 @@ static void compute_3d_tiled(
|
|||||||
context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
|
context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_compute_3d_tiled(
|
void legacy_pthreadpool_compute_3d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_3d_tiled_t function,
|
legacy_pthreadpool_function_3d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
@ -251,16 +251,16 @@ void pthreadpool_compute_3d_tiled(
|
|||||||
/*.tile_i = */ tile_i,
|
/*.tile_i = */ tile_i,
|
||||||
/*.tile_j = */ tile_j,
|
/*.tile_j = */ tile_j,
|
||||||
/*.tile_k = */ tile_k};
|
/*.tile_k = */ tile_k};
|
||||||
pthreadpool_compute_1d(
|
legacy_pthreadpool_compute_1d(
|
||||||
threadpool,
|
threadpool,
|
||||||
(pthreadpool_function_1d_t)compute_3d_tiled,
|
(legacy_pthreadpool_function_1d_t)compute_3d_tiled,
|
||||||
&context,
|
&context,
|
||||||
tile_range_i * tile_range_j * tile_range_k);
|
tile_range_i * tile_range_j * tile_range_k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct compute_4d_tiled_context {
|
struct compute_4d_tiled_context {
|
||||||
pthreadpool_function_4d_tiled_t function;
|
legacy_pthreadpool_function_4d_tiled_t function;
|
||||||
void* argument;
|
void* argument;
|
||||||
caffe2::FixedDivisor<int32_t> tile_range_kl;
|
caffe2::FixedDivisor<int32_t> tile_range_kl;
|
||||||
caffe2::FixedDivisor<int32_t> tile_range_j;
|
caffe2::FixedDivisor<int32_t> tile_range_j;
|
||||||
@ -310,9 +310,9 @@ static void compute_4d_tiled(
|
|||||||
tile_l);
|
tile_l);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_compute_4d_tiled(
|
void legacy_pthreadpool_compute_4d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_4d_tiled_t function,
|
legacy_pthreadpool_function_4d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
@ -367,9 +367,9 @@ void pthreadpool_compute_4d_tiled(
|
|||||||
/*.tile_j = */ tile_j,
|
/*.tile_j = */ tile_j,
|
||||||
/*.tile_k = */ tile_k,
|
/*.tile_k = */ tile_k,
|
||||||
/*.tile_l = */ tile_l};
|
/*.tile_l = */ tile_l};
|
||||||
pthreadpool_compute_1d(
|
legacy_pthreadpool_compute_1d(
|
||||||
threadpool,
|
threadpool,
|
||||||
(pthreadpool_function_1d_t)compute_4d_tiled,
|
(legacy_pthreadpool_function_1d_t)compute_4d_tiled,
|
||||||
&context,
|
&context,
|
||||||
tile_range_i * tile_range_j * tile_range_k * tile_range_l);
|
tile_range_i * tile_range_j * tile_range_k * tile_range_l);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,49 +5,16 @@
|
|||||||
|
|
||||||
#include "ThreadPoolCommon.h"
|
#include "ThreadPoolCommon.h"
|
||||||
|
|
||||||
|
|
||||||
#include <stddef.h> // for size_t
|
#include <stddef.h> // for size_t
|
||||||
|
|
||||||
typedef struct pthreadpool* pthreadpool_t;
|
|
||||||
|
|
||||||
typedef void (*pthreadpool_function_1d_t)(void*, size_t);
|
|
||||||
typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
|
|
||||||
typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
|
|
||||||
typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
|
|
||||||
typedef void (*pthreadpool_function_3d_tiled_t)(
|
|
||||||
void*,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t);
|
|
||||||
typedef void (*pthreadpool_function_4d_tiled_t)(
|
|
||||||
void*,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t);
|
|
||||||
|
|
||||||
#include <stdint.h> // for uint32_t
|
#include <stdint.h> // for uint32_t
|
||||||
|
|
||||||
typedef void (*pthreadpool_task_1d_t)(void*, size_t);
|
typedef struct pthreadpool* legacy_pthreadpool_t;
|
||||||
typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
|
|
||||||
typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
|
typedef void (*legacy_pthreadpool_function_1d_t)(void*, size_t);
|
||||||
typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
|
typedef void (*legacy_pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
|
||||||
typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
|
typedef void (*legacy_pthreadpool_function_2d_t)(void*, size_t, size_t);
|
||||||
typedef void (*pthreadpool_task_3d_tile_2d_t)(
|
typedef void (*legacy_pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
|
||||||
void*,
|
typedef void (*legacy_pthreadpool_function_3d_tiled_t)(
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t);
|
|
||||||
typedef void (*pthreadpool_task_4d_tile_2d_t)(
|
|
||||||
void*,
|
void*,
|
||||||
size_t,
|
size_t,
|
||||||
size_t,
|
size_t,
|
||||||
@ -55,16 +22,7 @@ typedef void (*pthreadpool_task_4d_tile_2d_t)(
|
|||||||
size_t,
|
size_t,
|
||||||
size_t,
|
size_t,
|
||||||
size_t);
|
size_t);
|
||||||
typedef void (*pthreadpool_task_5d_tile_2d_t)(
|
typedef void (*legacy_pthreadpool_function_4d_tiled_t)(
|
||||||
void*,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t,
|
|
||||||
size_t);
|
|
||||||
typedef void (*pthreadpool_task_6d_tile_2d_t)(
|
|
||||||
void*,
|
void*,
|
||||||
size_t,
|
size_t,
|
||||||
size_t,
|
size_t,
|
||||||
@ -90,8 +48,8 @@ extern "C" {
|
|||||||
* On error the function returns NULL and sets errno accordingly.
|
* On error the function returns NULL and sets errno accordingly.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//Returns internal threadpool impl.
|
// Returns internal threadpool impl.
|
||||||
pthreadpool_t pthreadpool_create(size_t threads_count);
|
legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Queries the number of threads in a thread pool.
|
* Queries the number of threads in a thread pool.
|
||||||
@ -100,7 +58,7 @@ pthreadpool_t pthreadpool_create(size_t threads_count);
|
|||||||
*
|
*
|
||||||
* @returns The number of threads in the thread pool.
|
* @returns The number of threads in the thread pool.
|
||||||
*/
|
*/
|
||||||
size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
|
size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Processes items in parallel using threads from a thread pool.
|
* Processes items in parallel using threads from a thread pool.
|
||||||
@ -117,38 +75,45 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
|
|||||||
* @param[in] items The number of items to process. The @a function
|
* @param[in] items The number of items to process. The @a function
|
||||||
* will be called once for each item.
|
* will be called once for each item.
|
||||||
*/
|
*/
|
||||||
void pthreadpool_compute_1d(
|
void legacy_pthreadpool_compute_1d(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_1d_t function,
|
legacy_pthreadpool_function_1d_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range);
|
size_t range);
|
||||||
|
|
||||||
void pthreadpool_compute_1d_tiled(
|
void legacy_pthreadpool_parallelize_1d(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_1d_tiled_t function,
|
legacy_pthreadpool_function_1d_t function,
|
||||||
|
void* argument,
|
||||||
|
size_t range,
|
||||||
|
uint32_t flags);
|
||||||
|
|
||||||
|
void legacy_pthreadpool_compute_1d_tiled(
|
||||||
|
legacy_pthreadpool_t threadpool,
|
||||||
|
legacy_pthreadpool_function_1d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range,
|
size_t range,
|
||||||
size_t tile);
|
size_t tile);
|
||||||
|
|
||||||
void pthreadpool_compute_2d(
|
void legacy_pthreadpool_compute_2d(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_2d_t function,
|
legacy_pthreadpool_function_2d_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j);
|
size_t range_j);
|
||||||
|
|
||||||
void pthreadpool_compute_2d_tiled(
|
void legacy_pthreadpool_compute_2d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_2d_tiled_t function,
|
legacy_pthreadpool_function_2d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
size_t tile_i,
|
size_t tile_i,
|
||||||
size_t tile_j);
|
size_t tile_j);
|
||||||
|
|
||||||
void pthreadpool_compute_3d_tiled(
|
void legacy_pthreadpool_compute_3d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_3d_tiled_t function,
|
legacy_pthreadpool_function_3d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
@ -157,9 +122,9 @@ void pthreadpool_compute_3d_tiled(
|
|||||||
size_t tile_j,
|
size_t tile_j,
|
||||||
size_t tile_k);
|
size_t tile_k);
|
||||||
|
|
||||||
void pthreadpool_compute_4d_tiled(
|
void legacy_pthreadpool_compute_4d_tiled(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_4d_tiled_t function,
|
legacy_pthreadpool_function_4d_tiled_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range_i,
|
size_t range_i,
|
||||||
size_t range_j,
|
size_t range_j,
|
||||||
@ -178,129 +143,29 @@ void pthreadpool_compute_4d_tiled(
|
|||||||
*
|
*
|
||||||
* @param[in,out] threadpool The thread pool to destroy.
|
* @param[in,out] threadpool The thread pool to destroy.
|
||||||
*/
|
*/
|
||||||
void pthreadpool_destroy(pthreadpool_t threadpool);
|
void legacy_pthreadpool_destroy(legacy_pthreadpool_t threadpool);
|
||||||
|
|
||||||
// New interface copy/pasted from pthreadpool.
|
#ifdef USE_INTERNAL_PTHREADPOOL_IMPL
|
||||||
// We will merge the internal and third-party/pthreadpool eventually.
|
|
||||||
// For now copy-paste to get past build issues.
|
|
||||||
|
|
||||||
#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
|
#define pthreadpool_t legacy_pthreadpool_t
|
||||||
|
#define pthreadpool_function_1d_t legacy_pthreadpool_function_1d_t
|
||||||
|
#define pthreadpool_function_1d_tiled_t legacy_pthreadpool_function_1d_tiled_t
|
||||||
|
#define pthreadpool_function_2d_t legacy_pthreadpool_function_2d_t
|
||||||
|
#define pthreadpool_function_2d_tiled_t legacy_pthreadpool_function_2d_tiled_t
|
||||||
|
#define pthreadpool_function_3d_tiled_t legacy_pthreadpool_function_3d_tiled_t
|
||||||
|
#define pthreadpool_function_4d_tiled_t legacy_pthreadpool_function_4d_tiled_t
|
||||||
|
#define pthreadpool_create legacy_pthreadpool_create
|
||||||
|
#define pthreadpool_destroy legacy_pthreadpool_destroy
|
||||||
|
#define pthreadpool_get_threads_count legacy_pthreadpool_get_threads_count
|
||||||
|
#define pthreadpool_compute_1d legacy_pthreadpool_compute_1d
|
||||||
|
#define pthreadpool_parallelize_1d legacy_pthreadpool_parallelize_1d
|
||||||
|
#define pthreadpool_compute_1d_tiled legacy_pthreadpool_compute_1d_tiled
|
||||||
|
#define pthreadpool_compute_2d legacy_pthreadpool_compute_2d
|
||||||
|
#define pthreadpool_compute_2d_tiled legacy_pthreadpool_compute_2d_tiled
|
||||||
|
#define pthreadpool_compute_3d_tiled legacy_pthreadpool_compute_3d_tiled
|
||||||
|
#define pthreadpool_compute_4d_tiled legacy_pthreadpool_compute_4d_tiled
|
||||||
|
|
||||||
// Returns the copied threadpool impl of third-party/pthreadpool
|
#endif /* USE_INTERNAL_PTHREADPOOL_IMPL */
|
||||||
pthreadpool_t pthreadpool_create_xnnpack(size_t threads_count);
|
|
||||||
|
|
||||||
// Copied third-party impl.
|
|
||||||
size_t pthreadpool_get_threads_count_xnnpack(pthreadpool_t threadpool);
|
|
||||||
|
|
||||||
// Copied third-party impl.
|
|
||||||
void pthreadpool_destroy_xnnpack(pthreadpool_t threadpool);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Processes items in parallel using threads from a thread pool.
|
|
||||||
*
|
|
||||||
* When the call returns, all items have been processed and the thread pool is
|
|
||||||
* ready for a new task.
|
|
||||||
*
|
|
||||||
* @note If multiple threads call this function with the same thread pool, the
|
|
||||||
* calls are serialized.
|
|
||||||
*
|
|
||||||
* @param[in] threadpool The thread pool to use for parallelisation.
|
|
||||||
* @param[in] function The function to call for each item.
|
|
||||||
* @param[in] argument The first argument passed to the @a function.
|
|
||||||
* @param[in] items The number of items to process. The @a function
|
|
||||||
* will be called once for each item.
|
|
||||||
*/
|
|
||||||
void pthreadpool_parallelize_1d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_1d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_1d_tile_1d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_1d_tile_1d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range,
|
|
||||||
size_t tile,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_2d_tile_1d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_2d_tile_1d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t tile_j,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_2d_tile_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_2d_tile_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t tile_i,
|
|
||||||
size_t tile_j,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_3d_tile_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_3d_tile_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t range_k,
|
|
||||||
size_t tile_j,
|
|
||||||
size_t tile_k,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_4d_tile_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_4d_tile_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t range_k,
|
|
||||||
size_t range_l,
|
|
||||||
size_t tile_k,
|
|
||||||
size_t tile_l,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_5d_tile_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_5d_tile_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t range_k,
|
|
||||||
size_t range_l,
|
|
||||||
size_t range_m,
|
|
||||||
size_t tile_l,
|
|
||||||
size_t tile_m,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
void pthreadpool_parallelize_6d_tile_2d(
|
|
||||||
pthreadpool_t threadpool,
|
|
||||||
pthreadpool_task_6d_tile_2d_t function,
|
|
||||||
void* argument,
|
|
||||||
size_t range_i,
|
|
||||||
size_t range_j,
|
|
||||||
size_t range_k,
|
|
||||||
size_t range_l,
|
|
||||||
size_t range_m,
|
|
||||||
size_t range_n,
|
|
||||||
size_t tile_m,
|
|
||||||
size_t tile_n,
|
|
||||||
uint32_t flags);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
|
|||||||
@ -6,9 +6,9 @@
|
|||||||
// External API
|
// External API
|
||||||
//
|
//
|
||||||
|
|
||||||
void pthreadpool_compute_1d(
|
void legacy_pthreadpool_compute_1d(
|
||||||
pthreadpool_t threadpool,
|
legacy_pthreadpool_t threadpool,
|
||||||
pthreadpool_function_1d_t function,
|
legacy_pthreadpool_function_1d_t function,
|
||||||
void* argument,
|
void* argument,
|
||||||
size_t range) {
|
size_t range) {
|
||||||
if (threadpool == nullptr) {
|
if (threadpool == nullptr) {
|
||||||
@ -27,30 +27,31 @@ void pthreadpool_compute_1d(
|
|||||||
range);
|
range);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t pthreadpool_get_threads_count(pthreadpool_t threadpool) {
|
void legacy_pthreadpool_parallelize_1d(
|
||||||
// The current fix only useful when XNNPACK calls pthreadpool_get_threads_count with nullptr.
|
const legacy_pthreadpool_t threadpool,
|
||||||
|
const legacy_pthreadpool_function_1d_t function,
|
||||||
|
void* const argument,
|
||||||
|
const size_t range,
|
||||||
|
uint32_t) {
|
||||||
|
legacy_pthreadpool_compute_1d(threadpool, function, argument, range);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool) {
|
||||||
|
// The current fix only useful when XNNPACK calls legacy_pthreadpool_get_threads_count with nullptr.
|
||||||
if (threadpool == nullptr) {
|
if (threadpool == nullptr) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return reinterpret_cast<caffe2::ThreadPool*>(threadpool)->getNumThreads();
|
return reinterpret_cast<caffe2::ThreadPool*>(threadpool)->getNumThreads();
|
||||||
// TODO: Future fix: If we keep maintaining two different threadpools.
|
|
||||||
// Old C2 and new one for XNNPACK, then the we have two different pthreadpool pointer
|
|
||||||
// types. One is caffe2::Thredpool*, the other is pthreadpool* (pthreadpool_new_if_impl.c)
|
|
||||||
// XNNPACK calls pthreadpool_get_threads_count during op setup using pthreadpool*, and
|
|
||||||
// uses _parallelize_ interface for for actual work.
|
|
||||||
// While NNPACK uses caffe2::Threadpool*.
|
|
||||||
// Thus if pthreadpool_get_threads_count is getting called from XNNPACK we cannot
|
|
||||||
// reinterpret_cast it to ThreadPool. It will seg fault or worse will have unedfined behavior.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pthreadpool_t pthreadpool_create(size_t threads_count) {
|
legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count) {
|
||||||
std::mutex thread_pool_creation_mutex_;
|
std::mutex thread_pool_creation_mutex_;
|
||||||
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
|
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
|
||||||
|
|
||||||
return reinterpret_cast<pthreadpool_t>(new caffe2::ThreadPool(threads_count));
|
return reinterpret_cast<legacy_pthreadpool_t>(new caffe2::ThreadPool(threads_count));
|
||||||
}
|
}
|
||||||
|
|
||||||
void pthreadpool_destroy(pthreadpool_t pthreadpool) {
|
void legacy_pthreadpool_destroy(legacy_pthreadpool_t pthreadpool) {
|
||||||
if (pthreadpool) {
|
if (pthreadpool) {
|
||||||
caffe2::ThreadPool* threadpool =
|
caffe2::ThreadPool* threadpool =
|
||||||
reinterpret_cast<caffe2::ThreadPool*>(pthreadpool);
|
reinterpret_cast<caffe2::ThreadPool*>(pthreadpool);
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,62 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#if defined(__SSE__) || defined(__x86_64__)
|
|
||||||
#include <xmmintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct fpu_state {
|
|
||||||
#if defined(__SSE__) || defined(__x86_64__)
|
|
||||||
uint32_t mxcsr;
|
|
||||||
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
|
|
||||||
uint32_t fpscr;
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
uint64_t fpcr;
|
|
||||||
#else
|
|
||||||
char unused;
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline struct fpu_state get_fpu_state() {
|
|
||||||
struct fpu_state state = { 0 };
|
|
||||||
#if defined(__SSE__) || defined(__x86_64__)
|
|
||||||
state.mxcsr = (uint32_t) _mm_getcsr();
|
|
||||||
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
|
|
||||||
__asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
__asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr));
|
|
||||||
#endif
|
|
||||||
return state;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void set_fpu_state(const struct fpu_state state) {
|
|
||||||
#if defined(__SSE__) || defined(__x86_64__)
|
|
||||||
_mm_setcsr((unsigned int) state.mxcsr);
|
|
||||||
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
|
|
||||||
__asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
__asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void disable_fpu_denormals() {
|
|
||||||
#if defined(__SSE__) || defined(__x86_64__)
|
|
||||||
_mm_setcsr(_mm_getcsr() | 0x8040);
|
|
||||||
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
|
|
||||||
uint32_t fpscr;
|
|
||||||
__asm__ __volatile__(
|
|
||||||
"VMRS %[fpscr], fpscr\n"
|
|
||||||
"ORR %[fpscr], #0x1000000\n"
|
|
||||||
"VMSR fpscr, %[fpscr]\n"
|
|
||||||
: [fpscr] "=r" (fpscr));
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
uint64_t fpcr;
|
|
||||||
__asm__ __volatile__(
|
|
||||||
"MRS %[fpcr], fpcr\n"
|
|
||||||
"ORR %w[fpcr], %w[fpcr], 0x1000000\n"
|
|
||||||
"ORR %w[fpcr], %w[fpcr], 0x80000\n"
|
|
||||||
"MSR fpcr, %[fpcr]\n"
|
|
||||||
: [fpcr] "=r" (fpcr));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
@ -239,10 +239,10 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(DISABLE_NNPACK_AND_FAMILY)
|
if(DISABLE_NNPACK_AND_FAMILY)
|
||||||
set(USE_NNPACK OFF)
|
caffe2_update_option(USE_NNPACK OFF)
|
||||||
set(USE_QNNPACK OFF)
|
caffe2_update_option(USE_QNNPACK OFF)
|
||||||
set(USE_PYTORCH_QNNPACK OFF)
|
caffe2_update_option(USE_PYTORCH_QNNPACK OFF)
|
||||||
set(USE_XNNPACK OFF)
|
caffe2_update_option(USE_XNNPACK OFF)
|
||||||
else()
|
else()
|
||||||
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
||||||
|
|
||||||
@ -261,11 +261,9 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK)
|
|||||||
if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
|
if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
|
||||||
set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
|
set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
|
|
||||||
set(CPUINFO_LOG_LEVEL "error" CACHE STRING "")
|
|
||||||
set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
|
|
||||||
endif()
|
endif()
|
||||||
|
else()
|
||||||
|
set(DISABLE_NNPACK_AND_FAMILY ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CONFU_DEPENDENCIES_SOURCE_DIR ${PROJECT_BINARY_DIR}/confu-srcs
|
set(CONFU_DEPENDENCIES_SOURCE_DIR ${PROJECT_BINARY_DIR}/confu-srcs
|
||||||
@ -281,45 +279,48 @@ if(INTERN_BUILD_MOBILE AND INTERN_USE_EIGEN_BLAS)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ---[ pthreadpool
|
# ---[ pthreadpool
|
||||||
# QNNPACK and NNPACK both depend on pthreadpool, but when building with libtorch
|
# Only add a dependency on pthreadpool if we are on a mobile build
|
||||||
# they should use the pthreadpool implementation under caffe2/utils/threadpool
|
# or are building any of the libraries in the {Q/X}NNPACK family.
|
||||||
# instead of the default implementation. To avoid confusion, add pthreadpool
|
if(INTERN_BUILD_MOBILE OR NOT DISABLE_NNPACK_AND_FAMILY)
|
||||||
# subdirectory explicitly with EXCLUDE_FROM_ALL property prior to QNNPACK/NNPACK
|
set(USE_PTHREADPOOL ON CACHE BOOL "" FORCE)
|
||||||
# does so, which will prevent it from installing the default pthreadpool library.
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_PTHREADPOOL")
|
||||||
if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE AND (USE_QNNPACK OR USE_NNPACK OR USE_XNNPACK))
|
|
||||||
if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
|
# Always use third_party/pthreadpool.
|
||||||
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
set(USE_INTERNAL_PTHREADPOOL_IMPL OFF CACHE BOOL "" FORCE)
|
||||||
set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT TARGET pthreadpool)
|
if(NOT TARGET pthreadpool)
|
||||||
set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
|
if(USE_SYSTEM_PTHREADPOOL)
|
||||||
set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
add_library(pthreadpool SHARED IMPORTED)
|
||||||
add_subdirectory(
|
find_library(PTHREADPOOL_LIBRARY pthreadpool)
|
||||||
"${PTHREADPOOL_SOURCE_DIR}"
|
set_property(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}")
|
||||||
"${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool"
|
if(NOT PTHREADPOOL_LIBRARY)
|
||||||
EXCLUDE_FROM_ALL)
|
message(FATAL_ERROR "Cannot find pthreadpool")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
message("-- Found pthreadpool: ${PTHREADPOOL_LIBRARY}")
|
||||||
|
elseif(NOT USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
|
if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
|
||||||
|
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
||||||
|
set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
|
||||||
|
endif()
|
||||||
|
|
||||||
# XNNPACK has not option of like QNNPACK_CUSTOM_THREADPOOL
|
set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
# that allows us to hijack pthreadpool interface.
|
set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
||||||
# Thus not doing this ends up building pthreadpool as well as
|
set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
# the internal implemenation of pthreadpool which results in symbol conflicts.
|
set(PTHREADPOOL_ALLOW_DEPRECATED_API ON CACHE BOOL "")
|
||||||
if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
add_subdirectory(
|
||||||
if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
|
"${PTHREADPOOL_SOURCE_DIR}"
|
||||||
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
"${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool")
|
||||||
set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
|
set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT TARGET pthreadpool)
|
if(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTERNAL_PTHREADPOOL_IMPL")
|
||||||
set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
else()
|
||||||
add_subdirectory(
|
list(APPEND Caffe2_DEPENDENCY_LIBS pthreadpool)
|
||||||
"${PTHREADPOOL_SOURCE_DIR}"
|
endif()
|
||||||
"${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool"
|
|
||||||
EXCLUDE_FROM_ALL)
|
|
||||||
endif()
|
endif()
|
||||||
|
else()
|
||||||
|
set(USE_PTHREADPOOL OFF CACHE BOOL "" FORCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ---[ Caffe2 uses cpuinfo library in the thread pool
|
# ---[ Caffe2 uses cpuinfo library in the thread pool
|
||||||
@ -369,9 +370,12 @@ if(USE_QNNPACK)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT TARGET qnnpack)
|
if(NOT TARGET qnnpack)
|
||||||
|
if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
|
set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
set(QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
set(QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
set(QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
||||||
set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
|
||||||
set(QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
set(QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
add_subdirectory(
|
add_subdirectory(
|
||||||
"${QNNPACK_SOURCE_DIR}"
|
"${QNNPACK_SOURCE_DIR}"
|
||||||
@ -379,8 +383,29 @@ if(USE_QNNPACK)
|
|||||||
# We build static versions of QNNPACK and pthreadpool but link
|
# We build static versions of QNNPACK and pthreadpool but link
|
||||||
# them into a shared library for Caffe2, so they need PIC.
|
# them into a shared library for Caffe2, so they need PIC.
|
||||||
set_property(TARGET qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
||||||
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
if(QNNPACK_CUSTOM_THREADPOOL)
|
||||||
|
target_compile_definitions(
|
||||||
|
qnnpack PRIVATE
|
||||||
|
pthreadpool_t=legacy_pthreadpool_t
|
||||||
|
pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
|
||||||
|
pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
|
||||||
|
pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
|
||||||
|
pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
|
||||||
|
pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
|
||||||
|
pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
|
||||||
|
pthreadpool_create=legacy_pthreadpool_create
|
||||||
|
pthreadpool_destroy=legacy_pthreadpool_destroy
|
||||||
|
pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
|
||||||
|
pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
|
||||||
|
pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
|
||||||
|
pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
|
||||||
|
pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
|
||||||
|
pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
|
||||||
|
pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
|
||||||
|
pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
list(APPEND Caffe2_DEPENDENCY_LIBS qnnpack)
|
list(APPEND Caffe2_DEPENDENCY_LIBS qnnpack)
|
||||||
@ -400,9 +425,12 @@ if(USE_PYTORCH_QNNPACK)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT TARGET pytorch_qnnpack)
|
if(NOT TARGET pytorch_qnnpack)
|
||||||
|
if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
|
set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(PYTORCH_QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
set(PYTORCH_QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
set(PYTORCH_QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
set(PYTORCH_QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
||||||
set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
|
||||||
set(PYTORCH_QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
set(PYTORCH_QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
add_subdirectory(
|
add_subdirectory(
|
||||||
"${PYTORCH_QNNPACK_SOURCE_DIR}"
|
"${PYTORCH_QNNPACK_SOURCE_DIR}"
|
||||||
@ -410,10 +438,29 @@ if(USE_PYTORCH_QNNPACK)
|
|||||||
# We build static versions of QNNPACK and pthreadpool but link
|
# We build static versions of QNNPACK and pthreadpool but link
|
||||||
# them into a shared library for Caffe2, so they need PIC.
|
# them into a shared library for Caffe2, so they need PIC.
|
||||||
set_property(TARGET pytorch_qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET pytorch_qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
if(NOT USE_SYSTEM_PTHREADPOOL)
|
|
||||||
set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
||||||
endif()
|
|
||||||
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
if(PYTORCH_QNNPACK_CUSTOM_THREADPOOL)
|
||||||
|
target_compile_definitions(
|
||||||
|
pytorch_qnnpack PRIVATE
|
||||||
|
pthreadpool_t=legacy_pthreadpool_t
|
||||||
|
pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
|
||||||
|
pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
|
||||||
|
pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
|
||||||
|
pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
|
||||||
|
pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
|
||||||
|
pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
|
||||||
|
pthreadpool_create=legacy_pthreadpool_create
|
||||||
|
pthreadpool_destroy=legacy_pthreadpool_destroy
|
||||||
|
pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
|
||||||
|
pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
|
||||||
|
pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
|
||||||
|
pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
|
||||||
|
pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
|
||||||
|
pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
|
||||||
|
pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
|
||||||
|
pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
list(APPEND Caffe2_DEPENDENCY_LIBS pytorch_qnnpack)
|
list(APPEND Caffe2_DEPENDENCY_LIBS pytorch_qnnpack)
|
||||||
@ -447,7 +494,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT TARGET XNNPACK)
|
if(NOT TARGET XNNPACK)
|
||||||
set(XNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
|
||||||
set(XNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
set(XNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
||||||
set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
@ -457,15 +503,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
|||||||
"${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK")
|
"${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK")
|
||||||
|
|
||||||
set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
# Context: pthreadpool_get_threads_count implementation that is built in pytorch, uses
|
|
||||||
# implementation defined in caffe2/utils/threadpool/pthreadpool_impl.cc. This implementation
|
|
||||||
# assumes the the pthreadpool* passed is of type caffe2::ThradPool and thus does reinterpret cast.
|
|
||||||
# This is not valid when we create pthreadpool via caffe2::xnnpack_threadpool, which is of type
|
|
||||||
# compatible with new pthreadpool interface and is used in PT's XNNPACK integration.
|
|
||||||
# Thus all the calls for pthreadpool_get_threads_count originating from XNNPACK must be routed
|
|
||||||
# appropriately to pthreadpool_get_threads_count_xnnpack, which does not do the aforementioned
|
|
||||||
# casting to caffe2::ThradPool. Once the threadpools are unified, we will not need this.
|
|
||||||
target_compile_definitions(XNNPACK PRIVATE -Dpthreadpool_get_threads_count=pthreadpool_get_threads_count_xnnpack)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
|
include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
|
||||||
|
|||||||
27
cmake/External/nnpack.cmake
vendored
27
cmake/External/nnpack.cmake
vendored
@ -59,9 +59,12 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM
|
|||||||
set(GOOGLETEST_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/googletest" CACHE STRING "Google Test source directory")
|
set(GOOGLETEST_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/googletest" CACHE STRING "Google Test source directory")
|
||||||
|
|
||||||
if(NOT TARGET nnpack)
|
if(NOT TARGET nnpack)
|
||||||
|
if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||||
|
set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(NNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
set(NNPACK_BUILD_TESTS OFF CACHE BOOL "")
|
||||||
set(NNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
set(NNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
|
||||||
set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
|
|
||||||
set(NNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
set(NNPACK_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
|
set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
|
set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
|
||||||
@ -73,6 +76,28 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM
|
|||||||
set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
if(NNPACK_CUSTOM_THREADPOOL)
|
||||||
|
target_compile_definitions(
|
||||||
|
nnpack PRIVATE
|
||||||
|
pthreadpool_t=legacy_pthreadpool_t
|
||||||
|
pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
|
||||||
|
pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
|
||||||
|
pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
|
||||||
|
pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
|
||||||
|
pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
|
||||||
|
pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
|
||||||
|
pthreadpool_create=legacy_pthreadpool_create
|
||||||
|
pthreadpool_destroy=legacy_pthreadpool_destroy
|
||||||
|
pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
|
||||||
|
pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
|
||||||
|
pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
|
||||||
|
pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
|
||||||
|
pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
|
||||||
|
pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
|
||||||
|
pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
|
||||||
|
pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(NNPACK_FOUND TRUE)
|
set(NNPACK_FOUND TRUE)
|
||||||
|
|||||||
@ -69,6 +69,11 @@ if(NOT @BUILD_SHARED_LIBS@)
|
|||||||
list(APPEND TORCH_LIBRARIES ${XNNPACK_LIBRARY})
|
list(APPEND TORCH_LIBRARIES ${XNNPACK_LIBRARY})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(NOT @USE_INTERNAL_PTHREADPOOL_IMPL@)
|
||||||
|
find_library(PTHREADPOOL_LIBRARY pthreadpool PATHS "${TORCH_INSTALL_PREFIX}/lib")
|
||||||
|
list(APPEND TORCH_LIBRARIES ${PTHREADPOOL_LIBRARY})
|
||||||
|
endif()
|
||||||
|
|
||||||
if(@INTERN_USE_EIGEN_BLAS@)
|
if(@INTERN_USE_EIGEN_BLAS@)
|
||||||
find_library(EIGEN_BLAS_LIBRARY eigen_blas PATHS "${TORCH_INSTALL_PREFIX}/lib")
|
find_library(EIGEN_BLAS_LIBRARY eigen_blas PATHS "${TORCH_INSTALL_PREFIX}/lib")
|
||||||
list(APPEND TORCH_LIBRARIES ${EIGEN_BLAS_LIBRARY})
|
list(APPEND TORCH_LIBRARIES ${EIGEN_BLAS_LIBRARY})
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user