mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 21:49:24 +08:00
Compare commits
101 Commits
v2.2.1-rc3
...
v1.0.1
Author | SHA1 | Date | |
---|---|---|---|
bb15580e88 | |||
743fdbdb19 | |||
cdb9fd44dc | |||
83221655a8 | |||
48fcfdaccb | |||
18eef1d8d9 | |||
770462a5ff | |||
0f87ff6e38 | |||
eb531da9a8 | |||
37c8a33b54 | |||
0e9bdcab80 | |||
1347a184ca | |||
1cb565fa34 | |||
dab52a4a16 | |||
0a440da88b | |||
cf11411d42 | |||
fd8212cebd | |||
ce37ec38f3 | |||
047231e1e1 | |||
f748654e0e | |||
8fdcdc3c3f | |||
40fa56a5d1 | |||
2f9642010e | |||
3c10845036 | |||
d85372f330 | |||
5fc2c8b115 | |||
fc0c2252d2 | |||
304b932879 | |||
e274158c72 | |||
8d1fc20c8b | |||
af03dbb93b | |||
b24edae29e | |||
c99c8d8aa3 | |||
eac4c5d901 | |||
231f1a4991 | |||
b65b55a652 | |||
c926cb4408 | |||
a6f4538f31 | |||
7d3e2fa190 | |||
98bc784694 | |||
3c83026249 | |||
202909d601 | |||
d4eea46dcd | |||
cf0965736c | |||
274e755237 | |||
c19b16cc99 | |||
228f73e7a9 | |||
1e61241227 | |||
9a9eae14d0 | |||
fb92c3c7b5 | |||
a9cf3f69ab | |||
6460628b25 | |||
74433436e8 | |||
57c685520f | |||
ca1f9349dd | |||
6290587244 | |||
9c863c1952 | |||
84cf1660d2 | |||
e8361887b1 | |||
9a7737146c | |||
e27b4ba594 | |||
0384a0282b | |||
f80dba92ae | |||
1b7113eaae | |||
7fec47f40d | |||
d711595a07 | |||
eef3be686e | |||
ba4d1e8ca6 | |||
ab1cd6241b | |||
1ff075b7df | |||
b879d006f1 | |||
167f8e8314 | |||
dfdf2376bb | |||
95fd0afed5 | |||
4e5b994ca7 | |||
5dbcbbf715 | |||
9067e9411d | |||
4c964dac7f | |||
7b40d9c7ff | |||
e7767c1af3 | |||
982a8722cc | |||
3c1cbb8da8 | |||
5f51de77c9 | |||
a4e2d27ddb | |||
4909529584 | |||
7b98af16ee | |||
fe098a3605 | |||
3486cebd87 | |||
a5a34fb5b1 | |||
b2c4c55734 | |||
b104068d24 | |||
e0834ded98 | |||
30aed0237d | |||
033ae1598f | |||
8ca4fc3fd2 | |||
20296297ca | |||
72d27e3802 | |||
563d67087c | |||
7dc06810c2 | |||
07e4a5e069 | |||
db5d3131d1 |
@ -1,14 +1,14 @@
|
||||
# IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
|
||||
# in this file to the new version number, and **ALSO** update the version number below:
|
||||
# PyTorchDockerVersion:262
|
||||
# Caffe2DockerVersion:230
|
||||
# PyTorchDockerVersion:282
|
||||
# Caffe2DockerVersion:238
|
||||
|
||||
docker_config_defaults: &docker_config_defaults
|
||||
user: jenkins
|
||||
aws_auth:
|
||||
# This IAM user only allows read-write access to ECR
|
||||
aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
|
||||
aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
|
||||
aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
|
||||
aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
|
||||
|
||||
# NOTE: We only perform the merge in build step and not in test step, because
|
||||
# all source files will be shared from build to test
|
||||
@ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client
|
||||
sudo apt-get -qq update
|
||||
sudo apt-get -qq install openssh-client git
|
||||
|
||||
install_doc_push_script: &install_doc_push_script
|
||||
name: Install the doc push script
|
||||
no_output_timeout: "2m"
|
||||
command: |
|
||||
cat >/home/circleci/project/doc_push_script.sh <<EOL
|
||||
# =================== The following code **should** be executed inside Docker container ===================
|
||||
|
||||
# This is where the local pytorch install in the docker image is located
|
||||
pt_checkout="/var/lib/jenkins/workspace"
|
||||
|
||||
# Since we're cat-ing this file, we need to escape all $'s
|
||||
echo "doc_push_script.sh: Invoked with \$*"
|
||||
|
||||
git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
|
||||
pushd pytorch.github.io
|
||||
|
||||
set -ex
|
||||
|
||||
# Argument 1: Where to copy the built documentation to
|
||||
# (pytorch.github.io/$install_path)
|
||||
install_path="\$1"
|
||||
if [ -z "\$install_path" ]; then
|
||||
echo "error: doc_push_script.sh: install_path (arg1) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Argument 2: What version of the docs we are building.
|
||||
version="\$2"
|
||||
if [ -z "\$version" ]; then
|
||||
echo "error: doc_push_script.sh: version (arg2) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
is_master_doc=false
|
||||
if [ "\$version" == "master" ]; then
|
||||
is_master_doc=true
|
||||
fi
|
||||
|
||||
# Argument 3: (optional) If present, we will NOT do any pushing. Used for testing.
|
||||
dry_run=false
|
||||
if [ "\$3" != "" ]; then
|
||||
dry_run=true
|
||||
fi
|
||||
|
||||
echo "install_path: \$install_path version: \$version dry_run: \$dry_run"
|
||||
|
||||
export LC_ALL=C
|
||||
export PATH=/opt/conda/bin:$PATH
|
||||
|
||||
rm -rf pytorch || true
|
||||
|
||||
# Get all the documentation sources, put them in one place
|
||||
pushd "\$pt_checkout"
|
||||
git clone https://github.com/pytorch/vision
|
||||
pushd vision
|
||||
conda install -q pillow
|
||||
time python setup.py install
|
||||
popd
|
||||
pushd docs
|
||||
rm -rf source/torchvision
|
||||
cp -r ../vision/docs/source source/torchvision
|
||||
|
||||
# Build the docs
|
||||
pip -q install -r requirements.txt || true
|
||||
if [ "\$is_master_doc" = true ]; then
|
||||
make html
|
||||
else
|
||||
make html-stable
|
||||
fi
|
||||
|
||||
# Move them into the docs repo
|
||||
popd
|
||||
popd
|
||||
git rm -rf "\$install_path" || true
|
||||
mv "\$pt_checkout/docs/build/html" "\$install_path"
|
||||
|
||||
# Add the version handler by search and replace.
|
||||
# XXX: Consider moving this to the docs Makefile or site build
|
||||
if [ "\$is_master_doc" = true ]; then
|
||||
find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \▼</a>@g"
|
||||
else
|
||||
find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \▼</a>@g"
|
||||
fi
|
||||
|
||||
git add "\$install_path" || true
|
||||
git status
|
||||
git config user.email "soumith+bot@pytorch.org"
|
||||
git config user.name "pytorchbot"
|
||||
# If there aren't changes, don't make a commit; push is no-op
|
||||
git commit -m "auto-generating sphinx docs" || true
|
||||
git status
|
||||
|
||||
if [ "\$dry_run" = false ]; then
|
||||
echo "Pushing to pytorch.github.io:site"
|
||||
git push origin site
|
||||
else
|
||||
echo "Skipping push due to dry_run"
|
||||
fi
|
||||
|
||||
popd
|
||||
# =================== The above code **should** be executed inside Docker container ===================
|
||||
EOL
|
||||
chmod +x /home/circleci/project/doc_push_script.sh
|
||||
|
||||
setup_ci_environment: &setup_ci_environment
|
||||
name: Set Up CI Environment
|
||||
no_output_timeout: "1h"
|
||||
@ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment
|
||||
echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
|
||||
|
||||
# This IAM user allows write access to S3 bucket for sccache
|
||||
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
|
||||
fi
|
||||
|
||||
# This IAM user only allows read-write access to ECR
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
|
||||
eval $(aws ecr get-login --region us-east-1 --no-include-email)
|
||||
|
||||
pytorch_linux_build_defaults: &pytorch_linux_build_defaults
|
||||
@ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
|
||||
<<: *setup_ci_environment
|
||||
- run:
|
||||
name: Test
|
||||
no_output_timeout: "90m"
|
||||
no_output_timeout: "1h"
|
||||
command: |
|
||||
set -e
|
||||
export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
|
||||
@ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
|
||||
|
||||
export IN_CIRCLECI=1
|
||||
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
|
||||
brew install moreutils --without-parallel
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
|
||||
# so we must unlink GNU `parallel` first, and relink it afterwards
|
||||
brew unlink parallel
|
||||
brew install moreutils
|
||||
brew link parallel --overwrite
|
||||
brew install cmake
|
||||
brew install expect
|
||||
|
||||
@ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
|
||||
export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
|
||||
|
||||
# This IAM user allows write access to S3 bucket for sccache
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
|
||||
export SCCACHE_BIN=${PWD}/sccache_bin
|
||||
mkdir -p ${SCCACHE_BIN}
|
||||
@ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
|
||||
sccache --show-stats
|
||||
fi
|
||||
|
||||
##############################################################################
|
||||
##############################################################################
|
||||
# Job specifications
|
||||
##############################################################################
|
||||
##############################################################################
|
||||
|
||||
version: 2
|
||||
jobs:
|
||||
pytorch_linux_trusty_py2_7_9_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py2_7_9_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_py2_7_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py2_7_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_5_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_5_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc4_8_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc4_8_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc5_4_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc5_4_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc7_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_py3_6_gcc7_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_trusty_pynightly_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_trusty_pynightly_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_xenial_py3_clang5_asan_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_xenial_py3_clang5_asan_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
resource_class: large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_xenial_cuda8_cudnn6_py3_build:
|
||||
pytorch_linux_xenial_cuda8_cudnn7_py3_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
|
||||
pytorch_linux_xenial_cuda8_cudnn6_py3_test:
|
||||
pytorch_linux_xenial_cuda8_cudnn7_py3_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
resource_class: gpu.medium
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
|
||||
pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
MULTI_GPU: "1"
|
||||
resource_class: gpu.large
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
|
||||
pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
resource_class: gpu.medium
|
||||
<<: *pytorch_linux_test_defaults
|
||||
|
||||
pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
|
||||
pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
resource_class: gpu.medium
|
||||
@ -517,7 +631,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_cudnn7_py2_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
|
||||
PYTHON_VERSION: "2.7"
|
||||
CUDA_VERSION: "9"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
@ -525,7 +639,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_cudnn7_py2_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
|
||||
PYTHON_VERSION: "2.7"
|
||||
CUDA_VERSION: "9"
|
||||
resource_class: gpu.medium
|
||||
@ -534,7 +648,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_cudnn7_py3_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "9"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
@ -542,7 +656,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_cudnn7_py3_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "9"
|
||||
resource_class: gpu.medium
|
||||
@ -551,7 +665,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "9.2"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
@ -559,7 +673,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "9.2"
|
||||
resource_class: gpu.medium
|
||||
@ -568,7 +682,7 @@ jobs:
|
||||
pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "10"
|
||||
<<: *pytorch_linux_build_defaults
|
||||
@ -576,7 +690,7 @@ jobs:
|
||||
pytorch_short_perf_test_gpu:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-short-perf-test-gpu
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
PYTHON_VERSION: "3.6"
|
||||
CUDA_VERSION: "8"
|
||||
resource_class: gpu.medium
|
||||
@ -597,8 +711,8 @@ jobs:
|
||||
|
||||
docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
|
||||
# This IAM user allows write access to S3 bucket for perf test numbers
|
||||
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
|
||||
echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
|
||||
docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env
|
||||
|
||||
export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
|
||||
@ -607,7 +721,7 @@ jobs:
|
||||
pytorch_doc_push:
|
||||
environment:
|
||||
JOB_BASE_NAME: pytorch-doc-push
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
|
||||
resource_class: large
|
||||
machine:
|
||||
image: default
|
||||
@ -615,72 +729,39 @@ jobs:
|
||||
- run:
|
||||
<<: *setup_ci_environment
|
||||
- run:
|
||||
name: Doc Push
|
||||
<<: *install_doc_push_script
|
||||
- run:
|
||||
name: Doc Build and Push
|
||||
no_output_timeout: "1h"
|
||||
command: |
|
||||
set -e
|
||||
if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
|
||||
echo "Skipping doc push..."
|
||||
exit 0
|
||||
fi
|
||||
export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
|
||||
echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
|
||||
docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
|
||||
export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
|
||||
|
||||
cat >/home/circleci/project/doc_push_script.sh <<EOL
|
||||
# =================== The following code will be executed inside Docker container ===================
|
||||
git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
|
||||
pushd pytorch.github.io
|
||||
|
||||
set -ex
|
||||
|
||||
export LC_ALL=C
|
||||
export PATH=/opt/conda/bin:$PATH
|
||||
|
||||
rm -rf pytorch || true
|
||||
|
||||
# Get all the documentation sources, put them in one place
|
||||
# TODO: These clones can race
|
||||
git clone https://github.com/pytorch/pytorch
|
||||
pushd pytorch
|
||||
git clone https://github.com/pytorch/vision
|
||||
pushd vision
|
||||
conda install -q pillow
|
||||
time python setup.py install
|
||||
popd
|
||||
pushd docs
|
||||
rm -rf source/torchvision
|
||||
cp -r ../vision/docs/source source/torchvision
|
||||
|
||||
# Build the docs
|
||||
pip -q install -r requirements.txt || true
|
||||
make html
|
||||
|
||||
# Move them into the docs repo
|
||||
popd
|
||||
popd
|
||||
git rm -rf docs/master || true
|
||||
mv pytorch/docs/build/html docs/master
|
||||
find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\▼<\\/a>/g'
|
||||
git add docs/master || true
|
||||
git status
|
||||
git config user.email "soumith+bot@pytorch.org"
|
||||
git config user.name "pytorchbot"
|
||||
# If there aren't changes, don't make a commit; push is no-op
|
||||
git commit -m "auto-generating sphinx docs" || true
|
||||
git status
|
||||
git push origin site
|
||||
|
||||
popd
|
||||
# =================== The above code will be executed inside Docker container ===================
|
||||
EOL
|
||||
chmod +x /home/circleci/project/doc_push_script.sh
|
||||
docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh
|
||||
|
||||
export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
|
||||
# master branch docs push
|
||||
if [[ "${CIRCLE_BRANCH}" == "master" ]]; then
|
||||
export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
|
||||
|
||||
# stable release docs push. We keep an eternal PR open for merging
|
||||
# v1.0.1 -> master; everytime v1.0.1 is updated the following is run.
|
||||
elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then
|
||||
export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1'
|
||||
|
||||
# For open PRs: Do a dry_run of the docs build, don't push build
|
||||
else
|
||||
export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1'
|
||||
fi
|
||||
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
|
||||
|
||||
# Save the docs build so we can debug any problems
|
||||
export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug
|
||||
docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE}
|
||||
docker push ${DEBUG_COMMIT_DOCKER_IMAGE}
|
||||
|
||||
pytorch_macos_10_13_py3_build:
|
||||
macos:
|
||||
xcode: "9.0"
|
||||
@ -696,8 +777,11 @@ jobs:
|
||||
set -e
|
||||
|
||||
export IN_CIRCLECI=1
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
|
||||
brew install moreutils --without-parallel
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
|
||||
# so we must unlink GNU `parallel` first, and relink it afterwards
|
||||
brew unlink parallel
|
||||
brew install moreutils
|
||||
brew link parallel --overwrite
|
||||
brew install expect
|
||||
|
||||
# Install sccache
|
||||
@ -706,8 +790,8 @@ jobs:
|
||||
|
||||
export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
|
||||
# This IAM user allows write access to S3 bucket for sccache
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
|
||||
git submodule sync && git submodule update -q --init
|
||||
chmod a+x .jenkins/pytorch/macos-build.sh
|
||||
@ -740,8 +824,11 @@ jobs:
|
||||
command: |
|
||||
set -e
|
||||
export IN_CIRCLECI=1
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
|
||||
brew install moreutils --without-parallel
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
|
||||
# so we must unlink GNU `parallel` first, and relink it afterwards
|
||||
brew unlink parallel
|
||||
brew install moreutils
|
||||
brew link parallel --overwrite
|
||||
brew install expect
|
||||
|
||||
cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
|
||||
@ -765,8 +852,11 @@ jobs:
|
||||
|
||||
export IN_CIRCLECI=1
|
||||
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
|
||||
brew install moreutils --without-parallel
|
||||
# moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
|
||||
# so we must unlink GNU `parallel` first, and relink it afterwards
|
||||
brew unlink parallel
|
||||
brew install moreutils
|
||||
brew link parallel --overwrite
|
||||
brew install expect
|
||||
|
||||
# Install CUDA 9.2
|
||||
@ -790,30 +880,13 @@ jobs:
|
||||
sudo chmod +x /usr/local/bin/sccache
|
||||
export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
|
||||
# This IAM user allows write access to S3 bucket for sccache
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
|
||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
|
||||
|
||||
git submodule sync && git submodule update -q --init
|
||||
chmod a+x .jenkins/pytorch/macos-build.sh
|
||||
unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
|
||||
|
||||
caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
|
||||
CUDA_VERSION: "8"
|
||||
BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
|
||||
<<: *caffe2_linux_build_defaults
|
||||
|
||||
caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
|
||||
CUDA_VERSION: "8"
|
||||
BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
|
||||
resource_class: gpu.medium
|
||||
<<: *caffe2_linux_test_defaults
|
||||
|
||||
caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
|
||||
@ -896,11 +969,20 @@ jobs:
|
||||
caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
|
||||
CUDA_VERSION: "8"
|
||||
BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
|
||||
BUILD_ONLY: "1"
|
||||
<<: *caffe2_linux_build_defaults
|
||||
|
||||
caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test
|
||||
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
|
||||
CUDA_VERSION: "8"
|
||||
BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
|
||||
resource_class: gpu.medium
|
||||
<<: *caffe2_linux_test_defaults
|
||||
|
||||
caffe2_py2_gcc4_9_ubuntu14_04_build:
|
||||
environment:
|
||||
JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
|
||||
@ -1008,25 +1090,25 @@ workflows:
|
||||
- pytorch_linux_xenial_py3_clang5_asan_test:
|
||||
requires:
|
||||
- pytorch_linux_xenial_py3_clang5_asan_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_test:
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_test:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_short_perf_test_gpu:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_doc_push:
|
||||
requires:
|
||||
- pytorch_linux_xenial_cuda8_cudnn6_py3_build
|
||||
- pytorch_linux_xenial_cuda8_cudnn7_py3_build
|
||||
- pytorch_linux_xenial_cuda9_cudnn7_py2_build
|
||||
- pytorch_linux_xenial_cuda9_cudnn7_py2_test:
|
||||
requires:
|
||||
@ -1047,10 +1129,6 @@ workflows:
|
||||
- pytorch_macos_10_13_py3_build
|
||||
- pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
|
||||
|
||||
- caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
|
||||
- caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
|
||||
requires:
|
||||
- caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
|
||||
- caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
|
||||
- caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
|
||||
requires:
|
||||
@ -1072,6 +1150,9 @@ workflows:
|
||||
requires:
|
||||
- caffe2_onnx_py2_gcc5_ubuntu16_04_build
|
||||
- caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
|
||||
- caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
|
||||
requires:
|
||||
- caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
|
||||
- caffe2_py2_clang3_8_ubuntu16_04_build
|
||||
- caffe2_py2_clang3_9_ubuntu16_04_build
|
||||
- caffe2_py2_clang7_ubuntu16_04_build
|
||||
|
@ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
|
||||
|
||||
if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
|
||||
CMAKE_ARGS+=("-DBLAS=MKL")
|
||||
CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
|
||||
fi
|
||||
if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
|
||||
CMAKE_ARGS+=("-DUSE_CUDA=ON")
|
||||
|
@ -14,18 +14,8 @@ clang --version
|
||||
# symbolize=1: Gives us much better errors when things go wrong
|
||||
export ASAN_OPTIONS=detect_leaks=0:symbolize=1
|
||||
|
||||
# FIXME: Remove the hardcoded "-pthread" option.
|
||||
# With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
|
||||
# succeed because "pthread_create" is in libasan.so. However, libasan doesn't
|
||||
# have the full pthread implementation. Other advanced pthread functions doesn't
|
||||
# exist in libasan.so[2]. If we need some pthread advanced functions, we still
|
||||
# need to link the pthread library.
|
||||
# [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
|
||||
# [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
|
||||
#
|
||||
# TODO: Make the ASAN flags a more unified env var
|
||||
CC="clang" CXX="clang++" LDSHARED="clang --shared" \
|
||||
CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
|
||||
CXX_FLAGS="-pthread" \
|
||||
CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
|
||||
NO_CUDA=1 USE_MKLDNN=0 \
|
||||
python setup.py install
|
||||
|
@ -129,7 +129,7 @@ fi
|
||||
git add -f build/bin
|
||||
|
||||
# Test documentation build
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
|
||||
pushd docs
|
||||
# TODO: Don't run this here
|
||||
pip install -q -r requirements.txt || true
|
||||
@ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
|
||||
fi
|
||||
|
||||
# Test standalone c10 build
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
|
||||
mkdir -p c10/build
|
||||
pushd c10/build
|
||||
cmake ..
|
||||
|
@ -122,7 +122,7 @@ fi
|
||||
# Use conda cmake in some CI build. Conda cmake will be newer than our supported
|
||||
# min version 3.5, so we only do it in two builds that we know should use conda.
|
||||
if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
|
||||
[[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
|
||||
if ! which conda; then
|
||||
echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
|
||||
|
@ -5,9 +5,9 @@
|
||||
# in this file will report a failure (so you don't forget to
|
||||
# reenable the tests on merge ;)
|
||||
|
||||
pytorch-linux-xenial-cuda8-cudnn6-py3-build
|
||||
pytorch-linux-xenial-cuda8-cudnn6-py3-test
|
||||
pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
|
||||
pytorch-linux-xenial-cuda8-cudnn7-py3-build
|
||||
pytorch-linux-xenial-cuda8-cudnn7-py3-test
|
||||
pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
|
||||
pytorch-linux-xenial-cuda9-cudnn7-py2-build
|
||||
pytorch-linux-xenial-cuda9-cudnn7-py2-test
|
||||
pytorch-linux-xenial-cuda9-cudnn7-py3-build
|
||||
|
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
|
||||
sccache --show-stats
|
||||
sccache --zero-stats
|
||||
rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
|
||||
for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
|
||||
IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
|
||||
del /S /Q %%i
|
||||
)
|
||||
)
|
||||
copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
|
||||
)
|
||||
|
||||
|
@ -34,10 +34,4 @@ matrix:
|
||||
script: cd docs/cpp/source && ./check-doxygen.sh
|
||||
- env: CLANG_TIDY
|
||||
python: "3.6"
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- llvm-toolchain-trusty
|
||||
packages: clang-tidy
|
||||
script: tools/run-clang-tidy-in-ci.sh
|
||||
|
@ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
|
||||
SET(CUFFT_FAKELINK_OPTIONS
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
else()
|
||||
MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
|
||||
endif()
|
||||
|
@ -2122,55 +2122,6 @@
|
||||
- arg: THTensor* self
|
||||
- arg: THTensor* tensor
|
||||
]]
|
||||
[[
|
||||
name: _th_tril
|
||||
cname: tril
|
||||
variants:
|
||||
- function
|
||||
return: argument 0
|
||||
arguments:
|
||||
- arg: THTensor* result
|
||||
output: True
|
||||
- THTensor* self
|
||||
- arg: long diagonal
|
||||
default: 0
|
||||
]]
|
||||
[[
|
||||
name: _th_tril_
|
||||
cname: tril
|
||||
variants: function
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
- arg: long diagonal
|
||||
default: 0
|
||||
]]
|
||||
[[
|
||||
name: _th_triu
|
||||
cname: triu
|
||||
variants:
|
||||
- function
|
||||
return: argument 0
|
||||
arguments:
|
||||
- arg: THTensor* result
|
||||
output: True
|
||||
- THTensor* self
|
||||
- arg: long diagonal
|
||||
default: 0
|
||||
]]
|
||||
[[
|
||||
name: _th_triu_
|
||||
cname: triu
|
||||
variants:
|
||||
- function
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
- arg: long diagonal
|
||||
default: 0
|
||||
]]
|
||||
[[
|
||||
name: _th_cross
|
||||
cname: cross
|
||||
|
@ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) {
|
||||
reduce_dims.push_back(i);
|
||||
}
|
||||
for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
|
||||
if (shape[i - leading_dims] == 1 && sizes[i] > 1) {
|
||||
if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
|
||||
reduce_dims.push_back(i);
|
||||
}
|
||||
}
|
||||
|
@ -81,6 +81,39 @@ inline void parallel_for(
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
parallel_reduce
|
||||
|
||||
begin: index at which to start applying reduction
|
||||
|
||||
end: index at which to stop applying reduction
|
||||
|
||||
grain_size: number of elements per chunk. impacts number of elements in
|
||||
intermediate results tensor and degree of parallelization.
|
||||
|
||||
ident: identity for binary combination function sf. sf(ident, x) needs to return
|
||||
x.
|
||||
|
||||
f: function for reduction over a chunk. f needs to be of signature scalar_t
|
||||
f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
|
||||
|
||||
sf: function to combine two partial results. sf needs to be of signature
|
||||
scalar_t sf(scalar_t x, scalar_t y)
|
||||
|
||||
For example, you might have a tensor of 10000 entires and want to sum together
|
||||
all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
|
||||
an intermediate result tensor with 4 elements. Then it will execute the function
|
||||
"f" you provide and pass the beginning and end index of these chunks, so
|
||||
0-24999, 2500-4999, etc. and the combination identity. It will then write out
|
||||
the result from each of these chunks into the intermediate result tensor. After
|
||||
that it'll reduce the partial results from each chunk into a single number using
|
||||
the combination function sf and the identity ident. For a total summation this
|
||||
would be "+" and 0 respectively. This is similar to tbb's approach [1], where
|
||||
you need to provide a function to accumulate a subrange, a function to combine
|
||||
two partial results and an identity.
|
||||
|
||||
[1] https://software.intel.com/en-us/node/506154
|
||||
*/
|
||||
template <class scalar_t, class F, class SF>
|
||||
inline scalar_t parallel_reduce(
|
||||
const int64_t begin,
|
||||
|
@ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
|
||||
|
||||
void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
|
||||
AT_CHECK(
|
||||
t.type().backend() == backend,
|
||||
!t.defined() || t.type().backend() == backend,
|
||||
"Expected tensor to have ", toString(backend),
|
||||
" Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
|
||||
"(while checking arguments for ", c, ")");
|
||||
|
@ -52,14 +52,11 @@ namespace c10 {
|
||||
_(prim, TupleSlice) \
|
||||
_(prim, ListConstruct) \
|
||||
_(prim, ListUnpack) \
|
||||
_(prim, BoolToTensor) \
|
||||
_(prim, NumToTensor) \
|
||||
_(prim, TensorToNum) \
|
||||
_(prim, ImplicitTensorToNum) \
|
||||
_(prim, TensorToBool) \
|
||||
_(prim, IntToFloat) \
|
||||
_(prim, FloatToInt) \
|
||||
_(prim, StringToFloat) \
|
||||
_(prim, Bool) \
|
||||
_(prim, Int) \
|
||||
_(prim, Float) \
|
||||
_(prim, device) \
|
||||
_(prim, dtype) \
|
||||
_(prim, shape) \
|
||||
@ -139,7 +136,8 @@ namespace c10 {
|
||||
_(attr, name) \
|
||||
_(attr, a) \
|
||||
_(attr, b) \
|
||||
_(attr, beg)
|
||||
_(attr, beg) \
|
||||
_(attr, idx)
|
||||
#else
|
||||
#define FORALL_NS_SYMBOLS(_) \
|
||||
_(namespaces, prim) \
|
||||
|
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
|
||||
ss << "Future[" << getElementType()->python_str() << "]";
|
||||
return ss.str();
|
||||
}
|
||||
TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
|
||||
return create(contained_types.at(0));
|
||||
}
|
||||
private:
|
||||
FutureType(TypePtr elem) : SingleElementType(elem) {}
|
||||
};
|
||||
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
|
||||
}
|
||||
|
||||
inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
|
||||
AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
|
||||
if (typ->isSubtypeOf(IntType::get())) {
|
||||
return CompleteTensorType::create(at::kLong, at::kCPU, {});
|
||||
} else if (typ->isSubtypeOf(FloatType::get())) {
|
||||
@ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
|
||||
template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
|
||||
template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }
|
||||
|
||||
CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
|
||||
CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);
|
||||
|
||||
using TypeEnv = std::unordered_map<std::string, TypePtr>;
|
||||
struct MatchTypeReturn {
|
||||
|
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
|
||||
return value;
|
||||
}
|
||||
|
||||
TypePtr inferTypeFrom(const IValue& value) {
|
||||
// why incomplete? You cannot completely recover a type from
|
||||
// an IValue, List[List[int]] and List[List[Tensor]] will both
|
||||
// become ivalue.isGenericList() and cannot be recovered.
|
||||
// The only appropriate place to use this is where you know that
|
||||
// you are only dealing with a subset of objects where you can recover
|
||||
// the type, like in the tracer.
|
||||
TypePtr incompleteInferTypeFrom(const IValue& value) {
|
||||
if (value.isTensor()) {
|
||||
return CompleteTensorType::create(value.toTensor());
|
||||
} else if (value.isDouble()) {
|
||||
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
|
||||
} else if (value.isDoubleList()) {
|
||||
return ListType::ofFloats();
|
||||
} else if (value.isTuple()) {
|
||||
return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
|
||||
return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
|
||||
} else if (value.isDevice()) {
|
||||
return DeviceObjType::get();
|
||||
}
|
||||
AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
|
||||
AT_ERROR("Type cannot be accurately recovered from this IValue.");
|
||||
}
|
||||
|
||||
c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
|
||||
|
@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
|
||||
vec256::Vec256<scalar_t> acc_vec,
|
||||
int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
scalar_t acc_arr[Vec::size];
|
||||
scalar_t acc_arr[Vec::size()];
|
||||
acc_vec.store(acc_arr);
|
||||
for (int64_t i = 1; i < size; i++) {
|
||||
scalar_t acc_arr_next[Vec::size];
|
||||
scalar_t acc_arr_next[Vec::size()];
|
||||
acc_arr_next[0] = acc_arr[i];
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next);
|
||||
acc_vec = vec_fun(acc_vec, acc_vec_next);
|
||||
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
|
||||
template <typename scalar_t, typename Op>
|
||||
inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
if (size < Vec::size)
|
||||
if (size < Vec::size())
|
||||
return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
|
||||
int64_t d = Vec::size;
|
||||
int64_t d = Vec::size();
|
||||
Vec acc_vec = Vec::loadu(data);
|
||||
for (; d < size - (size % Vec::size); d += Vec::size) {
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(data + d);
|
||||
acc_vec = vec_fun(acc_vec, data_vec);
|
||||
}
|
||||
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
|
||||
Vec data_vec = Vec::loadu(data + d, size - d);
|
||||
acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
|
||||
}
|
||||
return vec_reduce_all(vec_fun, acc_vec, Vec::size);
|
||||
return vec_reduce_all(vec_fun, acc_vec, Vec::size());
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
|
||||
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
|
||||
scalar_t* data,
|
||||
int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
if (size < Vec::size)
|
||||
if (size < Vec::size())
|
||||
return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
|
||||
int64_t d = Vec::size;
|
||||
int64_t d = Vec::size();
|
||||
Vec acc_vec = map_fun(Vec::loadu(data));
|
||||
for (; d < size - (size % Vec::size); d += Vec::size) {
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(data + d);
|
||||
data_vec = map_fun(data_vec);
|
||||
acc_vec = red_fun(acc_vec, data_vec);
|
||||
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
|
||||
data_vec = map_fun(data_vec);
|
||||
acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
|
||||
}
|
||||
return vec_reduce_all(red_fun, acc_vec, Vec::size);
|
||||
return vec_reduce_all(red_fun, acc_vec, Vec::size());
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
|
||||
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
|
||||
const scalar_t* data2,
|
||||
int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
if (size < Vec::size) {
|
||||
if (size < Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(data, size);
|
||||
Vec data2_vec = Vec::loadu(data2, size);
|
||||
data_vec = map_fun(data_vec, data2_vec);
|
||||
return vec_reduce_all(red_fun, data_vec, size);
|
||||
}
|
||||
int64_t d = Vec::size;
|
||||
int64_t d = Vec::size();
|
||||
Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
|
||||
for (; d < size - (size % Vec::size); d += Vec::size) {
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(data + d);
|
||||
Vec data2_vec = Vec::loadu(data2 + d);
|
||||
data_vec = map_fun(data_vec, data2_vec);
|
||||
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
|
||||
data_vec = map_fun(data_vec, data2_vec);
|
||||
acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
|
||||
}
|
||||
return vec_reduce_all(red_fun, acc_vec, Vec::size);
|
||||
return vec_reduce_all(red_fun, acc_vec, Vec::size());
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename Op>
|
||||
@ -103,7 +103,7 @@ inline void map(
|
||||
int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
int64_t d = 0;
|
||||
for (; d < size - (size % Vec::size); d += Vec::size) {
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec output_vec = vec_fun(Vec::loadu(input_data + d));
|
||||
output_vec.store(output_data + d);
|
||||
}
|
||||
@ -122,7 +122,7 @@ inline void map2(
|
||||
int64_t size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
int64_t d = 0;
|
||||
for (; d < size - (size % Vec::size); d += Vec::size) {
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(input_data + d);
|
||||
Vec data_vec2 = Vec::loadu(input_data2 + d);
|
||||
Vec output_vec = vec_fun(data_vec, data_vec2);
|
||||
|
@ -15,14 +15,24 @@
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
|
||||
// Note [Acceptable use of anonymous namespace in header]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// Yes you saw right, this is an anonymous namespace in a header. This header,
|
||||
// and all of its subheaders, REQUIRE their code to be entirely inlined into
|
||||
// the compilation unit that uses them. It's important that these functions have
|
||||
// internal linkage so that kernels for different architectures don't get
|
||||
// combined during linking. It's sufficient to label functions "static", but
|
||||
// class methods must be an unnamed namespace to have internal linkage (since
|
||||
// static means something different in the context of classes).
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
|
||||
T buf[Vec256<T>::size];
|
||||
T buf[Vec256<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
// See Note [Acceptable use of anonymous namespace in header]
|
||||
namespace {
|
||||
|
||||
template<size_t n> struct int_of_size;
|
||||
@ -45,15 +46,49 @@ struct Vec256 {
|
||||
private:
|
||||
T values[32 / sizeof(T)] = {0};
|
||||
public:
|
||||
static constexpr int size = 32 / sizeof(T);
|
||||
// Note [constexpr static function to avoid odr-usage compiler bug]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// Why, you might ask, is size defined to be a static constexpr function,
|
||||
// rather than a more ordinary 'static constexpr int size;' variable?
|
||||
// The problem lies within ODR rules for static constexpr members versus
|
||||
// static constexpr functions. First, recall that this class (along with all
|
||||
// of its derivations) live in an anonymous namespace: they are intended to be
|
||||
// *completely* inlined at their use-sites, because we need to compile it
|
||||
// multiple times for different instruction sets.
|
||||
//
|
||||
// Because of this constraint, we CANNOT provide a single definition for
|
||||
// any static members in this class; since we want to compile the class
|
||||
// multiple times, there wouldn't actually be any good place to put the
|
||||
// definition. Now here is the problem: if we ODR-use a static constexpr
|
||||
// member, we are *obligated* to provide a definition. Without the
|
||||
// definition, you get a compile error like:
|
||||
//
|
||||
// relocation R_X86_64_PC32 against undefined symbol
|
||||
// `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
|
||||
// a shared object; recompile with -fPIC
|
||||
//
|
||||
// If this were C++17, we could replace a static constexpr variable with
|
||||
// an inline variable which doesn't require one definition. But we are not
|
||||
// C++17. So the next best thing is to replace the member with a static
|
||||
// constexpr (and therefore inline) function, which does not require ODR
|
||||
// either.
|
||||
//
|
||||
// Also, technically according to the C++ standard, we don't have to define
|
||||
// a constexpr variable if we never odr-use it. But it seems that some
|
||||
// versions GCC/Clang have buggy determinations on whether or not an
|
||||
// identifier is odr-used or not, and in any case it's hard to tel if
|
||||
// a variabe is odr-used or not. So best to just cut the probem at the root.
|
||||
static constexpr int size() {
|
||||
return 32 / sizeof(T);
|
||||
}
|
||||
Vec256() {}
|
||||
Vec256(T val) {
|
||||
for (int i = 0; i != size; i++) {
|
||||
for (int i = 0; i != size(); i++) {
|
||||
values[i] = val;
|
||||
}
|
||||
}
|
||||
template<typename... Args,
|
||||
typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
|
||||
typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vec256(Args... vals) {
|
||||
values = { vals... };
|
||||
}
|
||||
@ -61,7 +96,7 @@ public:
|
||||
static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
int64_t mask = mask_;
|
||||
Vec256 vec;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (mask & 0x01) {
|
||||
vec[i] = b[i];
|
||||
} else {
|
||||
@ -74,9 +109,9 @@ public:
|
||||
static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
|
||||
const Vec256<T>& mask) {
|
||||
Vec256 vec;
|
||||
int_same_size_t<T> buffer[size];
|
||||
int_same_size_t<T> buffer[size()];
|
||||
mask.store(buffer);
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (buffer[i] & 0x01)
|
||||
{
|
||||
vec[i] = b[i];
|
||||
@ -88,14 +123,14 @@ public:
|
||||
}
|
||||
static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
|
||||
Vec256 vec;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
vec.values[i] = base + i * step;
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
|
||||
static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
|
||||
Vec256 vec;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (i < count) {
|
||||
vec[i] = b[i];
|
||||
} else {
|
||||
@ -114,7 +149,7 @@ public:
|
||||
std::memcpy(vec.values, ptr, count * sizeof(T));
|
||||
return vec;
|
||||
}
|
||||
void store(void* ptr, int count = size) const {
|
||||
void store(void* ptr, int count = size()) const {
|
||||
std::memcpy(ptr, values, count * sizeof(T));
|
||||
}
|
||||
const T& operator[](int idx) const {
|
||||
@ -125,14 +160,14 @@ public:
|
||||
}
|
||||
Vec256<T> map(T (*f)(T)) const {
|
||||
Vec256<T> ret;
|
||||
for (int64_t i = 0; i != size; i++) {
|
||||
for (int64_t i = 0; i != size(); i++) {
|
||||
ret[i] = f(values[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
Vec256<T> abs() const {
|
||||
Vec256<T> ret;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = values[i] < 0 ? -values[i] : values[i];
|
||||
}
|
||||
return ret;
|
||||
@ -214,7 +249,7 @@ public:
|
||||
}
|
||||
Vec256<T> pow(const Vec256<T> &exp) const {
|
||||
Vec256<T> ret;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::pow(values[i], exp[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -222,7 +257,7 @@ public:
|
||||
#define DEFINE_COMP(binary_pred) \
|
||||
Vec256<T> operator binary_pred(const Vec256<T> &other) const { \
|
||||
Vec256<T> vec; \
|
||||
for (int64_t i = 0; i != size; i++) { \
|
||||
for (int64_t i = 0; i != size(); i++) { \
|
||||
if (values[i] binary_pred other.values[i]) { \
|
||||
std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T)); \
|
||||
} else { \
|
||||
@ -242,7 +277,7 @@ public:
|
||||
|
||||
template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
return c;
|
||||
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T
|
||||
|
||||
template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = a[i] - b[i];
|
||||
}
|
||||
return c;
|
||||
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T
|
||||
|
||||
template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
||||
return c;
|
||||
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T
|
||||
|
||||
template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = a[i] / b[i];
|
||||
}
|
||||
return c;
|
||||
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
|
||||
// either input is a NaN.
|
||||
template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = (a[i] > b[i]) ? a[i] : b[i];
|
||||
if (std::is_floating_point<T>::value && std::isnan(a[i])) {
|
||||
// If either input is NaN, propagate a NaN.
|
||||
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
|
||||
// either input is a NaN.
|
||||
template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
c[i] = (a[i] < b[i]) ? a[i] : b[i];
|
||||
if (std::is_floating_point<T>::value && std::isnan(a[i])) {
|
||||
// If either input is NaN, propagate a NaN.
|
||||
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
|
||||
template <class T> \
|
||||
Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) { \
|
||||
using iT = int_same_size_t<T>; \
|
||||
iT buffer[Vec256<T>::size]; \
|
||||
for (int64_t i = 0; i != Vec256<T>::size; i++) { \
|
||||
iT buffer[Vec256<T>::size()]; \
|
||||
for (int64_t i = 0; i != Vec256<T>::size(); i++) { \
|
||||
auto a_val = a[i]; \
|
||||
auto b_val = b[i]; \
|
||||
iT *i_a_ptr = reinterpret_cast<iT*>(&a_val); \
|
||||
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
|
||||
template <int64_t scale = 1, typename T = void>
|
||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
|
||||
inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
|
||||
static constexpr int size = Vec256<T>::size;
|
||||
static constexpr int size = Vec256<T>::size();
|
||||
int_same_size_t<T> index_arr[size];
|
||||
vindex.store(static_cast<void*>(index_arr));
|
||||
T buffer[size];
|
||||
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
|
||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
|
||||
inline mask_gather(const Vec256<T>& src, T const* base_addr,
|
||||
const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
|
||||
static constexpr int size = Vec256<T>::size;
|
||||
static constexpr int size = Vec256<T>::size();
|
||||
T src_arr[size];
|
||||
int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
|
||||
int_same_size_t<T> index_arr[size];
|
||||
@ -392,7 +427,7 @@ namespace {
|
||||
template<typename dst_t, typename src_t>
|
||||
struct CastImpl {
|
||||
static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
|
||||
src_t src_arr[Vec256<src_t>::size];
|
||||
src_t src_arr[Vec256<src_t>::size()];
|
||||
src.store(static_cast<void*>(src_arr));
|
||||
return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
|
||||
}
|
||||
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {
|
||||
|
||||
template <typename T>
|
||||
inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
|
||||
static constexpr int size = Vec256<T>::size;
|
||||
static constexpr int size = Vec256<T>::size();
|
||||
T src_arr[size];
|
||||
src.store(static_cast<void*>(src_arr));
|
||||
int_same_size_t<T> buffer[size];
|
||||
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
|
||||
// returns: Vec256<float> = {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// Vec256<float> = {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
template <typename T>
|
||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
|
||||
inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
|
||||
deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
static constexpr int size = Vec256<T>::size;
|
||||
static constexpr int size = Vec256<T>::size();
|
||||
static constexpr int half_size = size / 2;
|
||||
T a_arr[size];
|
||||
T b_arr[size];
|
||||
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
// returns: Vec256<float> = {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// Vec256<float> = {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
template <typename T>
|
||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
|
||||
inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
|
||||
interleave2(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
static constexpr int size = Vec256<T>::size;
|
||||
static constexpr int size = Vec256<T>::size();
|
||||
static constexpr int half_size = size / 2;
|
||||
T a_arr[size];
|
||||
T b_arr[size];
|
||||
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
|
||||
template <typename src_T, typename dst_T>
|
||||
void convert(const src_T *src, dst_T *dst, int64_t n) {
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
*dst = static_cast<dst_T>(
|
||||
static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
// See Note [Acceptable use of anonymous namespace in header]
|
||||
namespace {
|
||||
|
||||
#if defined(__AVX__) && !defined(_MSC_VER)
|
||||
@ -16,7 +17,9 @@ template <> class Vec256<double> {
|
||||
private:
|
||||
__m256d values;
|
||||
public:
|
||||
static constexpr int size = 4;
|
||||
static constexpr int size() {
|
||||
return 4;
|
||||
}
|
||||
Vec256() {}
|
||||
Vec256(__m256d v) : values(v) {}
|
||||
Vec256(double val) {
|
||||
@ -40,7 +43,7 @@ public:
|
||||
return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
|
||||
}
|
||||
static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
|
||||
int64_t count = size) {
|
||||
int64_t count = size()) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
return a;
|
||||
@ -53,22 +56,22 @@ public:
|
||||
}
|
||||
return b;
|
||||
}
|
||||
static Vec256<double> loadu(const void* ptr, int64_t count = size) {
|
||||
if (count == size)
|
||||
static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
|
||||
|
||||
__at_align32__ double tmp_values[size];
|
||||
__at_align32__ double tmp_values[size()];
|
||||
std::memcpy(
|
||||
tmp_values,
|
||||
reinterpret_cast<const double*>(ptr),
|
||||
count * sizeof(double));
|
||||
return _mm256_load_pd(tmp_values);
|
||||
}
|
||||
void store(void* ptr, int count = size) const {
|
||||
if (count == size) {
|
||||
void store(void* ptr, int count = size()) const {
|
||||
if (count == size()) {
|
||||
_mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
|
||||
} else if (count > 0) {
|
||||
double tmp_values[size];
|
||||
double tmp_values[size()];
|
||||
_mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(double));
|
||||
}
|
||||
@ -252,7 +255,7 @@ template <>
|
||||
void convert(const double* src, double* dst, int64_t n) {
|
||||
int64_t i;
|
||||
#pragma unroll
|
||||
for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
|
||||
for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
|
||||
_mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
// See Note [Acceptable use of anonymous namespace in header]
|
||||
namespace {
|
||||
|
||||
#if defined(__AVX__) && !defined(_MSC_VER)
|
||||
@ -16,7 +17,9 @@ template <> class Vec256<float> {
|
||||
private:
|
||||
__m256 values;
|
||||
public:
|
||||
static constexpr int size = 8;
|
||||
static constexpr int size() {
|
||||
return 8;
|
||||
}
|
||||
Vec256() {}
|
||||
Vec256(__m256 v) : values(v) {}
|
||||
Vec256(float val) {
|
||||
@ -43,7 +46,7 @@ public:
|
||||
base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
|
||||
}
|
||||
static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
|
||||
int64_t count = size) {
|
||||
int64_t count = size()) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
return a;
|
||||
@ -64,19 +67,19 @@ public:
|
||||
}
|
||||
return b;
|
||||
}
|
||||
static Vec256<float> loadu(const void* ptr, int64_t count = size) {
|
||||
if (count == size)
|
||||
static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
|
||||
__at_align32__ float tmp_values[size];
|
||||
__at_align32__ float tmp_values[size()];
|
||||
std::memcpy(
|
||||
tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
|
||||
return _mm256_loadu_ps(tmp_values);
|
||||
}
|
||||
void store(void* ptr, int64_t count = size) const {
|
||||
if (count == size) {
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
_mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
|
||||
} else if (count > 0) {
|
||||
float tmp_values[size];
|
||||
float tmp_values[size()];
|
||||
_mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(float));
|
||||
}
|
||||
@ -260,7 +263,7 @@ template <>
|
||||
void convert(const float* src, float* dst, int64_t n) {
|
||||
int64_t i;
|
||||
#pragma unroll
|
||||
for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
|
||||
for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
|
||||
_mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
|
@ -12,6 +12,11 @@ namespace {
|
||||
struct Vec256i {
|
||||
protected:
|
||||
__m256i values;
|
||||
|
||||
static inline __m256i invert(const __m256i& v) {
|
||||
const auto ones = _mm256_set1_epi64x(-1);
|
||||
return _mm256_xor_si256(ones, v);
|
||||
}
|
||||
public:
|
||||
Vec256i() {}
|
||||
Vec256i(__m256i v) : values(v) {}
|
||||
@ -22,7 +27,9 @@ public:
|
||||
|
||||
template <>
|
||||
struct Vec256<int64_t> : public Vec256i {
|
||||
static constexpr int size = 4;
|
||||
static constexpr int size() {
|
||||
return 4;
|
||||
}
|
||||
using Vec256i::Vec256i;
|
||||
Vec256() {}
|
||||
Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
|
||||
@ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i {
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
|
||||
__at_align32__ int64_t tmp_values[size];
|
||||
__at_align32__ int64_t tmp_values[size()];
|
||||
a.store(tmp_values);
|
||||
if (mask & 0x01)
|
||||
tmp_values[0] = _mm256_extract_epi64(b.values, 0);
|
||||
@ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i {
|
||||
return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
|
||||
}
|
||||
static Vec256<int64_t>
|
||||
set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
|
||||
set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
return a;
|
||||
@ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i {
|
||||
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
|
||||
__at_align32__ int64_t tmp_values[size];
|
||||
__at_align32__ int64_t tmp_values[size()];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
void store(void* ptr, int count = size) const {
|
||||
if (count == size) {
|
||||
void store(void* ptr, int count = size()) const {
|
||||
if (count == size()) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
} else if (count > 0) {
|
||||
__at_align32__ int64_t tmp_values[size];
|
||||
__at_align32__ int64_t tmp_values[size()];
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
|
||||
}
|
||||
@ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i {
|
||||
return _mm256_cmpeq_epi64(values, other.values);
|
||||
}
|
||||
Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto eq = _mm256_cmpeq_epi64(values, other.values);
|
||||
return _mm256_xor_si256(zero, eq); // invert
|
||||
return invert(_mm256_cmpeq_epi64(values, other.values));
|
||||
}
|
||||
Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
|
||||
return _mm256_cmpgt_epi64(other.values, values);
|
||||
}
|
||||
Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto gt = _mm256_cmpgt_epi64(values, other.values);
|
||||
return _mm256_xor_si256(zero, gt); // invert
|
||||
return invert(_mm256_cmpgt_epi64(values, other.values));
|
||||
}
|
||||
Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
|
||||
return _mm256_cmpgt_epi64(values, other.values);
|
||||
}
|
||||
Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto lt = _mm256_cmpgt_epi64(other.values, values);
|
||||
return _mm256_xor_si256(zero, lt); // invert
|
||||
return invert(_mm256_cmpgt_epi64(other.values, values));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Vec256<int32_t> : public Vec256i {
|
||||
static constexpr int size = 8;
|
||||
static constexpr int size() {
|
||||
return 8;
|
||||
}
|
||||
using Vec256i::Vec256i;
|
||||
Vec256() {}
|
||||
Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
|
||||
@ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i {
|
||||
base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
|
||||
}
|
||||
static Vec256<int32_t>
|
||||
set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
|
||||
set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
return a;
|
||||
@ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i {
|
||||
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
|
||||
__at_align32__ int32_t tmp_values[size];
|
||||
__at_align32__ int32_t tmp_values[size()];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
void store(void* ptr, int count = size) const {
|
||||
if (count == size) {
|
||||
void store(void* ptr, int count = size()) const {
|
||||
if (count == size()) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
} else if (count > 0) {
|
||||
__at_align32__ int32_t tmp_values[size];
|
||||
__at_align32__ int32_t tmp_values[size()];
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
|
||||
}
|
||||
@ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i {
|
||||
return _mm256_cmpeq_epi32(values, other.values);
|
||||
}
|
||||
Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto eq = _mm256_cmpeq_epi32(values, other.values);
|
||||
return _mm256_xor_si256(zero, eq); // invert
|
||||
return invert(_mm256_cmpeq_epi32(values, other.values));
|
||||
}
|
||||
Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
|
||||
return _mm256_cmpgt_epi32(other.values, values);
|
||||
}
|
||||
Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto gt = _mm256_cmpgt_epi32(values, other.values);
|
||||
return _mm256_xor_si256(zero, gt); // invert
|
||||
return invert(_mm256_cmpgt_epi32(values, other.values));
|
||||
}
|
||||
Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
|
||||
return _mm256_cmpgt_epi32(values, other.values);
|
||||
}
|
||||
Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto lt = _mm256_cmpgt_epi32(other.values, values);
|
||||
return _mm256_xor_si256(zero, lt); // invert
|
||||
return invert(_mm256_cmpgt_epi32(other.values, values));
|
||||
}
|
||||
};
|
||||
|
||||
@ -212,13 +209,17 @@ template <>
|
||||
void convert(const int32_t *src, float *dst, int64_t n) {
|
||||
int64_t i;
|
||||
// int32_t and float have same size
|
||||
#pragma unroll
|
||||
for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
|
||||
auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
|
||||
auto output_vec = _mm256_cvtepi32_ps(input_vec);
|
||||
_mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
dst[i] = static_cast<float>(src[i]);
|
||||
}
|
||||
@ -228,13 +229,17 @@ template <>
|
||||
void convert(const int32_t *src, double *dst, int64_t n) {
|
||||
int64_t i;
|
||||
// int32_t has half the size of double
|
||||
#pragma unroll
|
||||
for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
|
||||
auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
|
||||
auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
|
||||
_mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
dst[i] = static_cast<double>(src[i]);
|
||||
}
|
||||
@ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {
|
||||
|
||||
template <>
|
||||
struct Vec256<int16_t> : public Vec256i {
|
||||
static constexpr int size = 16;
|
||||
static constexpr int size() {
|
||||
return 16;
|
||||
}
|
||||
using Vec256i::Vec256i;
|
||||
Vec256() {}
|
||||
Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
|
||||
@ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i {
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
|
||||
__at_align32__ int16_t tmp_values[size];
|
||||
__at_align32__ int16_t tmp_values[size()];
|
||||
a.store(tmp_values);
|
||||
if (mask & 0x01)
|
||||
tmp_values[0] = _mm256_extract_epi16(b.values, 0);
|
||||
@ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i {
|
||||
base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
|
||||
}
|
||||
static Vec256<int16_t>
|
||||
set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
|
||||
set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
return a;
|
||||
@ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i {
|
||||
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
|
||||
__at_align32__ int16_t tmp_values[size];
|
||||
__at_align32__ int16_t tmp_values[size()];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
void store(void* ptr, int count = size) const {
|
||||
if (count == size) {
|
||||
void store(void* ptr, int count = size()) const {
|
||||
if (count == size()) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
} else if (count > 0) {
|
||||
__at_align32__ int16_t tmp_values[size];
|
||||
__at_align32__ int16_t tmp_values[size()];
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
|
||||
}
|
||||
@ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i {
|
||||
return _mm256_cmpeq_epi16(values, other.values);
|
||||
}
|
||||
Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto eq = _mm256_cmpeq_epi16(values, other.values);
|
||||
return _mm256_xor_si256(zero, eq); // invert
|
||||
return invert(_mm256_cmpeq_epi16(values, other.values));
|
||||
}
|
||||
Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
|
||||
return _mm256_cmpgt_epi16(other.values, values);
|
||||
}
|
||||
Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto gt = _mm256_cmpgt_epi16(values, other.values);
|
||||
return _mm256_xor_si256(zero, gt); // invert
|
||||
return invert(_mm256_cmpgt_epi16(values, other.values));
|
||||
}
|
||||
Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
|
||||
return _mm256_cmpgt_epi16(values, other.values);
|
||||
}
|
||||
Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
|
||||
auto zero = _mm256_set1_epi64x(0);
|
||||
auto lt = _mm256_cmpgt_epi16(other.values, values);
|
||||
return _mm256_xor_si256(zero, lt); // invert
|
||||
return invert(_mm256_cmpgt_epi16(other.values, values));
|
||||
}
|
||||
};
|
||||
|
||||
@ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
|
||||
|
||||
template <typename T>
|
||||
Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
T values_a[Vec256<T>::size];
|
||||
T values_b[Vec256<T>::size];
|
||||
T values_a[Vec256<T>::size()];
|
||||
T values_b[Vec256<T>::size()];
|
||||
a.store(values_a);
|
||||
b.store(values_b);
|
||||
for (int i = 0; i != Vec256<T>::size; i++) {
|
||||
for (int i = 0; i != Vec256<T>::size(); i++) {
|
||||
values_a[i] /= values_b[i];
|
||||
}
|
||||
return Vec256<T>::loadu(values_a);
|
||||
|
@ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
|
||||
THCState_getCurrentDeviceProperties(globalContext().getTHCState());
|
||||
// NOTE: extra parenthesis around numbers disable clang warnings about
|
||||
// dead code
|
||||
return (
|
||||
(CUDNN_VERSION >= (6021)) ||
|
||||
(CUDNN_VERSION >= (6000) && prop->major >= 5));
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
@ -9,45 +9,6 @@
|
||||
#include "ATen/cuda/ATenCUDAGeneral.h"
|
||||
#include <cuda.h>
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
|
||||
#include <curand_kernel.h>
|
||||
|
||||
/*
|
||||
Note [cuDNN dropout descriptor initialization]
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In most cases, setting descriptors in cuDNN is cheap (e.g.,
|
||||
cudnnSetTensorNdDescriptor). However, this is not the case for
|
||||
cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
|
||||
expensive precomputation to initialize the random number generator states. In
|
||||
cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
|
||||
which means that law-abiding clients were expected to generate a dropout
|
||||
descriptor once and cache it. However, our ATen interface is (1) stateless (so
|
||||
we can't cache the descriptors) and (2) does not accept arbitrary user types in
|
||||
its interface (so we can't pass the descriptor in). This puts us in a pickle.
|
||||
|
||||
In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
|
||||
forgoes the expensive initialization process, and can initialize the
|
||||
descriptor with a pre-initialized state CUDA tensor. This is great, because
|
||||
it means we can simply pass in the state tensor and then initialize the
|
||||
descriptor internally. Unfortunately, this function is not available in
|
||||
cuDNN 6.
|
||||
|
||||
To work around this, we break the cuDNN abstraction barrier, and have
|
||||
the struct layout of the underlaying dropout descriptor. With this struct,
|
||||
we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
|
||||
*/
|
||||
|
||||
// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
|
||||
struct cudnnDropoutStruct {
|
||||
float dropout;
|
||||
int nstates;
|
||||
void * states;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
// TODO: Add constructors for all of the descriptors
|
||||
@ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor
|
||||
if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
|
||||
AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
|
||||
CUDNN_CROSS_CORRELATION, mathType));
|
||||
#if CUDNN_VERSION >= 7000
|
||||
AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
|
||||
AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
|
||||
if(dataType == CUDNN_DATA_HALF)
|
||||
AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
|
||||
// See Note [cuDNN dropout descriptor initialization]
|
||||
inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
|
||||
cudnnDropoutDescriptor_t dropoutDesc,
|
||||
cudnnHandle_t handle,
|
||||
float dropout,
|
||||
void *states,
|
||||
size_t stateSizeInBytes,
|
||||
unsigned long long seed) {
|
||||
// Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
|
||||
// This is not entirely accurate but is good enough to catch some API
|
||||
// uses which would not be compatible in cuDNN 7. Feel free to fix
|
||||
// this if you notice something is wrong.
|
||||
if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
|
||||
if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
|
||||
size_t expectedStateSizeInBytes;
|
||||
// State size will differ depending on size of GPU
|
||||
auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
|
||||
if (ret != CUDNN_STATUS_SUCCESS) return ret;
|
||||
if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
|
||||
dropoutDesc->dropout = dropout;
|
||||
dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
|
||||
dropoutDesc->states = states;
|
||||
return CUDNN_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif // CUDNN_VERSION
|
||||
|
||||
struct AT_CUDA_API DropoutDescriptor
|
||||
: public Descriptor<cudnnDropoutStruct,
|
||||
&cudnnCreateDropoutDescriptor,
|
||||
@ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor
|
||||
mode,
|
||||
algo,
|
||||
datatype));
|
||||
#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
|
||||
#if CUDA_VERSION >= 9000
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
if (prop->major >= 7) {
|
||||
if (datatype == CUDNN_DATA_HALF) {
|
||||
@ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
#if CUDNN_VERSION >= 7000
|
||||
|
||||
struct AT_CUDA_API CTCLossDescriptor
|
||||
: public Descriptor<cudnnCTCLossStruct,
|
||||
&cudnnCreateCTCLossDescriptor,
|
||||
@ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
union Constant
|
||||
{
|
||||
float f;
|
||||
|
@ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
|
||||
input_stride1 = strides[1];
|
||||
}
|
||||
AT_CHECK(channel_size == weight_num,
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
|
||||
weight_num, channel_size);
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
|
||||
" and channel size = ", channel_size, ".");
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
|
||||
prelu_cpu_kernel_multi_weights<scalar_t>(
|
||||
@ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
|
||||
input_stride1 = strides[1];
|
||||
}
|
||||
AT_CHECK(channel_size == weight_num,
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
|
||||
weight_num, channel_size);
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
|
||||
" and channel size = ", channel_size, ".");
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
|
||||
prelu_cpu_backward_kernel_multi_weights<scalar_t>(
|
||||
|
@ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)
|
||||
|
||||
// Supports arbitrary batch dimensions for self and A
|
||||
std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
|
||||
if (self.dim() <= 2 && A.dim() <= 2) {
|
||||
AT_CHECK(self.dim() >= 2,
|
||||
"b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
|
||||
AT_CHECK(A.dim() >= 2,
|
||||
"A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
|
||||
if (self.dim() == 2 && A.dim() == 2) {
|
||||
// TODO: #7102: It's not necessary to have gesv (single) bindings for both
|
||||
// TH and ATen. We should remove the TH gesv bindings, especially
|
||||
// since the lapackGesv function is already in ATen.
|
||||
linearSolveCheckInputs(self, A); // Checks square shape of A, and compatibility of self and A
|
||||
return at::_th_gesv_single(self, A);
|
||||
}
|
||||
|
||||
@ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) {
|
||||
}
|
||||
squareCheckInputs(self);
|
||||
|
||||
// TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors,
|
||||
// this can be simplified. Currently, we are zero-ing out values in the
|
||||
// batch of matrices by using a mask and the `where` function.
|
||||
// The simplification with batched `triu` and `tril` would be this:
|
||||
// if (upper) {
|
||||
// return raw_cholesky_output.triu();
|
||||
// } else {
|
||||
// return raw_cholesky_output.tril();
|
||||
// }
|
||||
auto raw_cholesky_output = at::_cholesky_helper(self, upper);
|
||||
int64_t n = self.size(-1);
|
||||
auto indices = at::ones({n, n}, self.options().dtype(at::kByte));
|
||||
indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self);
|
||||
return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output);
|
||||
if (upper) {
|
||||
return raw_cholesky_output.triu_();
|
||||
} else {
|
||||
return raw_cholesky_output.tril_();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
|
||||
@ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool inplace, bool upper>
|
||||
static void apply_triu_tril_single(
|
||||
scalar_t* result, scalar_t* self,
|
||||
int64_t k, int64_t n, int64_t m,
|
||||
int64_t res_row_stride, int64_t res_col_stride,
|
||||
int64_t self_row_stride, int64_t self_col_stride) {
|
||||
|
||||
constexpr int64_t zero = 0;
|
||||
int64_t i;
|
||||
|
||||
if (upper) {
|
||||
#pragma omp parallel for private(i)
|
||||
for (i = 0; i < n; i++) {
|
||||
for (int64_t j = 0; j < std::min(m, i + k); j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = 0;
|
||||
}
|
||||
if (!inplace) { // copy the rest of the self if not inplace
|
||||
for (int64_t j = std::max(zero, i + k); j < m; j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp parallel for private(i)
|
||||
for (i = 0; i < n; i++) {
|
||||
for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = 0;
|
||||
}
|
||||
if (!inplace) { // copy the rest of the self if not inplace
|
||||
for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool inplace, bool upper>
|
||||
void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) {
|
||||
auto n = self.size(-2);
|
||||
auto m = self.size(-1);
|
||||
auto self_data = self.data<scalar_t>();
|
||||
auto self_stride = self.dim() > 2 ? self.stride(-3) : 1;
|
||||
auto batchsize = batchCount(self);
|
||||
auto self_row_stride = self.stride(-2);
|
||||
auto self_column_stride = self.stride(-1);
|
||||
|
||||
auto result_data = result.data<scalar_t>();
|
||||
int64_t result_stride, result_row_stride, result_column_stride;
|
||||
if (result_data != self_data) {
|
||||
result_stride = result.dim() > 2 ? result.stride(-3) : 1;
|
||||
result_row_stride = result.stride(-2);
|
||||
result_column_stride = result.stride(-1);
|
||||
} else {
|
||||
result_stride = self_stride;
|
||||
result_row_stride = self_row_stride;
|
||||
result_column_stride = self_column_stride;
|
||||
}
|
||||
|
||||
int64_t b;
|
||||
#pragma omp parallel for private(b)
|
||||
for (b = 0; b < batchsize; b++) {
|
||||
scalar_t* self_batch = &self_data[b * self_stride];
|
||||
scalar_t* result_batch = &result_data[b * result_stride];
|
||||
apply_triu_tril_single<scalar_t, inplace, upper>(
|
||||
result_batch, self_batch, k, n, m,
|
||||
result_row_stride, result_column_stride, self_row_stride, self_column_stride);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor tril(const Tensor& self, int64_t k) {
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
at::tril_out(result, self, k);
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor& tril_cpu_(Tensor &self, int64_t k) {
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
|
||||
apply_triu_tril<scalar_t, true, false>(self, self, k);
|
||||
});
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
|
||||
if (result.sizes() != self.sizes()) {
|
||||
result.resize_as_(self);
|
||||
}
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
|
||||
apply_triu_tril<scalar_t, false, false>(result, self_c, k);
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor triu(const Tensor& self, int64_t k) {
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
at::triu_out(result, self, k);
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor& triu_cpu_(Tensor &self, int64_t k) {
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
|
||||
apply_triu_tril<scalar_t, true, true>(self, self, k);
|
||||
});
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
|
||||
if (result.sizes() != self.sizes()) {
|
||||
result.resize_as_(self);
|
||||
}
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
|
||||
apply_triu_tril<scalar_t, false, true>(result, self_c, k);
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
}} // namespace at::native
|
||||
|
@ -378,8 +378,8 @@ at::Tensor _convolution(
|
||||
AT_CHECK(!bias.defined() || (input.type() == bias.type()),
|
||||
"Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
|
||||
") should be the same");
|
||||
|
||||
output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
|
||||
output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias,
|
||||
params.padding, params.stride, params.dilation, params.groups);
|
||||
#endif
|
||||
} else {
|
||||
if (params.groups == 1) {
|
||||
|
@ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) {
|
||||
}
|
||||
|
||||
Tensor & eq_(Tensor& self, const Tensor & other) {
|
||||
return at::_th_ge_(self, other);
|
||||
return at::_th_eq_(self, other);
|
||||
}
|
||||
|
||||
Tensor & ne_(Tensor& self, Scalar other) {
|
||||
@ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) {
|
||||
return at::_th_atan2_(self, other);
|
||||
}
|
||||
|
||||
Tensor & tril_(Tensor& self, int64_t diagonal) {
|
||||
return at::_th_tril_(self, diagonal);
|
||||
}
|
||||
|
||||
Tensor & triu_(Tensor& self, int64_t diagonal) {
|
||||
return at::_th_triu_(self, diagonal);
|
||||
}
|
||||
|
||||
Tensor & digamma_(Tensor& self) {
|
||||
return at::_th_digamma_(self);
|
||||
}
|
||||
@ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
|
||||
return at::_th_cross(self, other, dim);
|
||||
}
|
||||
|
||||
Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
|
||||
return at::_th_triu_out(result, self, diagonal);
|
||||
}
|
||||
|
||||
Tensor triu(const Tensor & self, int64_t diagonal) {
|
||||
return at::_th_triu(self, diagonal);
|
||||
}
|
||||
|
||||
Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
|
||||
return at::_th_tril_out(result, self, diagonal);
|
||||
}
|
||||
|
||||
Tensor tril(const Tensor & self, int64_t diagonal) {
|
||||
return at::_th_tril(self, diagonal);
|
||||
}
|
||||
|
||||
Tensor trace(const Tensor & self) {
|
||||
return at::_th_trace(self);
|
||||
}
|
||||
|
@ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) {
|
||||
return batched_matrices.size(-1) * batched_matrices.size(-2);
|
||||
}
|
||||
|
||||
/* Checks a necessary property for the triu and tril implementations, hence the name.
|
||||
* Here batch contiguity is checked for tensors with greater than 4 dimensions.
|
||||
* Contiguous tensors and tensors with less than 3 dimensions pass this check
|
||||
*/
|
||||
static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) {
|
||||
// Complete contiguity is the most desired property, which is why
|
||||
// we return true if the tensor is contiguous
|
||||
if (tensor.is_contiguous()) return true;
|
||||
|
||||
int64_t dims = tensor.dim();
|
||||
|
||||
// Tensors with dimension less than 4 are handled by default
|
||||
if (dims <= 3) return true;
|
||||
|
||||
int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
|
||||
for (int64_t i = dims - 3; i >= 0; i--) {
|
||||
if (expected_stride != tensor.stride(i)) return false;
|
||||
expected_stride *= tensor.size(i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the epsilon value for floating types except half
|
||||
static inline double _get_epsilon(const ScalarType& sc_type) {
|
||||
switch (sc_type) {
|
||||
|
@ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
|
||||
std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
|
||||
const Tensor& running_mean, const Tensor& running_var,
|
||||
bool train, double momentum, double eps) {
|
||||
checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
|
||||
|
||||
return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
|
||||
return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
|
||||
});
|
||||
|
@ -21,7 +21,6 @@ namespace native {
|
||||
|
||||
DEFINE_DISPATCH(sum_stub);
|
||||
DEFINE_DISPATCH(prod_stub);
|
||||
DEFINE_DISPATCH(norm_kernel);
|
||||
|
||||
static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
|
||||
ScalarType scalarType = self.type().scalarType();
|
||||
@ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim());
|
||||
if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
|
||||
return result;
|
||||
if (self.is_contiguous() && result.is_contiguous()) {
|
||||
_dimreduce_setup(result, self, dim);
|
||||
norm_kernel(kCPU, result, self, p, dim);
|
||||
if (!keepdim) {
|
||||
result.squeeze_(dim);
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return at::_th_norm_out(result, self, p, dim, keepdim);
|
||||
}
|
||||
return at::_th_norm_out(result, self, p, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
|
||||
@ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
|
||||
AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
|
||||
"norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
|
||||
AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
|
||||
if (self.is_cuda()) {
|
||||
return at::_th_norm(self, p);
|
||||
} else {
|
||||
if (self.is_contiguous()) {
|
||||
Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
|
||||
norm_kernel(kCPU, result, self, p, c10::nullopt);
|
||||
return result;
|
||||
} else {
|
||||
return at::_th_norm(self, p);
|
||||
}
|
||||
}
|
||||
return at::_th_norm(self, p);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -34,11 +34,11 @@ Tensor _bincount_cpu_template(
|
||||
int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
|
||||
nbins = std::max(nbins, minlength); // at least minlength # of bins
|
||||
|
||||
const input_t* self_p = self.contiguous().data<input_t>();
|
||||
const input_t* self_p = self.data<input_t>();
|
||||
if (has_weights) {
|
||||
output = native::zeros({nbins}, weights.options());
|
||||
weights_t* output_p = output.data<weights_t>();
|
||||
const weights_t* weights_p = weights.contiguous().data<weights_t>();
|
||||
const weights_t* weights_p = weights.data<weights_t>();
|
||||
for (int64_t i = 0; i < self.size(0); i++) {
|
||||
output_p[self_p[i]] += weights_p[i];
|
||||
}
|
||||
@ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
|
||||
return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
|
||||
const auto scalar = weights.type().scalarType();
|
||||
if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
|
||||
return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
|
||||
return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
|
||||
return _bincount_cpu_template<scalar_t, double>(
|
||||
self, weights.toType(CPU(kDouble)), minlength);
|
||||
self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const {
|
||||
}
|
||||
|
||||
void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
|
||||
if (range.size() == 0) {
|
||||
return;
|
||||
}
|
||||
auto strides = get_strides();
|
||||
while (strides.size() < 2 * ntensors()) {
|
||||
strides.push_back(0);
|
||||
@ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range)
|
||||
int64_t ndim = values.size();
|
||||
for (int dim = 0; dim < ndim; dim++) {
|
||||
int64_t size = shape[dim];
|
||||
values[dim] = linear_offset % size;
|
||||
linear_offset /= size;
|
||||
if (size > 0) {
|
||||
values[dim] = linear_offset % size;
|
||||
linear_offset /= size;
|
||||
}
|
||||
}
|
||||
AT_ASSERT(linear_offset == 0);
|
||||
}
|
||||
|
@ -101,14 +101,14 @@ struct PDist {
|
||||
|
||||
scalar_t * const res_start = result.data<scalar_t>();
|
||||
int64_t combs = result.numel(); // n * (n - 1) / 2
|
||||
const Vec pvec(p);
|
||||
|
||||
// We conceptually iterate over tuples of (i, j, k) where i is the first
|
||||
// vector from the input, j is the second, and k is the result index. This
|
||||
// parallelizes over the range of k and infers what i and j are from the
|
||||
// value of k.
|
||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
|
||||
float n2 = n - .5;
|
||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) {
|
||||
const Vec pvec(p);
|
||||
double n2 = n - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
@ -149,7 +149,7 @@ struct PDist {
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
|
||||
inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
|
||||
for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
|
||||
|
||||
const Vec self_vec_i = Vec::loadu(self_i, count);
|
||||
@ -177,7 +177,6 @@ struct PDist {
|
||||
const int64_t n = self.size(0);
|
||||
const int64_t m = self.size(1);
|
||||
const int64_t gs = grad.stride(0);
|
||||
const Vec pvec(p);
|
||||
|
||||
const scalar_t * const grad_start = grad.data<scalar_t>();
|
||||
const scalar_t * const dist_start = dist.data<scalar_t>();
|
||||
@ -187,17 +186,19 @@ struct PDist {
|
||||
// The only way to parallelize and avoid locking requires parallelizing
|
||||
// over the columns of the input, i.e. we compute the gradient for the
|
||||
// first section of each vector independentaly of the second section, etc.
|
||||
at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
|
||||
const scalar_t * self_l = self_start + l * Vec::size;
|
||||
scalar_t * res_l = res_start + l * Vec::size;
|
||||
at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) {
|
||||
const Vec pvec(p);
|
||||
|
||||
for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
|
||||
const scalar_t * self_l = self_start + l * Vec::size();
|
||||
scalar_t * res_l = res_start + l * Vec::size();
|
||||
|
||||
for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) {
|
||||
backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
|
||||
}
|
||||
});
|
||||
const int64_t remainder = m % Vec::size;
|
||||
const int64_t remainder = m % Vec::size();
|
||||
if (remainder) {
|
||||
backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
|
||||
backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -308,7 +308,9 @@ static inline void
|
||||
mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
|
||||
const int_same_size_t<scalar_t> *offsets,
|
||||
const int_same_size_t<scalar_t> *mask, int64_t len) {
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
if (mask[i] & 0x01) {
|
||||
base_addr[offsets[i]] += src[i];
|
||||
@ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
|
||||
auto i_sw_offset = i_nw_offset + iVec(inp_sH);
|
||||
auto i_se_offset = i_sw_offset + iVec(inp_sW);
|
||||
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t c = 0; c < C; ++c) {
|
||||
auto inp_slice_C_ptr = inp_slice[c].data();
|
||||
|
||||
@ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
|
||||
// So we store the necessary vectors to temporary arrays and use the helper
|
||||
// mask_scatter_add defined above.
|
||||
|
||||
integer_t i_gInp_nw_offset_arr[iVec::size];
|
||||
integer_t i_gInp_ne_offset_arr[iVec::size];
|
||||
integer_t i_gInp_sw_offset_arr[iVec::size];
|
||||
integer_t i_gInp_se_offset_arr[iVec::size];
|
||||
integer_t i_gInp_nw_offset_arr[iVec::size()];
|
||||
integer_t i_gInp_ne_offset_arr[iVec::size()];
|
||||
integer_t i_gInp_sw_offset_arr[iVec::size()];
|
||||
integer_t i_gInp_se_offset_arr[iVec::size()];
|
||||
i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
|
||||
i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
|
||||
i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
|
||||
i_gInp_se_offset.store(i_gInp_se_offset_arr);
|
||||
|
||||
integer_t i_nw_mask_arr[iVec::size];
|
||||
integer_t i_ne_mask_arr[iVec::size];
|
||||
integer_t i_sw_mask_arr[iVec::size];
|
||||
integer_t i_se_mask_arr[iVec::size];
|
||||
integer_t i_nw_mask_arr[iVec::size()];
|
||||
integer_t i_ne_mask_arr[iVec::size()];
|
||||
integer_t i_sw_mask_arr[iVec::size()];
|
||||
integer_t i_se_mask_arr[iVec::size()];
|
||||
nw_mask.store(i_nw_mask_arr);
|
||||
ne_mask.store(i_ne_mask_arr);
|
||||
sw_mask.store(i_sw_mask_arr);
|
||||
se_mask.store(i_se_mask_arr);
|
||||
|
||||
scalar_t gInp_corner_arr[Vec::size];
|
||||
scalar_t gInp_corner_arr[Vec::size()];
|
||||
|
||||
auto gx = Vec(0), gy = Vec(0);
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t c = 0; c < C; ++c) {
|
||||
auto inp_slice_C_ptr = inp_slice[c].data();
|
||||
auto gInp_slice_C_ptr = gInp_slice[c].data();
|
||||
@ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
|
||||
gx = gx * gx_mult;
|
||||
gy = gy * gy_mult;
|
||||
|
||||
constexpr int64_t step = Vec::size;
|
||||
constexpr int64_t step = Vec::size();
|
||||
auto interleaved_gGrid = interleave2(gx, gy);
|
||||
auto gGrid_ptr = gGrid_slice.data() + offset * 2;
|
||||
std::get<0>(interleaved_gGrid).store(gGrid_ptr,
|
||||
@ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
|
||||
auto out_ptr = out_slice.data() + offset;
|
||||
auto out_sC = out_slice.stride(0);
|
||||
auto inp_slice_ptr = inp_slice.data();
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
|
||||
// mask_gather zeros out the mask, so we need to make a copy
|
||||
auto mask_copy = mask;
|
||||
@ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
|
||||
|
||||
auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest; // gInp is contiguous
|
||||
|
||||
integer_t mask_arr[iVec::size];
|
||||
integer_t mask_arr[iVec::size()];
|
||||
i_mask.store(mask_arr);
|
||||
integer_t gInp_offset_arr[iVec::size];
|
||||
integer_t gInp_offset_arr[iVec::size()];
|
||||
i_gInp_offset.store(gInp_offset_arr);
|
||||
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t c = 0; c < C; ++c) {
|
||||
mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
|
||||
gInp_offset_arr, mask_arr, len);
|
||||
@ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
|
||||
|
||||
using Vec = Vec256<scalar_t>;
|
||||
using iVec = Vec256<int_same_size_t<scalar_t>>;
|
||||
constexpr int64_t step = Vec::size;
|
||||
constexpr int64_t step = Vec::size();
|
||||
|
||||
// Loop over each output pixel in grid.
|
||||
// We consider the following three cases (after slicing out the batch
|
||||
@ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator(
|
||||
auto spatial_offset = 0;
|
||||
auto i_offsets_delta = iVec(grid_sW * step);
|
||||
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t h = 0; h < out_H; h++) {
|
||||
auto grid_ptr_x = grid_ptr + h * grid_sH;
|
||||
auto grid_ptr_y = grid_ptr_x + grid_sCoor;
|
||||
auto i_offsets = iVec::arange(0, grid_sW);
|
||||
#pragma unroll
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (int64_t w = 0; w < out_W; w += step) {
|
||||
auto len = std::min(step, out_W - w);
|
||||
if (len < step) {
|
||||
|
@ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t>
|
||||
static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
int64_t i = 0;
|
||||
for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
|
||||
auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
|
||||
auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
auto out1 = vop(a1, b1);
|
||||
auto out2 = vop(a2, b2);
|
||||
out1.store(out_ptr + i * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
}
|
||||
int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
|
||||
binary_loop(data, strides, i, n, op);
|
||||
@ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op,
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
int64_t i = 0;
|
||||
auto a = Vec(*(scalar_t*)in1_ptr);
|
||||
for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
|
||||
auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
auto out1 = vop(a, b1);
|
||||
auto out2 = vop(a, b2);
|
||||
out1.store(out_ptr + i * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
}
|
||||
int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
|
||||
binary_loop(data, strides, i, n, op);
|
||||
@ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op,
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
int64_t i = 0;
|
||||
auto b = Vec(*(scalar_t*)in2_ptr);
|
||||
for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
|
||||
auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
auto out1 = vop(a1, b);
|
||||
auto out2 = vop(a2, b);
|
||||
out1.store(out_ptr + i * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
|
||||
out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
|
||||
}
|
||||
int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
|
||||
binary_loop(data, strides, i, n, op);
|
||||
@ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o
|
||||
char* in_ptr = data[1];
|
||||
Vec acc[4];
|
||||
for (int j = 0; j < 4; j++) {
|
||||
acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t));
|
||||
acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t));
|
||||
}
|
||||
for (int64_t i = 1; i < n; i++) {
|
||||
const char* ptr = in_ptr + stride * i;
|
||||
acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t))));
|
||||
acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t))));
|
||||
acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t))));
|
||||
acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t))));
|
||||
acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
|
||||
acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
|
||||
acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
|
||||
acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
|
||||
}
|
||||
if (reduce) {
|
||||
scalar_t buffer[Vec::size];
|
||||
scalar_t buffer[Vec::size()];
|
||||
acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
|
||||
acc[0].store(buffer);
|
||||
for (int j = 1; j < Vec::size; j++) {
|
||||
for (int j = 1; j < Vec::size(); j++) {
|
||||
buffer[0] = op(buffer[0], buffer[j]);
|
||||
}
|
||||
auto dst = (scalar_t*)out_ptr;
|
||||
*dst = op(*dst, buffer[0]);
|
||||
} else {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
auto dst = out_ptr + j * Vec::size * sizeof(scalar_t);
|
||||
auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
|
||||
acc[j] = vop(acc[j], Vec::loadu(dst));
|
||||
acc[j].store(dst);
|
||||
}
|
||||
@ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int
|
||||
template <typename func_t, typename vec_func_t>
|
||||
static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
|
||||
VEC_HEADER(func_t)
|
||||
int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t);
|
||||
int64_t count = n / (4 * Vec::size);
|
||||
int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
int64_t count = n / (4 * Vec::size());
|
||||
if (count > 0) {
|
||||
reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
|
||||
}
|
||||
char* ptrs[3] = { data[0], data[0], data[1] };
|
||||
int64_t strides[] = { 0, 0, sizeof(scalar_t) };
|
||||
binary_loop(ptrs, strides, count * 4 * Vec::size, n, op);
|
||||
binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
|
||||
}
|
||||
|
||||
// computes the reduction out = op(out, in)
|
||||
@ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t>
|
||||
static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
|
||||
VEC_HEADER(func_t)
|
||||
|
||||
// reduce down each column of 4 * Vec::size elements (128 bytes)
|
||||
// reduce down each column of 4 * Vec::size() elements (128 bytes)
|
||||
int64_t outer_stride[2] = { 128, 128 };
|
||||
UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] {
|
||||
UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
|
||||
reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
|
||||
});
|
||||
|
||||
// reduce down the remaining columns
|
||||
int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
|
||||
int64_t remaining = size1 % (4 * Vec::size);
|
||||
int64_t remaining = size1 % (4 * Vec::size());
|
||||
UNARY_OUTER_LOOP(data, step, remaining, [&] {
|
||||
char* ptrs[3] = { data[0], data[0], data[1] };
|
||||
int64_t strides[] = { 0, 0, inner_stride };
|
||||
|
@ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) {
|
||||
/*identity=*/1);
|
||||
});
|
||||
}
|
||||
|
||||
static inline int64_t round_down(int64_t a, int64_t m) {
|
||||
return a - (a % m);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
struct NormReduction {
|
||||
// reduction width in number of scalar elements
|
||||
static constexpr int WIDTH = 128 / sizeof(scalar_t);
|
||||
using Vec = Vec256<scalar_t>;
|
||||
|
||||
static void apply(
|
||||
Tensor& res,
|
||||
const Tensor& self,
|
||||
Scalar p,
|
||||
c10::optional<int64_t> dim) {
|
||||
auto out_ = res.data<scalar_t>();
|
||||
auto data_ = self.data<scalar_t>();
|
||||
auto numel = self.numel();
|
||||
float pval = 0.0;
|
||||
if (p.isIntegral()){
|
||||
pval = p.to<int64_t>();
|
||||
} else if (p.isFloatingPoint()) {
|
||||
pval = p.to<float>();
|
||||
}
|
||||
if (!dim.has_value()) {
|
||||
*out_ = reduce_all(data_, numel, pval);
|
||||
return;
|
||||
}
|
||||
int64_t n = self.size(*dim);
|
||||
int64_t stride = self.stride(*dim);
|
||||
// A contiguous tensor does not need to hold a meaningful stride
|
||||
// if the corresponding size is 1
|
||||
if (n == 1) {
|
||||
stride = 1;
|
||||
for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
|
||||
stride *= self.size(i);
|
||||
}
|
||||
}
|
||||
int64_t batch = numel / n;
|
||||
parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
|
||||
for (int64_t bi = begin; bi < end; bi++) {
|
||||
int64_t b = bi / stride;
|
||||
int64_t i = bi % stride;
|
||||
const scalar_t* data = &data_[b * n * stride + i];
|
||||
out_[bi] = norm_reduce(data, n, stride, pval);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static scalar_t reduce_all(const scalar_t* data_, int64_t size, float pval) {
|
||||
scalar_t sum = parallel_reduce(
|
||||
0,
|
||||
size,
|
||||
internal::GRAIN_SIZE,
|
||||
(scalar_t)0,
|
||||
[=](int64_t begin, int64_t end, scalar_t init) {
|
||||
const scalar_t* data = &data_[begin];
|
||||
int64_t n = end - begin;
|
||||
scalar_t result = norm_reduce(data, n, 1, pval);
|
||||
return result;
|
||||
},
|
||||
std::plus<scalar_t>());
|
||||
return sum;
|
||||
}
|
||||
|
||||
static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
|
||||
scalar_t result = 0.0;
|
||||
if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
|
||||
int64_t n_rounded = round_down(n, WIDTH);
|
||||
scalar_t result1 = norm_reduce128(data, n_rounded, pval);
|
||||
scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
|
||||
result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
|
||||
} else {
|
||||
result = norm_reduce_sequential(data, n, stride, pval);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
|
||||
scalar_t result = 0.0;
|
||||
if (pval == 0) {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result += (data[k * stride] != 0.0);
|
||||
}
|
||||
} else if (pval == 1) {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result += std::abs(data[k * stride]);
|
||||
}
|
||||
} else if (pval == 2) {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result += data[k * stride] * data[k * stride];
|
||||
}
|
||||
result = std::sqrt(result);
|
||||
} else if (pval == 3) {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
|
||||
}
|
||||
result = std::pow(result, 1.0/3);
|
||||
} else if (pval == INFINITY) {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
|
||||
}
|
||||
} else if (pval == -INFINITY) {
|
||||
result = INFINITY;
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result;
|
||||
}
|
||||
} else {
|
||||
for (int64_t k = 0; k < n; k++) {
|
||||
result += std::pow(std::abs(data[k * stride]), pval);
|
||||
}
|
||||
result = std::pow(result, 1.0/pval);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Reduce down a column of WIDTH elements (128 bytes) with the given number n
|
||||
// n is already rounded by 128
|
||||
static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
|
||||
scalar_t result = 0.0;
|
||||
Vec acc[4] = {0.0, 0.0, 0.0, 0.0}; // 128 bytes (two cache lines)
|
||||
static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
|
||||
int64_t rows = n / WIDTH;
|
||||
if (pval == 1){
|
||||
for (int row = 0; row < rows; row ++) {
|
||||
for (int j = 0; j != 4; j++) {
|
||||
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
|
||||
acc[j] = acc[j] + val.abs();
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (pval == 2) {
|
||||
for (int row = 0; row < rows; row ++) {
|
||||
for (int j = 0; j != 4; j++) {
|
||||
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
|
||||
acc[j] = acc[j] + val * val;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (pval == 3) {
|
||||
for (int row = 0; row < rows; row ++) {
|
||||
for (int j = 0; j != 4; j++) {
|
||||
auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
|
||||
acc[j] = acc[j] + (val * val * val).abs();
|
||||
}
|
||||
}
|
||||
}
|
||||
scalar_t buf[WIDTH] = {0};
|
||||
for (int j = 0; j != 4; j++) {
|
||||
acc[j].store(&buf[j * Vec::size]);
|
||||
}
|
||||
for (int i = 0; i < WIDTH; i++) {
|
||||
result += buf[i];
|
||||
}
|
||||
result = std::pow(result, 1.0/pval);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
static void norm_kernel_impl(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
Scalar p,
|
||||
c10::optional<int64_t> dim) {
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
|
||||
NormReduction<scalar_t>::apply(result, self, p, dim);
|
||||
});
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
|
||||
REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
|
||||
REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
|
||||
|
||||
}} // namespace at::native
|
||||
|
@ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim(
|
||||
int64_t outer_size,
|
||||
int64_t dim_size) {
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
|
||||
static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
|
||||
int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
|
||||
if (grain_size < CHUNK_SIZE)
|
||||
grain_size = CHUNK_SIZE;
|
||||
|
@ -37,9 +37,9 @@ template <>
|
||||
int64_t _sigmoid(float* x, float* y, int64_t size) {
|
||||
using Vec = Vec256<float>;
|
||||
int64_t i = 0;
|
||||
for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
|
||||
for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
|
||||
Vec ret = Vec::loadu(y + i);
|
||||
Vec ret2 = Vec::loadu(y + i + Vec::size);
|
||||
Vec ret2 = Vec::loadu(y + i + Vec::size());
|
||||
ret = ret.neg();
|
||||
ret2 = ret2.neg();
|
||||
#if defined(__AVX2__) && !defined(_MSC_VER)
|
||||
@ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) {
|
||||
ret = ret.reciprocal();
|
||||
ret2 = ret2.reciprocal();
|
||||
ret.store(x + i);
|
||||
ret2.store(x + i + Vec::size);
|
||||
ret2.store(x + i + Vec::size());
|
||||
}
|
||||
return i;
|
||||
}
|
||||
@ -63,9 +63,9 @@ template <>
|
||||
int64_t _sigmoid(double* x, double* y, int64_t size) {
|
||||
using Vec = Vec256<double>;
|
||||
int64_t i = 0;
|
||||
for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
|
||||
for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
|
||||
Vec ret = Vec::loadu(y + i);
|
||||
Vec ret2 = Vec::loadu(y + i + Vec::size);
|
||||
Vec ret2 = Vec::loadu(y + i + Vec::size());
|
||||
ret = ret.neg();
|
||||
ret2 = ret2.neg();
|
||||
ret = ret.exp();
|
||||
@ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) {
|
||||
ret = ret.reciprocal();
|
||||
ret2 = ret2.reciprocal();
|
||||
ret.store(x + i);
|
||||
ret2.store(x + i + Vec::size);
|
||||
ret2.store(x + i + Vec::size());
|
||||
}
|
||||
return i;
|
||||
}
|
||||
@ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
|
||||
if (stridex == 1 && stridey == 1) {
|
||||
i = _sigmoid(x, y, size);
|
||||
}
|
||||
for (; i < size; i += Vec::size) {
|
||||
scalar_t buffer[Vec::size];
|
||||
int64_t width = Vec::size;
|
||||
for (; i < size; i += Vec::size()) {
|
||||
scalar_t buffer[Vec::size()];
|
||||
int64_t width = Vec::size();
|
||||
width = std::min(width, size - i);
|
||||
for (int64_t j = 0; j < width; j++) {
|
||||
buffer[j] = y[stridey * (i + j)];
|
||||
|
@ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
|
||||
input_stride1 = strides[1];
|
||||
}
|
||||
AT_CHECK(channel_size == weight_num,
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
|
||||
weight_num, channel_size);
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
|
||||
" and channel size = ", channel_size, ".");
|
||||
|
||||
// config to run cuda kernel
|
||||
int64_t input_numel = input.numel();
|
||||
@ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
|
||||
input_stride1 = strides[1];
|
||||
}
|
||||
AT_CHECK(channel_size == weight_num,
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
|
||||
weight_num, channel_size);
|
||||
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
|
||||
" and channel size = ", channel_size, ".");
|
||||
|
||||
// config to run cuda kernel
|
||||
int64_t input_numel = input.numel();
|
||||
|
@ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool upper>
|
||||
__global__
|
||||
void triu_tril_kernel(
|
||||
scalar_t* result, scalar_t* self, int64_t k, int64_t N,
|
||||
int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride,
|
||||
int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) {
|
||||
int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (linear_idx >= N) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t self_batch_idx = blockIdx.y;
|
||||
int64_t row = linear_idx / self_ncol;
|
||||
int64_t col = linear_idx % self_ncol;
|
||||
|
||||
bool mask = upper ? (col - row >= k) : (col - row <= k);
|
||||
|
||||
// Now compute the offset for the self and result tensor
|
||||
int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride;
|
||||
int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride;
|
||||
result[res_offset] = mask ? self[self_offset] : scalar_t(0);
|
||||
}
|
||||
|
||||
template <bool upper>
|
||||
Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
|
||||
int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2),
|
||||
res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1,
|
||||
res_row_stride = result.stride(-2), res_col_stride = result.stride(-1),
|
||||
self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1,
|
||||
self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
|
||||
dim3 dim_block = cuda::getApplyBlock();
|
||||
dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
|
||||
AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{
|
||||
triu_tril_kernel<scalar_t, upper>
|
||||
<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
|
||||
result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
|
||||
res_batch_stride, res_row_stride, res_col_stride,
|
||||
self_batch_stride, self_row_stride, self_col_stride, self.size(-1));
|
||||
});
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor& tril_cuda_(Tensor &self, int64_t k) {
|
||||
if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
|
||||
return tril_cuda_out(self, self, k);
|
||||
}
|
||||
|
||||
Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
|
||||
if (result.sizes() != self.sizes()) {
|
||||
result.resize_as_(self);
|
||||
}
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
|
||||
return triu_tril_cuda_template<false>(result, self_c, k, "tril");
|
||||
}
|
||||
|
||||
Tensor& triu_cuda_(Tensor &self, int64_t k) {
|
||||
if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
|
||||
return triu_cuda_out(self, self, k);
|
||||
}
|
||||
|
||||
Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
|
||||
if (result.sizes() != self.sizes()) {
|
||||
result.resize_as_(self);
|
||||
}
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
|
||||
return triu_tril_cuda_template<true>(result, self_c, k, "triu");
|
||||
}
|
||||
|
||||
}} // namespace at::native
|
||||
|
||||
#undef ALLOCATE_ARRAY
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "ATen/ATen.h"
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <THC/THCTensorMathReduce.cuh>
|
||||
#include <math.h>
|
||||
|
||||
@ -78,13 +79,13 @@ struct dists {
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename F>
|
||||
__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
|
||||
__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
|
||||
const double n2, const double n2_squared_minus_1) {
|
||||
const int k = blockIdx.x;
|
||||
const int stride = blockDim.x;
|
||||
|
||||
float n2 = n - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
|
||||
int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
|
||||
const scalar_t * const start = self + i * m;
|
||||
@ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename F>
|
||||
__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
|
||||
__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
|
||||
const double n2, const double n2_squared_minus_1) {
|
||||
const int k = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
@ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const
|
||||
return;
|
||||
}
|
||||
|
||||
float n2 = n - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
|
||||
int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
int64_t ib = j - i - 1;
|
||||
int64_t jb = n - 2 - i;
|
||||
@ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
|
||||
const dim3 block(forward_threads);
|
||||
int64_t n = self.size(0);
|
||||
int64_t m = self.size(1);
|
||||
// https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
|
||||
// some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
|
||||
const double n2 = n - .5;
|
||||
const double n2_squared_minus_1 = n2 * n2 - 1;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
|
||||
if (p == 0.0) {
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
|
||||
} else if (p == 1.0) {
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
|
||||
} else if (p == 2.0) {
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
|
||||
} else if (std::isinf(p)) {
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
|
||||
} else {
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
|
||||
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
|
||||
}
|
||||
});
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
|
||||
@ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
|
||||
const int64_t n = result.size(0);
|
||||
int64_t m = self.size(1);
|
||||
const int block_x = 64;
|
||||
const int block_y = 4;
|
||||
// NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
|
||||
// From binary search, block_y of 16 gives us max pdist dim0 of 1449,
|
||||
// block_y of 4 gives us max pdist dim0 of 725.
|
||||
const int block_y = 16;
|
||||
const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
|
||||
const int grid_y = (dist.numel() + block_y - 1) / block_y;
|
||||
const dim3 grid(grid_x, grid_y);
|
||||
const dim3 block(block_x, block_y);
|
||||
// https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
|
||||
// some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
|
||||
const double n2 = n - .5;
|
||||
const double n2_squared_minus_1 = n2 * n2 - 1;
|
||||
|
||||
Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
|
||||
if (p == 1.0) {
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
|
||||
} else if (p < 2.0) {
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
|
||||
} else if (p == 2.0) {
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
|
||||
} else if (std::isinf(p)) {
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
|
||||
} else {
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
|
||||
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
|
||||
}
|
||||
});
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
at::sum_out(result, buffer, 0);
|
||||
}
|
||||
|
@ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
|
||||
|
||||
default:
|
||||
AT_ERROR(
|
||||
"Unknown mode for embedding_bag_backward_cuda %d", mode);
|
||||
"Unknown mode for embedding_bag_backward_cuda ", mode);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
||||
+ log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
|
||||
|
||||
log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
|
||||
} else if ((s < 2*max_target_length+1) || (t >= input_length)) {
|
||||
} else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) {
|
||||
log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
|
||||
}
|
||||
}
|
||||
@ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
|
||||
if (targets.type().scalarType() == kLong) {
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
|
||||
} else {
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_
|
||||
const Tensor& running_mean_, const Tensor& running_var_,
|
||||
bool train, double momentum, double epsilon) {
|
||||
|
||||
TensorArg input_arg{ input_, "input", 1 },
|
||||
weight_arg{ weight_, "weight", 2 },
|
||||
bias_arg{ bias_, "bias", 3 },
|
||||
run_mean_arg{ running_mean_, "running_mean", 4 },
|
||||
run_var_arg{ running_var_, "running_var", 5 };
|
||||
CheckedFrom c = "batch_norm_cuda";
|
||||
checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg});
|
||||
|
||||
using accscalar_t = at::acc_type<scalar_t, true>;
|
||||
int64_t n_input = input_.size(1);
|
||||
Tensor save_mean_;
|
||||
|
@ -7,28 +7,13 @@
|
||||
#include <tuple>
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/scan.h>
|
||||
#include <thrust/scatter.h>
|
||||
|
||||
namespace at {
|
||||
namespace native{
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
__global__ void inverse_indices_kernel(
|
||||
const scalar_t* input_data,
|
||||
const scalar_t* output_data,
|
||||
int64_t* inverse_indices_data,
|
||||
int64_t num_inp,
|
||||
int64_t num_out) {
|
||||
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int64_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (int64_t i = idx; i < num_inp * num_out; i += stride) {
|
||||
if (input_data[i / num_out] == output_data[i % num_out]){
|
||||
inverse_indices_data[i / num_out] = i % num_out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename scalar_t>
|
||||
std::tuple<Tensor, Tensor> _unique_cuda_template(
|
||||
@ -47,25 +32,29 @@ template <typename scalar_t>
|
||||
Tensor output = input.clone();
|
||||
output = output.view(-1);
|
||||
scalar_t* output_data = output.data<scalar_t>();
|
||||
thrust::sort(policy, output_data, output_data + num_inp);
|
||||
scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
|
||||
int64_t num_out = output_end - output_data;
|
||||
output.resize_(num_out);
|
||||
|
||||
Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
|
||||
|
||||
if (return_inverse) {
|
||||
inverse_indices.resize_(input.sizes());
|
||||
int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
|
||||
int block = 512;
|
||||
int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
|
||||
inverse_indices_kernel<<<grid, block, 0, stream>>>(
|
||||
input_data, output_data, inverse_indices_data, num_inp, num_out);
|
||||
Tensor inverse_indices;
|
||||
if (!return_inverse) {
|
||||
inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
|
||||
thrust::sort(policy, output_data, output_data + num_inp);
|
||||
} else {
|
||||
Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong));
|
||||
int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>();
|
||||
thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr);
|
||||
Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong));
|
||||
inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong));
|
||||
int64_t* inv_loc_ptr = inv_loc.data<int64_t>();
|
||||
int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>();
|
||||
thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }});
|
||||
inv_loc[0] = 0;
|
||||
thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
|
||||
thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
|
||||
inverse_indices.resize_(input.sizes());
|
||||
}
|
||||
int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data;
|
||||
output.resize_(num_out);
|
||||
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return std::tuple<Tensor, Tensor>(output, inverse_indices);
|
||||
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
|
@ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
|
||||
#if CUDNN_VERSION >= 6000
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
|
||||
#endif
|
||||
};
|
||||
// NOTE: - 1 because ALGO_WINOGRAD is not implemented
|
||||
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
|
||||
@ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
|
||||
THCCachingAllocator_emptyCache();
|
||||
}
|
||||
|
||||
|
||||
//hot fix for #16610
|
||||
//specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData,
|
||||
//adding "strided" argument, so in the end this looks simpler.
|
||||
template<>
|
||||
void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) {
|
||||
using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>;
|
||||
auto& cache = search::cache();
|
||||
|
||||
if (cache.find(args.params, algo)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (args.params.deterministic && !benchmark) {
|
||||
*algo = search::DEFAULT_ALGO;
|
||||
return;
|
||||
}
|
||||
|
||||
int stride_dim = args.input.dim() - 2;
|
||||
bool strided = false;
|
||||
for (int i = 0; i< stride_dim; i++) {
|
||||
if (args.params.stride[i] != 1) {
|
||||
strided = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!benchmark) {
|
||||
search::getAlgorithm(args, algo);
|
||||
if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
|
||||
*algo = search::DEFAULT_ALGO;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (cache.find(args.params, algo)) {
|
||||
// re-check cache since another thread may have benchmarked the algorithm
|
||||
return;
|
||||
}
|
||||
|
||||
auto perfResults = search::findAlgorithm(args);
|
||||
// for deterministic algo, look at all the perf results and return the best
|
||||
// deterministic algo
|
||||
if (perfResults.status == CUDNN_STATUS_SUCCESS &&
|
||||
!(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
|
||||
*algo = perfResults.algo;
|
||||
} else {
|
||||
*algo = search::DEFAULT_ALGO;
|
||||
}
|
||||
if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
|
||||
*algo = search::DEFAULT_ALGO;
|
||||
}
|
||||
cache.insert(args.params, *algo);
|
||||
|
||||
// Free the cached blocks in our caching allocator. They are
|
||||
// needed here because the above benchmarking uses a huge amount of memory,
|
||||
// e.g. a few GBs.
|
||||
THCCachingAllocator_emptyCache();
|
||||
}
|
||||
|
||||
|
||||
template<typename algo_t>
|
||||
Workspace chooseAlgorithm(
|
||||
const ConvolutionArgs& args,
|
||||
@ -848,19 +907,9 @@ Tensor cudnn_convolution_forward(
|
||||
// See #4500
|
||||
Tensor weight_contig = weight->contiguous();
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
for (int i = 0; i < groups; i++) {
|
||||
raw_cudnn_convolution_forward_out(
|
||||
narrowGroup(*output, output_channels_dim, i, groups),
|
||||
narrowGroup(*input, input_channels_dim, i, groups),
|
||||
narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
|
||||
padding, stride, dilation, 1, benchmark, deterministic);
|
||||
}
|
||||
#else
|
||||
raw_cudnn_convolution_forward_out(
|
||||
*output, *input, weight_contig,
|
||||
padding, stride, dilation, groups, benchmark, deterministic);
|
||||
#endif
|
||||
|
||||
return *output;
|
||||
}
|
||||
@ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input(
|
||||
// See #4500
|
||||
Tensor weight_contig = weight->contiguous();
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
for (int i = 0; i < groups; i++) {
|
||||
raw_cudnn_convolution_backward_input_out(
|
||||
narrowGroup(*grad_input, input_channels_dim, i, groups),
|
||||
narrowGroup(*grad_output, output_channels_dim, i, groups),
|
||||
narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
|
||||
padding, stride, dilation, 1, benchmark, deterministic);
|
||||
}
|
||||
#else
|
||||
raw_cudnn_convolution_backward_input_out(
|
||||
*grad_input, *grad_output, weight_contig,
|
||||
padding, stride, dilation, groups, benchmark, deterministic);
|
||||
#endif
|
||||
|
||||
return *grad_input;
|
||||
}
|
||||
@ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight(
|
||||
TensorArg grad_weight{ grad_weight_t, "result", 0 };
|
||||
convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
for (int i = 0; i < groups; i++) {
|
||||
raw_cudnn_convolution_backward_weight_out(
|
||||
narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
|
||||
narrowGroup(*grad_output, output_channels_dim, i, groups),
|
||||
narrowGroup(*input, input_channels_dim, i, groups),
|
||||
padding, stride, dilation, groups, benchmark, deterministic);
|
||||
}
|
||||
#else
|
||||
raw_cudnn_convolution_backward_weight_out(
|
||||
*grad_weight, *grad_output, *input,
|
||||
padding, stride, dilation, groups, benchmark, deterministic);
|
||||
#endif
|
||||
|
||||
return grad_weight_t;
|
||||
}
|
||||
|
@ -7,7 +7,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
|
||||
#if !AT_CUDNN_ENABLED()
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
|
@ -375,7 +375,7 @@ namespace {
|
||||
case CUDNN_RNN_TANH:
|
||||
return 2;
|
||||
default:
|
||||
AT_ERROR("unknown cuDNN RNN mode %d", mode);
|
||||
AT_ERROR("unknown cuDNN RNN mode ", mode);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2555,9 +2555,15 @@
|
||||
|
||||
- func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU: tril_cpu_
|
||||
CUDA: tril_cuda_
|
||||
|
||||
- func: triu_(Tensor self, int64_t diagonal=0) -> Tensor
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU: triu_cpu_
|
||||
CUDA: triu_cuda_
|
||||
|
||||
- func: digamma_(Tensor self) -> Tensor
|
||||
variants: method
|
||||
@ -2658,11 +2664,17 @@
|
||||
variants: method, function
|
||||
|
||||
- func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
|
||||
dispatch:
|
||||
CPU: triu_cpu_out
|
||||
CUDA: triu_cuda_out
|
||||
|
||||
- func: triu(Tensor self, int64_t diagonal=0) -> Tensor
|
||||
variants: method, function
|
||||
|
||||
- func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
|
||||
dispatch:
|
||||
CPU: tril_cpu_out
|
||||
CUDA: tril_cuda_out
|
||||
|
||||
- func: tril(Tensor self, int64_t diagonal=0) -> Tensor
|
||||
variants: method, function
|
||||
|
@ -11,18 +11,4 @@ using namespace at::native;
|
||||
TEST(CUDNNTest, CUDNNTestCUDA) {
|
||||
if (!at::cuda::is_available()) return;
|
||||
manual_seed(123);
|
||||
|
||||
#if CUDNN_VERSION < 7000
|
||||
auto handle = getCudnnHandle();
|
||||
DropoutDescriptor desc1, desc2;
|
||||
desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte));
|
||||
desc2.set(handle, 0.5, desc1.state);
|
||||
bool isEQ;
|
||||
isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
|
||||
ASSERT_TRUE(isEQ);
|
||||
isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
|
||||
ASSERT_TRUE(isEQ);
|
||||
isEQ = (desc1.desc()->states == desc2.desc()->states);
|
||||
ASSERT_TRUE(isEQ);
|
||||
#endif
|
||||
}
|
||||
|
@ -3,6 +3,8 @@ find_package(ATen REQUIRED)
|
||||
include_directories(${ATEN_INCLUDE_DIR})
|
||||
|
||||
# C++11
|
||||
set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
|
||||
if (not MSVC)
|
||||
set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
add_executable(main main.cpp)
|
||||
target_link_libraries(main ${ATEN_LIBRARIES})
|
||||
|
@ -247,10 +247,13 @@
|
||||
|
||||
#ifdef _OPENMP
|
||||
|
||||
#ifndef _WIN32
|
||||
#define PRAGMA(P) _Pragma(#P)
|
||||
#ifdef _WIN32
|
||||
// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.
|
||||
#define PRAGMA_LOOP(P) // Noop
|
||||
#define PRAGMA(P) __pragma(P)
|
||||
#else
|
||||
#define PRAGMA(P) __pragma(P)
|
||||
#define PRAGMA_LOOP(P) _Pragma(#P)
|
||||
#define PRAGMA(P) _Pragma(#P)
|
||||
#endif
|
||||
|
||||
#include <omp.h>
|
||||
@ -369,7 +372,7 @@
|
||||
TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
|
||||
ptrdiff_t iter = 0; \
|
||||
if(tp != (TYPE2*)rp) { \
|
||||
PRAGMA(ivdep) \
|
||||
PRAGMA_LOOP(ivdep) \
|
||||
PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
|
||||
for (iter = 0; iter < SIZE; iter++) { \
|
||||
TYPE2 *TENSOR2##_data = tp+iter; \
|
||||
@ -377,7 +380,7 @@
|
||||
CODE \
|
||||
}\
|
||||
} else {\
|
||||
PRAGMA(simd) \
|
||||
PRAGMA_LOOP(simd) \
|
||||
PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) ) \
|
||||
for (iter = 0; iter < SIZE; iter++) {\
|
||||
TYPE2* TENSOR2##_data = tp+iter;\
|
||||
@ -449,7 +452,7 @@
|
||||
TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset(); \
|
||||
ptrdiff_t iter = 0;\
|
||||
if(tp != (TYPE2*)rp) { \
|
||||
PRAGMA(ivdep) \
|
||||
PRAGMA_LOOP(ivdep) \
|
||||
PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \
|
||||
for (iter = 0; iter < SIZE; iter++) {\
|
||||
TYPE1 *TENSOR1##_data = rp+iter;\
|
||||
@ -458,7 +461,7 @@
|
||||
CODE \
|
||||
} \
|
||||
} else {\
|
||||
PRAGMA(simd) \
|
||||
PRAGMA_LOOP(simd) \
|
||||
PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \
|
||||
for (iter = 0; iter < SIZE; iter++) {\
|
||||
TYPE1 *TENSOR1##_data = rp+iter;\
|
||||
|
@ -13,10 +13,13 @@
|
||||
|
||||
#ifdef _OPENMP
|
||||
|
||||
#ifndef _WIN32
|
||||
#define PRAGMA(P) _Pragma(#P)
|
||||
#ifdef _WIN32
|
||||
// MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.
|
||||
#define PRAGMA_LOOP(P) // Noop
|
||||
#define PRAGMA(P) __pragma(P)
|
||||
#else
|
||||
#define PRAGMA(P) __pragma(P)
|
||||
#define PRAGMA_LOOP(P) _Pragma(#P)
|
||||
#define PRAGMA(P) _Pragma(#P)
|
||||
#endif
|
||||
|
||||
#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
|
||||
|
@ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
|
||||
int free_b = 0;
|
||||
if (a == NULL) a = ra_;
|
||||
if (b == NULL) b = rb_;
|
||||
THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
|
||||
a->dim());
|
||||
THArgCheck(!a->is_empty(), 2, "A should not be empty");
|
||||
THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
|
||||
"dimensions, but has %d", b->dim());
|
||||
THArgCheck(!b->is_empty(), 2, "B should not be empty");
|
||||
THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
|
||||
a->size(0), a->size(1));
|
||||
THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
|
||||
"rows, B has %ld", a->size(0), b->size(0));
|
||||
|
||||
if (b->dim() == 1) {
|
||||
b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
|
||||
b->stride(0), 1, 0);
|
||||
free_b = 1;
|
||||
}
|
||||
|
||||
int n, nrhs, lda, ldb, info;
|
||||
THIntTensor *ipiv;
|
||||
@ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
|
||||
THTensor_(freeCopyTo)(ra__, ra_);
|
||||
THTensor_(freeCopyTo)(rb__, rb_);
|
||||
THIntTensor_free(ipiv);
|
||||
if (free_b) c10::raw::intrusive_ptr::decref(b);
|
||||
}
|
||||
|
||||
void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
|
||||
|
@ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n
|
||||
|
||||
TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
|
||||
TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
|
||||
TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
|
||||
TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
|
||||
TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
|
||||
TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
|
||||
|
@ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
|
||||
REAL_SWAP(ARR(III), ARR(JJJ)); \
|
||||
LONG_SWAP(IDX(III), IDX(JJJ))
|
||||
|
||||
/* Emulate NumPy behavior of putting NaNs
|
||||
* at the end of an ascending list. */
|
||||
#define GT_OR_NAN(x, y) \
|
||||
((x != x && y == y) || (x > y))
|
||||
|
||||
static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
|
||||
{
|
||||
int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
|
||||
@ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
|
||||
/* Use median of three for pivot choice */
|
||||
P=(L+R)>>1;
|
||||
BOTH_SWAP(P, L+1);
|
||||
if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
|
||||
if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
|
||||
if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
|
||||
if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
|
||||
if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
|
||||
if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }
|
||||
|
||||
i = L+1; j = R; piv = ARR(L); pid = IDX(L);
|
||||
|
||||
do {
|
||||
do { i = i+1; } while(ARR(i) < piv);
|
||||
do { j = j-1; } while(ARR(j) > piv);
|
||||
do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
|
||||
do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
|
||||
if (j < i)
|
||||
break;
|
||||
BOTH_SWAP(i, j);
|
||||
@ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
|
||||
} /* while not done */
|
||||
/* Now insertion sort on the concatenation of subfiles */
|
||||
for(i=elements-2; i>=0; i--) {
|
||||
if (ARR(i) > ARR(i+1)) {
|
||||
if (GT_OR_NAN(ARR(i),ARR(i+1))) {
|
||||
piv = ARR(i);
|
||||
pid = IDX(i);
|
||||
j = i+1;
|
||||
@ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
|
||||
ARR(j-1) = ARR(j);
|
||||
IDX(j-1) = IDX(j);
|
||||
j = j+1;
|
||||
} while(j < elements && ARR(j) < piv);
|
||||
} while(j < elements && GT_OR_NAN(piv, ARR(j)));
|
||||
ARR(j-1) = piv;
|
||||
IDX(j-1) = pid;
|
||||
}
|
||||
@ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
|
||||
/* Use median of three for pivot choice */
|
||||
P=(L+R)>>1;
|
||||
BOTH_SWAP(P, L+1);
|
||||
if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
|
||||
if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
|
||||
if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
|
||||
if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
|
||||
if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
|
||||
if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }
|
||||
|
||||
i = L+1; j = R; piv = ARR(L); pid = IDX(L);
|
||||
|
||||
do {
|
||||
do { i = i+1; } while(ARR(i) > piv);
|
||||
do { j = j-1; } while(ARR(j) < piv);
|
||||
do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
|
||||
do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
|
||||
if (j < i)
|
||||
break;
|
||||
BOTH_SWAP(i, j);
|
||||
@ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
|
||||
} /* while not done */
|
||||
/* Now insertion sort on the concatenation of subfiles */
|
||||
for(i=elements-2; i>=0; i--) {
|
||||
if (ARR(i) < ARR(i+1)) {
|
||||
if (GT_OR_NAN(ARR(i+1), ARR(i))) {
|
||||
piv = ARR(i);
|
||||
pid = IDX(i);
|
||||
j = i+1;
|
||||
@ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
|
||||
ARR(j-1) = ARR(j);
|
||||
IDX(j-1) = IDX(j);
|
||||
j = j+1;
|
||||
} while(j < elements && ARR(j) > piv);
|
||||
} while(j < elements && GT_OR_NAN(ARR(j), piv));
|
||||
ARR(j-1) = piv;
|
||||
IDX(j-1) = pid;
|
||||
}
|
||||
@ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i
|
||||
THLongTensor_free(tmpIndices);
|
||||
}
|
||||
|
||||
void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
|
||||
{
|
||||
int64_t t_size_0, t_size_1;
|
||||
int64_t t_stride_0, t_stride_1;
|
||||
int64_t r__stride_0, r__stride_1;
|
||||
scalar_t *t_data, *r__data;
|
||||
int64_t r, c;
|
||||
|
||||
THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
|
||||
|
||||
THTensor_(resizeAs)(r_, t);
|
||||
|
||||
t_size_0 = THTensor_(size)(t, 0);
|
||||
t_size_1 = THTensor_(size)(t, 1);
|
||||
t_stride_0 = THTensor_(stride)(t, 0);
|
||||
t_stride_1 = THTensor_(stride)(t, 1);
|
||||
r__stride_0 = THTensor_(stride)(r_, 0);
|
||||
r__stride_1 = THTensor_(stride)(r_, 1);
|
||||
r__data = r_->data<scalar_t>();
|
||||
t_data = t->data<scalar_t>();
|
||||
|
||||
for(r = 0; r < t_size_0; r++)
|
||||
{
|
||||
int64_t sz = THMin(r+k+1, t_size_1);
|
||||
for(c = THMax(0, r+k+1); c < t_size_1; c++)
|
||||
r__data[r*r__stride_0+c*r__stride_1] = 0;
|
||||
for(c = 0; c < sz; c++)
|
||||
r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
|
||||
}
|
||||
}
|
||||
|
||||
void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
|
||||
{
|
||||
int64_t t_size_0, t_size_1;
|
||||
|
@ -6,17 +6,17 @@
|
||||
#include "THCNumerics.cuh"
|
||||
|
||||
// Collection of kernel sort routines
|
||||
template <typename T>
|
||||
template <typename T, bool handleNaN = false>
|
||||
struct LTComp {
|
||||
__device__ inline bool operator()(const T& a, const T& b) const {
|
||||
return THCNumerics<T>::lt(a, b);
|
||||
return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
template <typename T, bool handleNaN = false>
|
||||
struct GTComp {
|
||||
__device__ inline bool operator()(const T& a, const T& b) const {
|
||||
return THCNumerics<T>::gt(a, b);
|
||||
return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ int binarySearchForMultinomial(T* dist,
|
||||
__device__ int binarySearchForMultinomial(T* cumdist,
|
||||
T* dist,
|
||||
int size,
|
||||
T val) {
|
||||
int start = 0;
|
||||
int end = size;
|
||||
// dist[size - 1] = 0 => all zero prob dist
|
||||
assert(THCNumerics<T>::gt(dist[size - 1], 0));
|
||||
// cumdist[size - 1] = 0 => all zero prob dist
|
||||
assert(THCNumerics<T>::gt(cumdist[size - 1], 0));
|
||||
|
||||
while (end - start > 0) {
|
||||
int mid = start + (end - start) / 2;
|
||||
|
||||
T midVal = dist[mid];
|
||||
T midVal = cumdist[mid];
|
||||
if (THCNumerics<T>::lt(midVal, val)) {
|
||||
start = mid + 1;
|
||||
} else {
|
||||
@ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist,
|
||||
start = size - 1;
|
||||
}
|
||||
|
||||
T curVal = dist[start];
|
||||
while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
|
||||
T curVal = cumdist[start];
|
||||
while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--;
|
||||
|
||||
return start;
|
||||
}
|
||||
@ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
|
||||
int64_t* dest,
|
||||
int64_t distributions,
|
||||
int categories,
|
||||
T* normDistPrefixSum) {
|
||||
T* normDistPrefixSum,
|
||||
T* normDist) {
|
||||
// At the moment, each warp computes one sample value in the binary
|
||||
// search due to divergence. It seems possible to compute multiple
|
||||
// values and limit divergence though later on. However, no matter
|
||||
@ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
|
||||
// Find the bucket that a uniform sample lies in
|
||||
int choice = binarySearchForMultinomial<T>(
|
||||
normDistPrefixSum + curDist * categories,
|
||||
normDist + curDist * categories,
|
||||
categories,
|
||||
r);
|
||||
|
||||
@ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
|
||||
// Find the bucket that a uniform sample lies in
|
||||
int choice = binarySearchForMultinomial<T>(
|
||||
normDistPrefixSum + curDist * categories,
|
||||
origDist + curDist * categories,
|
||||
categories,
|
||||
r);
|
||||
|
||||
|
@ -15,17 +15,17 @@
|
||||
#include <thrust/system/cuda/execution_policy.h>
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
template <typename T, bool handleNaN = false>
|
||||
struct ThrustGTOp {
|
||||
__device__ bool operator()(const T& lhs, const T& rhs) const {
|
||||
return THCNumerics<T>::gt(lhs, rhs);
|
||||
return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
template <typename T, bool handleNaN = false>
|
||||
struct ThrustLTOp {
|
||||
__device__ bool operator()(const T& lhs, const T& rhs) const {
|
||||
return THCNumerics<T>::lt(lhs, rhs);
|
||||
return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
|
||||
void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
|
||||
{
|
||||
#ifdef USE_MAGMA
|
||||
THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
|
||||
THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
|
||||
THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
|
||||
THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
|
||||
|
||||
int64_t n = a_->size(0);
|
||||
int64_t nrhs = b_->size(1);
|
||||
|
||||
|
@ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
|
||||
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
|
||||
}
|
||||
} else {
|
||||
THCTensor_(resizeAs)(state, self_, src_);
|
||||
|
||||
if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
|
||||
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
|
||||
|
@ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state,
|
||||
n_sample,
|
||||
THCudaLongTensor_data(state, self),
|
||||
numDist, numCategories,
|
||||
THCTensor_(data)(state, prefixSum));
|
||||
THCTensor_(data)(state, prefixSum),
|
||||
THCTensor_(data)(state, normDist));
|
||||
} else {
|
||||
// Sample without replacement
|
||||
|
||||
|
@ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
|
||||
dim3 block(blockSize); \
|
||||
\
|
||||
if (dir) { \
|
||||
bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \
|
||||
bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \
|
||||
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
|
||||
keyInfo, \
|
||||
keySlices, \
|
||||
@ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
|
||||
(TYPE) keyInfo.strides[collapseKeyDim], \
|
||||
valueInfo, \
|
||||
(TYPE) valueInfo.strides[collapseValueDim], \
|
||||
GTComp<scalar_t>()); \
|
||||
GTComp<scalar_t, true>()); \
|
||||
} else { \
|
||||
bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \
|
||||
bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \
|
||||
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
|
||||
keyInfo, \
|
||||
keySlices, \
|
||||
@ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
|
||||
(TYPE) keyInfo.strides[collapseKeyDim], \
|
||||
valueInfo, \
|
||||
(TYPE) valueInfo.strides[collapseValueDim], \
|
||||
LTComp<scalar_t>()); \
|
||||
LTComp<scalar_t, true>()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state,
|
||||
#if CUDA_VERSION >= 7000
|
||||
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
|
||||
#endif
|
||||
keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>());
|
||||
keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>());
|
||||
} else {
|
||||
thrust::stable_sort_by_key(
|
||||
#if CUDA_VERSION >= 7000
|
||||
thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
|
||||
#endif
|
||||
keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>());
|
||||
keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>());
|
||||
}
|
||||
|
||||
// Then, re-sort according to slice that each index is
|
||||
|
12
c10/Half.h
12
c10/Half.h
@ -383,6 +383,14 @@ struct Converter<
|
||||
}
|
||||
};
|
||||
|
||||
// In some versions of MSVC, there will be a compiler error when building.
|
||||
// C4146: unary minus operator applied to unsigned type, result still unsigned
|
||||
// It can be addressed by disabling the following warning.
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning( push )
|
||||
#pragma warning( disable : 4146 )
|
||||
#endif
|
||||
|
||||
// skip isnan and isinf check for integral types
|
||||
template <typename To, typename From>
|
||||
typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
|
||||
@ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning( pop )
|
||||
#endif
|
||||
|
||||
template <typename To, typename From>
|
||||
typename std::enable_if<std::is_floating_point<From>::value, bool>::type
|
||||
overflows(From f) {
|
||||
|
@ -11,9 +11,11 @@ using c10::intrusive_ptr_target;
|
||||
using c10::make_intrusive;
|
||||
using c10::weak_intrusive_ptr;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
|
||||
#pragma GCC diagnostic ignored "-Wself-move"
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
class SomeClass0Parameters : public intrusive_ptr_target {};
|
||||
|
@ -25,7 +25,7 @@ Error::Error(
|
||||
// Caffe2-style error message
|
||||
Error::Error(
|
||||
const char* file,
|
||||
const int line,
|
||||
const uint32_t line,
|
||||
const char* condition,
|
||||
const std::string& msg,
|
||||
const std::string& backtrace,
|
||||
|
@ -49,7 +49,7 @@ class C10_API Error : public std::exception {
|
||||
Error(SourceLocation source_location, const std::string& msg);
|
||||
Error(
|
||||
const char* file,
|
||||
const int line,
|
||||
const uint32_t line,
|
||||
const char* condition,
|
||||
const std::string& msg,
|
||||
const std::string& backtrace,
|
||||
@ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e);
|
||||
// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
|
||||
// not met.
|
||||
|
||||
// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
|
||||
// which is different from the definition of `SourceLocation` that requires
|
||||
// unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
|
||||
// error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
|
||||
// Here the static cast is used to pass the build.
|
||||
|
||||
#define AT_ERROR(...) \
|
||||
throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
|
||||
throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
|
||||
|
||||
#define AT_WARN(...) \
|
||||
::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
|
||||
::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
|
||||
|
||||
#define AT_ASSERT(cond) \
|
||||
if (!(cond)) { \
|
||||
|
@ -17,9 +17,10 @@
|
||||
#include <utility>
|
||||
#include <type_traits>
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wshadow"
|
||||
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
|
||||
#else
|
||||
@ -1457,4 +1458,6 @@ namespace ska
|
||||
|
||||
} // end namespace ska
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
@ -72,18 +72,27 @@ class C10_API intrusive_ptr_target {
|
||||
// We also have to disable -Wunknown-warning-option and -Wpragmas, because
|
||||
// some other compilers don't know about -Wterminate or -Wexceptions and
|
||||
// will show a warning about unknown warning options otherwise.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
|
||||
#pragma GCC diagnostic ignored "-Wterminate"
|
||||
#pragma GCC diagnostic ignored "-Wexceptions"
|
||||
#ifdef _MSC_VER
|
||||
# pragma warning(push)
|
||||
# pragma warning(disable: 4297) // function assumed not to throw an exception but does
|
||||
#else
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wpragmas"
|
||||
# pragma GCC diagnostic ignored "-Wunknown-warning-option"
|
||||
# pragma GCC diagnostic ignored "-Wterminate"
|
||||
# pragma GCC diagnostic ignored "-Wexceptions"
|
||||
#endif
|
||||
AT_ASSERTM(
|
||||
refcount_.load() == 0,
|
||||
"Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
|
||||
AT_ASSERTM(
|
||||
weakcount_.load() == 0,
|
||||
"Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
|
||||
#pragma GCC diagnostic pop
|
||||
#ifdef _MSC_VER
|
||||
# pragma warning(pop)
|
||||
#else
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
|
||||
|
@ -430,12 +430,16 @@ class C10_API TypeMeta {
|
||||
// variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
|
||||
// disabled for compilers that don't know '-Wundefined-var-template' and
|
||||
// would error at our attempt to disable it.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
|
||||
#pragma GCC diagnostic ignored "-Wundefined-var-template"
|
||||
#ifndef _MSC_VER
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wpragmas"
|
||||
# pragma GCC diagnostic ignored "-Wunknown-warning-option"
|
||||
# pragma GCC diagnostic ignored "-Wundefined-var-template"
|
||||
#endif
|
||||
return TypeMeta(_typeMetaDataInstance<T>());
|
||||
#pragma GCC diagnostic pop
|
||||
#ifndef _MSC_VER
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY)
|
||||
else()
|
||||
target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
|
||||
endif()
|
||||
|
||||
#cmake only check for separate OpenMP library on AppleClang 7+
|
||||
#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
|
||||
if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
|
||||
target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
target_link_libraries(caffe2 PUBLIC c10)
|
||||
target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
|
||||
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
|
||||
@ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
|
||||
# Set standard properties on the target
|
||||
torch_set_target_props(caffe2)
|
||||
|
||||
if (MSVC)
|
||||
target_compile_options(caffe2 INTERFACE "-std=c++11")
|
||||
else()
|
||||
target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
|
||||
if (NOT MSVC)
|
||||
target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
|
||||
endif()
|
||||
|
||||
target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
|
||||
|
@ -93,7 +93,7 @@ using std::vector;
|
||||
#define CAFFE2_NORETURN __attribute__((noreturn))
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if (defined _MSC_VER && !defined NOMINMAX)
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
|
||||
|
@ -1,5 +1,8 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
from caffe2.proto import caffe2_pb2
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
# TODO: refactor & remove the following alias
|
||||
caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
|
||||
caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
|
||||
@ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
|
||||
caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
|
||||
caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
|
||||
caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
|
||||
|
||||
if platform.system() == 'Windows':
|
||||
IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
|
||||
|
||||
if IS_CONDA:
|
||||
from ctypes import windll, c_wchar_p
|
||||
from ctypes.wintypes import DWORD, HMODULE
|
||||
|
||||
AddDllDirectory = windll.kernel32.AddDllDirectory
|
||||
AddDllDirectory.restype = DWORD
|
||||
AddDllDirectory.argtypes = [c_wchar_p]
|
||||
|
||||
def add_extra_dll_dir(extra_dll_dir):
|
||||
if os.path.isdir(extra_dll_dir):
|
||||
os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH']
|
||||
|
||||
if IS_CONDA:
|
||||
AddDllDirectory(extra_dll_dir)
|
||||
|
||||
# first get nvToolsExt PATH
|
||||
def get_nvToolsExt_path():
|
||||
NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
|
||||
|
||||
if os.path.exists(NVTOOLEXT_HOME):
|
||||
return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
|
||||
else:
|
||||
return ''
|
||||
|
||||
py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
|
||||
th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
|
||||
th_dll_path = os.path.join(th_root, 'lib')
|
||||
|
||||
dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()]
|
||||
|
||||
# then add the path to env
|
||||
for p in dll_paths:
|
||||
add_extra_dll_dir(p)
|
||||
|
@ -628,37 +628,12 @@ endif()
|
||||
|
||||
# ---[ OpenMP
|
||||
if(USE_OPENMP)
|
||||
set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
|
||||
if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
exec_program(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION)
|
||||
string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
|
||||
message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
|
||||
if(DARWIN_VERSION GREATER 9)
|
||||
set(APPLE_OPENMP_SUCKS 1)
|
||||
endif(DARWIN_VERSION GREATER 9)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
|
||||
OUTPUT_VARIABLE GCC_VERSION)
|
||||
if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
|
||||
message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
|
||||
message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
|
||||
add_compile_options(-Wno-unknown-pragmas)
|
||||
set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_OPENMP AND NOT CHECKED_OPENMP)
|
||||
find_package(OpenMP)
|
||||
set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
|
||||
|
||||
# OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
|
||||
# see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
|
||||
set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
|
||||
endif()
|
||||
|
||||
find_package(OpenMP)
|
||||
if(OPENMP_FOUND)
|
||||
message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
|
||||
else()
|
||||
message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
|
||||
caffe2_update_option(USE_OPENMP OFF)
|
||||
@ -690,7 +665,12 @@ if(USE_CUDA)
|
||||
caffe2_update_option(USE_NVRTC OFF)
|
||||
endif()
|
||||
if(CAFFE2_USE_CUDNN)
|
||||
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
||||
IF(CUDNN_STATIC_LINKAGE)
|
||||
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
|
||||
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
|
||||
ELSE()
|
||||
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
||||
ENDIF()
|
||||
else()
|
||||
caffe2_update_option(USE_CUDNN OFF)
|
||||
endif()
|
||||
@ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE)
|
||||
STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
|
||||
STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
|
||||
ENDIF()
|
||||
|
||||
# OpenMP support?
|
||||
SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
|
||||
IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
EXEC_PROGRAM (uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION)
|
||||
STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
|
||||
MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
|
||||
IF (DARWIN_VERSION GREATER 9)
|
||||
SET(APPLE_OPENMP_SUCKS 1)
|
||||
ENDIF (DARWIN_VERSION GREATER 9)
|
||||
EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
|
||||
OUTPUT_VARIABLE GCC_VERSION)
|
||||
IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
|
||||
MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
|
||||
MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
|
||||
add_compile_options(-Wno-unknown-pragmas)
|
||||
SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
|
||||
FIND_PACKAGE(OpenMP)
|
||||
SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
|
||||
|
||||
# OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
|
||||
# see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
|
||||
SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
|
||||
ENDIF()
|
||||
|
||||
IF (OPENMP_FOUND)
|
||||
MESSAGE(STATUS "Compiling with OpenMP support")
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
ENDIF()
|
||||
|
||||
|
||||
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
||||
|
||||
FIND_PACKAGE(MAGMA)
|
||||
@ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE)
|
||||
SET(AT_CUDA_ENABLED 0)
|
||||
else()
|
||||
SET(AT_CUDA_ENABLED 1)
|
||||
find_package(CUDA 5.5 REQUIRED)
|
||||
endif()
|
||||
|
||||
IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
|
||||
@ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE)
|
||||
SET(AT_MKLDNN_ENABLED 0)
|
||||
SET(CAFFE2_USE_MKLDNN OFF)
|
||||
IF (USE_MKLDNN)
|
||||
FIND_PACKAGE(MKLDNN)
|
||||
INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
|
||||
IF(MKLDNN_FOUND)
|
||||
SET(AT_MKLDNN_ENABLED 1)
|
||||
INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR})
|
||||
INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR})
|
||||
IF(BUILD_CAFFE2_OPS)
|
||||
SET(CAFFE2_USE_MKLDNN ON)
|
||||
LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
|
||||
|
@ -2,7 +2,6 @@
|
||||
#
|
||||
# The following variables are optionally searched for defaults
|
||||
# MKL_FOUND : set to true if a library implementing the CBLAS interface is found
|
||||
# USE_MKLDNN
|
||||
#
|
||||
# The following are set after configuration is done:
|
||||
# MKLDNN_FOUND : set to true if mkl-dnn is found.
|
||||
@ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND)
|
||||
SET(MKLDNN_LIBRARIES)
|
||||
SET(MKLDNN_INCLUDE_DIR)
|
||||
|
||||
IF (NOT USE_MKLDNN)
|
||||
RETURN()
|
||||
ENDIF(NOT USE_MKLDNN)
|
||||
|
||||
IF(MSVC)
|
||||
MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
|
||||
RETURN()
|
||||
@ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
|
||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
|
||||
|
||||
IF(MKL_FOUND)
|
||||
# Append to mkldnn dependencies
|
||||
LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
|
||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
|
||||
# The OMP-related variables of MKL-DNN have to be overwritten here,
|
||||
# if MKL is used, and the OMP version is defined by MKL.
|
||||
# MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL.
|
||||
# INTEL_MKL_DIR gives the MKL root path.
|
||||
IF (INTEL_MKL_DIR)
|
||||
SET(MKLROOT ${INTEL_MKL_DIR})
|
||||
IF(WIN32)
|
||||
SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
|
||||
ELSE(WIN32)
|
||||
IF (MKL_LIBRARIES_gomp_LIBRARY)
|
||||
SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY})
|
||||
ELSE(MKL_LIBRARIES_gomp_LIBRARY)
|
||||
SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY})
|
||||
ENDIF(MKL_LIBRARIES_gomp_LIBRARY)
|
||||
SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
|
||||
ENDIF(WIN32)
|
||||
ELSE(INTEL_MKL_DIR)
|
||||
MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!")
|
||||
ENDIF(INTEL_MKL_DIR)
|
||||
|
||||
ELSE(MKL_FOUND)
|
||||
# If we cannot find MKL, we will use the Intel MKL Small library
|
||||
# comes with ${MKLDNN_ROOT}/external
|
||||
@ -75,60 +51,65 @@ ELSE(MKL_FOUND)
|
||||
ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)
|
||||
|
||||
FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
|
||||
IF(MKLML_INNER_INCLUDE_DIR)
|
||||
# if user has multiple version under external/ then guess last
|
||||
# one alphabetically is "latest" and warn
|
||||
LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
|
||||
IF(MKLINCLEN GREATER 1)
|
||||
LIST(SORT MKLML_INNER_INCLUDE_DIR)
|
||||
LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
|
||||
LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
|
||||
SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
|
||||
ENDIF(MKLINCLEN GREATER 1)
|
||||
GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
|
||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
|
||||
IF(NOT MKLML_INNER_INCLUDE_DIR)
|
||||
MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
|
||||
RETURN()
|
||||
ENDIF(NOT MKLML_INNER_INCLUDE_DIR)
|
||||
# if user has multiple version under external/ then guess last
|
||||
# one alphabetically is "latest" and warn
|
||||
LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
|
||||
IF(MKLINCLEN GREATER 1)
|
||||
LIST(SORT MKLML_INNER_INCLUDE_DIR)
|
||||
LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
|
||||
LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
|
||||
SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
|
||||
ENDIF(MKLINCLEN GREATER 1)
|
||||
GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
|
||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
|
||||
|
||||
IF(APPLE)
|
||||
SET(__mklml_inner_libs mklml iomp5)
|
||||
ELSE(APPLE)
|
||||
SET(__mklml_inner_libs mklml_intel iomp5)
|
||||
ENDIF(APPLE)
|
||||
|
||||
FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
|
||||
STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
|
||||
FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
|
||||
NAMES ${__mklml_inner_lib}
|
||||
PATHS "${MKLML_INNER_INCLUDE_DIR}/../lib"
|
||||
DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
|
||||
MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
|
||||
LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
|
||||
ENDFOREACH(__mklml_inner_lib)
|
||||
ENDIF(MKLML_INNER_INCLUDE_DIR)
|
||||
IF(APPLE)
|
||||
SET(__mklml_inner_libs mklml iomp5)
|
||||
ELSE(APPLE)
|
||||
SET(__mklml_inner_libs mklml_intel iomp5)
|
||||
ENDIF(APPLE)
|
||||
FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
|
||||
STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
|
||||
FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
|
||||
NAMES ${__mklml_inner_lib}
|
||||
PATHS "${MKLML_INNER_INCLUDE_DIR}/../lib"
|
||||
DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
|
||||
MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
|
||||
IF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
|
||||
MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
|
||||
RETURN()
|
||||
ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
|
||||
LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
|
||||
ENDFOREACH(__mklml_inner_lib)
|
||||
ENDIF(MKL_FOUND)
|
||||
|
||||
LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES)
|
||||
LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR)
|
||||
INCLUDE(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for})
|
||||
IF(MKL_FOUND)
|
||||
SET(MKL_cmake_included TRUE)
|
||||
SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE)
|
||||
ENDIF(MKL_FOUND)
|
||||
SET(WITH_TEST FALSE CACHE BOOL "" FORCE)
|
||||
SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE)
|
||||
SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
|
||||
ADD_SUBDIRECTORY(${MKLDNN_ROOT})
|
||||
IF(NOT TARGET mkldnn)
|
||||
MESSAGE("Failed to include MKL-DNN target")
|
||||
RETURN()
|
||||
ENDIF(NOT TARGET mkldnn)
|
||||
IF(MKL_FOUND)
|
||||
TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL)
|
||||
ENDIF(MKL_FOUND)
|
||||
IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized)
|
||||
TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow)
|
||||
TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow)
|
||||
ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
LIST(APPEND MKLDNN_LIBRARIES mkldnn)
|
||||
|
||||
IF(MKLDNN_FOUND)
|
||||
IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized)
|
||||
ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
|
||||
SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE)
|
||||
SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE)
|
||||
ADD_SUBDIRECTORY(${MKLDNN_ROOT})
|
||||
SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
|
||||
IF(WIN32)
|
||||
LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}")
|
||||
ELSE(WIN32)
|
||||
LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
|
||||
ENDIF(WIN32)
|
||||
ELSE(MKLDNN_FOUND)
|
||||
MESSAGE(STATUS "MKLDNN source files not found!")
|
||||
ENDIF(MKLDNN_FOUND)
|
||||
|
||||
UNSET(__mklml_inner_libs)
|
||||
UNSET(__mkldnn_looked_for)
|
||||
SET(MKLDNN_FOUND TRUE)
|
||||
MESSAGE(STATUS "Found MKL-DNN: TRUE")
|
||||
|
||||
ENDIF(NOT MKLDNN_FOUND)
|
||||
|
@ -9,6 +9,12 @@ endif()
|
||||
# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
|
||||
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
|
||||
|
||||
# we dont want to statically link cudart, because we rely on it's dynamic linkage in
|
||||
# python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
|
||||
# Technically, we can link cudart here statically, and link libtorch_python.so
|
||||
# to a dynamic libcudart.so, but that's just wasteful
|
||||
SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
|
||||
|
||||
# Find CUDA.
|
||||
find_package(CUDA 7.0)
|
||||
if(NOT CUDA_FOUND)
|
||||
@ -89,6 +95,9 @@ endif()
|
||||
|
||||
if(DEFINED ENV{CUDNN_LIBRARY})
|
||||
set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
|
||||
if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a")
|
||||
SET(CUDNN_STATIC_LINKAGE ON)
|
||||
endif()
|
||||
else()
|
||||
find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
|
||||
HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
|
||||
@ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN)
|
||||
"${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
|
||||
endif()
|
||||
message(STATUS "Found cuDNN: v${CUDNN_VERSION} (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
|
||||
if(CUDNN_VERSION VERSION_LESS "7.0.0")
|
||||
message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ CUDA libraries wrapper
|
||||
@ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED)
|
||||
if(CAFFE2_STATIC_LINK_CUDA)
|
||||
set_property(
|
||||
TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt)
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl)
|
||||
else()
|
||||
set_property(
|
||||
TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
|
917
docs/source/community/contribution_guide.rst
Normal file
917
docs/source/community/contribution_guide.rst
Normal file
@ -0,0 +1,917 @@
|
||||
PyTorch Contribution Guide
|
||||
==========================
|
||||
|
||||
PyTorch is a GPU-accelerated Python tensor computation package for
|
||||
building deep neural networks built on tape-based autograd systems.
|
||||
|
||||
The PyTorch Contribution Process
|
||||
--------------------------------
|
||||
|
||||
The PyTorch organization is governed by `PyTorch
|
||||
Governance </docs/community/governance.html>`__.
|
||||
|
||||
The PyTorch development process involves a healthy amount of open
|
||||
discussions between the core development team and the community.
|
||||
|
||||
PyTorch operates similar to most open source projects on GitHub.
|
||||
However, if you've never contributed to an open source project before,
|
||||
here is the basic process.
|
||||
|
||||
- **Figure out what you're going to work on.** The majority of open
|
||||
source contributions come from people scratching their own itches.
|
||||
However, if you don't know what you want to work on, or are just
|
||||
looking to get more acquainted with the project, here are some tips
|
||||
for how to find appropriate tasks:
|
||||
|
||||
- Look through the `issue
|
||||
tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if
|
||||
there are any issues you know how to fix. Issues that are
|
||||
confirmed by other contributors tend to be better to investigate.
|
||||
We also maintain some labels for issues which are likely to be
|
||||
good for new people, e.g., **bootcamp** and **1hr**, although
|
||||
these labels are less well maintained.
|
||||
- Join us on Slack and let us know you're interested in getting to
|
||||
know PyTorch. We're very happy to help out researchers and
|
||||
partners get up to speed with the codebase.
|
||||
|
||||
- **Figure out the scope of your change and reach out for design
|
||||
comments on a GitHub issue if it's large.** The majority of pull
|
||||
requests are small; in that case, no need to let us know about what
|
||||
you want to do, just get cracking. But if the change is going to be
|
||||
large, it's usually a good idea to get some design comments about it
|
||||
first.
|
||||
|
||||
- If you don't know how big a change is going to be, we can help you
|
||||
figure it out! Just post about it on issues or Slack.
|
||||
- Some feature additions are very standardized; for example, lots of
|
||||
people add new operators or optimizers to PyTorch. Design
|
||||
discussion in these cases boils down mostly to, “Do we want this
|
||||
operator/optimizer?” Giving evidence for its utility, e.g., usage
|
||||
in peer reviewed papers, or existence in other frameworks, helps a
|
||||
bit when making this case.
|
||||
- Core changes and refactors can be quite difficult to coordinate,
|
||||
as the pace of development on PyTorch master is quite fast.
|
||||
Definitely reach out about fundamental or cross-cutting changes;
|
||||
we can often give guidance about how to stage such changes into
|
||||
more easily reviewable pieces.
|
||||
|
||||
- **Code it out!**
|
||||
|
||||
- See the technical guide for advice for working with PyTorch in a
|
||||
technical form.
|
||||
|
||||
- **Open a pull request.**
|
||||
|
||||
- If you are not ready for the pull request to be reviewed, tag it
|
||||
with [WIP]. We will ignore it when doing review passes. If you are
|
||||
working on a complex change, it's good to start things off as WIP,
|
||||
because you will need to spend time looking at CI results to see
|
||||
if things worked out or not.
|
||||
- Find an appropriate reviewer for your change. We have some folks
|
||||
who regularly go through the PR queue and try to review
|
||||
everything, but if you happen to know who the maintainer for a
|
||||
given subsystem affected by your patch is, feel free to include
|
||||
them directly on the pull request. You can learn more about this
|
||||
structure at PyTorch Subsystem Ownership.
|
||||
|
||||
- **Iterate on the pull request until it's accepted!**
|
||||
|
||||
- We'll try our best to minimize the number of review roundtrips and
|
||||
block PRs only when there are major issues. For the most common
|
||||
issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__.
|
||||
- Once a pull request is accepted and CI is passing, there is
|
||||
nothing else you need to do; we will merge the PR for you.
|
||||
|
||||
Getting Started
|
||||
---------------
|
||||
|
||||
Proposing new features
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
New feature ideas are best discussed on a specific issue. Please include
|
||||
as much information as you can, any accompanying data, and your proposed
|
||||
solution. The PyTorch team and community frequently reviews new issues
|
||||
and comments where they think they can help. If you feel confident in
|
||||
your solution, go ahead and implement it.
|
||||
|
||||
Reporting Issues
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
If you've identified an issue, first search through the `list of
|
||||
existing issues <https://github.com/pytorch/pytorch/issues>`__ on the
|
||||
repo. If you are unable to find a similar issue, then create a new one.
|
||||
Supply as much information you can to reproduce the problematic
|
||||
behavior. Also, include any additional insights like the behavior you
|
||||
expect.
|
||||
|
||||
Implementing Features or Fixing Bugs
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you want to fix a specific issue, it's best to comment on the
|
||||
individual issue with your intent. However, we do not lock or assign
|
||||
issues except in cases where we have worked with the developer before.
|
||||
It's best to strike up a conversation on the issue and discuss your
|
||||
proposed solution. The PyTorch team can provide guidance that saves you
|
||||
time.
|
||||
|
||||
Issues that are labeled first-new-issue, low, or medium priority provide
|
||||
the best entrance point are great places to start.
|
||||
|
||||
Adding Tutorials
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__
|
||||
come from the community itself and we welcome additional contributions.
|
||||
To learn more about how to contribute a new tutorial you can learn more
|
||||
here: `PyTorch.org Tutorial Contribution Guide on
|
||||
Github <https://github.com/pytorch/tutorials/#contributing>`__
|
||||
|
||||
Improving Documentation & Tutorials
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We aim to produce high quality documentation and tutorials. On rare
|
||||
occasions that content includes typos or bugs. If you find something you
|
||||
can fix, send us a pull request for consideration.
|
||||
|
||||
Take a look at the `Documentation <#on-documentation>`__ section to learn how our system
|
||||
works.
|
||||
|
||||
Participating in online discussions
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can find active discussions happening on the PyTorch Discussion
|
||||
`forum <https://discuss.pytorch.org/>`__.
|
||||
|
||||
Submitting pull requests to fix open issues
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can view a list of all open issues
|
||||
`here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an
|
||||
issue is a great way to get the attention of the team. From here you can
|
||||
share your ideas and how you plan to resolve the issue.
|
||||
|
||||
For more challenging issues, the team will provide feedback and
|
||||
direction for how to best solve the issue.
|
||||
|
||||
If you're not able to fix the issue itself, commenting and sharing
|
||||
whether you can reproduce the issue can be useful for helping the team
|
||||
identify problem areas.
|
||||
|
||||
Reviewing open pull requests
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We appreciate your help reviewing and commenting on pull requests. Our
|
||||
team strives to keep the number of open pull requests at a manageable
|
||||
size, we respond quickly for more information if we need it, and we
|
||||
merge PRs that we think are useful. However, due to the high level of
|
||||
interest, additional eyes on pull requests is appreciated.
|
||||
|
||||
Improving code readability
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Improve code readability helps everyone. It is often better to submit a
|
||||
small number of pull requests that touch few files versus a large pull
|
||||
request that touches many files. Starting a discussion in the PyTorch
|
||||
forum `here <https://discuss.pytorch.org/>`__ or on an issue related to
|
||||
your improvement is the best way to get started.
|
||||
|
||||
Adding test cases to make the codebase more robust
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Additional test coverage is appreciated.
|
||||
|
||||
Promoting PyTorch
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Your use of PyTorch in your projects, research papers, write ups, blogs,
|
||||
or general discussions around the internet helps to raise awareness for
|
||||
PyTorch and our growing community. Please reach out to
|
||||
`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
|
||||
for marketing support.
|
||||
|
||||
Triaging issues
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
If you feel that an issue could benefit from a particular tag or level
|
||||
of complexity comment on the issue and share your opinion. If an you
|
||||
feel an issue isn't categorized properly comment and let the team know.
|
||||
|
||||
About open source development
|
||||
-----------------------------
|
||||
|
||||
If this is your first time contributing to an open source project, some
|
||||
aspects of the development process may seem unusual to you.
|
||||
|
||||
- **There is no way to “claim” issues.** People often want to “claim”
|
||||
an issue when they decide to work on it, to ensure that there isn't
|
||||
wasted work when someone else ends up working on it. This doesn't
|
||||
really work too well in open source, since someone may decide to work
|
||||
on something, and end up not having time to do it. Feel free to give
|
||||
information in an advisory fashion, but at the end of the day, we
|
||||
will take running code and rough consensus.
|
||||
- **There is a high bar for new functionality that is added.** Unlike
|
||||
in a corporate environment, where the person who wrote code
|
||||
implicitly “owns” it and can be expected to take care of it in the
|
||||
beginning of its lifetime, once a pull request is merged into an open
|
||||
source project, it immediately becomes the collective responsibility
|
||||
of all maintainers on the project. When we merge code, we are saying
|
||||
that we, the maintainers, are able to review subsequent changes and
|
||||
make a bugfix to the code. This naturally leads to a higher standard
|
||||
of contribution.
|
||||
|
||||
Common Mistakes To Avoid
|
||||
------------------------
|
||||
|
||||
- **Did you add tests?** (Or if the change is hard to test, did you
|
||||
describe how you tested your change?)
|
||||
|
||||
- We have a few motivations for why we ask for tests:
|
||||
|
||||
1. to help us tell if we break it later
|
||||
2. to help us tell if the patch is correct in the first place
|
||||
(yes, we did review it, but as Knuth says, “beware of the
|
||||
following code, for I have not run it, merely proven it
|
||||
correct”)
|
||||
|
||||
- When is it OK not to add a test? Sometimes a change can't be
|
||||
conveniently tested, or the change is so obviously correct (and
|
||||
unlikely to be broken) that it's OK not to test it. On the
|
||||
contrary, if a change is seems likely (or is known to be likely)
|
||||
to be accidentally broken, it's important to put in the time to
|
||||
work out a testing strategy.
|
||||
|
||||
- **Is your PR too long?**
|
||||
|
||||
- It's easier for us to review and merge small PRs. Difficulty of
|
||||
reviewing a PR scales nonlinearly with its size.
|
||||
- When is it OK to submit a large PR? It helps a lot if there was a
|
||||
corresponding design discussion in an issue, with sign off from
|
||||
the people who are going to review your diff. We can also help
|
||||
give advice about how to split up a large change into individually
|
||||
shippable parts. Similarly, it helps if there is a complete
|
||||
description of the contents of the PR: it's easier to review code
|
||||
if we know what's inside!
|
||||
|
||||
- **Comments for subtle things?** In cases where behavior of your code
|
||||
is nuanced, please include extra comments and documentation to allow
|
||||
us to better understand the intention of your code.
|
||||
- **Did you add a hack?** Sometimes a hack is the right answer. But
|
||||
usually we will have to discuss it.
|
||||
- **Do you want to touch a very core component?** In order to prevent
|
||||
major regressions, pull requests that touch core components receive
|
||||
extra scrutiny. Make sure you've discussed your changes with the team
|
||||
before undertaking major changes.
|
||||
- **Want to add a new feature?** If you want to add new features,
|
||||
comment your intention on the related issue. Our team tries to
|
||||
comment on and provide feedback to the community. It's better to have
|
||||
an open discussion with the team and the rest of the community prior
|
||||
to building new features. This helps us stay aware of what you're
|
||||
working on and increases the chance that it'll be merged.
|
||||
- **Did you touch unrelated code to the PR?** To aid in code review,
|
||||
please only include files in your pull request that are directly
|
||||
related to your changes.
|
||||
|
||||
Frequently asked questions
|
||||
|
||||
- **How can I contribute as a reviewer?** There is lots of value if
|
||||
community developer reproduce issues, try out new functionality, or
|
||||
otherwise help us identify or troubleshoot issues. Commenting on
|
||||
tasks or pull requests with your enviroment details is helpful and
|
||||
appreciated.
|
||||
- **CI tests failed, what does it mean?** Maybe you need to merge with
|
||||
master or rebase with latest changes. Pushing your changes should
|
||||
re-trigger CI tests. If the tests persist, you'll want to trace
|
||||
through the error messages and resolve the related issues.
|
||||
- **What are the most high risk changes?** Anything that tourhces build
|
||||
configuration is an risky area. Please avoid changing these unless
|
||||
you've had a discussion with the team beforehand.
|
||||
- **Hey, a commit showed up on my branch, what's up with that?**
|
||||
Sometimes another community member will provide a patch or fix to
|
||||
your pull request or branch. This is often needed for getting CI tests
|
||||
to pass.
|
||||
|
||||
On Documentation
|
||||
----------------
|
||||
|
||||
Python Docs
|
||||
~~~~~~~~~~~
|
||||
|
||||
PyTorch documentation is generated from python source using
|
||||
`Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is
|
||||
copied to the docs folder in the master branch of
|
||||
`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
|
||||
and is served via GitHub pages.
|
||||
|
||||
- Site: http://pytorch.org/docs
|
||||
- GitHub: http://github.com/pytorch/pytorch/docs
|
||||
- Served from:
|
||||
`https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
|
||||
|
||||
C++ Docs
|
||||
~~~~~~~~
|
||||
|
||||
For C++ code we use Doxygen to generate the content files. The C++ docs
|
||||
are built on a special server and the resulting files are copied to the
|
||||
https://github.com/pytorch/cppdocs repo, and are served from GitHub
|
||||
pages.
|
||||
|
||||
- Site: http://pytorch.org/cppdocs
|
||||
- GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp
|
||||
- Served from: https://github.com/pytorch/cppdocs
|
||||
|
||||
Tutorials
|
||||
---------
|
||||
|
||||
PyTorch tutorials are documents used to help understand using PyTorch to
|
||||
accomplish specific tasks or to understand more holistic concepts.
|
||||
Tutorials are built using
|
||||
`Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__
|
||||
from executable python sources files, or from restructured-text (rst)
|
||||
files.
|
||||
|
||||
- Site: http://pytorch.org/tutorials
|
||||
- GitHub: http://github.com/pytorch/tutorials
|
||||
|
||||
Tutorials Build Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For tutorials, `pull
|
||||
requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a
|
||||
rebuild the entire site using CircleCI to test the effects of the
|
||||
change. This build is sharded into 9 worker builds and takes around 40
|
||||
minutes total. At the same time, we do a Netlify build using *make
|
||||
html-noplot*, which builds the site without rendering the notebook
|
||||
output into pages for quick review.
|
||||
|
||||
After a PR is accepted, the site is rebuilt and deployed from CircleCI.
|
||||
|
||||
Contributing a new Tutorial
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`PyTorch.org Tutorial Contribution
|
||||
Guide <https://github.com/pytorch/tutorials/#contributing>`__
|
||||
|
||||
Code Style
|
||||
~~~~~~~~~~
|
||||
|
||||
**Python style**
|
||||
|
||||
**C++ style**
|
||||
|
||||
Submitting a Pull Request
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
PyTorch development happens publicly on our Github repo.
|
||||
|
||||
To have your feature or fix added to PyTorch, please submit a Pull
|
||||
Request.
|
||||
|
||||
Running Tests
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
Show examples for running all tests, just one individual...
|
||||
|
||||
Technical Process
|
||||
-----------------
|
||||
|
||||
Developing PyTorch
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To develop PyTorch on your machine, here are some tips:
|
||||
|
||||
1. Uninstall all existing PyTorch installs:
|
||||
|
||||
::
|
||||
|
||||
conda uninstall pytorch
|
||||
pip uninstall torch
|
||||
pip uninstall torch # run this command twice
|
||||
|
||||
2. Clone a copy of PyTorch from source:
|
||||
|
||||
::
|
||||
|
||||
git clone https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
|
||||
3. Install PyTorch in ``build develop`` mode:
|
||||
|
||||
A full set of instructions on installing PyTorch from source is here:
|
||||
https://github.com/pytorch/pytorch#from-source
|
||||
|
||||
The change you have to make is to replace
|
||||
|
||||
::
|
||||
|
||||
python setup.py install
|
||||
|
||||
with
|
||||
|
||||
::
|
||||
|
||||
python setup.py build develop
|
||||
|
||||
This is especially useful if you are only changing Python files.
|
||||
|
||||
This mode will symlink the Python files from the current local source
|
||||
tree into the Python install.
|
||||
|
||||
Hence, if you modify a Python file, you do not need to reinstall PyTorch
|
||||
again and again.
|
||||
|
||||
For example:
|
||||
|
||||
- Install local PyTorch in ``build develop`` mode
|
||||
- modify your Python file ``torch/__init__.py`` (for example)
|
||||
- test functionality
|
||||
- modify your Python file ``torch/__init__.py``
|
||||
- test functionality
|
||||
- modify your Python file ``torch/__init__.py``
|
||||
- test functionality
|
||||
|
||||
You do not need to repeatedly install after modifying Python files.
|
||||
|
||||
In case you want to reinstall, make sure that you uninstall PyTorch
|
||||
first by running ``pip uninstall torch`` and ``python setup.py clean``.
|
||||
Then you can install in ``build develop`` mode again.
|
||||
|
||||
Codebase structure
|
||||
------------------
|
||||
|
||||
- `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core
|
||||
library files that work everywhere, both server and mobile. We are
|
||||
slowly moving pieces from
|
||||
`ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
|
||||
here. This library is intended only to contain essential
|
||||
functionality, and appropriate to use in settings where binary size
|
||||
matters. (But you'll have a lot of missing functionality if you try
|
||||
to use it directly.)
|
||||
- `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++
|
||||
tensor library for PyTorch (no autograd support)
|
||||
|
||||
- `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__
|
||||
|
||||
- `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__
|
||||
`THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__
|
||||
`THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__
|
||||
`THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__
|
||||
- Legacy library code from the original Torch. Try not to add
|
||||
things here; we're slowly porting these to
|
||||
`native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__.
|
||||
|
||||
- generic - Contains actual implementations of operators,
|
||||
parametrized over ``scalar_t``. Files here get compiled N
|
||||
times per supported scalar type in PyTorch.
|
||||
|
||||
- `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__
|
||||
|
||||
- `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
|
||||
- Core functionality of ATen. This is migrating to top-level
|
||||
c10 folder.
|
||||
- `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__
|
||||
- Modern implementations of operators. If you want to write
|
||||
a new operator, here is where it should go. Most CPU
|
||||
operators go in the top level directory, except for
|
||||
operators which need to be compiled specially; see cpu
|
||||
below.
|
||||
|
||||
- `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__
|
||||
- Not actually CPU implementations of operators, but
|
||||
specifically implementations which are compiled with
|
||||
processor-specific instructions, like AVX. See the
|
||||
`README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__
|
||||
for more details.
|
||||
- `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__
|
||||
- CUDA implementations of operators.
|
||||
- `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__
|
||||
- CPU and CUDA implementations of COO sparse tensor
|
||||
operations
|
||||
- `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__
|
||||
`mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__
|
||||
`miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__
|
||||
`cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__
|
||||
|
||||
- implementations of operators which simply bind to some
|
||||
backend library.
|
||||
|
||||
- `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ -
|
||||
The actual PyTorch library. Everything that is not in
|
||||
`csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
|
||||
is a Python module, following the PyTorch Python frontend module
|
||||
structure.
|
||||
|
||||
- `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
|
||||
- C++ files composing the PyTorch library. Files in this directory
|
||||
tree are a mix of Python binding code, and C++ heavy lifting.
|
||||
Consult ``setup.py`` for the canonical list of Python binding
|
||||
files; conventionally, they are often prefixed with ``python_``.
|
||||
|
||||
- `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__
|
||||
- Compiler and frontend for TorchScript JIT frontend.
|
||||
- `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__
|
||||
- Implementation of reverse-mode automatic differentiation.
|
||||
- `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__
|
||||
- The PyTorch C++ frontend.
|
||||
- `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__
|
||||
- Distributed training support for PyTorch.
|
||||
|
||||
- `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ -
|
||||
Code generation scripts for the PyTorch library. See
|
||||
`README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__
|
||||
of this directory for more details.
|
||||
- `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ -
|
||||
Python unit tests for PyTorch Python frontend.
|
||||
|
||||
- `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__
|
||||
- Basic tests for PyTorch functionality.
|
||||
- `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__
|
||||
- Tests for non-NN automatic differentiation support.
|
||||
- `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__
|
||||
- Tests for NN operators and their automatic differentiation.
|
||||
- `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__
|
||||
- Tests for the JIT compiler and TorchScript.
|
||||
- ...
|
||||
- `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__
|
||||
- C++ unit tests for PyTorch C++ frontend.
|
||||
- `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__
|
||||
- Automatically generated "expect" files which are used to compare
|
||||
against expected output.
|
||||
- `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__
|
||||
- Tests for ONNX export functionality, using both PyTorch and
|
||||
Caffe2.
|
||||
|
||||
- `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ -
|
||||
The Caffe2 library.
|
||||
|
||||
- `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__
|
||||
- Core files of Caffe2, e.g., tensor, workspace, blobs, etc.
|
||||
- `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__
|
||||
- Operators of Caffe2.
|
||||
- `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__
|
||||
- Python bindings to Caffe2.
|
||||
- ...
|
||||
|
||||
Unit Testing
|
||||
------------
|
||||
|
||||
PyTorch's testing is located under ``test/``. Run the entire test suite
|
||||
with
|
||||
|
||||
::
|
||||
|
||||
python test/run_test.py
|
||||
|
||||
or run individual test files, like ``python test/test_nn.py``, for
|
||||
individual test suites.
|
||||
|
||||
Better local unit tests with pytest
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We don't officially support ``pytest``, but it works well with our
|
||||
``unittest`` tests and offers a number of useful features for local
|
||||
developing. Install it via ``pip install pytest``.
|
||||
|
||||
If you want to just run tests that contain a specific substring, you can
|
||||
use the ``-k`` flag:
|
||||
|
||||
::
|
||||
|
||||
pytest test/test_nn.py -k Loss -v
|
||||
|
||||
The above is an example of testing a change to Loss functions: this
|
||||
command runs tests such as ``TestNN.test_BCELoss``\ and
|
||||
``TestNN.test_MSELoss`` and can be useful to save keystrokes.
|
||||
|
||||
Writing documentation
|
||||
---------------------
|
||||
|
||||
PyTorch uses `Google
|
||||
style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__
|
||||
for formatting docstrings. Length of line inside docstrings block must
|
||||
be limited to 80 characters to fit into Jupyter documentation popups.
|
||||
|
||||
For C++ documentation (https://pytorch.org/cppdocs), we use
|
||||
`Doxygen <http://www.doxygen.nl/>`__ and then convert it to
|
||||
`Sphinx <http://www.sphinx-doc.org/>`__ via
|
||||
`Breathe <https://github.com/michaeljones/breathe>`__
|
||||
and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen
|
||||
reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__
|
||||
for more information on the documentation syntax. To build the
|
||||
documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``.
|
||||
|
||||
We run Doxygen in CI (Travis) to verify that you do not use invalid
|
||||
Doxygen commands. To run this check locally, run ``./check-doxygen.sh``
|
||||
from inside ``docs/cpp``.
|
||||
|
||||
Managing multiple build trees
|
||||
-----------------------------
|
||||
|
||||
One downside to using ``python setup.py develop`` is that your
|
||||
development version of PyTorch will be installed globally on your
|
||||
account (e.g., if you run ``import torch`` anywhere else, the
|
||||
development version will be used.
|
||||
|
||||
If you want to manage multiple builds of PyTorch, you can make use of
|
||||
`conda environments <https://conda.io/docs/using/envs.html>`__ to
|
||||
maintain separate Python package environments, each of which can be tied
|
||||
to a specific build of PyTorch. To set one up:
|
||||
|
||||
::
|
||||
|
||||
conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed
|
||||
python setup.py build develop
|
||||
|
||||
C++ Development tips
|
||||
--------------------
|
||||
|
||||
If you are working on the C++ code, there are a few important things
|
||||
that you will want to keep in mind:
|
||||
|
||||
1. How to rebuild only the code you are working on.
|
||||
2. How to make rebuilds in the absence of changes go faster.
|
||||
|
||||
Build only what you need.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``python setup.py build`` will build everything, but since our build
|
||||
system is not very optimized for incremental rebuilds, this will
|
||||
actually be very slow. Far better is to only request rebuilds of the
|
||||
parts of the project you are working on:
|
||||
|
||||
- Working on the Python bindings? Run ``python setup.py develop`` to
|
||||
rebuild (NB: no ``build`` here!)
|
||||
- Working on ``torch/csrc`` or ``aten``? Run
|
||||
``python setup.py rebuild_libtorch`` to rebuild and avoid having to
|
||||
rebuild other dependent libraries we depend on.
|
||||
- Working on one of the other dependent libraries? The other valid
|
||||
targets are listed in ``dep_libs`` in ``setup.py``. prepend
|
||||
``build_`` to get a target, and run as e.g.
|
||||
``python setup.py build_gloo``.
|
||||
- Working on a test binary? Run
|
||||
``(cd build && ninja bin/test_binary_name)`` to rebuild only that
|
||||
test binary (without rerunning cmake). (Replace ``ninja`` with
|
||||
``make`` if you don't have ninja installed).
|
||||
|
||||
On the initial build, you can also speed things up with the environment
|
||||
variables ``DEBUG`` and ``NO_CUDA``.
|
||||
|
||||
- ``DEBUG=1`` will enable debug builds (-g -O0)
|
||||
- ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations
|
||||
(-g -O3)
|
||||
- ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing
|
||||
on something not CUDA related), to save compile time.
|
||||
|
||||
For example:
|
||||
|
||||
::
|
||||
|
||||
NO_CUDA=1 DEBUG=1 python setup.py build develop
|
||||
|
||||
Make sure you continue to pass these flags on subsequent builds.
|
||||
|
||||
Code completion and IDE support
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When using ``python setup.py develop``, PyTorch will generate a
|
||||
``compile_commands.json`` file that can be used by many editors to
|
||||
provide command completion and error highlighting for PyTorch's C++
|
||||
code. You need to ``pip install ninja`` to generate accurate information
|
||||
for the code in ``torch/csrc``. More information at:
|
||||
|
||||
- https://sarcasm.github.io/notes/dev/compilation-database.html
|
||||
|
||||
Make no-op build fast.
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use Ninja
|
||||
~~~~~~~~~
|
||||
|
||||
Python ``setuptools`` is pretty dumb, and always rebuilds every C file
|
||||
in a project. If you install the ninja build system with
|
||||
``pip install ninja``, then PyTorch will use it to track dependencies
|
||||
correctly. If PyTorch was already built, you will need to run
|
||||
``python setup.py clean`` once after installing ninja for builds to
|
||||
succeed.
|
||||
|
||||
Use CCache
|
||||
~~~~~~~~~~
|
||||
|
||||
Even when dependencies are tracked with file modification, there are
|
||||
many situations where files get rebuilt when a previous compilation was
|
||||
exactly the same.
|
||||
|
||||
Using ccache in a situation like this is a real time-saver. However, by
|
||||
default, ccache does not properly support CUDA stuff, so here are the
|
||||
instructions for installing a custom ccache fork that has CUDA support:
|
||||
|
||||
::
|
||||
|
||||
# install and export ccacheif ! ls ~/ccache/bin/ccachethen
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y automake autoconf
|
||||
sudo apt-get install -y asciidoc
|
||||
mkdir -p ~/ccache
|
||||
pushd /tmp
|
||||
rm -rf ccache
|
||||
git clone https://github.com/colesbury/ccache -b ccbin
|
||||
pushd ccache
|
||||
./autogen.sh
|
||||
./configure
|
||||
make install prefix=~/ccache
|
||||
popdpopd
|
||||
|
||||
mkdir -p ~/ccache/lib
|
||||
mkdir -p ~/ccache/cuda
|
||||
ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
|
||||
ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
|
||||
ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
|
||||
ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
|
||||
ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
|
||||
|
||||
~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
|
||||
|
||||
CUDA Development tips
|
||||
---------------------
|
||||
|
||||
If you are working on the CUDA code, here are some useful CUDA debugging
|
||||
tips:
|
||||
|
||||
1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug
|
||||
symbols (``-g -G``). This will be particularly helpful in debugging
|
||||
device code. However, it will slow down the build process for about
|
||||
50% (compared to only ``DEBUG=1``), so use wisely.
|
||||
2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging
|
||||
friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a
|
||||
CUDA tensor (rather than all zeros).
|
||||
|
||||
Hope this helps, and thanks for considering to contribute.
|
||||
|
||||
Windows development tips
|
||||
------------------------
|
||||
|
||||
Occasionally, you will write a patch which works on Linux, but fails CI
|
||||
on Windows. There are a few aspects in which MSVC (the Windows compiler
|
||||
toolchain we use) is stricter than Linux, which are worth keeping in
|
||||
mind when fixing these problems.
|
||||
|
||||
1. Symbols are NOT exported by default on Windows; instead, you have to
|
||||
explicitly mark a symbol as exported/imported in a header file with
|
||||
``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have
|
||||
codified this pattern into a set of macros which follow the
|
||||
convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen.
|
||||
(Every separate shared library needs a unique macro name, because
|
||||
symbol visibility is on a per shared library basis. See
|
||||
c10/macros/Macros.h for more details.) The upshot is if you see an
|
||||
"unresolved external" error in your Windows build, this is probably
|
||||
because you forgot to mark a function with ``*_API``. However, there
|
||||
is one important counterexample to this principle: if you want a
|
||||
*templated* function to be instantiated at the call site, do NOT mark
|
||||
it with ``*_API`` (if you do mark it, you'll have to explicitly
|
||||
instantiate all of the specializations used by the call sites.)
|
||||
2. If you link against a library, this does not make its dependencies
|
||||
transitively visible. You must explicitly specify a link dependency
|
||||
against every library whose symbols you use. (This is different from
|
||||
Linux where in most environments, transitive dependencies can be used
|
||||
to fulfill unresolved symbols.)
|
||||
3. If you have a Windows box (we have a few on EC2 which you can request
|
||||
access to) and you want to run the build, the easiest way is to just
|
||||
run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run
|
||||
``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing
|
||||
away your Conda environment.)
|
||||
|
||||
Even if you don't know anything about MSVC, you can use cmake to build
|
||||
simple programs on Windows; this can be helpful if you want to learn
|
||||
more about some peculiar linking behavior by reproducing it on a small
|
||||
example. Here's a simple example cmake file that defines two dynamic
|
||||
libraries, one linking with the other:
|
||||
|
||||
::
|
||||
|
||||
project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo)
|
||||
|
||||
You can build it with:
|
||||
|
||||
::
|
||||
|
||||
mkdir buildcd build
|
||||
cmake ..
|
||||
cmake --build .
|
||||
|
||||
Known MSVC (and MSVC with NVCC) bugs
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The PyTorch codebase sometimes likes to use exciting C++ features, and
|
||||
these exciting features lead to exciting bugs in Windows compilers. To
|
||||
add insult to injury, the error messages will often not tell you which
|
||||
line of code actually induced the erroring template instantiation. We've
|
||||
found the most effective way to debug these problems is to carefully
|
||||
read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few
|
||||
well known pitfalls and workarounds:
|
||||
|
||||
- This is not actually a bug per se, but in general, code generated by
|
||||
MSVC is more sensitive to memory errors; you may have written some
|
||||
code that does a use-after-free or stack overflows; on Linux the code
|
||||
might work, but on Windows your program will crash. ASAN may not
|
||||
catch all of these problems: stay vigilant to the possibility that
|
||||
your crash is due to a real memory problem.
|
||||
- (NVCC) ``c10::optional`` does not work when used from device code.
|
||||
Don't use it from kernels. Upstream issue:
|
||||
https://github.com/akrzemi1/Optional/issues/58 and our local issue
|
||||
#10329.
|
||||
- ``constexpr`` generally works less well on MSVC.
|
||||
|
||||
- The idiom ``static_assert(f() == f())`` to test if ``f`` is
|
||||
constexpr does not work; you'll get "error C2131: expression did
|
||||
not evaluate to a constant". Don't use these asserts on Windows.
|
||||
(Example: ``c10/util/intrusive_ptr.h``)
|
||||
|
||||
- (NVCC) Code you access inside a ``static_assert`` will eagerly be
|
||||
evaluated as if it were device code, and so you might get an error
|
||||
that the code is "not accessible".
|
||||
|
||||
::
|
||||
|
||||
class A {
|
||||
static A singleton_;
|
||||
static constexpr inline A* singleton() {
|
||||
return &singleton_;
|
||||
}
|
||||
};static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
|
||||
|
||||
- The compiler will run out of heap space if you attempt to compile
|
||||
files that are too large. Splitting such files into separate files
|
||||
helps. (Example: ``THTensorMath``, ``THTensorMoreMath``,
|
||||
``THTensorEvenMoreMath``.)
|
||||
- MSVC's preprocessor (but not the standard compiler) has a bug where
|
||||
it incorrectly tokenizes raw string literals, ending when it sees a
|
||||
``"``. This causes preprocessor tokens inside the literal like
|
||||
an\ ``#endif`` to be incorrectly treated as preprocessor directives.
|
||||
See https://godbolt.org/z/eVTIJq as an example.
|
||||
|
||||
Running Clang-Tidy
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a
|
||||
C++ linter and static analysis tool based on the clang compiler. We run
|
||||
clang-tidy in our CI to make sure that new C++ code is safe, sane and
|
||||
efficient. See our
|
||||
`.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__
|
||||
file for the simple commands we use for this. To run clang-tidy locally,
|
||||
follow these steps:
|
||||
|
||||
1. Install clang-tidy. First, check if you already have clang-tidy by
|
||||
simply writing ``clang-tidy`` in your terminal. If you don't yet have
|
||||
clang-tidy, you should be able to install it easily with your package
|
||||
manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu.
|
||||
See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on
|
||||
how to install the latest version. Note that newer versions of
|
||||
clang-tidy will have more checks than older versions. In our CI, we
|
||||
run clang-tidy-6.0.
|
||||
2. Use our driver script to run clang-tidy over any changes relative to
|
||||
some git revision (you may want to replace ``HEAD~1`` with ``HEAD``
|
||||
to pick up uncommitted changes). Changes are picked up based on a
|
||||
``git diff`` with the given revision:
|
||||
|
||||
::
|
||||
|
||||
python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
|
||||
|
||||
Above, it is assumed you are in the PyTorch root folder.
|
||||
``path/to/build`` should be the path to where you built PyTorch from
|
||||
source, e.g. ``build`` in the PyTorch root folder if you used
|
||||
``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change
|
||||
the clang-tidy this script uses. Make sure you have PyYaml installed,
|
||||
which is in PyTorch's ``requirements.txt``.
|
||||
|
||||
Pre-commit Tidy/Linting Hook
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We use clang-tidy and flake8 to perform additional formatting and
|
||||
semantic checking of code. We provide a pre-commit git hook for
|
||||
performing these checks, before a commit is created:
|
||||
|
||||
::
|
||||
|
||||
ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
|
||||
|
||||
Caffe2 notes
|
||||
------------
|
||||
|
||||
In 2018, we merged Caffe2 into the PyTorch source repository. While the
|
||||
steady state aspiration is that Caffe2 and PyTorch share code freely, in
|
||||
the meantime there will be some separation. If you submit a PR to only
|
||||
PyTorch or only Caffe2 code, CI will only run for the project you
|
||||
edited. The logic for this is implemented in
|
||||
``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can
|
||||
look at this to see what path prefixes constitute changes. This also
|
||||
means if you ADD a new top-level path, or you start sharing code between
|
||||
projects, you need to modify these files. There are a few "unusual"
|
||||
directories which, for historical reasons, are Caffe2/PyTorch specific.
|
||||
Here they are:
|
||||
|
||||
- ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``,
|
||||
``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code
|
||||
in them without extra coordination.
|
||||
- ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools``
|
||||
are PyTorch-specific. Don't put Caffe2 code in them without extra
|
||||
coordination.
|
154
docs/source/community/governance.rst
Normal file
154
docs/source/community/governance.rst
Normal file
@ -0,0 +1,154 @@
|
||||
PyTorch Governance
|
||||
==========================
|
||||
|
||||
Governance Philosophy and Guiding Tenets
|
||||
-----------------------------------------
|
||||
|
||||
PyTorch adopts a governance structure with a small set of maintainers
|
||||
driving the overall project direction with a strong bias towards
|
||||
PyTorch's design philosophy where design and code contributions are
|
||||
valued. Beyond the core maintainers, there is also a slightly broader
|
||||
set of core developers that have the ability to directly merge pull
|
||||
requests and own various parts of the core code base.
|
||||
|
||||
Beyond the maintainers and core devs, the community is encouraged to
|
||||
contribute, file issues, make proposals, review pull requests and be
|
||||
present in the community. Given contributions and willingness to
|
||||
invest, anyone can be provided write access or ownership of parts of
|
||||
the codebase.
|
||||
|
||||
Based on this governance structure, the project has the following core
|
||||
operating tenets by which decisions are made and overall culture is
|
||||
derived:
|
||||
|
||||
1. **Code contributions** matter much more than corporate sponsorship
|
||||
and independent developers are highly valued.
|
||||
2. **Project influence** is gained through contributions (whether PRs,
|
||||
forum answers, code reviews or otherwise)
|
||||
|
||||
Key people and their functions
|
||||
------------------------------
|
||||
|
||||
Project Maintainers
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Project maintainers provide leadership and direction for the PyTorch
|
||||
project. Specifics include:
|
||||
|
||||
- Articulate a cohesive long-term vision for the project
|
||||
- Possess a deep understanding of the PyTorch code base
|
||||
- Negotiate and resolve contentious issues in ways acceptable to all
|
||||
parties involved
|
||||
|
||||
PyTorch Maintainers:
|
||||
|
||||
- Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Greg Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
|
||||
- (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__)
|
||||
|
||||
Core Developers
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
The PyTorch project is developed by a team of core developers. You can
|
||||
find the list of core developers at `PyTorch Governance \| Persons of
|
||||
Interest </docs/community/persons_of_interest.html>`__.
|
||||
|
||||
While membership is determined by presence in the "PyTorch core" team in
|
||||
the "PyTorch"
|
||||
`organization <https://github.com/orgs/pytorch/teams/facebook>`__ on
|
||||
GitHub, contribution takes many forms:
|
||||
|
||||
- committing changes to the repository;
|
||||
- reviewing pull requests by others;
|
||||
- triaging bug reports on the issue tracker;
|
||||
- discussing topics on official PyTorch communication channels.
|
||||
|
||||
Moderators
|
||||
~~~~~~~~~~
|
||||
|
||||
There is a group of people, some of which are not core developers,
|
||||
responsible for ensuring that discussions on official communication
|
||||
channels adhere to the Code of Conduct. They take action in view of
|
||||
violations and help to support a healthy community. You can find the
|
||||
list of moderators `here <https://discuss.pytorch.org/about>`__.
|
||||
|
||||
Decision Making
|
||||
---------------
|
||||
|
||||
Uncontroversial Changes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Primary work happens through bug tracker issues and pull requests on
|
||||
GitHub. Core developers should avoid pushing their changes directly to
|
||||
the PyTorch repository, instead relying on pull requests. Approving a
|
||||
pull request by a core developer allows it to be merged without further
|
||||
process. Core Developers and Project Maintainers ultimately approve
|
||||
these changes.
|
||||
|
||||
Notifying relevant experts about a bug tracker issue or a pull request
|
||||
is important. Reviews from experts in the given interest area are
|
||||
strongly preferred, especially on pull request approvals. Failure to do
|
||||
so might end up with the change being reverted by the relevant expert.
|
||||
|
||||
Controversial decision process
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Substantial changes in a given interest area require a GitHub issue to
|
||||
be opened for discussion. This includes:
|
||||
|
||||
- Any semantic or syntactic change to the framework.
|
||||
- Backwards-incompatible changes to the Python or Cpp API.
|
||||
- Additions to the core framework, including substantial new
|
||||
functionality within an existing library.
|
||||
- Removing core features
|
||||
|
||||
Project Maintainers ultimately approve these changes.
|
||||
|
||||
FAQ
|
||||
---
|
||||
|
||||
**Q: What if I would like to own (or partly own) a part of the project
|
||||
such as a domain api (i.e. Torch Vision)?** This is absolutely possible.
|
||||
The first step is to start contributing to the existing project area and
|
||||
contributing to its health and success. In addition to this, you can
|
||||
make a proposal through a GitHub issue for new functionality or changes
|
||||
to improve the project area.
|
||||
|
||||
**Q: What if I am a company looking to use PyTorch internally for
|
||||
development, can I be granted or purchase a board seat to drive the
|
||||
project direction?** No, the PyTorch project is strictly driven by the
|
||||
maintainer-driven project philosophy and does not have a board or
|
||||
vehicle to take financial contributions relating to gaining influence
|
||||
over technical direction.
|
||||
|
||||
**Q: Does the PyTorch project support grants or ways to support
|
||||
independent developers using or contributing to the project?** No, not
|
||||
at this point. We are however looking at ways to better support the
|
||||
community of independent developers around PyTorch. If you have
|
||||
suggestions or inputs, please reach out on the PyTorch forums to
|
||||
discuss.
|
||||
|
||||
**Q: How do I contribute code to the project?** If the change is
|
||||
relatively minor, a pull request on GitHub can be opened up immediately
|
||||
for review and merge by the project committers. For larger changes,
|
||||
please open an issue to make a proposal to discuss prior. Please also
|
||||
see the **`PyTorch Contributor
|
||||
Guide </docs/community/contribution_guide.html>`__** for contribution
|
||||
guidelines.
|
||||
|
||||
**Q: Can I become a committer on the project?** Unfortunately, the
|
||||
current commit process to PyTorch involves an interaction with Facebook
|
||||
infrastructure that can only be triggered by Facebook employees. We are
|
||||
however looking at ways to expand the committer base to individuals
|
||||
outside of Facebook and will provide an update when the tooling exists
|
||||
to allow this.
|
||||
|
||||
**Q: What if i would like to deliver a PyTorch tutorial at a conference
|
||||
or otherwise? Do I need to be 'officially' a committer to do this?** No,
|
||||
we encourage community members to showcase their work wherever and
|
||||
whenever they can. Please reach out to
|
||||
`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
|
||||
for marketing support.
|
130
docs/source/community/persons_of_interest.rst
Normal file
130
docs/source/community/persons_of_interest.rst
Normal file
@ -0,0 +1,130 @@
|
||||
PyTorch Governance | Persons of Interest
|
||||
=========================================
|
||||
|
||||
General Maintainers
|
||||
-------------------
|
||||
|
||||
- Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Greg Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
|
||||
- (sunsetting) Sam Gross
|
||||
(`colesbury <https://github.com/colesbury>`__)
|
||||
|
||||
Module-level maintainers
|
||||
------------------------
|
||||
|
||||
JIT
|
||||
~~~
|
||||
|
||||
- Zach Devito (`zdevito <https://github.com/zdevito>`__)
|
||||
- Michael Suo (`suo <https://github.com/suo>`__)
|
||||
|
||||
Distributed
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
|
||||
- Shen Li (`mrshenli <https://github.com/mrshenli>`__)
|
||||
- (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__)
|
||||
|
||||
Autograd Engine
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
- Alban Desmaison (`alband <https://github.com/alband>`__)
|
||||
- Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
|
||||
Multiprocessing and DataLoaders
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Simon Wang (`SsnL <https://github.com/SsnL>`__)
|
||||
- Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
- (proposed) Vitaly Fedyunin
|
||||
(`VitalyFedyunin <https://github.com/proposed>`__)
|
||||
|
||||
CUDA
|
||||
~~~~
|
||||
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
|
||||
C++
|
||||
~~~
|
||||
|
||||
- Will Feng (`yf225 <https://github.com/yf225>`__)
|
||||
- (sunsetting) Peter Goldsborough
|
||||
(`goldsborough <https://github.com/goldsborough>`__)
|
||||
|
||||
Build + CI
|
||||
~~~~~~~~~~
|
||||
|
||||
- Will Feng (`yf225 <https://github.com/yf225>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- (sunsetting) Orion Reblitz-Richardson
|
||||
(`orionr <https://github.com/orionr>`__)
|
||||
|
||||
Distributions & RNG
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__)
|
||||
- Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__)
|
||||
- Alican Bozkurt (`alicanb <https://github.com/alicanb>`__)
|
||||
- Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
|
||||
|
||||
C10
|
||||
~~~
|
||||
|
||||
- Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
|
||||
ONNX <-> PyTorch
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
- Lu Fang (`houseroad <https://github.com/houseroad>`__)
|
||||
|
||||
torch.nn
|
||||
~~~~~~~~
|
||||
|
||||
- Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
|
||||
- Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
- Greg Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- Sam Gross (`colesbury <https://github.com/colesbury>`__)
|
||||
|
||||
CPU Performance / SIMD
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
- Sam Gross (`colesbury <https://github.com/colesbury>`__)
|
||||
- Richard Zou (`zou3519 <https://github.com/zou3519>`__)
|
||||
|
||||
AMD/ROCm/HIP
|
||||
~~~~~~~~~~~~
|
||||
|
||||
- Junjie Bai (`bddppq <https://github.com/bddppq>`__)
|
||||
- Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__)
|
||||
|
||||
Windows
|
||||
~~~~~~~
|
||||
|
||||
- Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
|
||||
|
||||
MKLDNN
|
||||
~~~~~~
|
||||
|
||||
- Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
|
||||
|
||||
XLA
|
||||
~~~
|
||||
|
||||
- Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
|
||||
- Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
|
||||
- Alex Suhan (`asuhan <https://github.com/asuhan>`__)
|
||||
|
||||
PPC
|
||||
~~~
|
||||
|
||||
- Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
|
@ -1,6 +1,101 @@
|
||||
torch.hub
|
||||
===================================
|
||||
Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
|
||||
|
||||
Publishing models
|
||||
-----------------
|
||||
|
||||
Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
|
||||
to a github repository by adding a simple ``hubconf.py`` file;
|
||||
|
||||
``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with
|
||||
the following signature.
|
||||
|
||||
::
|
||||
|
||||
def entrypoint_name(pretrained=False, *args, **kwargs):
|
||||
...
|
||||
|
||||
How to implement an entrypoint?
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Here is a code snipet from pytorch/vision repository, which specifies an entrypoint
|
||||
for ``resnet18`` model. You can see a full script in
|
||||
`pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_
|
||||
|
||||
::
|
||||
|
||||
dependencies = ['torch', 'math']
|
||||
|
||||
def resnet18(pretrained=False, *args, **kwargs):
|
||||
"""
|
||||
Resnet18 model
|
||||
pretrained (bool): a recommended kwargs for all entrypoints
|
||||
args & kwargs are arguments for the function
|
||||
"""
|
||||
######## Call the model in the repo ###############
|
||||
from torchvision.models.resnet import resnet18 as _resnet18
|
||||
model = _resnet18(*args, **kwargs)
|
||||
######## End of call ##############################
|
||||
# The following logic is REQUIRED
|
||||
if pretrained:
|
||||
# For weights saved in local repo
|
||||
# model.load_state_dict(<path_to_saved_file>)
|
||||
|
||||
# For weights saved elsewhere
|
||||
checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
|
||||
model.load_state_dict(model_zoo.load_url(checkpoint, progress=False))
|
||||
return model
|
||||
|
||||
- ``dependencies`` variable is a **list** of package names required to to run the model.
|
||||
- Pretrained weights can either be stored local in the github repo, or loadable by
|
||||
``model_zoo.load()``.
|
||||
- ``pretrained`` controls whether to load the pre-trained weights provided by repo owners.
|
||||
- ``args`` and ``kwargs`` are passed along to the real callable function.
|
||||
- Docstring of the function works as a help message, explaining what does the model do and what
|
||||
are the allowed arguments.
|
||||
- Entrypoint function should **ALWAYS** return a model(nn.module).
|
||||
|
||||
Important Notice
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
- The published models should be at least in a branch/tag. It can't be a random commit.
|
||||
|
||||
Loading models from Hub
|
||||
-----------------------
|
||||
|
||||
Users can load the pre-trained models using ``torch.hub.load()`` API.
|
||||
|
||||
|
||||
.. automodule:: torch.hub
|
||||
.. autofunction:: load
|
||||
|
||||
Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo.
|
||||
|
||||
::
|
||||
|
||||
hub_model = hub.load(
|
||||
'pytorch/vision:master', # repo_owner/repo_name:branch
|
||||
'resnet18', # entrypoint
|
||||
1234, # args for callable [not applicable to resnet]
|
||||
pretrained=True) # kwargs for callable
|
||||
|
||||
Where are my downloaded model & weights saved?
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The locations are used in the order of
|
||||
|
||||
- hub_dir: user specified path. It can be set in the following ways:
|
||||
- Setting the environment variable ``TORCH_HUB_DIR``
|
||||
- Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
|
||||
- ``~/.torch/hub``
|
||||
|
||||
.. autofunction:: set_dir
|
||||
|
||||
Caching logic
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``.
|
||||
|
||||
Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
|
||||
the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
|
||||
when updates are published to the same branch, users can keep up with the latest release.
|
||||
|
@ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
|
||||
|
||||
notes/*
|
||||
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 1
|
||||
:caption: Community
|
||||
|
||||
community/*
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
@ -1,4 +1,4 @@
|
||||
Torch Script
|
||||
TorchScript
|
||||
============
|
||||
|
||||
.. contents:: :local:
|
||||
@ -6,17 +6,17 @@ Torch Script
|
||||
.. automodule:: torch.jit
|
||||
.. currentmodule:: torch.jit
|
||||
|
||||
Torch Script is a way to create serializable and optimizable models from PyTorch code.
|
||||
Any code written in Torch Script can be saved from your Python
|
||||
TorchScript is a way to create serializable and optimizable models from PyTorch code.
|
||||
Any code written in TorchScript can be saved from your Python
|
||||
process and loaded in a process where there is no Python dependency.
|
||||
|
||||
We provide tools to incrementally transition a model from being a pure Python program
|
||||
to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
|
||||
to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program.
|
||||
This makes it possible to train models in PyTorch using familiar tools and then export
|
||||
the model to a production environment where it is not a good idea to run models as Python programs
|
||||
for performance and multi-threading reasons.
|
||||
|
||||
Creating Torch Script Code
|
||||
Creating TorchScript Code
|
||||
--------------------------
|
||||
|
||||
|
||||
@ -117,26 +117,26 @@ Example:
|
||||
return self.resnet(input - self.means)
|
||||
|
||||
|
||||
Torch Script Language Reference
|
||||
TorchScript Language Reference
|
||||
-------------------------------
|
||||
|
||||
Torch Script is a subset of Python that can either be written directly (using
|
||||
TorchScript is a subset of Python that can either be written directly (using
|
||||
the @script annotations) or generated automatically from Python code via
|
||||
tracing. When using tracing, code is automatically converted into this subset of
|
||||
Python by recording only the actual operators on tensors and simply executing and
|
||||
discarding the other surrounding Python code.
|
||||
|
||||
When writing Torch Script directly using @script annotations, the programmer must
|
||||
only use the subset of Python supported in Torch Script. This section documents
|
||||
what is supported in Torch Script as if it were a language reference for a stand
|
||||
When writing TorchScript directly using @script annotations, the programmer must
|
||||
only use the subset of Python supported in TorchScript. This section documents
|
||||
what is supported in TorchScript as if it were a language reference for a stand
|
||||
alone language. Any features of Python not mentioned in this reference are not
|
||||
part of Torch Script.
|
||||
part of TorchScript.
|
||||
|
||||
As a subset of Python any valid Torch Script function is also a valid Python
|
||||
As a subset of Python any valid TorchScript function is also a valid Python
|
||||
function. This makes it possible to remove the @script annotations and debug the
|
||||
function using standard Python tools like pdb. The reverse is not true: there
|
||||
are many valid python programs that are not valid Torch Script programs.
|
||||
Instead, Torch Script focuses specifically on the features of Python that are
|
||||
are many valid python programs that are not valid TorchScript programs.
|
||||
Instead, TorchScript focuses specifically on the features of Python that are
|
||||
needed to represent neural network models in Torch.
|
||||
|
||||
.. envvar:: PYTORCH_JIT=1
|
||||
@ -150,9 +150,9 @@ needed to represent neural network models in Torch.
|
||||
Types
|
||||
~~~~~
|
||||
|
||||
The largest difference between Torch Script and the full Python language is that
|
||||
Torch Script only support a small set of types that are needed to express neural
|
||||
net models. In particular Torch Script supports:
|
||||
The largest difference between TorchScript and the full Python language is that
|
||||
TorchScript only support a small set of types that are needed to express neural
|
||||
net models. In particular TorchScript supports:
|
||||
|
||||
``Tensor``
|
||||
A PyTorch tensor of any dtype, dimension, or backend.
|
||||
@ -169,8 +169,8 @@ net models. In particular Torch Script supports:
|
||||
``List[T]``
|
||||
A list of which all members are type ``T``
|
||||
|
||||
Unlike Python, each variable in Torch Script function must have a single static type.
|
||||
This makes it easier to optimize Torch Script functions.
|
||||
Unlike Python, each variable in TorchScript function must have a single static type.
|
||||
This makes it easier to optimize TorchScript functions.
|
||||
|
||||
Example::
|
||||
|
||||
@ -183,9 +183,9 @@ Example::
|
||||
return r # Type mismatch: r is set to type Tensor in the true branch
|
||||
# and type int in the false branch
|
||||
|
||||
By default, all parameters to a Torch Script function are assumed to be Tensor
|
||||
By default, all parameters to a TorchScript function are assumed to be Tensor
|
||||
because this is the most common type used in modules. To specify that an
|
||||
argument to a Torch Script function is another type, it is possible to use
|
||||
argument to a TorchScript function is another type, it is possible to use
|
||||
MyPy-style type annotations using the types listed above:
|
||||
|
||||
Example::
|
||||
@ -264,7 +264,7 @@ Subscripts
|
||||
``t[i:j, i]``
|
||||
|
||||
.. note::
|
||||
Torch Script currently does not support mutating tensors in place, so any
|
||||
TorchScript currently does not support mutating tensors in place, so any
|
||||
tensor indexing can only appear on the right-hand size of an expression.
|
||||
|
||||
Function calls
|
||||
@ -328,7 +328,7 @@ Accessing Module Parameters
|
||||
Statements
|
||||
~~~~~~~~~~
|
||||
|
||||
Torch Script supports the following types of statements:
|
||||
TorchScript supports the following types of statements:
|
||||
|
||||
Simple Assignments
|
||||
|
||||
@ -438,7 +438,7 @@ Return
|
||||
Variable Resolution
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Torch Script supports a subset of Python's variable resolution (i.e. scoping)
|
||||
TorchScript supports a subset of Python's variable resolution (i.e. scoping)
|
||||
rules. Local variables behave the same as in Python, except for the restriction
|
||||
that a variable must have the same type along all paths through a function.
|
||||
If a variable has a different type on different sides of an if statement, it
|
||||
@ -456,23 +456,23 @@ Example::
|
||||
print(y) # Error: undefined value y
|
||||
|
||||
Non-local variables are resolved to Python values at compile time when the
|
||||
function is defined. These values are then converted into Torch Script values using
|
||||
function is defined. These values are then converted into TorchScript values using
|
||||
the rules described in `Use of Python Values`_.
|
||||
|
||||
Use of Python Values
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To make writing Torch Script more convenient, we allow script code to refer
|
||||
To make writing TorchScript more convenient, we allow script code to refer
|
||||
to Python values in the surrounding scope. For instance, any time there is a
|
||||
reference to ``torch``, the Torch Script compiler is actually resolving it to the
|
||||
reference to ``torch``, the TorchScript compiler is actually resolving it to the
|
||||
``torch`` Python module when the function is declared. These Python values are
|
||||
not a first class part of Torch Script. Instead they are desugared at compile-time
|
||||
into the primitive types that Torch Script supports. This section describes the
|
||||
rules that are used when accessing Python values in Torch Script. They depend
|
||||
not a first class part of TorchScript. Instead they are desugared at compile-time
|
||||
into the primitive types that TorchScript supports. This section describes the
|
||||
rules that are used when accessing Python values in TorchScript. They depend
|
||||
on the dynamic type of the python valued referenced.
|
||||
|
||||
Functions
|
||||
Torch Script can call python functions. This functionality is very useful when
|
||||
TorchScript can call python functions. This functionality is very useful when
|
||||
incrementally converting a model into script. The model can be moved function-by-function
|
||||
to script, leaving calls to Python functions in place. This way you can incrementally
|
||||
check the correctness of the model as you go.
|
||||
@ -495,12 +495,12 @@ Functions
|
||||
|
||||
|
||||
Attribute Lookup On Python Modules
|
||||
Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
|
||||
are accessed this way. This allows Torch Script to call functions defined in
|
||||
TorchScript can lookup attributes on modules. Builtin functions like ``torch.add``
|
||||
are accessed this way. This allows TorchScript to call functions defined in
|
||||
other modules.
|
||||
|
||||
Python-defined Constants
|
||||
Torch Script also provides a way to use constants that are defined in Python.
|
||||
TorchScript also provides a way to use constants that are defined in Python.
|
||||
These can be used to hard-code hyper-parameters into the function, or to
|
||||
define universal constants. There are two ways of specifying that a Python
|
||||
value should be treated as a constant.
|
||||
@ -597,36 +597,35 @@ Interpreting Graphs
|
||||
|
||||
The example script above produces the graph::
|
||||
|
||||
graph(%len : int) {
|
||||
%13 : float = prim::Constant[value=1]()
|
||||
%10 : int = prim::Constant[value=10]()
|
||||
%2 : int = prim::Constant[value=4]()
|
||||
%1 : int = prim::Constant[value=3]()
|
||||
%3 : int[] = prim::ListConstruct(%1, %2)
|
||||
%4 : int = prim::Constant[value=6]()
|
||||
%5 : int = prim::Constant[value=0]()
|
||||
%6 : int[] = prim::Constant[value=[0, -1]]()
|
||||
%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
|
||||
%8 : int = prim::Constant[value=1]()
|
||||
%rv : Dynamic = prim::Loop(%len, %8, %rv.1)
|
||||
block0(%i : int, %12 : Dynamic) {
|
||||
%11 : int = aten::lt(%i, %10)
|
||||
%rv.4 : Dynamic = prim::If(%11)
|
||||
block0() {
|
||||
%14 : int = prim::Constant[value=1]()
|
||||
%rv.2 : Dynamic = aten::sub(%12, %13, %14)
|
||||
-> (%rv.2)
|
||||
}
|
||||
block1() {
|
||||
%16 : int = prim::Constant[value=1]()
|
||||
%rv.3 : Dynamic = aten::add(%12, %13, %16)
|
||||
-> (%rv.3)
|
||||
}
|
||||
%19 : int = prim::Constant[value=1]()
|
||||
-> (%19, %rv.4)
|
||||
}
|
||||
return (%rv);
|
||||
}
|
||||
graph(%len : int) {
|
||||
%15 : int = prim::Constant[value=1]()
|
||||
%9 : bool = prim::Constant[value=1]()
|
||||
%7 : Device = prim::Constant[value="cpu"]()
|
||||
%6 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=6]()
|
||||
%1 : int = prim::Constant[value=3]()
|
||||
%2 : int = prim::Constant[value=4]()
|
||||
%11 : int = prim::Constant[value=10]()
|
||||
%14 : float = prim::Constant[value=1]()
|
||||
%4 : int[] = prim::ListConstruct(%1, %2)
|
||||
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
|
||||
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
|
||||
block0(%i : int, %13 : Tensor) {
|
||||
%12 : bool = aten::lt(%i, %11)
|
||||
%rv.4 : Tensor = prim::If(%12)
|
||||
block0() {
|
||||
%rv.2 : Tensor = aten::sub(%13, %14, %15)
|
||||
-> (%rv.2)
|
||||
}
|
||||
block1() {
|
||||
%rv.3 : Tensor = aten::add(%13, %14, %15)
|
||||
-> (%rv.3)
|
||||
}
|
||||
-> (%9, %rv.4)
|
||||
}
|
||||
return (%rv);
|
||||
}
|
||||
|
||||
|
||||
Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
|
||||
example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
|
||||
@ -676,34 +675,39 @@ Automatic Trace Checking
|
||||
traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
|
||||
|
||||
Gives us the following diagnostic information::
|
||||
|
||||
ERROR: Graphs differed across invocations!
|
||||
Graph diff:
|
||||
graph(%0 : Dynamic) {
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=0]()
|
||||
%3 : Dynamic = aten::select(%0, %1, %2)
|
||||
%4 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=0]()
|
||||
%6 : Dynamic = aten::select(%0, %4, %5)
|
||||
%7 : Dynamic = aten::mul(%3, %6)
|
||||
%8 : int = prim::Constant[value=0]()
|
||||
%9 : int = prim::Constant[value=1]()
|
||||
%10 : Dynamic = aten::select(%0, %8, %9)
|
||||
%11 : Dynamic = aten::mul(%7, %10)
|
||||
%12 : int = prim::Constant[value=0]()
|
||||
%13 : int = prim::Constant[value=2]()
|
||||
%14 : Dynamic = aten::select(%0, %12, %13)
|
||||
%15 : Dynamic = aten::mul(%11, %14)
|
||||
+ %16 : int = prim::Constant[value=0]()
|
||||
+ %17 : int = prim::Constant[value=3]()
|
||||
+ %18 : Dynamic = aten::select(%0, %16, %17)
|
||||
+ %19 : Dynamic = aten::mul(%15, %18)
|
||||
- return (%15);
|
||||
? ^
|
||||
+ return (%19);
|
||||
? ^
|
||||
}
|
||||
ERROR: Graphs differed across invocations!
|
||||
Graph diff::
|
||||
|
||||
graph(%x : Tensor) {
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %2)
|
||||
%4 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=0]()
|
||||
%6 : Tensor = aten::select(%x, %4, %5)
|
||||
%result.2 : Tensor = aten::mul(%result.1, %6)
|
||||
%8 : int = prim::Constant[value=0]()
|
||||
%9 : int = prim::Constant[value=1]()
|
||||
%10 : Tensor = aten::select(%x, %8, %9)
|
||||
- %result : Tensor = aten::mul(%result.2, %10)
|
||||
+ %result.3 : Tensor = aten::mul(%result.2, %10)
|
||||
? ++
|
||||
%12 : int = prim::Constant[value=0]()
|
||||
%13 : int = prim::Constant[value=2]()
|
||||
%14 : Tensor = aten::select(%x, %12, %13)
|
||||
+ %result : Tensor = aten::mul(%result.3, %14)
|
||||
+ %16 : int = prim::Constant[value=0]()
|
||||
+ %17 : int = prim::Constant[value=3]()
|
||||
+ %18 : Tensor = aten::select(%x, %16, %17)
|
||||
- %15 : Tensor = aten::mul(%result, %14)
|
||||
? ^ ^
|
||||
+ %19 : Tensor = aten::mul(%result, %18)
|
||||
? ^ ^
|
||||
- return (%15);
|
||||
? ^
|
||||
+ return (%19);
|
||||
? ^
|
||||
}
|
||||
|
||||
|
||||
This message indicates to us that the computation differed between when
|
||||
@ -733,23 +737,19 @@ Automatic Trace Checking
|
||||
|
||||
Which produces::
|
||||
|
||||
graph(%x : Dynamic) {
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=0]()
|
||||
%result.1 : Dynamic = aten::select(%x, %2, %1)
|
||||
%4 : int = aten::size(%x, %1)
|
||||
%5 : int = prim::Constant[value=1]()
|
||||
%result : Dynamic = prim::Loop(%4, %5, %result.1)
|
||||
block0(%i : int, %7 : Dynamic) {
|
||||
%9 : int = prim::Constant[value=0]()
|
||||
%10 : Dynamic = aten::select(%x, %9, %i)
|
||||
%result.2 : Dynamic = aten::mul(%7, %10)
|
||||
%12 : int = prim::Constant[value=1]()
|
||||
-> (%12, %result.2)
|
||||
}
|
||||
return (%result);
|
||||
}
|
||||
|
||||
graph(%x : Tensor) {
|
||||
%5 : bool = prim::Constant[value=1]()
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %1)
|
||||
%4 : int = aten::size(%x, %1)
|
||||
%result : Tensor = prim::Loop(%4, %5, %result.1)
|
||||
block0(%i : int, %7 : Tensor) {
|
||||
%10 : Tensor = aten::select(%x, %1, %i)
|
||||
%result.2 : Tensor = aten::mul(%7, %10)
|
||||
-> (%5, %result.2)
|
||||
}
|
||||
return (%result);
|
||||
}
|
||||
|
||||
Tracer Warnings
|
||||
The tracer produces warnings for several problematic patterns in traced
|
||||
@ -789,14 +789,24 @@ Tracer Warnings
|
||||
Builtin Functions
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Torch Script supports a subset of the builtin tensor and neural network functions that
|
||||
PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
|
||||
namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
|
||||
Torch Script supports a subset of the builtin tensor and neural network
|
||||
functions that PyTorch provides. Most methods on Tensor as well as functions in
|
||||
the ``torch`` namespace, all functions in ``torch.nn.functional`` and all
|
||||
modules from ``torch.nn`` are supported in Torch Script, excluding those in the
|
||||
table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`.
|
||||
|
||||
Unsupported ``torch.nn`` Modules ::
|
||||
|
||||
torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss
|
||||
torch.nn.modules.normalization.CrossMapLRN2d
|
||||
torch.nn.modules.fold.Fold
|
||||
torch.nn.modules.fold.Unfold
|
||||
torch.nn.modules.rnn.GRU
|
||||
torch.nn.modules.rnn.LSTM
|
||||
torch.nn.modules.rnn.RNN
|
||||
torch.nn.modules.rnn.GRUCell
|
||||
torch.nn.modules.rnn.LSTMCell
|
||||
torch.nn.modules.rnn.RNNCell
|
||||
|
||||
We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
|
||||
``Conv`` module. This functionality is something that will be developed in the future.
|
||||
For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
|
||||
modules into ScriptModules on construction.
|
||||
|
||||
.. automodule:: torch.jit.supported_ops
|
||||
|
@ -338,6 +338,7 @@ view of a storage and defines numeric operations on it.
|
||||
.. automethod:: reshape_as
|
||||
.. automethod:: resize_
|
||||
.. automethod:: resize_as_
|
||||
.. automethod:: roll
|
||||
.. automethod:: round
|
||||
.. automethod:: round_
|
||||
.. automethod:: rsqrt
|
||||
|
@ -269,6 +269,7 @@ Other Operations
|
||||
.. autofunction:: histc
|
||||
.. autofunction:: meshgrid
|
||||
.. autofunction:: renorm
|
||||
.. autofunction:: roll
|
||||
.. autofunction:: tensordot
|
||||
.. autofunction:: trace
|
||||
.. autofunction:: tril
|
||||
|
@ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
|
||||
file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
|
||||
|
||||
if (BUILD_CAFFE2_OPS)
|
||||
#cmake only check for separate OpenMP library on AppleClang 7+
|
||||
#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
|
||||
if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
|
||||
Set(OpenMP_link ${OpenMP_libomp_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Note(ilijar): Since Detectron ops currently have no
|
||||
# CPU implementation, we only build GPU ops for now.
|
||||
if (USE_CUDA)
|
||||
@ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS)
|
||||
${Detectron_CPU_SRCS}
|
||||
${Detectron_GPU_SRCS})
|
||||
|
||||
target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link})
|
||||
target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
|
||||
install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
|
||||
elseif(NOT IOS_PLATFORM)
|
||||
add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
|
||||
target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
|
||||
target_link_libraries(caffe2_detectron_ops caffe2)
|
||||
install(TARGETS caffe2_detectron_ops DESTINATION lib)
|
||||
endif()
|
||||
endif()
|
||||
|
166
setup.py
166
setup.py
@ -124,6 +124,7 @@
|
||||
# LD_LIBRARY_PATH
|
||||
# we will search for libraries in these paths
|
||||
|
||||
from __future__ import print_function
|
||||
from setuptools import setup, Extension, distutils, Command, find_packages
|
||||
import setuptools.command.build_ext
|
||||
import setuptools.command.install
|
||||
@ -144,86 +145,32 @@ import json
|
||||
import glob
|
||||
import importlib
|
||||
|
||||
from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
|
||||
|
||||
|
||||
def hotpatch_var(var, prefix='USE_'):
|
||||
if check_env_flag('NO_' + var):
|
||||
os.environ[prefix + var] = '0'
|
||||
elif check_negative_env_flag('NO_' + var):
|
||||
os.environ[prefix + var] = '1'
|
||||
elif check_env_flag('WITH_' + var):
|
||||
os.environ[prefix + var] = '1'
|
||||
elif check_negative_env_flag('WITH_' + var):
|
||||
os.environ[prefix + var] = '0'
|
||||
|
||||
# Before we run the setup_helpers, let's look for NO_* and WITH_*
|
||||
# variables and hotpatch environment with the USE_* equivalent
|
||||
use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
|
||||
'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
|
||||
list(map(hotpatch_var, use_env_vars))
|
||||
|
||||
# Also hotpatch a few with BUILD_* equivalent
|
||||
build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
|
||||
[hotpatch_var(v, 'BUILD_') for v in build_env_vars]
|
||||
|
||||
from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
|
||||
from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
|
||||
BUILD_CAFFE2_OPS, USE_LEVELDB,
|
||||
USE_LMDB, USE_OPENCV, USE_FFMPEG)
|
||||
from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
|
||||
from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
|
||||
CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
|
||||
from tools.setup_helpers.fbgemm import USE_FBGEMM
|
||||
from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY,
|
||||
MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
|
||||
from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
|
||||
NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
|
||||
from tools.setup_helpers.nnpack import USE_NNPACK
|
||||
from tools.setup_helpers.qnnpack import USE_QNNPACK
|
||||
from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
|
||||
# If you want to modify flags or environmental variables that is set when
|
||||
# building torch, you should do it in tools/setup_helpers/configure.py.
|
||||
# Please don't add it here unless it's only used in PyTorch.
|
||||
from tools.setup_helpers.configure import *
|
||||
from tools.setup_helpers.generate_code import generate_code
|
||||
from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
|
||||
from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
|
||||
USE_GLOO_IBVERBS
|
||||
|
||||
################################################################################
|
||||
# Parameters parsed from environment
|
||||
################################################################################
|
||||
|
||||
DEBUG = check_env_flag('DEBUG')
|
||||
REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO')
|
||||
IS_WINDOWS = (platform.system() == 'Windows')
|
||||
IS_DARWIN = (platform.system() == 'Darwin')
|
||||
IS_LINUX = (platform.system() == 'Linux')
|
||||
IS_PPC = (platform.machine() == 'ppc64le')
|
||||
IS_ARM = (platform.machine() == 'aarch64')
|
||||
VERBOSE_SCRIPT = True
|
||||
# see if the user passed a quiet flag to setup.py arguments and respect
|
||||
# that in our parts of the build
|
||||
for arg in sys.argv:
|
||||
if arg == "--":
|
||||
break
|
||||
if arg == '-q' or arg == '--quiet':
|
||||
VERBOSE_SCRIPT = False
|
||||
|
||||
BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
|
||||
# ppc64le and aarch64 do not support MKLDNN
|
||||
if IS_PPC or IS_ARM:
|
||||
USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF')
|
||||
if VERBOSE_SCRIPT:
|
||||
def report(*args):
|
||||
print(*args)
|
||||
else:
|
||||
USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON')
|
||||
|
||||
USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
|
||||
RERUN_CMAKE = True
|
||||
|
||||
NUM_JOBS = multiprocessing.cpu_count()
|
||||
max_jobs = os.getenv("MAX_JOBS")
|
||||
if max_jobs is not None:
|
||||
NUM_JOBS = min(NUM_JOBS, int(max_jobs))
|
||||
|
||||
ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
|
||||
if not ONNX_NAMESPACE:
|
||||
ONNX_NAMESPACE = "onnx_torch"
|
||||
|
||||
# Ninja
|
||||
try:
|
||||
import ninja
|
||||
USE_NINJA = True
|
||||
except ImportError:
|
||||
USE_NINJA = False
|
||||
def report(*args):
|
||||
pass
|
||||
|
||||
# Constant known variables used throughout this file
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -323,8 +270,9 @@ def build_libs(libs):
|
||||
build_libs_cmd = ['tools\\build_pytorch_libs.bat']
|
||||
else:
|
||||
build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTORCH_PYTHON"] = sys.executable
|
||||
|
||||
my_env, extra_flags = get_pytorch_env_with_flags()
|
||||
build_libs_cmd.extend(extra_flags)
|
||||
my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
|
||||
my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
|
||||
my_env["PYTORCH_BUILD_VERSION"] = version
|
||||
@ -334,64 +282,8 @@ def build_libs(libs):
|
||||
cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
|
||||
my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path
|
||||
|
||||
my_env["NUM_JOBS"] = str(NUM_JOBS)
|
||||
my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
|
||||
if not IS_WINDOWS:
|
||||
if USE_NINJA:
|
||||
my_env["CMAKE_GENERATOR"] = '-GNinja'
|
||||
my_env["CMAKE_INSTALL"] = 'ninja install'
|
||||
else:
|
||||
my_env['CMAKE_GENERATOR'] = ''
|
||||
my_env['CMAKE_INSTALL'] = 'make install'
|
||||
if USE_SYSTEM_NCCL:
|
||||
my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
|
||||
my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
|
||||
my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
|
||||
if USE_CUDA:
|
||||
my_env["CUDA_BIN_PATH"] = CUDA_HOME
|
||||
build_libs_cmd += ['--use-cuda']
|
||||
if IS_WINDOWS:
|
||||
my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
|
||||
if USE_CUDA_STATIC_LINK:
|
||||
build_libs_cmd += ['--cuda-static-link']
|
||||
if USE_FBGEMM:
|
||||
build_libs_cmd += ['--use-fbgemm']
|
||||
if USE_ROCM:
|
||||
build_libs_cmd += ['--use-rocm']
|
||||
if USE_NNPACK:
|
||||
build_libs_cmd += ['--use-nnpack']
|
||||
if USE_NUMPY:
|
||||
my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR
|
||||
if USE_CUDNN:
|
||||
my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
|
||||
my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
|
||||
my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
|
||||
if USE_MIOPEN:
|
||||
my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
|
||||
my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
|
||||
my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
|
||||
if USE_MKLDNN:
|
||||
build_libs_cmd += ['--use-mkldnn']
|
||||
if USE_QNNPACK:
|
||||
build_libs_cmd += ['--use-qnnpack']
|
||||
if USE_GLOO_IBVERBS:
|
||||
build_libs_cmd += ['--use-gloo-ibverbs']
|
||||
if not RERUN_CMAKE:
|
||||
build_libs_cmd += ['--dont-rerun-cmake']
|
||||
|
||||
my_env["BUILD_TORCH"] = "ON"
|
||||
my_env["BUILD_PYTHON"] = "ON"
|
||||
my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
|
||||
my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
|
||||
my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
|
||||
my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
|
||||
my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
|
||||
my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
|
||||
my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
|
||||
my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
|
||||
my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
|
||||
my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
|
||||
|
||||
if VERBOSE_SCRIPT:
|
||||
my_env['VERBOSE_SCRIPT'] = '1'
|
||||
try:
|
||||
os.mkdir('build')
|
||||
except OSError:
|
||||
@ -660,6 +552,16 @@ class build_ext(build_ext_parent):
|
||||
return outputs
|
||||
|
||||
|
||||
# this is a subclass of build just to get access to self.build_lib
|
||||
# as there does not seem to be an utility function getting this
|
||||
class create_pyi(distutils.command.build.build):
|
||||
def run(self):
|
||||
print("-- Building .pyi --")
|
||||
if sys.version_info[0] == 3:
|
||||
from tools.pyi.gen_pyi import gen_pyi
|
||||
gen_pyi(self.build_lib)
|
||||
|
||||
|
||||
class build(distutils.command.build.build):
|
||||
sub_commands = [
|
||||
('build_deps', lambda self: True),
|
||||
@ -914,6 +816,7 @@ if USE_CUDA:
|
||||
|
||||
cmdclass = {
|
||||
'create_version_file': create_version_file,
|
||||
'create_pyi': create_pyi,
|
||||
'build': build,
|
||||
'build_py': build_py,
|
||||
'build_ext': build_ext,
|
||||
@ -946,6 +849,7 @@ if __name__ == '__main__':
|
||||
entry_points=entry_points,
|
||||
package_data={
|
||||
'torch': [
|
||||
'__init__.pyi',
|
||||
'lib/*.so*',
|
||||
'lib/*.dylib*',
|
||||
'lib/*.dll',
|
||||
|
@ -458,6 +458,10 @@ method_tests = [
|
||||
NO_ARGS, [skipIfNoLapack]),
|
||||
('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
|
||||
NO_ARGS, [skipIfNoLapack]),
|
||||
('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"),
|
||||
('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"),
|
||||
('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"),
|
||||
('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"),
|
||||
('addcmul', (S, S), ((S, S), (S, S))),
|
||||
('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
|
||||
('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
|
||||
@ -560,8 +564,14 @@ method_tests = [
|
||||
('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
|
||||
('tril', (M, M), NO_ARGS),
|
||||
('tril', (M, M), (2,), 'idx'),
|
||||
('tril', (S, M, M), NO_ARGS, 'batched'),
|
||||
('tril', (S, M, M), (2,), 'batched_idx'),
|
||||
('tril', (3, 3, S, S), NO_ARGS, 'more_batched'),
|
||||
('triu', (M, M), NO_ARGS),
|
||||
('triu', (M, M), (2,), 'idx'),
|
||||
('triu', (S, M, M), NO_ARGS, 'batched'),
|
||||
('triu', (S, M, M), (2,), 'batched_idx'),
|
||||
('triu', (3, 3, S, S), NO_ARGS, 'more_batched'),
|
||||
('trace', (M, M), NO_ARGS),
|
||||
('cross', (S, 3), ((S, 3),)),
|
||||
('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
|
||||
|
@ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs):
|
||||
return torch.stack(all_matrices).reshape(*(batches + (l, l)))
|
||||
|
||||
|
||||
def brute_pdist(inp, p=2):
|
||||
"""Computes the same as torch.pdist using primitives"""
|
||||
n = inp.shape[-2]
|
||||
k = n * (n - 1) // 2
|
||||
if k == 0:
|
||||
# torch complains about empty indices
|
||||
return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device)
|
||||
square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1)
|
||||
unroll = square.view(square.shape[:-2] + (n * n,))
|
||||
inds = torch.ones(k, dtype=torch.int)
|
||||
inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int)
|
||||
return unroll[..., inds.cumsum(0)]
|
||||
|
||||
|
||||
def do_test_dtypes(self, dtypes, layout, device):
|
||||
for dtype in dtypes:
|
||||
if dtype != torch.float16:
|
||||
|
@ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) {
|
||||
ASSERT_EQ(batch[1].target, "2");
|
||||
}
|
||||
|
||||
struct DummyTensorDataset
|
||||
: datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> {
|
||||
Example<torch::Tensor, int> get(size_t index) override {
|
||||
const auto channels = static_cast<int64_t>(index);
|
||||
torch::Tensor tensor =
|
||||
(channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4});
|
||||
return {tensor, static_cast<int>(channels)};
|
||||
}
|
||||
|
||||
torch::optional<size_t> size() const override {
|
||||
return 100;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(DataTest, NormalizeTransform) {
|
||||
auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1));
|
||||
|
||||
// Works for zero (one implicit) channels
|
||||
std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0);
|
||||
ASSERT_EQ(output.size(), 1);
|
||||
// (1 - 0.5) / 0.1 = 5
|
||||
ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5))
|
||||
<< output[0].data;
|
||||
|
||||
// Works for one explicit channel
|
||||
output = dataset.get_batch(1);
|
||||
ASSERT_EQ(output.size(), 1);
|
||||
ASSERT_EQ(output[0].data.size(0), 1);
|
||||
ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5))
|
||||
<< output[0].data;
|
||||
|
||||
// Works for two channels with different moments
|
||||
dataset = DummyTensorDataset().map(
|
||||
transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2}));
|
||||
output = dataset.get_batch(2);
|
||||
ASSERT_EQ(output.size(), 1);
|
||||
ASSERT_EQ(output[0].data.size(0), 2);
|
||||
ASSERT_TRUE(output[0]
|
||||
.data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
|
||||
.allclose(torch::ones({1, 4, 4}) * 5))
|
||||
<< output[0].data;
|
||||
ASSERT_TRUE(output[0]
|
||||
.data.slice(/*dim=*/0, /*start=*/1)
|
||||
.allclose(torch::ones({1, 4, 4}) * -2.5))
|
||||
<< output[0].data;
|
||||
|
||||
// Works for three channels with one moment value
|
||||
dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2));
|
||||
output = dataset.get_batch(3);
|
||||
ASSERT_EQ(output.size(), 1);
|
||||
ASSERT_EQ(output[0].data.size(0), 3);
|
||||
ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5))
|
||||
<< output[0].data;
|
||||
|
||||
// Works for three channels with different moments
|
||||
dataset = DummyTensorDataset().map(
|
||||
transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2}));
|
||||
output = dataset.get_batch(3);
|
||||
ASSERT_EQ(output.size(), 1);
|
||||
ASSERT_EQ(output[0].data.size(0), 3);
|
||||
ASSERT_TRUE(output[0]
|
||||
.data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
|
||||
.allclose(torch::ones({1, 4, 4}) * 5))
|
||||
<< output[0].data;
|
||||
ASSERT_TRUE(output[0]
|
||||
.data.slice(/*dim=*/0, /*start=*/1, /*end=*/2)
|
||||
.allclose(torch::ones({1, 4, 4}) * -2.5))
|
||||
<< output[0].data;
|
||||
ASSERT_TRUE(output[0]
|
||||
.data.slice(/*dim=*/0, /*start=*/2)
|
||||
.allclose(torch::ones({1, 4, 4}) * 12.5))
|
||||
<< output[0].data;
|
||||
}
|
||||
|
||||
struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
|
||||
UnCopyableDataset() = default;
|
||||
|
||||
|
@ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) {
|
||||
TEST_F(ModuleTest, ZeroGrad) {
|
||||
Linear module(3, 4);
|
||||
auto weight = torch::ones({8, 3}, torch::requires_grad());
|
||||
auto loss = module->forward(weight).sum();
|
||||
auto loss = module(weight).sum();
|
||||
loss.backward();
|
||||
for (auto& parameter : module->parameters()) {
|
||||
auto grad = parameter.grad();
|
||||
@ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) {
|
||||
ASSERT_NO_THROW(module->modules());
|
||||
}
|
||||
}
|
||||
|
||||
struct ModuleWithNonTensorForwardImpl : torch::nn::Module {
|
||||
int64_t forward(torch::Tensor x) {
|
||||
return x.numel();
|
||||
}
|
||||
};
|
||||
TORCH_MODULE(ModuleWithNonTensorForward);
|
||||
|
||||
TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) {
|
||||
ModuleWithNonTensorForward m;
|
||||
ASSERT_EQ(m(torch::ones(123)), 123);
|
||||
}
|
||||
|
@ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {};
|
||||
TEST_F(ModulesTest, Conv1d) {
|
||||
Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
|
||||
auto x = torch::randn({2, 3, 5}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) {
|
||||
TEST_F(ModulesTest, Conv2dEven) {
|
||||
Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
|
||||
auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) {
|
||||
TEST_F(ModulesTest, Conv2dUneven) {
|
||||
Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
|
||||
auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) {
|
||||
TEST_F(ModulesTest, Conv3d) {
|
||||
Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
|
||||
auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) {
|
||||
TEST_F(ModulesTest, Linear) {
|
||||
Linear model(5, 2);
|
||||
auto x = torch::randn({10, 5}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) {
|
||||
auto l3 = model->add(Linear(5, 100), "l3");
|
||||
|
||||
auto x = torch::randn({1000, 10}, torch::requires_grad());
|
||||
x = l1->forward(x).clamp_min(0);
|
||||
x = l2->forward(x).clamp_min(0);
|
||||
x = l3->forward(x).clamp_min(0);
|
||||
x = l1(x).clamp_min(0);
|
||||
x = l2(x).clamp_min(0);
|
||||
x = l3(x).clamp_min(0);
|
||||
|
||||
x.backward();
|
||||
ASSERT_EQ(x.ndimension(), 2);
|
||||
@ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
|
||||
// Cannot get gradients to change indices (input) - only for embedding
|
||||
// params
|
||||
auto x = torch::full({10}, dict_size - 1, torch::kInt64);
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
|
||||
TEST_F(ModulesTest, EmbeddingList) {
|
||||
Embedding model(6, 4);
|
||||
auto x = torch::full({2, 3}, 5, torch::kInt64);
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) {
|
||||
TEST_F(ModulesTest, Dropout) {
|
||||
Dropout dropout(0.5);
|
||||
torch::Tensor x = torch::ones(100, torch::requires_grad());
|
||||
torch::Tensor y = dropout->forward(x);
|
||||
torch::Tensor y = dropout(x);
|
||||
|
||||
y.backward();
|
||||
ASSERT_EQ(y.ndimension(), 1);
|
||||
@ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) {
|
||||
ASSERT_GT(y.sum().item<float>(), 70); // Probably
|
||||
|
||||
dropout->eval();
|
||||
y = dropout->forward(x);
|
||||
y = dropout(x);
|
||||
ASSERT_EQ(y.sum().item<float>(), 100);
|
||||
}
|
||||
|
||||
@ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
|
||||
was_called = true;
|
||||
return input;
|
||||
});
|
||||
auto output = functional->forward(torch::ones(5, torch::requires_grad()));
|
||||
auto output = functional(torch::ones(5, torch::requires_grad()));
|
||||
ASSERT_TRUE(was_called);
|
||||
ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));
|
||||
|
||||
@ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) {
|
||||
ASSERT_FALSE(bn->bias.defined());
|
||||
|
||||
ASSERT_THROWS_WITH(
|
||||
bn->forward(torch::ones({2, 5})),
|
||||
bn(torch::ones({2, 5})),
|
||||
"Calling BatchNorm::forward is only permitted "
|
||||
"when the 'stateful' option is true (was false). "
|
||||
"Use BatchNorm::pure_forward instead.");
|
||||
@ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) {
|
||||
model->to(torch::kCUDA);
|
||||
auto x =
|
||||
torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
@ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) {
|
||||
model->to(torch::kCUDA);
|
||||
model->to(torch::kCPU);
|
||||
auto x = torch::randn({10, 5}, torch::requires_grad());
|
||||
auto y = model->forward(x);
|
||||
auto y = model(x);
|
||||
torch::Tensor s = y.sum();
|
||||
|
||||
s.backward();
|
||||
|
@ -215,7 +215,9 @@ TEST(SerializeTest, Optim) {
|
||||
TEST(SerializeTest, XOR_CUDA) {
|
||||
torch::manual_seed(0);
|
||||
// We better be able to save and load a XOR model!
|
||||
auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) {
|
||||
auto getLoss = [](Sequential model,
|
||||
uint32_t batch_size,
|
||||
bool is_cuda = false) {
|
||||
auto inputs = torch::empty({batch_size, 2});
|
||||
auto labels = torch::empty({batch_size});
|
||||
if (is_cuda) {
|
||||
@ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) {
|
||||
loss = getLoss(model3, 100, true);
|
||||
ASSERT_LT(loss.item<float>(), 0.1);
|
||||
}
|
||||
|
||||
TEST(
|
||||
SerializeTest,
|
||||
CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) {
|
||||
struct C : torch::nn::Module {
|
||||
C() {
|
||||
register_buffer("foo", torch::ones(5, torch::kInt32));
|
||||
}
|
||||
};
|
||||
struct B : torch::nn::Module {};
|
||||
struct A : torch::nn::Module {
|
||||
A() {
|
||||
register_module("b", std::make_shared<B>());
|
||||
register_module("c", std::make_shared<C>());
|
||||
}
|
||||
};
|
||||
struct M : torch::nn::Module {
|
||||
M() {
|
||||
register_module("a", std::make_shared<A>());
|
||||
}
|
||||
};
|
||||
|
||||
auto out = std::make_shared<M>();
|
||||
std::stringstream ss;
|
||||
torch::save(out, ss);
|
||||
auto in = std::make_shared<M>();
|
||||
torch::load(in, ss);
|
||||
|
||||
const int output = in->named_buffers()["a.c.foo"].sum().item<int>();
|
||||
ASSERT_EQ(output, 5);
|
||||
}
|
||||
|
@ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) {
|
||||
ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
|
||||
}
|
||||
|
||||
struct A : torch::nn::Module {
|
||||
int forward() {
|
||||
return 5;
|
||||
}
|
||||
};
|
||||
|
||||
struct B : torch::nn::Module {
|
||||
std::string forward(torch::Tensor tensor) {
|
||||
return "";
|
||||
}
|
||||
};
|
||||
|
||||
struct C : torch::nn::Module {
|
||||
float forward(torch::Tensor& tensor) {
|
||||
return 5.0;
|
||||
}
|
||||
};
|
||||
|
||||
struct D : torch::nn::Module {
|
||||
char forward(torch::Tensor&& tensor) {
|
||||
return 'x';
|
||||
}
|
||||
};
|
||||
|
||||
struct E : torch::nn::Module {};
|
||||
|
||||
// Put in a function because macros don't handle the comma between arguments to
|
||||
// is_same well ...
|
||||
template <typename Module, typename ExpectedType, typename... Args>
|
||||
void assert_has_expected_type() {
|
||||
using ReturnType =
|
||||
typename torch::detail::return_type_of_forward<Module, Args...>::type;
|
||||
constexpr bool is_expected_type =
|
||||
std::is_same<ReturnType, ExpectedType>::value;
|
||||
ASSERT_TRUE(is_expected_type) << Module().name();
|
||||
}
|
||||
|
||||
TEST(TestStatic, ReturnTypeOfForward) {
|
||||
assert_has_expected_type<A, int>();
|
||||
assert_has_expected_type<B, std::string, torch::Tensor>();
|
||||
assert_has_expected_type<C, float, torch::Tensor&>();
|
||||
assert_has_expected_type<D, char, torch::Tensor&&>();
|
||||
assert_has_expected_type<E, void>();
|
||||
}
|
||||
|
||||
TEST(TestStatic, Apply) {
|
||||
std::vector<int> v;
|
||||
torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
|
||||
|
@ -10,12 +10,13 @@ graph(%x.1_data : Tensor
|
||||
%x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
|
||||
block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
|
||||
%16 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%16)
|
||||
%alpha : float = prim::Float(%16)
|
||||
%data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%5_mask, %y_mask)
|
||||
%dims : Tensor = aten::__or__(%5_dims, %y_dims)
|
||||
%data : Tensor = aten::where(%mask, %data.1, %5_data)
|
||||
-> (%7, %data, %mask, %dims)
|
||||
}
|
||||
return (%x, %10, %11);
|
||||
%22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11)
|
||||
return (%22);
|
||||
}
|
||||
|
@ -7,33 +7,31 @@ graph(%a.1_data : Tensor
|
||||
%6 : int = prim::Constant[value=1]()
|
||||
%7 : Tensor = aten::gt(%a.1_data, %b_data)
|
||||
%8 : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%10 : bool = prim::TensorToBool(%7)
|
||||
%11 : Long() = prim::NumToTensor(%6)
|
||||
%alpha.1 : float = prim::TensorToNum(%11)
|
||||
%9 : Long() = prim::NumToTensor(%6)
|
||||
%alpha.1 : float = prim::Float(%9)
|
||||
%data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
|
||||
%mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%16 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%16)
|
||||
%14 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::Float(%14)
|
||||
%data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%21 : bool = prim::Constant[value=1]()
|
||||
%22 : int = prim::Constant[value=1]()
|
||||
%23 : Tensor = aten::type_as(%8, %7)
|
||||
%data.2 : Tensor = aten::mul(%7, %23)
|
||||
%25 : int = aten::dim(%data.2)
|
||||
%26 : bool = aten::eq(%25, %22)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
|
||||
%19 : bool = prim::Constant[value=1]()
|
||||
%20 : int = prim::Constant[value=1]()
|
||||
%21 : Tensor = aten::type_as(%8, %7)
|
||||
%data.2 : Tensor = aten::mul(%7, %21)
|
||||
%23 : int = aten::dim(%data.2)
|
||||
%24 : bool = aten::eq(%23, %20)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%24)
|
||||
block0() {
|
||||
%29 : int = aten::dim(%data.1)
|
||||
%30 : int = aten::sub(%29, %22)
|
||||
%data.4 : Tensor = prim::Loop(%30, %21, %data.2)
|
||||
block0(%32 : int, %33 : Tensor) {
|
||||
%34 : int = aten::dim(%33)
|
||||
%data.3 : Tensor = aten::unsqueeze(%33, %34)
|
||||
-> (%21, %data.3)
|
||||
%27 : int = aten::dim(%data.1)
|
||||
%28 : int = aten::sub(%27, %20)
|
||||
%data.4 : Tensor = prim::Loop(%28, %19, %data.2)
|
||||
block0(%30 : int, %31 : Tensor) {
|
||||
%32 : int = aten::dim(%31)
|
||||
%data.3 : Tensor = aten::unsqueeze(%31, %32)
|
||||
-> (%19, %data.3)
|
||||
}
|
||||
%cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
|
||||
%cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
|
||||
@ -45,5 +43,6 @@ graph(%a.1_data : Tensor
|
||||
%res_data : Tensor = aten::where(%cond_data, %data.1, %data)
|
||||
%res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
|
||||
%res_dims : Tensor = aten::__or__(%dims.1, %dims)
|
||||
return (%res_data, %res_mask, %res_dims);
|
||||
%39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
|
||||
return (%39);
|
||||
}
|
||||
|
@ -7,34 +7,33 @@ graph(%a.1_data : Tensor
|
||||
%6 : int = prim::Constant[value=1]()
|
||||
%7 : float = prim::Constant[value=0.1]()
|
||||
%8 : Float() = prim::NumToTensor(%7)
|
||||
%other : float = prim::TensorToNum(%8)
|
||||
%other : float = prim::Float(%8)
|
||||
%10 : Tensor = aten::gt(%a.1_data, %other)
|
||||
%11 : bool = prim::TensorToBool(%10)
|
||||
%12 : Long() = prim::NumToTensor(%6)
|
||||
%alpha.1 : float = prim::TensorToNum(%12)
|
||||
%11 : Long() = prim::NumToTensor(%6)
|
||||
%alpha.1 : float = prim::Float(%11)
|
||||
%data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
|
||||
%mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%17 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%17)
|
||||
%16 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::Float(%16)
|
||||
%data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%22 : bool = prim::Constant[value=1]()
|
||||
%23 : int = prim::Constant[value=1]()
|
||||
%24 : Tensor = aten::type_as(%a.1_mask, %10)
|
||||
%data.2 : Tensor = aten::mul(%10, %24)
|
||||
%26 : int = aten::dim(%data.2)
|
||||
%27 : bool = aten::eq(%26, %23)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%27)
|
||||
%21 : bool = prim::Constant[value=1]()
|
||||
%22 : int = prim::Constant[value=1]()
|
||||
%23 : Tensor = aten::type_as(%a.1_mask, %10)
|
||||
%data.2 : Tensor = aten::mul(%10, %23)
|
||||
%25 : int = aten::dim(%data.2)
|
||||
%26 : bool = aten::eq(%25, %22)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
|
||||
block0() {
|
||||
%30 : int = aten::dim(%data.1)
|
||||
%31 : int = aten::sub(%30, %23)
|
||||
%data.4 : Tensor = prim::Loop(%31, %22, %data.2)
|
||||
block0(%33 : int, %34 : Tensor) {
|
||||
%35 : int = aten::dim(%34)
|
||||
%data.3 : Tensor = aten::unsqueeze(%34, %35)
|
||||
-> (%22, %data.3)
|
||||
%29 : int = aten::dim(%data.1)
|
||||
%30 : int = aten::sub(%29, %22)
|
||||
%data.4 : Tensor = prim::Loop(%30, %21, %data.2)
|
||||
block0(%32 : int, %33 : Tensor) {
|
||||
%34 : int = aten::dim(%33)
|
||||
%data.3 : Tensor = aten::unsqueeze(%33, %34)
|
||||
-> (%21, %data.3)
|
||||
}
|
||||
%cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
|
||||
%cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
|
||||
@ -46,5 +45,6 @@ graph(%a.1_data : Tensor
|
||||
%res_data : Tensor = aten::where(%cond_data, %data.1, %data)
|
||||
%res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
|
||||
%res_dims : Tensor = aten::__or__(%dims.1, %dims)
|
||||
return (%res_data, %res_mask, %res_dims);
|
||||
%41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
|
||||
return (%41);
|
||||
}
|
||||
|
@ -7,28 +7,26 @@ graph(%a.1_data : Tensor
|
||||
%6 : int = prim::Constant[value=1]()
|
||||
%7 : Tensor = aten::gt(%a.1_data, %b_data)
|
||||
%8 : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%10 : bool = prim::TensorToBool(%7)
|
||||
%11 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%11)
|
||||
%9 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::Float(%9)
|
||||
%data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%16 : bool = prim::Constant[value=1]()
|
||||
%17 : int = prim::Constant[value=1]()
|
||||
%18 : Tensor = aten::type_as(%8, %7)
|
||||
%data.2 : Tensor = aten::mul(%7, %18)
|
||||
%20 : int = aten::dim(%data.2)
|
||||
%21 : bool = aten::eq(%20, %17)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
|
||||
%14 : bool = prim::Constant[value=1]()
|
||||
%15 : int = prim::Constant[value=1]()
|
||||
%16 : Tensor = aten::type_as(%8, %7)
|
||||
%data.2 : Tensor = aten::mul(%7, %16)
|
||||
%18 : int = aten::dim(%data.2)
|
||||
%19 : bool = aten::eq(%18, %15)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%19)
|
||||
block0() {
|
||||
%24 : int = aten::dim(%data)
|
||||
%25 : int = aten::sub(%24, %17)
|
||||
%data.4 : Tensor = prim::Loop(%25, %16, %data.2)
|
||||
block0(%27 : int, %28 : Tensor) {
|
||||
%29 : int = aten::dim(%28)
|
||||
%data.3 : Tensor = aten::unsqueeze(%28, %29)
|
||||
-> (%16, %data.3)
|
||||
%22 : int = aten::dim(%data)
|
||||
%23 : int = aten::sub(%22, %15)
|
||||
%data.4 : Tensor = prim::Loop(%23, %14, %data.2)
|
||||
block0(%25 : int, %26 : Tensor) {
|
||||
%27 : int = aten::dim(%26)
|
||||
%data.3 : Tensor = aten::unsqueeze(%26, %27)
|
||||
-> (%14, %data.3)
|
||||
}
|
||||
%cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
|
||||
%cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
|
||||
@ -40,5 +38,6 @@ graph(%a.1_data : Tensor
|
||||
%res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
|
||||
%res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
|
||||
%res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
|
||||
return (%res_data, %res_mask, %res_dims);
|
||||
%34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
|
||||
return (%34);
|
||||
}
|
||||
|
@ -7,29 +7,28 @@ graph(%a.1_data : Tensor
|
||||
%6 : int = prim::Constant[value=1]()
|
||||
%7 : float = prim::Constant[value=0.1]()
|
||||
%8 : Float() = prim::NumToTensor(%7)
|
||||
%other : float = prim::TensorToNum(%8)
|
||||
%other : float = prim::Float(%8)
|
||||
%10 : Tensor = aten::gt(%a.1_data, %other)
|
||||
%11 : bool = prim::TensorToBool(%10)
|
||||
%12 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%12)
|
||||
%11 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::Float(%11)
|
||||
%data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%17 : bool = prim::Constant[value=1]()
|
||||
%18 : int = prim::Constant[value=1]()
|
||||
%19 : Tensor = aten::type_as(%a.1_mask, %10)
|
||||
%data.2 : Tensor = aten::mul(%10, %19)
|
||||
%21 : int = aten::dim(%data.2)
|
||||
%22 : bool = aten::eq(%21, %18)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%22)
|
||||
%16 : bool = prim::Constant[value=1]()
|
||||
%17 : int = prim::Constant[value=1]()
|
||||
%18 : Tensor = aten::type_as(%a.1_mask, %10)
|
||||
%data.2 : Tensor = aten::mul(%10, %18)
|
||||
%20 : int = aten::dim(%data.2)
|
||||
%21 : bool = aten::eq(%20, %17)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
|
||||
block0() {
|
||||
%25 : int = aten::dim(%data)
|
||||
%26 : int = aten::sub(%25, %18)
|
||||
%data.4 : Tensor = prim::Loop(%26, %17, %data.2)
|
||||
block0(%28 : int, %29 : Tensor) {
|
||||
%30 : int = aten::dim(%29)
|
||||
%data.3 : Tensor = aten::unsqueeze(%29, %30)
|
||||
-> (%17, %data.3)
|
||||
%24 : int = aten::dim(%data)
|
||||
%25 : int = aten::sub(%24, %17)
|
||||
%data.4 : Tensor = prim::Loop(%25, %16, %data.2)
|
||||
block0(%27 : int, %28 : Tensor) {
|
||||
%29 : int = aten::dim(%28)
|
||||
%data.3 : Tensor = aten::unsqueeze(%28, %29)
|
||||
-> (%16, %data.3)
|
||||
}
|
||||
%cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
|
||||
%cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
|
||||
@ -41,5 +40,6 @@ graph(%a.1_data : Tensor
|
||||
%res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
|
||||
%res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
|
||||
%res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
|
||||
return (%res_data, %res_mask, %res_dims);
|
||||
%36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
|
||||
return (%36);
|
||||
}
|
||||
|
@ -9,38 +9,35 @@ graph(%a.1_data : Tensor
|
||||
%8 : Tensor = aten::gt(%a.1_data, %b_data)
|
||||
%9 : Tensor = aten::mul(%a.1_mask, %b_mask)
|
||||
%10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
|
||||
%11 : bool = prim::TensorToBool(%8)
|
||||
%12 : int = prim::Constant[value=0]()
|
||||
%13 : Tensor = aten::mul(%8, %9)
|
||||
%14 : Tensor = aten::sum(%13)
|
||||
%15 : Tensor = aten::gt(%14, %12)
|
||||
%16 : bool = prim::TensorToBool(%15)
|
||||
%17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
|
||||
block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
|
||||
%30 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::TensorToNum(%30)
|
||||
%11 : int = prim::Constant[value=0]()
|
||||
%12 : Tensor = aten::mul(%8, %9)
|
||||
%13 : Tensor = aten::sum(%12)
|
||||
%14 : Tensor = aten::gt(%13, %11)
|
||||
%15 : bool = prim::Bool(%14)
|
||||
%16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
|
||||
block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
|
||||
%27 : Long() = prim::NumToTensor(%6)
|
||||
%alpha : float = prim::Float(%27)
|
||||
%data : Tensor = aten::sub(%6_data, %b_data, %alpha)
|
||||
%mask : Tensor = aten::mul(%6_mask, %b_mask)
|
||||
%dims : Tensor = aten::__or__(%6_dims, %b_dims)
|
||||
%35 : Tensor = aten::gt(%data, %b_data)
|
||||
%36 : Tensor = aten::mul(%mask, %b_mask)
|
||||
%37 : Tensor = aten::__or__(%dims, %b_dims)
|
||||
%38 : bool = prim::TensorToBool(%35)
|
||||
%39 : bool = prim::Constant[value=1]()
|
||||
%40 : int = prim::Constant[value=1]()
|
||||
%41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
|
||||
%data.2 : Tensor = aten::mul(%cond_data.2, %41)
|
||||
%43 : int = aten::dim(%data.2)
|
||||
%44 : bool = aten::eq(%43, %40)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%44)
|
||||
%32 : Tensor = aten::gt(%data, %b_data)
|
||||
%33 : Tensor = aten::mul(%mask, %b_mask)
|
||||
%34 : bool = prim::Constant[value=1]()
|
||||
%35 : int = prim::Constant[value=1]()
|
||||
%36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
|
||||
%data.2 : Tensor = aten::mul(%cond_data.2, %36)
|
||||
%38 : int = aten::dim(%data.2)
|
||||
%39 : bool = aten::eq(%38, %35)
|
||||
%cond_data : Tensor, %cond_mask : Tensor = prim::If(%39)
|
||||
block0() {
|
||||
%47 : int = aten::dim(%data)
|
||||
%48 : int = aten::sub(%47, %40)
|
||||
%data.4 : Tensor = prim::Loop(%48, %39, %data.2)
|
||||
block0(%50 : int, %51 : Tensor) {
|
||||
%52 : int = aten::dim(%51)
|
||||
%data.3 : Tensor = aten::unsqueeze(%51, %52)
|
||||
-> (%39, %data.3)
|
||||
%42 : int = aten::dim(%data)
|
||||
%43 : int = aten::sub(%42, %35)
|
||||
%data.4 : Tensor = prim::Loop(%43, %34, %data.2)
|
||||
block0(%45 : int, %46 : Tensor) {
|
||||
%47 : int = aten::dim(%46)
|
||||
%data.3 : Tensor = aten::unsqueeze(%46, %47)
|
||||
-> (%34, %data.3)
|
||||
}
|
||||
%cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
|
||||
%cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
|
||||
@ -52,12 +49,13 @@ graph(%a.1_data : Tensor
|
||||
%res_data : Tensor = aten::where(%cond_data, %data, %6_data)
|
||||
%res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
|
||||
%res_dims : Tensor = aten::__or__(%dims, %6_dims)
|
||||
%59 : int = prim::Constant[value=0]()
|
||||
%60 : Tensor = aten::mul(%35, %36)
|
||||
%61 : Tensor = aten::sum(%60)
|
||||
%62 : Tensor = aten::gt(%61, %59)
|
||||
%63 : bool = prim::TensorToBool(%62)
|
||||
-> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims)
|
||||
%54 : int = prim::Constant[value=0]()
|
||||
%55 : Tensor = aten::mul(%32, %33)
|
||||
%56 : Tensor = aten::sum(%55)
|
||||
%57 : Tensor = aten::gt(%56, %54)
|
||||
%58 : bool = prim::Bool(%57)
|
||||
-> (%58, %32, %33, %res_data, %res_mask, %res_dims)
|
||||
}
|
||||
return (%a, %21, %22);
|
||||
%59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20)
|
||||
return (%59);
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user