mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 20:34:54 +08:00 
			
		
		
		
	Compare commits
	
		
			101 Commits
		
	
	
		
			ciflow/tru
			...
			v1.0.1
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| bb15580e88 | |||
| 743fdbdb19 | |||
| cdb9fd44dc | |||
| 83221655a8 | |||
| 48fcfdaccb | |||
| 18eef1d8d9 | |||
| 770462a5ff | |||
| 0f87ff6e38 | |||
| eb531da9a8 | |||
| 37c8a33b54 | |||
| 0e9bdcab80 | |||
| 1347a184ca | |||
| 1cb565fa34 | |||
| dab52a4a16 | |||
| 0a440da88b | |||
| cf11411d42 | |||
| fd8212cebd | |||
| ce37ec38f3 | |||
| 047231e1e1 | |||
| f748654e0e | |||
| 8fdcdc3c3f | |||
| 40fa56a5d1 | |||
| 2f9642010e | |||
| 3c10845036 | |||
| d85372f330 | |||
| 5fc2c8b115 | |||
| fc0c2252d2 | |||
| 304b932879 | |||
| e274158c72 | |||
| 8d1fc20c8b | |||
| af03dbb93b | |||
| b24edae29e | |||
| c99c8d8aa3 | |||
| eac4c5d901 | |||
| 231f1a4991 | |||
| b65b55a652 | |||
| c926cb4408 | |||
| a6f4538f31 | |||
| 7d3e2fa190 | |||
| 98bc784694 | |||
| 3c83026249 | |||
| 202909d601 | |||
| d4eea46dcd | |||
| cf0965736c | |||
| 274e755237 | |||
| c19b16cc99 | |||
| 228f73e7a9 | |||
| 1e61241227 | |||
| 9a9eae14d0 | |||
| fb92c3c7b5 | |||
| a9cf3f69ab | |||
| 6460628b25 | |||
| 74433436e8 | |||
| 57c685520f | |||
| ca1f9349dd | |||
| 6290587244 | |||
| 9c863c1952 | |||
| 84cf1660d2 | |||
| e8361887b1 | |||
| 9a7737146c | |||
| e27b4ba594 | |||
| 0384a0282b | |||
| f80dba92ae | |||
| 1b7113eaae | |||
| 7fec47f40d | |||
| d711595a07 | |||
| eef3be686e | |||
| ba4d1e8ca6 | |||
| ab1cd6241b | |||
| 1ff075b7df | |||
| b879d006f1 | |||
| 167f8e8314 | |||
| dfdf2376bb | |||
| 95fd0afed5 | |||
| 4e5b994ca7 | |||
| 5dbcbbf715 | |||
| 9067e9411d | |||
| 4c964dac7f | |||
| 7b40d9c7ff | |||
| e7767c1af3 | |||
| 982a8722cc | |||
| 3c1cbb8da8 | |||
| 5f51de77c9 | |||
| a4e2d27ddb | |||
| 4909529584 | |||
| 7b98af16ee | |||
| fe098a3605 | |||
| 3486cebd87 | |||
| a5a34fb5b1 | |||
| b2c4c55734 | |||
| b104068d24 | |||
| e0834ded98 | |||
| 30aed0237d | |||
| 033ae1598f | |||
| 8ca4fc3fd2 | |||
| 20296297ca | |||
| 72d27e3802 | |||
| 563d67087c | |||
| 7dc06810c2 | |||
| 07e4a5e069 | |||
| db5d3131d1 | 
| @ -1,14 +1,14 @@ | ||||
| # IMPORTANT: To update Docker image version, please search and update ":{previous_version}" | ||||
| # in this file to the new version number, and **ALSO** update the version number below: | ||||
| # PyTorchDockerVersion:262 | ||||
| # Caffe2DockerVersion:230 | ||||
| # PyTorchDockerVersion:282 | ||||
| # Caffe2DockerVersion:238 | ||||
|  | ||||
| docker_config_defaults: &docker_config_defaults | ||||
|   user: jenkins | ||||
|   aws_auth: | ||||
|     # This IAM user only allows read-write access to ECR | ||||
|     aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2} | ||||
|     aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2} | ||||
|     aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3} | ||||
|     aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3} | ||||
|  | ||||
| # NOTE: We only perform the merge in build step and not in test step, because | ||||
| # all source files will be shared from build to test | ||||
| @ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client | ||||
|     sudo apt-get -qq update | ||||
|     sudo apt-get -qq install openssh-client git | ||||
|  | ||||
| install_doc_push_script: &install_doc_push_script | ||||
|   name: Install the doc push script | ||||
|   no_output_timeout: "2m" | ||||
|   command: | | ||||
|     cat >/home/circleci/project/doc_push_script.sh <<EOL | ||||
|     # =================== The following code **should** be executed inside Docker container =================== | ||||
|  | ||||
|     # This is where the local pytorch install in the docker image is located | ||||
|     pt_checkout="/var/lib/jenkins/workspace" | ||||
|  | ||||
|     # Since we're cat-ing this file, we need to escape all $'s | ||||
|     echo "doc_push_script.sh: Invoked with \$*" | ||||
|  | ||||
|     git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site | ||||
|     pushd pytorch.github.io | ||||
|  | ||||
|     set -ex | ||||
|  | ||||
|     # Argument 1: Where to copy the built documentation to | ||||
|     # (pytorch.github.io/$install_path) | ||||
|     install_path="\$1" | ||||
|     if [ -z "\$install_path" ]; then | ||||
|     echo "error: doc_push_script.sh: install_path (arg1) not specified" | ||||
|       exit 1 | ||||
|     fi | ||||
|  | ||||
|     # Argument 2: What version of the docs we are building. | ||||
|     version="\$2" | ||||
|     if [ -z "\$version" ]; then | ||||
|     echo "error: doc_push_script.sh: version (arg2) not specified" | ||||
|       exit 1 | ||||
|     fi | ||||
|  | ||||
|     is_master_doc=false | ||||
|     if [ "\$version" == "master" ]; then | ||||
|       is_master_doc=true | ||||
|     fi | ||||
|  | ||||
|     # Argument 3: (optional) If present, we will NOT do any pushing. Used for testing. | ||||
|     dry_run=false | ||||
|     if [ "\$3" != "" ]; then | ||||
|       dry_run=true | ||||
|     fi | ||||
|  | ||||
|     echo "install_path: \$install_path  version: \$version  dry_run: \$dry_run" | ||||
|  | ||||
|     export LC_ALL=C | ||||
|     export PATH=/opt/conda/bin:$PATH | ||||
|  | ||||
|     rm -rf pytorch || true | ||||
|  | ||||
|     # Get all the documentation sources, put them in one place | ||||
|     pushd "\$pt_checkout" | ||||
|     git clone https://github.com/pytorch/vision | ||||
|     pushd vision | ||||
|     conda install -q pillow | ||||
|     time python setup.py install | ||||
|     popd | ||||
|     pushd docs | ||||
|     rm -rf source/torchvision | ||||
|     cp -r ../vision/docs/source source/torchvision | ||||
|  | ||||
|     # Build the docs | ||||
|     pip -q install -r requirements.txt || true | ||||
|     if [ "\$is_master_doc" = true ]; then | ||||
|       make html | ||||
|     else | ||||
|       make html-stable | ||||
|     fi | ||||
|  | ||||
|     # Move them into the docs repo | ||||
|     popd | ||||
|     popd | ||||
|     git rm -rf "\$install_path" || true | ||||
|     mv "\$pt_checkout/docs/build/html" "\$install_path" | ||||
|  | ||||
|     # Add the version handler by search and replace. | ||||
|     # XXX: Consider moving this to the docs Makefile or site build | ||||
|     if [ "\$is_master_doc" = true ]; then | ||||
|       find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \▼</a>@g" | ||||
|     else | ||||
|       find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \▼</a>@g" | ||||
|     fi | ||||
|  | ||||
|     git add "\$install_path" || true | ||||
|     git status | ||||
|     git config user.email "soumith+bot@pytorch.org" | ||||
|     git config user.name "pytorchbot" | ||||
|     # If there aren't changes, don't make a commit; push is no-op | ||||
|     git commit -m "auto-generating sphinx docs" || true | ||||
|     git status | ||||
|  | ||||
|     if [ "\$dry_run" = false ]; then | ||||
|       echo "Pushing to pytorch.github.io:site" | ||||
|       git push origin site | ||||
|     else | ||||
|       echo "Skipping push due to dry_run" | ||||
|     fi | ||||
|  | ||||
|     popd | ||||
|     # =================== The above code **should** be executed inside Docker container =================== | ||||
|     EOL | ||||
|     chmod +x /home/circleci/project/doc_push_script.sh | ||||
|  | ||||
| setup_ci_environment: &setup_ci_environment | ||||
|   name: Set Up CI Environment | ||||
|   no_output_timeout: "1h" | ||||
| @ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment | ||||
|       echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env | ||||
|  | ||||
|       # This IAM user allows write access to S3 bucket for sccache | ||||
|       echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env | ||||
|       echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env | ||||
|       echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env | ||||
|       echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env | ||||
|     fi | ||||
|  | ||||
|     # This IAM user only allows read-write access to ECR | ||||
|     export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2} | ||||
|     export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2} | ||||
|     export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3} | ||||
|     export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3} | ||||
|     eval $(aws ecr get-login --region us-east-1 --no-include-email) | ||||
|  | ||||
| pytorch_linux_build_defaults: &pytorch_linux_build_defaults | ||||
| @ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults | ||||
|       <<: *setup_ci_environment | ||||
|   - run: | ||||
|       name: Test | ||||
|       no_output_timeout: "90m" | ||||
|       no_output_timeout: "1h" | ||||
|       command: | | ||||
|         set -e | ||||
|         export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} | ||||
| @ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults | ||||
|  | ||||
|           export IN_CIRCLECI=1 | ||||
|  | ||||
|           # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae | ||||
|           brew install moreutils --without-parallel | ||||
|           # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel` | ||||
|           # so we must unlink GNU `parallel` first, and relink it afterwards | ||||
|           brew unlink parallel | ||||
|           brew install moreutils | ||||
|           brew link parallel --overwrite | ||||
|           brew install cmake | ||||
|           brew install expect | ||||
|  | ||||
| @ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults | ||||
|           export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2 | ||||
|  | ||||
|           # This IAM user allows write access to S3 bucket for sccache | ||||
|           export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|           export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|           export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|           export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|  | ||||
|           export SCCACHE_BIN=${PWD}/sccache_bin | ||||
|           mkdir -p ${SCCACHE_BIN} | ||||
| @ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults | ||||
|             sccache --show-stats | ||||
|           fi | ||||
|  | ||||
| ############################################################################## | ||||
| ############################################################################## | ||||
| # Job specifications | ||||
| ############################################################################## | ||||
| ############################################################################## | ||||
|  | ||||
| version: 2 | ||||
| jobs: | ||||
|   pytorch_linux_trusty_py2_7_9_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py2_7_9_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py2_7_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py2_7_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_5_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_5_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc4_8_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc4_8_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc5_4_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc5_4_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc7_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_py3_6_gcc7_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_pynightly_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_trusty_pynightly_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_py3_clang5_asan_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_py3_clang5_asan_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|     resource_class: large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_cuda8_cudnn6_py3_build: | ||||
|   pytorch_linux_xenial_cuda8_cudnn7_py3_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|       BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_cuda8_cudnn6_py3_test: | ||||
|   pytorch_linux_xenial_cuda8_cudnn7_py3_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|     resource_class: gpu.medium | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: | ||||
|   pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|       MULTI_GPU: "1" | ||||
|     resource_class: gpu.large | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test: | ||||
|   pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|     resource_class: gpu.medium | ||||
|     <<: *pytorch_linux_test_defaults | ||||
|  | ||||
|   pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test: | ||||
|   pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|     resource_class: gpu.medium | ||||
| @ -517,7 +631,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_cudnn7_py2_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282" | ||||
|       PYTHON_VERSION: "2.7" | ||||
|       CUDA_VERSION: "9" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
| @ -525,7 +639,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_cudnn7_py2_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282" | ||||
|       PYTHON_VERSION: "2.7" | ||||
|       CUDA_VERSION: "9" | ||||
|     resource_class: gpu.medium | ||||
| @ -534,7 +648,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_cudnn7_py3_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "9" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
| @ -542,7 +656,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_cudnn7_py3_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "9" | ||||
|     resource_class: gpu.medium | ||||
| @ -551,7 +665,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "9.2" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
| @ -559,7 +673,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "9.2" | ||||
|     resource_class: gpu.medium | ||||
| @ -568,7 +682,7 @@ jobs: | ||||
|   pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "10" | ||||
|     <<: *pytorch_linux_build_defaults | ||||
| @ -576,7 +690,7 @@ jobs: | ||||
|   pytorch_short_perf_test_gpu: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-short-perf-test-gpu | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|       PYTHON_VERSION: "3.6" | ||||
|       CUDA_VERSION: "8" | ||||
|     resource_class: gpu.medium | ||||
| @ -597,8 +711,8 @@ jobs: | ||||
|  | ||||
|           docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env | ||||
|           # This IAM user allows write access to S3 bucket for perf test numbers | ||||
|           echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env | ||||
|           echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env | ||||
|           echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env | ||||
|           echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env | ||||
|           docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env | ||||
|  | ||||
|           export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' | ||||
| @ -607,7 +721,7 @@ jobs: | ||||
|   pytorch_doc_push: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: pytorch-doc-push | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282" | ||||
|     resource_class: large | ||||
|     machine: | ||||
|       image: default | ||||
| @ -615,72 +729,39 @@ jobs: | ||||
|     - run: | ||||
|         <<: *setup_ci_environment | ||||
|     - run: | ||||
|         name: Doc Push | ||||
|         <<: *install_doc_push_script | ||||
|     - run: | ||||
|         name: Doc Build and Push | ||||
|         no_output_timeout: "1h" | ||||
|         command: | | ||||
|           set -e | ||||
|           if [[ "${CIRCLE_BRANCH}" != "master" ]]; then | ||||
|             echo "Skipping doc push..." | ||||
|             exit 0 | ||||
|           fi | ||||
|           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} | ||||
|           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} | ||||
|           docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null | ||||
|           export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) | ||||
|  | ||||
|           cat >/home/circleci/project/doc_push_script.sh <<EOL | ||||
|           # =================== The following code will be executed inside Docker container =================== | ||||
|           git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site | ||||
|           pushd pytorch.github.io | ||||
|  | ||||
|           set -ex | ||||
|  | ||||
|           export LC_ALL=C | ||||
|           export PATH=/opt/conda/bin:$PATH | ||||
|  | ||||
|           rm -rf pytorch || true | ||||
|  | ||||
|           # Get all the documentation sources, put them in one place | ||||
|           # TODO: These clones can race | ||||
|           git clone https://github.com/pytorch/pytorch | ||||
|           pushd pytorch | ||||
|           git clone https://github.com/pytorch/vision | ||||
|           pushd vision | ||||
|           conda install -q pillow | ||||
|           time python setup.py install | ||||
|           popd | ||||
|           pushd docs | ||||
|           rm -rf source/torchvision | ||||
|           cp -r ../vision/docs/source source/torchvision | ||||
|  | ||||
|           # Build the docs | ||||
|           pip -q install -r requirements.txt || true | ||||
|           make html | ||||
|  | ||||
|           # Move them into the docs repo | ||||
|           popd | ||||
|           popd | ||||
|           git rm -rf docs/master || true | ||||
|           mv pytorch/docs/build/html docs/master | ||||
|           find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\▼<\\/a>/g' | ||||
|           git add docs/master || true | ||||
|           git status | ||||
|           git config user.email "soumith+bot@pytorch.org" | ||||
|           git config user.name "pytorchbot" | ||||
|           # If there aren't changes, don't make a commit; push is no-op | ||||
|           git commit -m "auto-generating sphinx docs" || true | ||||
|           git status | ||||
|           git push origin site | ||||
|  | ||||
|           popd | ||||
|           # =================== The above code will be executed inside Docker container =================== | ||||
|           EOL | ||||
|           chmod +x /home/circleci/project/doc_push_script.sh | ||||
|           docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh | ||||
|  | ||||
|           export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' | ||||
|           # master branch docs push | ||||
|           if [[ "${CIRCLE_BRANCH}" == "master" ]]; then | ||||
|             export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1' | ||||
|  | ||||
|           # stable release docs push. We keep an eternal PR open for merging | ||||
|           # v1.0.1 -> master; everytime v1.0.1 is updated the following is run. | ||||
|           elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then | ||||
|             export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1' | ||||
|  | ||||
|           # For open PRs: Do a dry_run of the docs build, don't push build | ||||
|           else | ||||
|             export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1' | ||||
|           fi | ||||
|           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts | ||||
|  | ||||
|           # Save the docs build so we can debug any problems | ||||
|           export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug | ||||
|           docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE} | ||||
|           docker push ${DEBUG_COMMIT_DOCKER_IMAGE} | ||||
|  | ||||
|   pytorch_macos_10_13_py3_build: | ||||
|     macos: | ||||
|       xcode: "9.0" | ||||
| @ -696,8 +777,11 @@ jobs: | ||||
|             set -e | ||||
|  | ||||
|             export IN_CIRCLECI=1 | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae | ||||
|             brew install moreutils --without-parallel | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel` | ||||
|             # so we must unlink GNU `parallel` first, and relink it afterwards | ||||
|             brew unlink parallel | ||||
|             brew install moreutils | ||||
|             brew link parallel --overwrite | ||||
|             brew install expect | ||||
|  | ||||
|             # Install sccache | ||||
| @ -706,8 +790,8 @@ jobs: | ||||
|  | ||||
|             export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2 | ||||
|             # This IAM user allows write access to S3 bucket for sccache | ||||
|             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|  | ||||
|             git submodule sync && git submodule update -q --init | ||||
|             chmod a+x .jenkins/pytorch/macos-build.sh | ||||
| @ -740,8 +824,11 @@ jobs: | ||||
|           command: | | ||||
|             set -e | ||||
|             export IN_CIRCLECI=1 | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae | ||||
|             brew install moreutils --without-parallel | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel` | ||||
|             # so we must unlink GNU `parallel` first, and relink it afterwards | ||||
|             brew unlink parallel | ||||
|             brew install moreutils | ||||
|             brew link parallel --overwrite | ||||
|             brew install expect | ||||
|  | ||||
|             cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project | ||||
| @ -765,8 +852,11 @@ jobs: | ||||
|  | ||||
|             export IN_CIRCLECI=1 | ||||
|  | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae | ||||
|             brew install moreutils --without-parallel | ||||
|             # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel` | ||||
|             # so we must unlink GNU `parallel` first, and relink it afterwards | ||||
|             brew unlink parallel | ||||
|             brew install moreutils | ||||
|             brew link parallel --overwrite | ||||
|             brew install expect | ||||
|  | ||||
|             # Install CUDA 9.2 | ||||
| @ -790,30 +880,13 @@ jobs: | ||||
|             sudo chmod +x /usr/local/bin/sccache | ||||
|             export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2 | ||||
|             # This IAM user allows write access to S3 bucket for sccache | ||||
|             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2} | ||||
|             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3} | ||||
|  | ||||
|             git submodule sync && git submodule update -q --init | ||||
|             chmod a+x .jenkins/pytorch/macos-build.sh | ||||
|             unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts | ||||
|  | ||||
|   caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230" | ||||
|       CUDA_VERSION: "8" | ||||
|       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04" | ||||
|     <<: *caffe2_linux_build_defaults | ||||
|  | ||||
|   caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230" | ||||
|       CUDA_VERSION: "8" | ||||
|       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04" | ||||
|     resource_class: gpu.medium | ||||
|     <<: *caffe2_linux_test_defaults | ||||
|  | ||||
|   caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build | ||||
| @ -896,11 +969,20 @@ jobs: | ||||
|   caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230" | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238" | ||||
|       CUDA_VERSION: "8" | ||||
|       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04" | ||||
|       BUILD_ONLY: "1" | ||||
|     <<: *caffe2_linux_build_defaults | ||||
|  | ||||
|   caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test | ||||
|       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238" | ||||
|       CUDA_VERSION: "8" | ||||
|       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04" | ||||
|     resource_class: gpu.medium | ||||
|     <<: *caffe2_linux_test_defaults | ||||
|  | ||||
|   caffe2_py2_gcc4_9_ubuntu14_04_build: | ||||
|     environment: | ||||
|       JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build | ||||
| @ -1008,25 +1090,25 @@ workflows: | ||||
|       - pytorch_linux_xenial_py3_clang5_asan_test: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_py3_clang5_asan_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn6_py3_test: | ||||
|       - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn7_py3_test: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_short_perf_test_gpu: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_doc_push: | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_cuda8_cudnn6_py3_build | ||||
|             - pytorch_linux_xenial_cuda8_cudnn7_py3_build | ||||
|       - pytorch_linux_xenial_cuda9_cudnn7_py2_build | ||||
|       - pytorch_linux_xenial_cuda9_cudnn7_py2_test: | ||||
|           requires: | ||||
| @ -1047,10 +1129,6 @@ workflows: | ||||
|             - pytorch_macos_10_13_py3_build | ||||
|       - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build | ||||
|  | ||||
|       - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build | ||||
|       - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: | ||||
|           requires: | ||||
|             - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build | ||||
|       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build | ||||
|       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: | ||||
|           requires: | ||||
| @ -1072,6 +1150,9 @@ workflows: | ||||
|           requires: | ||||
|             - caffe2_onnx_py2_gcc5_ubuntu16_04_build | ||||
|       - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build | ||||
|       - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test: | ||||
|           requires: | ||||
|             - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build | ||||
|       - caffe2_py2_clang3_8_ubuntu16_04_build | ||||
|       - caffe2_py2_clang3_9_ubuntu16_04_build | ||||
|       - caffe2_py2_clang7_ubuntu16_04_build | ||||
|  | ||||
| @ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") | ||||
|  | ||||
| if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then | ||||
|   CMAKE_ARGS+=("-DBLAS=MKL") | ||||
|   CMAKE_ARGS+=("-DUSE_MKLDNN=ON") | ||||
| fi | ||||
| if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then | ||||
|   CMAKE_ARGS+=("-DUSE_CUDA=ON") | ||||
|  | ||||
| @ -14,18 +14,8 @@ clang --version | ||||
| # symbolize=1: Gives us much better errors when things go wrong | ||||
| export ASAN_OPTIONS=detect_leaks=0:symbolize=1 | ||||
|  | ||||
| # FIXME: Remove the hardcoded "-pthread" option. | ||||
| # With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will | ||||
| # succeed because "pthread_create" is in libasan.so. However, libasan doesn't | ||||
| # have the full pthread implementation. Other advanced pthread functions doesn't | ||||
| # exist in libasan.so[2]. If we need some pthread advanced functions, we still | ||||
| # need to link the pthread library. | ||||
| # [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135 | ||||
| # [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems | ||||
| # | ||||
| # TODO: Make the ASAN flags a more unified env var | ||||
| CC="clang" CXX="clang++" LDSHARED="clang --shared" \ | ||||
|   CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \ | ||||
|   CXX_FLAGS="-pthread" \ | ||||
|   CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \ | ||||
|   NO_CUDA=1 USE_MKLDNN=0 \ | ||||
|   python setup.py install | ||||
|  | ||||
| @ -129,7 +129,7 @@ fi | ||||
| git add -f build/bin | ||||
|  | ||||
| # Test documentation build | ||||
| if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then | ||||
| if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then | ||||
|   pushd docs | ||||
|   # TODO: Don't run this here | ||||
|   pip install -q -r requirements.txt || true | ||||
| @ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then | ||||
| fi | ||||
|  | ||||
| # Test standalone c10 build | ||||
| if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then | ||||
| if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then | ||||
|   mkdir -p c10/build | ||||
|   pushd c10/build | ||||
|   cmake .. | ||||
|  | ||||
| @ -122,7 +122,7 @@ fi | ||||
| # Use conda cmake in some CI build. Conda cmake will be newer than our supported | ||||
| # min version 3.5, so we only do it in two builds that we know should use conda. | ||||
| if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \ | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \ | ||||
|      [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then | ||||
|     if ! which conda; then | ||||
|       echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty" | ||||
|  | ||||
| @ -5,9 +5,9 @@ | ||||
| # in this file will report a failure (so you don't forget to | ||||
| # reenable the tests on merge ;) | ||||
|  | ||||
| pytorch-linux-xenial-cuda8-cudnn6-py3-build | ||||
| pytorch-linux-xenial-cuda8-cudnn6-py3-test | ||||
| pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test | ||||
| pytorch-linux-xenial-cuda8-cudnn7-py3-build | ||||
| pytorch-linux-xenial-cuda8-cudnn7-py3-test | ||||
| pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test | ||||
| pytorch-linux-xenial-cuda9-cudnn7-py2-build | ||||
| pytorch-linux-xenial-cuda9-cudnn7-py2-test | ||||
| pytorch-linux-xenial-cuda9-cudnn7-py3-build | ||||
|  | ||||
| @ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" ( | ||||
|     sccache --show-stats | ||||
|     sccache --zero-stats | ||||
|     rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch | ||||
|     for /f "delims=" %%i in ('where /R caffe2\proto *.py') do ( | ||||
|       IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" ( | ||||
|         del /S /Q %%i | ||||
|       ) | ||||
|     ) | ||||
|     copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe | ||||
|   ) | ||||
|  | ||||
|  | ||||
| @ -34,10 +34,4 @@ matrix: | ||||
|         script: cd docs/cpp/source && ./check-doxygen.sh | ||||
|       - env: CLANG_TIDY | ||||
|         python: "3.6" | ||||
|         addons: | ||||
|           apt: | ||||
|             sources: | ||||
|               - ubuntu-toolchain-r-test | ||||
|               - llvm-toolchain-trusty | ||||
|             packages: clang-tidy | ||||
|         script: tools/run-clang-tidy-in-ci.sh | ||||
|  | ||||
| @ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM) | ||||
| 	--generate-code arch=compute_50,code=sm_50 | ||||
| 	--generate-code arch=compute_60,code=sm_60 | ||||
| 	--generate-code arch=compute_70,code=sm_70) | ||||
|     elseif(${CUDA_VERSION_MAJOR} EQUAL "10") | ||||
|       SET(CUFFT_FAKELINK_OPTIONS | ||||
| 	--generate-code arch=compute_35,code=sm_35 | ||||
| 	--generate-code arch=compute_50,code=sm_50 | ||||
| 	--generate-code arch=compute_60,code=sm_60 | ||||
| 	--generate-code arch=compute_70,code=sm_70) | ||||
|     else() | ||||
|       MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}") | ||||
|     endif() | ||||
|  | ||||
| @ -2122,55 +2122,6 @@ | ||||
|     - arg: THTensor* self | ||||
|     - arg: THTensor* tensor | ||||
| ]] | ||||
| [[ | ||||
|   name: _th_tril | ||||
|   cname: tril | ||||
|   variants: | ||||
|     - function | ||||
|   return: argument 0 | ||||
|   arguments: | ||||
|     - arg: THTensor* result | ||||
|       output: True | ||||
|     - THTensor* self | ||||
|     - arg: long diagonal | ||||
|       default: 0 | ||||
| ]] | ||||
| [[ | ||||
|   name: _th_tril_ | ||||
|   cname: tril | ||||
|   variants: function | ||||
|   return: self | ||||
|   arguments: | ||||
|     - THTensor* self | ||||
|     - THTensor* self | ||||
|     - arg: long diagonal | ||||
|       default: 0 | ||||
| ]] | ||||
| [[ | ||||
|   name: _th_triu | ||||
|   cname: triu | ||||
|   variants: | ||||
|     - function | ||||
|   return: argument 0 | ||||
|   arguments: | ||||
|     - arg: THTensor* result | ||||
|       output: True | ||||
|     - THTensor* self | ||||
|     - arg: long diagonal | ||||
|       default: 0 | ||||
| ]] | ||||
| [[ | ||||
|   name: _th_triu_ | ||||
|   cname: triu | ||||
|   variants: | ||||
|     - function | ||||
|   return: self | ||||
|   arguments: | ||||
|     - THTensor* self | ||||
|     - THTensor* self | ||||
|     - arg: long diagonal | ||||
|       default: 0 | ||||
| ]] | ||||
| [[ | ||||
|   name: _th_cross | ||||
|   cname: cross | ||||
|  | ||||
| @ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) { | ||||
|     reduce_dims.push_back(i); | ||||
|   } | ||||
|   for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) { | ||||
|     if (shape[i - leading_dims] == 1 && sizes[i] > 1) { | ||||
|     if (shape[i - leading_dims] == 1 && sizes[i] != 1) { | ||||
|       reduce_dims.push_back(i); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| @ -81,6 +81,39 @@ inline void parallel_for( | ||||
| #endif | ||||
| } | ||||
|  | ||||
| /* | ||||
| parallel_reduce | ||||
|  | ||||
| begin: index at which to start applying reduction | ||||
|  | ||||
| end: index at which to stop applying reduction | ||||
|  | ||||
| grain_size: number of elements per chunk. impacts number of elements in | ||||
| intermediate results tensor and degree of parallelization. | ||||
|  | ||||
| ident: identity for binary combination function sf. sf(ident, x) needs to return | ||||
| x. | ||||
|  | ||||
| f: function for reduction over a chunk. f needs to be of signature scalar_t | ||||
| f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy) | ||||
|  | ||||
| sf: function to combine two partial results. sf needs to be of signature | ||||
| scalar_t sf(scalar_t x, scalar_t y) | ||||
|  | ||||
| For example, you might have a tensor of 10000 entires and want to sum together | ||||
| all the elements. Parallel_reduce with a grain_size of 2500 will then allocate | ||||
| an intermediate result tensor with 4 elements. Then it will execute the function | ||||
| "f" you provide and pass the beginning and end index of these chunks, so | ||||
| 0-24999, 2500-4999, etc. and the combination identity. It will then write out | ||||
| the result from each of these chunks into the intermediate result tensor. After | ||||
| that it'll reduce the partial results from each chunk into a single number using | ||||
| the combination function sf and the identity ident. For a total summation this | ||||
| would be "+" and 0 respectively. This is similar to tbb's approach [1], where | ||||
| you need to provide a function to accumulate a subrange, a function to combine | ||||
| two partial results and an identity. | ||||
|  | ||||
| [1] https://software.intel.com/en-us/node/506154 | ||||
| */ | ||||
| template <class scalar_t, class F, class SF> | ||||
| inline scalar_t parallel_reduce( | ||||
|     const int64_t begin, | ||||
|  | ||||
| @ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) { | ||||
|  | ||||
| void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) { | ||||
|   AT_CHECK( | ||||
|     t.type().backend() == backend, | ||||
|     !t.defined() || t.type().backend() == backend, | ||||
|     "Expected tensor to have ", toString(backend), | ||||
|     " Backend, but got tensor with ", toString(t.type().backend()), " Backend ", | ||||
|     "(while checking arguments for ", c, ")"); | ||||
|  | ||||
| @ -52,14 +52,11 @@ namespace c10 { | ||||
|   _(prim, TupleSlice)              \ | ||||
|   _(prim, ListConstruct)           \ | ||||
|   _(prim, ListUnpack)              \ | ||||
|   _(prim, BoolToTensor)            \ | ||||
|   _(prim, NumToTensor)             \ | ||||
|   _(prim, TensorToNum)             \ | ||||
|   _(prim, ImplicitTensorToNum)     \ | ||||
|   _(prim, TensorToBool)            \ | ||||
|   _(prim, IntToFloat)              \ | ||||
|   _(prim, FloatToInt)              \ | ||||
|   _(prim, StringToFloat)           \ | ||||
|   _(prim, Bool)                    \ | ||||
|   _(prim, Int)                     \ | ||||
|   _(prim, Float)                   \ | ||||
|   _(prim, device)                  \ | ||||
|   _(prim, dtype)                   \ | ||||
|   _(prim, shape)                   \ | ||||
| @ -139,7 +136,8 @@ namespace c10 { | ||||
|   _(attr, name)                    \ | ||||
|   _(attr, a)                       \ | ||||
|   _(attr, b)                       \ | ||||
|   _(attr, beg) | ||||
|   _(attr, beg)                     \ | ||||
|   _(attr, idx) | ||||
| #else | ||||
| #define FORALL_NS_SYMBOLS(_) \ | ||||
|   _(namespaces, prim)              \ | ||||
|  | ||||
| @ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu | ||||
|     ss << "Future[" << getElementType()->python_str() << "]"; | ||||
|     return ss.str(); | ||||
|   } | ||||
|   TypePtr createWithContained(std::vector<TypePtr> contained_types) const override { | ||||
|     return create(contained_types.at(0)); | ||||
|   } | ||||
| private: | ||||
|   FutureType(TypePtr elem) : SingleElementType(elem) {} | ||||
| }; | ||||
| @ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) { | ||||
| } | ||||
|  | ||||
| inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) { | ||||
|   AT_ASSERT(typ->isSubtypeOf(NumberType::get())); | ||||
|   if (typ->isSubtypeOf(IntType::get())) { | ||||
|     return CompleteTensorType::create(at::kLong, at::kCPU, {}); | ||||
|   } else if (typ->isSubtypeOf(FloatType::get())) { | ||||
| @ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp | ||||
| template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); } | ||||
| template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); } | ||||
|  | ||||
| CAFFE2_API TypePtr inferTypeFrom(const IValue& value); | ||||
| CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value); | ||||
|  | ||||
| using TypeEnv = std::unordered_map<std::string, TypePtr>; | ||||
| struct MatchTypeReturn { | ||||
|  | ||||
| @ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() { | ||||
|   return value; | ||||
| } | ||||
|  | ||||
| TypePtr inferTypeFrom(const IValue& value) { | ||||
| // why incomplete? You cannot completely recover a type from | ||||
| // an IValue, List[List[int]] and List[List[Tensor]] will both | ||||
| // become ivalue.isGenericList() and cannot be recovered. | ||||
| // The only appropriate place to use this is where you know that | ||||
| // you are only dealing with a subset of objects where you can recover | ||||
| // the type, like in the tracer. | ||||
| TypePtr incompleteInferTypeFrom(const IValue& value) { | ||||
|   if (value.isTensor()) { | ||||
|     return CompleteTensorType::create(value.toTensor()); | ||||
|   } else if (value.isDouble()) { | ||||
| @ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) { | ||||
|   } else if (value.isDoubleList()) { | ||||
|     return ListType::ofFloats(); | ||||
|   } else if (value.isTuple()) { | ||||
|     return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom)); | ||||
|     return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom)); | ||||
|   } else if (value.isDevice()) { | ||||
|     return DeviceObjType::get(); | ||||
|   } | ||||
|   AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom"); | ||||
|   AT_ERROR("Type cannot be accurately recovered from this IValue."); | ||||
| } | ||||
|  | ||||
| c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) { | ||||
|  | ||||
| @ -10,10 +10,10 @@ inline scalar_t vec_reduce_all( | ||||
|     vec256::Vec256<scalar_t> acc_vec, | ||||
|     int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   scalar_t acc_arr[Vec::size]; | ||||
|   scalar_t acc_arr[Vec::size()]; | ||||
|   acc_vec.store(acc_arr); | ||||
|   for (int64_t i = 1; i < size; i++) { | ||||
|     scalar_t acc_arr_next[Vec::size]; | ||||
|     scalar_t acc_arr_next[Vec::size()]; | ||||
|     acc_arr_next[0] = acc_arr[i]; | ||||
|     Vec acc_vec_next = Vec::loadu(acc_arr_next); | ||||
|     acc_vec = vec_fun(acc_vec, acc_vec_next); | ||||
| @ -25,11 +25,11 @@ inline scalar_t vec_reduce_all( | ||||
| template <typename scalar_t, typename Op> | ||||
| inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   if (size < Vec::size) | ||||
|   if (size < Vec::size()) | ||||
|     return vec_reduce_all(vec_fun, Vec::loadu(data, size), size); | ||||
|   int64_t d = Vec::size; | ||||
|   int64_t d = Vec::size(); | ||||
|   Vec acc_vec = Vec::loadu(data); | ||||
|   for (; d < size - (size % Vec::size); d += Vec::size) { | ||||
|   for (; d < size - (size % Vec::size()); d += Vec::size()) { | ||||
|     Vec data_vec = Vec::loadu(data + d); | ||||
|     acc_vec = vec_fun(acc_vec, data_vec); | ||||
|   } | ||||
| @ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) { | ||||
|     Vec data_vec = Vec::loadu(data + d, size - d); | ||||
|     acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d); | ||||
|   } | ||||
|   return vec_reduce_all(vec_fun, acc_vec, Vec::size); | ||||
|   return vec_reduce_all(vec_fun, acc_vec, Vec::size()); | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename MapOp, typename ReduceOp> | ||||
| @ -47,11 +47,11 @@ inline scalar_t map_reduce_all( | ||||
|     scalar_t* data, | ||||
|     int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   if (size < Vec::size) | ||||
|   if (size < Vec::size()) | ||||
|     return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size); | ||||
|   int64_t d = Vec::size; | ||||
|   int64_t d = Vec::size(); | ||||
|   Vec acc_vec = map_fun(Vec::loadu(data)); | ||||
|   for (; d < size - (size % Vec::size); d += Vec::size) { | ||||
|   for (; d < size - (size % Vec::size()); d += Vec::size()) { | ||||
|     Vec data_vec = Vec::loadu(data + d); | ||||
|     data_vec = map_fun(data_vec); | ||||
|     acc_vec = red_fun(acc_vec, data_vec); | ||||
| @ -61,7 +61,7 @@ inline scalar_t map_reduce_all( | ||||
|     data_vec = map_fun(data_vec); | ||||
|     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); | ||||
|   } | ||||
|   return vec_reduce_all(red_fun, acc_vec, Vec::size); | ||||
|   return vec_reduce_all(red_fun, acc_vec, Vec::size()); | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename MapOp, typename ReduceOp> | ||||
| @ -72,15 +72,15 @@ inline scalar_t map2_reduce_all( | ||||
|     const scalar_t* data2, | ||||
|     int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   if (size < Vec::size) { | ||||
|   if (size < Vec::size()) { | ||||
|     Vec data_vec = Vec::loadu(data, size); | ||||
|     Vec data2_vec = Vec::loadu(data2, size); | ||||
|     data_vec = map_fun(data_vec, data2_vec); | ||||
|     return vec_reduce_all(red_fun, data_vec, size); | ||||
|   } | ||||
|   int64_t d = Vec::size; | ||||
|   int64_t d = Vec::size(); | ||||
|   Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2)); | ||||
|   for (; d < size - (size % Vec::size); d += Vec::size) { | ||||
|   for (; d < size - (size % Vec::size()); d += Vec::size()) { | ||||
|     Vec data_vec = Vec::loadu(data + d); | ||||
|     Vec data2_vec = Vec::loadu(data2 + d); | ||||
|     data_vec = map_fun(data_vec, data2_vec); | ||||
| @ -92,7 +92,7 @@ inline scalar_t map2_reduce_all( | ||||
|     data_vec = map_fun(data_vec, data2_vec); | ||||
|     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); | ||||
|   } | ||||
|   return vec_reduce_all(red_fun, acc_vec, Vec::size); | ||||
|   return vec_reduce_all(red_fun, acc_vec, Vec::size()); | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename Op> | ||||
| @ -103,7 +103,7 @@ inline void map( | ||||
|     int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   int64_t d = 0; | ||||
|   for (; d < size - (size % Vec::size); d += Vec::size) { | ||||
|   for (; d < size - (size % Vec::size()); d += Vec::size()) { | ||||
|     Vec output_vec = vec_fun(Vec::loadu(input_data + d)); | ||||
|     output_vec.store(output_data + d); | ||||
|   } | ||||
| @ -122,7 +122,7 @@ inline void map2( | ||||
|     int64_t size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   int64_t d = 0; | ||||
|   for (; d < size - (size % Vec::size); d += Vec::size) { | ||||
|   for (; d < size - (size % Vec::size()); d += Vec::size()) { | ||||
|     Vec data_vec = Vec::loadu(input_data + d); | ||||
|     Vec data_vec2 = Vec::loadu(input_data2 + d); | ||||
|     Vec output_vec = vec_fun(data_vec, data_vec2); | ||||
|  | ||||
| @ -15,14 +15,24 @@ | ||||
|  | ||||
| namespace at { | ||||
| namespace vec256 { | ||||
|  | ||||
| // Note [Acceptable use of anonymous namespace in header] | ||||
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
| // Yes you saw right, this is an anonymous namespace in a header.  This header, | ||||
| // and all of its subheaders, REQUIRE their code to be entirely inlined into | ||||
| // the compilation unit that uses them.  It's important that these functions have | ||||
| // internal linkage so that kernels for different architectures don't get | ||||
| // combined during linking. It's sufficient to label functions "static", but | ||||
| // class methods must be an unnamed namespace to have internal linkage (since | ||||
| // static means something different in the context of classes). | ||||
| namespace { | ||||
|  | ||||
| template <typename T> | ||||
| std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) { | ||||
|   T buf[Vec256<T>::size]; | ||||
|   T buf[Vec256<T>::size()]; | ||||
|   vec.store(buf); | ||||
|   stream << "vec["; | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     if (i != 0) { | ||||
|       stream << ", "; | ||||
|     } | ||||
|  | ||||
| @ -20,6 +20,7 @@ | ||||
|  | ||||
| namespace at { | ||||
| namespace vec256 { | ||||
| // See Note [Acceptable use of anonymous namespace in header] | ||||
| namespace { | ||||
|  | ||||
| template<size_t n> struct int_of_size; | ||||
| @ -45,15 +46,49 @@ struct Vec256 { | ||||
| private: | ||||
|   T values[32 / sizeof(T)] = {0}; | ||||
| public: | ||||
|   static constexpr int size = 32 / sizeof(T); | ||||
|   // Note [constexpr static function to avoid odr-usage compiler bug] | ||||
|   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|   // Why, you might ask, is size defined to be a static constexpr function, | ||||
|   // rather than a more ordinary 'static constexpr int size;' variable? | ||||
|   // The problem lies within ODR rules for static constexpr members versus | ||||
|   // static constexpr functions.  First, recall that this class (along with all | ||||
|   // of its derivations) live in an anonymous namespace: they are intended to be | ||||
|   // *completely* inlined at their use-sites, because we need to compile it | ||||
|   // multiple times for different instruction sets. | ||||
|   // | ||||
|   // Because of this constraint, we CANNOT provide a single definition for | ||||
|   // any static members in this class; since we want to compile the class | ||||
|   // multiple times, there wouldn't actually be any good place to put the | ||||
|   // definition.  Now here is the problem: if we ODR-use a static constexpr | ||||
|   // member, we are *obligated* to provide a definition.  Without the | ||||
|   // definition, you get a compile error like: | ||||
|   // | ||||
|   //    relocation R_X86_64_PC32 against undefined symbol | ||||
|   //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making | ||||
|   //    a shared object; recompile with -fPIC | ||||
|   // | ||||
|   // If this were C++17, we could replace a static constexpr variable with | ||||
|   // an inline variable which doesn't require one definition. But we are not | ||||
|   // C++17.  So the next best thing is to replace the member with a static | ||||
|   // constexpr (and therefore inline) function, which does not require ODR | ||||
|   // either. | ||||
|   // | ||||
|   // Also, technically according to the C++ standard, we don't have to define | ||||
|   // a constexpr variable if we never odr-use it.  But it seems that some | ||||
|   // versions GCC/Clang have buggy determinations on whether or not an | ||||
|   // identifier is odr-used or not, and in any case it's hard to tel if | ||||
|   // a variabe is odr-used or not.  So best to just cut the probem at the root. | ||||
|   static constexpr int size() { | ||||
|     return 32 / sizeof(T); | ||||
|   } | ||||
|   Vec256() {} | ||||
|   Vec256(T val) { | ||||
|     for (int i = 0; i != size; i++) { | ||||
|     for (int i = 0; i != size(); i++) { | ||||
|       values[i] = val; | ||||
|     } | ||||
|   } | ||||
|   template<typename... Args, | ||||
|            typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>> | ||||
|            typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>> | ||||
|   Vec256(Args... vals) { | ||||
|     values = { vals... }; | ||||
|   } | ||||
| @ -61,7 +96,7 @@ public: | ||||
|   static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) { | ||||
|     int64_t mask = mask_; | ||||
|     Vec256 vec; | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       if (mask & 0x01) { | ||||
|         vec[i] = b[i]; | ||||
|       } else { | ||||
| @ -74,9 +109,9 @@ public: | ||||
|   static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b, | ||||
|                           const Vec256<T>& mask) { | ||||
|     Vec256 vec; | ||||
|     int_same_size_t<T> buffer[size]; | ||||
|     int_same_size_t<T> buffer[size()]; | ||||
|     mask.store(buffer); | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       if (buffer[i] & 0x01) | ||||
|        { | ||||
|         vec[i] = b[i]; | ||||
| @ -88,14 +123,14 @@ public: | ||||
|   } | ||||
|   static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) { | ||||
|     Vec256 vec; | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       vec.values[i] = base + i * step; | ||||
|     } | ||||
|     return vec; | ||||
|   } | ||||
|   static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) { | ||||
|   static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) { | ||||
|     Vec256 vec; | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       if (i < count) { | ||||
|         vec[i] = b[i]; | ||||
|       } else { | ||||
| @ -114,7 +149,7 @@ public: | ||||
|     std::memcpy(vec.values, ptr, count * sizeof(T)); | ||||
|     return vec; | ||||
|   } | ||||
|   void store(void* ptr, int count = size) const { | ||||
|   void store(void* ptr, int count = size()) const { | ||||
|     std::memcpy(ptr, values, count * sizeof(T)); | ||||
|   } | ||||
|   const T& operator[](int idx) const { | ||||
| @ -125,14 +160,14 @@ public: | ||||
|   } | ||||
|   Vec256<T> map(T (*f)(T)) const { | ||||
|     Vec256<T> ret; | ||||
|     for (int64_t i = 0; i != size; i++) { | ||||
|     for (int64_t i = 0; i != size(); i++) { | ||||
|       ret[i] = f(values[i]); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|   Vec256<T> abs() const { | ||||
|     Vec256<T> ret; | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       ret[i] = values[i] < 0 ? -values[i] : values[i]; | ||||
|     } | ||||
|     return ret; | ||||
| @ -214,7 +249,7 @@ public: | ||||
|   } | ||||
|   Vec256<T> pow(const Vec256<T> &exp) const { | ||||
|     Vec256<T> ret; | ||||
|     for (int64_t i = 0; i < size; i++) { | ||||
|     for (int64_t i = 0; i < size(); i++) { | ||||
|       ret[i] = std::pow(values[i], exp[i]); | ||||
|     } | ||||
|     return ret; | ||||
| @ -222,7 +257,7 @@ public: | ||||
| #define DEFINE_COMP(binary_pred)                                              \ | ||||
|   Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \ | ||||
|     Vec256<T> vec;                                                            \ | ||||
|     for (int64_t i = 0; i != size; i++) {                                     \ | ||||
|     for (int64_t i = 0; i != size(); i++) {                                     \ | ||||
|       if (values[i] binary_pred other.values[i]) {                            \ | ||||
|         std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \ | ||||
|       } else {                                                                \ | ||||
| @ -242,7 +277,7 @@ public: | ||||
|  | ||||
| template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = a[i] + b[i]; | ||||
|   } | ||||
|   return c; | ||||
| @ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T | ||||
|  | ||||
| template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = a[i] - b[i]; | ||||
|   } | ||||
|   return c; | ||||
| @ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T | ||||
|  | ||||
| template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = a[i] * b[i]; | ||||
|   } | ||||
|   return c; | ||||
| @ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T | ||||
|  | ||||
| template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = a[i] / b[i]; | ||||
|   } | ||||
|   return c; | ||||
| @ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T | ||||
| // either input is a NaN. | ||||
| template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = (a[i] > b[i]) ? a[i] : b[i]; | ||||
|     if (std::is_floating_point<T>::value && std::isnan(a[i])) { | ||||
|       // If either input is NaN, propagate a NaN. | ||||
| @ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) { | ||||
| // either input is a NaN. | ||||
| template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) { | ||||
|   Vec256<T> c = Vec256<T>(); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     c[i] = (a[i] < b[i]) ? a[i] : b[i]; | ||||
|     if (std::is_floating_point<T>::value && std::isnan(a[i])) { | ||||
|       // If either input is NaN, propagate a NaN. | ||||
| @ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) { | ||||
| template <class T>                                                          \ | ||||
| Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \ | ||||
|   using iT = int_same_size_t<T>;                                            \ | ||||
|   iT buffer[Vec256<T>::size];                                               \ | ||||
|   for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \ | ||||
|   iT buffer[Vec256<T>::size()];                                               \ | ||||
|   for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \ | ||||
|     auto a_val = a[i];                                                      \ | ||||
|     auto b_val = b[i];                                                      \ | ||||
|     iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \ | ||||
| @ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) { | ||||
| template <int64_t scale = 1, typename T = void> | ||||
| c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>> | ||||
| inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) { | ||||
|   static constexpr int size = Vec256<T>::size; | ||||
|   static constexpr int size = Vec256<T>::size(); | ||||
|   int_same_size_t<T> index_arr[size]; | ||||
|   vindex.store(static_cast<void*>(index_arr)); | ||||
|   T buffer[size]; | ||||
| @ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void> | ||||
| c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>> | ||||
| inline mask_gather(const Vec256<T>& src, T const* base_addr, | ||||
|                    const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) { | ||||
|   static constexpr int size = Vec256<T>::size; | ||||
|   static constexpr int size = Vec256<T>::size(); | ||||
|   T src_arr[size]; | ||||
|   int_same_size_t<T> mask_arr[size];  // use int type so we can logical and | ||||
|   int_same_size_t<T> index_arr[size]; | ||||
| @ -392,7 +427,7 @@ namespace { | ||||
|   template<typename dst_t, typename src_t> | ||||
|   struct CastImpl { | ||||
|     static inline Vec256<dst_t> apply(const Vec256<src_t>& src) { | ||||
|       src_t src_arr[Vec256<src_t>::size]; | ||||
|       src_t src_arr[Vec256<src_t>::size()]; | ||||
|       src.store(static_cast<void*>(src_arr)); | ||||
|       return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr)); | ||||
|     } | ||||
| @ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) { | ||||
|  | ||||
| template <typename T> | ||||
| inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) { | ||||
|   static constexpr int size = Vec256<T>::size; | ||||
|   static constexpr int size = Vec256<T>::size(); | ||||
|   T src_arr[size]; | ||||
|   src.store(static_cast<void*>(src_arr)); | ||||
|   int_same_size_t<T> buffer[size]; | ||||
| @ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s | ||||
| //       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7} | ||||
| //                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7} | ||||
| template <typename T> | ||||
| inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>> | ||||
| inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>> | ||||
| deinterleave2(const Vec256<T>& a, const Vec256<T>& b) { | ||||
|   static constexpr int size = Vec256<T>::size; | ||||
|   static constexpr int size = Vec256<T>::size(); | ||||
|   static constexpr int half_size = size / 2; | ||||
|   T a_arr[size]; | ||||
|   T b_arr[size]; | ||||
| @ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) { | ||||
| //       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3} | ||||
| //                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7} | ||||
| template <typename T> | ||||
| inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>> | ||||
| inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>> | ||||
| interleave2(const Vec256<T>& a, const Vec256<T>& b) { | ||||
|   static constexpr int size = Vec256<T>::size; | ||||
|   static constexpr int size = Vec256<T>::size(); | ||||
|   static constexpr int half_size = size / 2; | ||||
|   T a_arr[size]; | ||||
|   T b_arr[size]; | ||||
| @ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) { | ||||
|  | ||||
| template <typename src_T, typename dst_T> | ||||
| void convert(const src_T *src, dst_T *dst, int64_t n) { | ||||
| #pragma unroll | ||||
| #ifndef _MSC_VER   | ||||
| # pragma unroll   | ||||
| #endif | ||||
|   for (int64_t i = 0; i < n; i++) { | ||||
|     *dst = static_cast<dst_T>( | ||||
|         static_cast<at::native::inter_copy_type_t<dst_T>>(*src)); | ||||
|  | ||||
| @ -8,6 +8,7 @@ | ||||
|  | ||||
| namespace at { | ||||
| namespace vec256 { | ||||
| // See Note [Acceptable use of anonymous namespace in header] | ||||
| namespace { | ||||
|  | ||||
| #if defined(__AVX__) && !defined(_MSC_VER) | ||||
| @ -16,7 +17,9 @@ template <> class Vec256<double> { | ||||
| private: | ||||
|   __m256d values; | ||||
| public: | ||||
|   static constexpr int size = 4; | ||||
|   static constexpr int size() { | ||||
|     return 4; | ||||
|   } | ||||
|   Vec256() {} | ||||
|   Vec256(__m256d v) : values(v) {} | ||||
|   Vec256(double val) { | ||||
| @ -40,7 +43,7 @@ public: | ||||
|     return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step); | ||||
|   } | ||||
|   static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b, | ||||
|                             int64_t count = size) { | ||||
|                             int64_t count = size()) { | ||||
|     switch (count) { | ||||
|       case 0: | ||||
|         return a; | ||||
| @ -53,22 +56,22 @@ public: | ||||
|     } | ||||
|     return b; | ||||
|   } | ||||
|   static Vec256<double> loadu(const void* ptr, int64_t count = size) { | ||||
|     if (count == size) | ||||
|   static Vec256<double> loadu(const void* ptr, int64_t count = size()) { | ||||
|     if (count == size()) | ||||
|       return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr)); | ||||
|  | ||||
|     __at_align32__ double tmp_values[size]; | ||||
|     __at_align32__ double tmp_values[size()]; | ||||
|     std::memcpy( | ||||
|         tmp_values, | ||||
|         reinterpret_cast<const double*>(ptr), | ||||
|         count * sizeof(double)); | ||||
|     return _mm256_load_pd(tmp_values); | ||||
|   } | ||||
|   void store(void* ptr, int count = size) const { | ||||
|     if (count == size) { | ||||
|   void store(void* ptr, int count = size()) const { | ||||
|     if (count == size()) { | ||||
|       _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values); | ||||
|     } else if (count > 0) { | ||||
|       double tmp_values[size]; | ||||
|       double tmp_values[size()]; | ||||
|       _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values); | ||||
|       std::memcpy(ptr, tmp_values, count * sizeof(double)); | ||||
|     } | ||||
| @ -252,7 +255,7 @@ template <> | ||||
| void convert(const double* src, double* dst, int64_t n) { | ||||
|   int64_t i; | ||||
| #pragma unroll | ||||
|   for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) { | ||||
|   for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) { | ||||
|     _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i)); | ||||
|   } | ||||
| #pragma unroll | ||||
|  | ||||
| @ -8,6 +8,7 @@ | ||||
|  | ||||
| namespace at { | ||||
| namespace vec256 { | ||||
| // See Note [Acceptable use of anonymous namespace in header] | ||||
| namespace { | ||||
|  | ||||
| #if defined(__AVX__) && !defined(_MSC_VER) | ||||
| @ -16,7 +17,9 @@ template <> class Vec256<float> { | ||||
| private: | ||||
|   __m256 values; | ||||
| public: | ||||
|   static constexpr int size = 8; | ||||
|   static constexpr int size() { | ||||
|     return 8; | ||||
|   } | ||||
|   Vec256() {} | ||||
|   Vec256(__m256 v) : values(v) {} | ||||
|   Vec256(float val) { | ||||
| @ -43,7 +46,7 @@ public: | ||||
|       base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); | ||||
|   } | ||||
|   static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b, | ||||
|                            int64_t count = size) { | ||||
|                            int64_t count = size()) { | ||||
|     switch (count) { | ||||
|       case 0: | ||||
|         return a; | ||||
| @ -64,19 +67,19 @@ public: | ||||
|     } | ||||
|     return b; | ||||
|   } | ||||
|   static Vec256<float> loadu(const void* ptr, int64_t count = size) { | ||||
|     if (count == size) | ||||
|   static Vec256<float> loadu(const void* ptr, int64_t count = size()) { | ||||
|     if (count == size()) | ||||
|       return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr)); | ||||
|     __at_align32__ float tmp_values[size]; | ||||
|     __at_align32__ float tmp_values[size()]; | ||||
|     std::memcpy( | ||||
|         tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float)); | ||||
|     return _mm256_loadu_ps(tmp_values); | ||||
|   } | ||||
|   void store(void* ptr, int64_t count = size) const { | ||||
|     if (count == size) { | ||||
|   void store(void* ptr, int64_t count = size()) const { | ||||
|     if (count == size()) { | ||||
|       _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values); | ||||
|     } else if (count > 0) { | ||||
|       float tmp_values[size]; | ||||
|       float tmp_values[size()]; | ||||
|       _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values); | ||||
|       std::memcpy(ptr, tmp_values, count * sizeof(float)); | ||||
|     } | ||||
| @ -260,7 +263,7 @@ template <> | ||||
| void convert(const float* src, float* dst, int64_t n) { | ||||
|   int64_t i; | ||||
| #pragma unroll | ||||
|   for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) { | ||||
|   for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) { | ||||
|     _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i)); | ||||
|   } | ||||
| #pragma unroll | ||||
|  | ||||
| @ -12,6 +12,11 @@ namespace { | ||||
| struct Vec256i { | ||||
| protected: | ||||
|   __m256i values; | ||||
|  | ||||
|   static inline __m256i invert(const __m256i& v) { | ||||
|     const auto ones = _mm256_set1_epi64x(-1); | ||||
|     return _mm256_xor_si256(ones, v); | ||||
|   } | ||||
| public: | ||||
|   Vec256i() {} | ||||
|   Vec256i(__m256i v) : values(v) {} | ||||
| @ -22,7 +27,9 @@ public: | ||||
|  | ||||
| template <> | ||||
| struct Vec256<int64_t> : public Vec256i { | ||||
|   static constexpr int size = 4; | ||||
|   static constexpr int size() { | ||||
|     return 4; | ||||
|   } | ||||
|   using Vec256i::Vec256i; | ||||
|   Vec256() {} | ||||
|   Vec256(int64_t v) { values = _mm256_set1_epi64x(v); } | ||||
| @ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i { | ||||
|   } | ||||
|   template <int64_t mask> | ||||
|   static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) { | ||||
|     __at_align32__ int64_t tmp_values[size]; | ||||
|     __at_align32__ int64_t tmp_values[size()]; | ||||
|     a.store(tmp_values); | ||||
|     if (mask & 0x01) | ||||
|       tmp_values[0] = _mm256_extract_epi64(b.values, 0); | ||||
| @ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i { | ||||
|     return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step); | ||||
|   } | ||||
|   static Vec256<int64_t> | ||||
|   set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) { | ||||
|   set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) { | ||||
|     switch (count) { | ||||
|       case 0: | ||||
|         return a; | ||||
| @ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i { | ||||
|     return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); | ||||
|   } | ||||
|   static Vec256<int64_t> loadu(const void* ptr, int64_t count) { | ||||
|     __at_align32__ int64_t tmp_values[size]; | ||||
|     __at_align32__ int64_t tmp_values[size()]; | ||||
|     std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); | ||||
|     return loadu(tmp_values); | ||||
|   } | ||||
|   void store(void* ptr, int count = size) const { | ||||
|     if (count == size) { | ||||
|   void store(void* ptr, int count = size()) const { | ||||
|     if (count == size()) { | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); | ||||
|     } else if (count > 0) { | ||||
|       __at_align32__ int64_t tmp_values[size]; | ||||
|       __at_align32__ int64_t tmp_values[size()]; | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); | ||||
|       std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); | ||||
|     } | ||||
| @ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i { | ||||
|     return _mm256_cmpeq_epi64(values, other.values); | ||||
|   } | ||||
|   Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto eq = _mm256_cmpeq_epi64(values, other.values); | ||||
|     return _mm256_xor_si256(zero, eq);  // invert | ||||
|     return invert(_mm256_cmpeq_epi64(values, other.values)); | ||||
|   } | ||||
|   Vec256<int64_t> operator<(const Vec256<int64_t>& other) const { | ||||
|     return _mm256_cmpgt_epi64(other.values, values); | ||||
|   } | ||||
|   Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto gt = _mm256_cmpgt_epi64(values, other.values); | ||||
|     return _mm256_xor_si256(zero, gt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi64(values, other.values)); | ||||
|   } | ||||
|   Vec256<int64_t> operator>(const Vec256<int64_t>& other) const { | ||||
|     return _mm256_cmpgt_epi64(values, other.values); | ||||
|   } | ||||
|   Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto lt = _mm256_cmpgt_epi64(other.values, values); | ||||
|     return _mm256_xor_si256(zero, lt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi64(other.values, values)); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <> | ||||
| struct Vec256<int32_t> : public Vec256i { | ||||
|   static constexpr int size = 8; | ||||
|   static constexpr int size() { | ||||
|     return 8; | ||||
|   } | ||||
|   using Vec256i::Vec256i; | ||||
|   Vec256() {} | ||||
|   Vec256(int32_t v) { values = _mm256_set1_epi32(v); } | ||||
| @ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i { | ||||
|       base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); | ||||
|   } | ||||
|   static Vec256<int32_t> | ||||
|   set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) { | ||||
|   set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) { | ||||
|     switch (count) { | ||||
|       case 0: | ||||
|         return a; | ||||
| @ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i { | ||||
|     return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); | ||||
|   } | ||||
|   static Vec256<int32_t> loadu(const void* ptr, int32_t count) { | ||||
|     __at_align32__ int32_t tmp_values[size]; | ||||
|     __at_align32__ int32_t tmp_values[size()]; | ||||
|     std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); | ||||
|     return loadu(tmp_values); | ||||
|   } | ||||
|   void store(void* ptr, int count = size) const { | ||||
|     if (count == size) { | ||||
|   void store(void* ptr, int count = size()) const { | ||||
|     if (count == size()) { | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); | ||||
|     } else if (count > 0) { | ||||
|       __at_align32__ int32_t tmp_values[size]; | ||||
|       __at_align32__ int32_t tmp_values[size()]; | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); | ||||
|       std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); | ||||
|     } | ||||
| @ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i { | ||||
|     return _mm256_cmpeq_epi32(values, other.values); | ||||
|   } | ||||
|   Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto eq = _mm256_cmpeq_epi32(values, other.values); | ||||
|     return _mm256_xor_si256(zero, eq);  // invert | ||||
|     return invert(_mm256_cmpeq_epi32(values, other.values)); | ||||
|   } | ||||
|   Vec256<int32_t> operator<(const Vec256<int32_t>& other) const { | ||||
|     return _mm256_cmpgt_epi32(other.values, values); | ||||
|   } | ||||
|   Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto gt = _mm256_cmpgt_epi32(values, other.values); | ||||
|     return _mm256_xor_si256(zero, gt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi32(values, other.values)); | ||||
|   } | ||||
|   Vec256<int32_t> operator>(const Vec256<int32_t>& other) const { | ||||
|     return _mm256_cmpgt_epi32(values, other.values); | ||||
|   } | ||||
|   Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto lt = _mm256_cmpgt_epi32(other.values, values); | ||||
|     return _mm256_xor_si256(zero, lt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi32(other.values, values)); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @ -212,13 +209,17 @@ template <> | ||||
| void convert(const int32_t *src, float *dst, int64_t n) { | ||||
|   int64_t i; | ||||
|   // int32_t and float have same size | ||||
| #pragma unroll | ||||
|   for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) { | ||||
| #ifndef _MSC_VER | ||||
| # pragma unroll | ||||
| #endif | ||||
|   for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) { | ||||
|     auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i)); | ||||
|     auto output_vec = _mm256_cvtepi32_ps(input_vec); | ||||
|     _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec); | ||||
|   } | ||||
| #pragma unroll | ||||
| #ifndef _MSC_VER | ||||
| # pragma unroll | ||||
| #endif | ||||
|   for (; i < n; i++) { | ||||
|     dst[i] = static_cast<float>(src[i]); | ||||
|   } | ||||
| @ -228,13 +229,17 @@ template <> | ||||
| void convert(const int32_t *src, double *dst, int64_t n) { | ||||
|   int64_t i; | ||||
|   // int32_t has half the size of double | ||||
| #pragma unroll | ||||
|   for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) { | ||||
| #ifndef _MSC_VER | ||||
| # pragma unroll | ||||
| #endif | ||||
|   for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) { | ||||
|     auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i)); | ||||
|     auto output_vec = _mm256_cvtepi32_pd(input_128_vec); | ||||
|     _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec); | ||||
|   } | ||||
| #pragma unroll | ||||
| #ifndef _MSC_VER | ||||
| # pragma unroll | ||||
| #endif | ||||
|   for (; i < n; i++) { | ||||
|     dst[i] = static_cast<double>(src[i]); | ||||
|   } | ||||
| @ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) { | ||||
|  | ||||
| template <> | ||||
| struct Vec256<int16_t> : public Vec256i { | ||||
|   static constexpr int size = 16; | ||||
|   static constexpr int size() { | ||||
|     return 16; | ||||
|   } | ||||
|   using Vec256i::Vec256i; | ||||
|   Vec256() {} | ||||
|   Vec256(int16_t v) { values = _mm256_set1_epi16(v); } | ||||
| @ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i { | ||||
|   } | ||||
|   template <int64_t mask> | ||||
|   static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) { | ||||
|     __at_align32__ int16_t tmp_values[size]; | ||||
|     __at_align32__ int16_t tmp_values[size()]; | ||||
|     a.store(tmp_values); | ||||
|     if (mask & 0x01) | ||||
|       tmp_values[0] = _mm256_extract_epi16(b.values, 0); | ||||
| @ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i { | ||||
|       base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); | ||||
|   } | ||||
|   static Vec256<int16_t> | ||||
|   set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) { | ||||
|   set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) { | ||||
|     switch (count) { | ||||
|       case 0: | ||||
|         return a; | ||||
| @ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i { | ||||
|     return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); | ||||
|   } | ||||
|   static Vec256<int16_t> loadu(const void* ptr, int16_t count) { | ||||
|     __at_align32__ int16_t tmp_values[size]; | ||||
|     __at_align32__ int16_t tmp_values[size()]; | ||||
|     std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); | ||||
|     return loadu(tmp_values); | ||||
|   } | ||||
|   void store(void* ptr, int count = size) const { | ||||
|     if (count == size) { | ||||
|   void store(void* ptr, int count = size()) const { | ||||
|     if (count == size()) { | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); | ||||
|     } else if (count > 0) { | ||||
|       __at_align32__ int16_t tmp_values[size]; | ||||
|       __at_align32__ int16_t tmp_values[size()]; | ||||
|       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); | ||||
|       std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); | ||||
|     } | ||||
| @ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i { | ||||
|     return _mm256_cmpeq_epi16(values, other.values); | ||||
|   } | ||||
|   Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto eq = _mm256_cmpeq_epi16(values, other.values); | ||||
|     return _mm256_xor_si256(zero, eq);  // invert | ||||
|     return invert(_mm256_cmpeq_epi16(values, other.values)); | ||||
|   } | ||||
|   Vec256<int16_t> operator<(const Vec256<int16_t>& other) const { | ||||
|     return _mm256_cmpgt_epi16(other.values, values); | ||||
|   } | ||||
|   Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto gt = _mm256_cmpgt_epi16(values, other.values); | ||||
|     return _mm256_xor_si256(zero, gt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi16(values, other.values)); | ||||
|   } | ||||
|   Vec256<int16_t> operator>(const Vec256<int16_t>& other) const { | ||||
|     return _mm256_cmpgt_epi16(values, other.values); | ||||
|   } | ||||
|   Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const { | ||||
|     auto zero = _mm256_set1_epi64x(0); | ||||
|     auto lt = _mm256_cmpgt_epi16(other.values, values); | ||||
|     return _mm256_xor_si256(zero, lt);  // invert | ||||
|     return invert(_mm256_cmpgt_epi16(other.values, values)); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t> | ||||
|  | ||||
| template <typename T> | ||||
| Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) { | ||||
|   T values_a[Vec256<T>::size]; | ||||
|   T values_b[Vec256<T>::size]; | ||||
|   T values_a[Vec256<T>::size()]; | ||||
|   T values_b[Vec256<T>::size()]; | ||||
|   a.store(values_a); | ||||
|   b.store(values_b); | ||||
|   for (int i = 0; i != Vec256<T>::size; i++) { | ||||
|   for (int i = 0; i != Vec256<T>::size(); i++) { | ||||
|     values_a[i] /= values_b[i]; | ||||
|   } | ||||
|   return Vec256<T>::loadu(values_a); | ||||
|  | ||||
| @ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { | ||||
|       THCState_getCurrentDeviceProperties(globalContext().getTHCState()); | ||||
|   // NOTE: extra parenthesis around numbers disable clang warnings about | ||||
|   // dead code | ||||
|   return ( | ||||
|       (CUDNN_VERSION >= (6021)) || | ||||
|       (CUDNN_VERSION >= (6000) && prop->major >= 5)); | ||||
|   return true; | ||||
| #else | ||||
|   return false; | ||||
| #endif | ||||
|  | ||||
| @ -9,45 +9,6 @@ | ||||
| #include "ATen/cuda/ATenCUDAGeneral.h" | ||||
| #include <cuda.h> | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|  | ||||
| #include <curand_kernel.h> | ||||
|  | ||||
| /* | ||||
| Note [cuDNN dropout descriptor initialization] | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| In most cases, setting descriptors in cuDNN is cheap (e.g., | ||||
| cudnnSetTensorNdDescriptor).  However, this is not the case for | ||||
| cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an | ||||
| expensive precomputation to initialize the random number generator states.  In | ||||
| cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor, | ||||
| which means that law-abiding clients were expected to generate a dropout | ||||
| descriptor once and cache it.  However, our ATen interface is (1) stateless (so | ||||
| we can't cache the descriptors) and (2) does not accept arbitrary user types in | ||||
| its interface (so we can't pass the descriptor in).  This puts us in a pickle. | ||||
|  | ||||
| In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which | ||||
| forgoes the expensive initialization process, and can initialize the | ||||
| descriptor with a pre-initialized state CUDA tensor.  This is great, because | ||||
| it means we can simply pass in the state tensor and then initialize the | ||||
| descriptor internally.  Unfortunately, this function is not available in | ||||
| cuDNN 6. | ||||
|  | ||||
| To work around this, we break the cuDNN abstraction barrier, and have | ||||
| the struct layout of the underlaying dropout descriptor.  With this struct, | ||||
| we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great! | ||||
| */ | ||||
|  | ||||
| // Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization] | ||||
| struct cudnnDropoutStruct { | ||||
|   float dropout; | ||||
|   int nstates; | ||||
|   void * states; | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| namespace at { namespace native { | ||||
|  | ||||
| // TODO: Add constructors for all of the descriptors | ||||
| @ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor | ||||
|     if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT; | ||||
|     AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, | ||||
|                                           CUDNN_CROSS_CORRELATION, mathType)); | ||||
| #if CUDNN_VERSION >= 7000 | ||||
|     AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups)); | ||||
|     AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH)); | ||||
|     if(dataType == CUDNN_DATA_HALF) | ||||
|       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|  | ||||
| // See Note [cuDNN dropout descriptor initialization] | ||||
| inline cudnnStatus_t cudnnRestoreDropoutDescriptor( | ||||
|     cudnnDropoutDescriptor_t dropoutDesc, | ||||
|     cudnnHandle_t handle, | ||||
|     float dropout, | ||||
|     void *states, | ||||
|     size_t stateSizeInBytes, | ||||
|     unsigned long long seed) { | ||||
|   // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends. | ||||
|   // This is not entirely accurate but is good enough to catch some API | ||||
|   // uses which would not be compatible in cuDNN 7.  Feel free to fix | ||||
|   // this if you notice something is wrong. | ||||
|   if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE; | ||||
|   if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE; | ||||
|   size_t expectedStateSizeInBytes; | ||||
|   // State size will differ depending on size of GPU | ||||
|   auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes); | ||||
|   if (ret != CUDNN_STATUS_SUCCESS) return ret; | ||||
|   if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE; | ||||
|   dropoutDesc->dropout = dropout; | ||||
|   dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t); | ||||
|   dropoutDesc->states = states; | ||||
|   return CUDNN_STATUS_SUCCESS; | ||||
| } | ||||
|  | ||||
| #endif // CUDNN_VERSION | ||||
|  | ||||
| struct AT_CUDA_API DropoutDescriptor | ||||
|   : public Descriptor<cudnnDropoutStruct, | ||||
|                       &cudnnCreateDropoutDescriptor, | ||||
| @ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor | ||||
|           mode, | ||||
|           algo, | ||||
|           datatype)); | ||||
| #if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000 | ||||
| #if CUDA_VERSION >= 9000 | ||||
|     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); | ||||
|     if (prop->major >= 7) { | ||||
|       if (datatype == CUDNN_DATA_HALF) { | ||||
| @ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #if CUDNN_VERSION >= 7000 | ||||
|  | ||||
| struct AT_CUDA_API CTCLossDescriptor | ||||
|   : public Descriptor<cudnnCTCLossStruct, | ||||
|                       &cudnnCreateCTCLossDescriptor, | ||||
| @ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| union Constant | ||||
| { | ||||
|   float f; | ||||
|  | ||||
| @ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) { | ||||
|       input_stride1 = strides[1]; | ||||
|     } | ||||
|     AT_CHECK(channel_size == weight_num, | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.", | ||||
|       weight_num, channel_size); | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, | ||||
|       " and channel size = ", channel_size, "."); | ||||
|  | ||||
|     AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] { | ||||
|       prelu_cpu_kernel_multi_weights<scalar_t>( | ||||
| @ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten | ||||
|       input_stride1 = strides[1]; | ||||
|     } | ||||
|     AT_CHECK(channel_size == weight_num, | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.", | ||||
|       weight_num, channel_size); | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, | ||||
|       " and channel size = ", channel_size, "."); | ||||
|  | ||||
|     AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] { | ||||
|       prelu_cpu_backward_kernel_multi_weights<scalar_t>( | ||||
|  | ||||
| @ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A) | ||||
|  | ||||
| // Supports arbitrary batch dimensions for self and A | ||||
| std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) { | ||||
|   if (self.dim() <= 2 && A.dim() <= 2) { | ||||
|   AT_CHECK(self.dim() >= 2, | ||||
|            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); | ||||
|   AT_CHECK(A.dim() >= 2, | ||||
|            "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); | ||||
|   if (self.dim() == 2 && A.dim() == 2) { | ||||
|     // TODO: #7102: It's not necessary to have gesv (single) bindings for both | ||||
|     // TH and ATen. We should remove the TH gesv bindings, especially | ||||
|     // since the lapackGesv function is already in ATen. | ||||
|     linearSolveCheckInputs(self, A);  // Checks square shape of A, and compatibility of self and A | ||||
|     return at::_th_gesv_single(self, A); | ||||
|   } | ||||
|  | ||||
| @ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) { | ||||
|   } | ||||
|   squareCheckInputs(self); | ||||
|  | ||||
|   // TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors, | ||||
|   // this can be simplified. Currently, we are zero-ing out values in the | ||||
|   // batch of matrices by using a mask and the `where` function. | ||||
|   // The simplification with batched `triu` and `tril` would be this: | ||||
|   // if (upper) { | ||||
|   //   return raw_cholesky_output.triu(); | ||||
|   // } else { | ||||
|   //   return raw_cholesky_output.tril(); | ||||
|   // } | ||||
|   auto raw_cholesky_output = at::_cholesky_helper(self, upper); | ||||
|   int64_t n = self.size(-1); | ||||
|   auto indices = at::ones({n, n}, self.options().dtype(at::kByte)); | ||||
|   indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self); | ||||
|   return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output); | ||||
|   if (upper) { | ||||
|     return raw_cholesky_output.triu_(); | ||||
|   } else { | ||||
|     return raw_cholesky_output.tril_(); | ||||
|   } | ||||
| } | ||||
|  | ||||
| Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) { | ||||
| @ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) { | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, bool inplace, bool upper> | ||||
| static void apply_triu_tril_single( | ||||
|     scalar_t* result, scalar_t* self, | ||||
|     int64_t k, int64_t n, int64_t m, | ||||
|     int64_t res_row_stride, int64_t res_col_stride, | ||||
|     int64_t self_row_stride, int64_t self_col_stride) { | ||||
|  | ||||
|   constexpr int64_t zero = 0; | ||||
|   int64_t i; | ||||
|  | ||||
|   if (upper) { | ||||
|     #pragma omp parallel for private(i) | ||||
|     for (i = 0; i < n; i++) { | ||||
|       for (int64_t j = 0; j < std::min(m, i + k); j++) { | ||||
|         result[i * res_row_stride + j * res_col_stride] = 0; | ||||
|       } | ||||
|       if (!inplace) {  // copy the rest of the self if not inplace | ||||
|         for (int64_t j = std::max(zero, i + k); j < m; j++) { | ||||
|           result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride]; | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   } else { | ||||
|     #pragma omp parallel for private(i) | ||||
|     for (i = 0; i < n; i++) { | ||||
|       for (int64_t j = std::max(zero, i + k + 1); j < m; j++) { | ||||
|         result[i * res_row_stride + j * res_col_stride] = 0; | ||||
|       } | ||||
|       if (!inplace) {  // copy the rest of the self if not inplace | ||||
|         for (int64_t j = zero; j < std::min(m, i + k + 1); j++) { | ||||
|           result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride]; | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, bool inplace, bool upper> | ||||
| void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) { | ||||
|   auto n = self.size(-2); | ||||
|   auto m = self.size(-1); | ||||
|   auto self_data = self.data<scalar_t>(); | ||||
|   auto self_stride = self.dim() > 2 ? self.stride(-3) : 1; | ||||
|   auto batchsize = batchCount(self); | ||||
|   auto self_row_stride = self.stride(-2); | ||||
|   auto self_column_stride = self.stride(-1); | ||||
|  | ||||
|   auto result_data = result.data<scalar_t>(); | ||||
|   int64_t result_stride, result_row_stride, result_column_stride; | ||||
|   if (result_data != self_data) { | ||||
|     result_stride = result.dim() > 2 ? result.stride(-3) : 1; | ||||
|     result_row_stride = result.stride(-2); | ||||
|     result_column_stride = result.stride(-1); | ||||
|   } else { | ||||
|     result_stride = self_stride; | ||||
|     result_row_stride = self_row_stride; | ||||
|     result_column_stride = self_column_stride; | ||||
|   } | ||||
|  | ||||
|   int64_t b; | ||||
|   #pragma omp parallel for private(b) | ||||
|   for (b = 0; b < batchsize; b++) { | ||||
|     scalar_t* self_batch = &self_data[b * self_stride]; | ||||
|     scalar_t* result_batch = &result_data[b * result_stride]; | ||||
|     apply_triu_tril_single<scalar_t, inplace, upper>( | ||||
|         result_batch, self_batch, k, n, m, | ||||
|         result_row_stride, result_column_stride, self_row_stride, self_column_stride); | ||||
|   } | ||||
| } | ||||
|  | ||||
| Tensor tril(const Tensor& self, int64_t k) { | ||||
|   Tensor result = at::empty({0}, self.options()); | ||||
|   at::tril_out(result, self, k); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| Tensor& tril_cpu_(Tensor &self, int64_t k) { | ||||
|   if (self.numel() == 0) { | ||||
|     return self; | ||||
|   } | ||||
|   if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous(); | ||||
|   AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{ | ||||
|     apply_triu_tril<scalar_t, true, false>(self, self, k); | ||||
|   }); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) { | ||||
|   if (result.sizes() != self.sizes()) { | ||||
|     result.resize_as_(self); | ||||
|   } | ||||
|   if (self.numel() == 0) { | ||||
|     return result; | ||||
|   } | ||||
|   Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous(); | ||||
|   AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{ | ||||
|     apply_triu_tril<scalar_t, false, false>(result, self_c, k); | ||||
|   }); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| Tensor triu(const Tensor& self, int64_t k) { | ||||
|   Tensor result = at::empty({0}, self.options()); | ||||
|   at::triu_out(result, self, k); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| Tensor& triu_cpu_(Tensor &self, int64_t k) { | ||||
|   if (self.numel() == 0) { | ||||
|     return self; | ||||
|   } | ||||
|   if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous(); | ||||
|   AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{ | ||||
|     apply_triu_tril<scalar_t, true, true>(self, self, k); | ||||
|   }); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) { | ||||
|   if (result.sizes() != self.sizes()) { | ||||
|     result.resize_as_(self); | ||||
|   } | ||||
|   if (self.numel() == 0) { | ||||
|     return result; | ||||
|   } | ||||
|   Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous(); | ||||
|   AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{ | ||||
|     apply_triu_tril<scalar_t, false, true>(result, self_c, k); | ||||
|   }); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| }}  // namespace at::native | ||||
|  | ||||
| @ -378,8 +378,8 @@ at::Tensor _convolution( | ||||
|     AT_CHECK(!bias.defined() || (input.type() == bias.type()), | ||||
|              "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), | ||||
|              ") should be the same"); | ||||
|  | ||||
|     output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups); | ||||
|     output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias, | ||||
|                                     params.padding, params.stride, params.dilation, params.groups); | ||||
| #endif | ||||
|   } else { | ||||
|     if (params.groups == 1) { | ||||
|  | ||||
| @ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) { | ||||
| } | ||||
|  | ||||
| Tensor & eq_(Tensor& self, const Tensor & other) { | ||||
|   return at::_th_ge_(self, other); | ||||
|   return at::_th_eq_(self, other); | ||||
| } | ||||
|  | ||||
| Tensor & ne_(Tensor& self, Scalar other) { | ||||
| @ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) { | ||||
|   return at::_th_atan2_(self, other); | ||||
| } | ||||
|  | ||||
| Tensor & tril_(Tensor& self, int64_t diagonal) { | ||||
|   return at::_th_tril_(self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor & triu_(Tensor& self, int64_t diagonal) { | ||||
|   return at::_th_triu_(self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor & digamma_(Tensor& self) { | ||||
|   return at::_th_digamma_(self); | ||||
| } | ||||
| @ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) { | ||||
|   return at::_th_cross(self, other, dim); | ||||
| } | ||||
|  | ||||
| Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) { | ||||
|   return at::_th_triu_out(result, self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor triu(const Tensor & self, int64_t diagonal) { | ||||
|   return at::_th_triu(self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) { | ||||
|   return at::_th_tril_out(result, self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor tril(const Tensor & self, int64_t diagonal) { | ||||
|   return at::_th_tril(self, diagonal); | ||||
| } | ||||
|  | ||||
| Tensor trace(const Tensor & self) { | ||||
|   return at::_th_trace(self); | ||||
| } | ||||
|  | ||||
| @ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) { | ||||
|   return batched_matrices.size(-1) * batched_matrices.size(-2); | ||||
| } | ||||
|  | ||||
| /* Checks a necessary property for the triu and tril implementations, hence the name. | ||||
|  * Here batch contiguity is checked for tensors with greater than 4 dimensions. | ||||
|  * Contiguous tensors and tensors with less than 3 dimensions pass this check | ||||
|  */  | ||||
| static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) { | ||||
|   // Complete contiguity is the most desired property, which is why | ||||
|   // we return true if the tensor is contiguous | ||||
|   if (tensor.is_contiguous()) return true; | ||||
|  | ||||
|   int64_t dims = tensor.dim(); | ||||
|  | ||||
|   // Tensors with dimension less than 4 are handled by default | ||||
|   if (dims <= 3) return true; | ||||
|  | ||||
|   int64_t expected_stride = tensor.size(-1) * tensor.size(-2); | ||||
|   for (int64_t i = dims - 3; i >= 0; i--) { | ||||
|     if (expected_stride != tensor.stride(i)) return false; | ||||
|     expected_stride *= tensor.size(i); | ||||
|   } | ||||
|   return true; | ||||
| } | ||||
|  | ||||
| // Returns the epsilon value for floating types except half | ||||
| static inline double _get_epsilon(const ScalarType& sc_type) { | ||||
|   switch (sc_type) { | ||||
|  | ||||
| @ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups, | ||||
| std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias, | ||||
|                                                   const Tensor& running_mean, const Tensor& running_var, | ||||
|                                                   bool train, double momentum, double eps) { | ||||
|   checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU); | ||||
|  | ||||
|   return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] { | ||||
|       return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps); | ||||
|     }); | ||||
|  | ||||
| @ -21,7 +21,6 @@ namespace native { | ||||
|  | ||||
| DEFINE_DISPATCH(sum_stub); | ||||
| DEFINE_DISPATCH(prod_stub); | ||||
| DEFINE_DISPATCH(norm_kernel); | ||||
|  | ||||
| static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) { | ||||
|   ScalarType scalarType = self.type().scalarType(); | ||||
| @ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_ | ||||
|   int64_t dim = maybe_wrap_dim(dim_, self.dim()); | ||||
|   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) | ||||
|     return result; | ||||
|   if (self.is_contiguous() && result.is_contiguous()) { | ||||
|     _dimreduce_setup(result, self, dim); | ||||
|     norm_kernel(kCPU, result, self, p, dim); | ||||
|     if (!keepdim) { | ||||
|       result.squeeze_(dim); | ||||
|     } | ||||
|     return result; | ||||
|   } else { | ||||
|     return at::_th_norm_out(result, self, p, dim, keepdim); | ||||
|   } | ||||
|   return at::_th_norm_out(result, self, p, dim, keepdim); | ||||
| } | ||||
|  | ||||
| Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { | ||||
| @ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) { | ||||
|     AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, | ||||
|              "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); | ||||
|     AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); | ||||
|     if (self.is_cuda()) { | ||||
|       return at::_th_norm(self, p); | ||||
|     } else { | ||||
|       if (self.is_contiguous()) { | ||||
|         Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type()); | ||||
|         norm_kernel(kCPU, result, self, p, c10::nullopt); | ||||
|         return result; | ||||
|       } else { | ||||
|         return at::_th_norm(self, p); | ||||
|       } | ||||
|     } | ||||
|     return at::_th_norm(self, p); | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -34,11 +34,11 @@ Tensor _bincount_cpu_template( | ||||
|   int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L; | ||||
|   nbins = std::max(nbins, minlength); // at least minlength # of bins | ||||
|  | ||||
|   const input_t* self_p = self.contiguous().data<input_t>(); | ||||
|   const input_t* self_p = self.data<input_t>(); | ||||
|   if (has_weights) { | ||||
|     output = native::zeros({nbins}, weights.options()); | ||||
|     weights_t* output_p = output.data<weights_t>(); | ||||
|     const weights_t* weights_p = weights.contiguous().data<weights_t>(); | ||||
|     const weights_t* weights_p = weights.data<weights_t>(); | ||||
|     for (int64_t i = 0; i < self.size(0); i++) { | ||||
|       output_p[self_p[i]] += weights_p[i]; | ||||
|     } | ||||
| @ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) { | ||||
|   return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] { | ||||
|     const auto scalar = weights.type().scalarType(); | ||||
|     if (scalar == ScalarType::Undefined || scalar == ScalarType::Float) | ||||
|       return _bincount_cpu_template<scalar_t, float>(self, weights, minlength); | ||||
|       return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength); | ||||
|     return _bincount_cpu_template<scalar_t, double>( | ||||
|         self, weights.toType(CPU(kDouble)), minlength); | ||||
|         self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength); | ||||
|   }); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const { | ||||
| } | ||||
|  | ||||
| void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const { | ||||
|   if (range.size() == 0) { | ||||
|     return; | ||||
|   } | ||||
|   auto strides = get_strides(); | ||||
|   while (strides.size() < 2 * ntensors()) { | ||||
|     strides.push_back(0); | ||||
| @ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range) | ||||
|   int64_t ndim = values.size(); | ||||
|   for (int dim = 0; dim < ndim; dim++) { | ||||
|     int64_t size = shape[dim]; | ||||
|     values[dim] = linear_offset % size; | ||||
|     linear_offset /= size; | ||||
|     if (size > 0) { | ||||
|       values[dim] = linear_offset % size; | ||||
|       linear_offset /= size; | ||||
|     } | ||||
|   } | ||||
|   AT_ASSERT(linear_offset == 0); | ||||
| } | ||||
|  | ||||
| @ -101,14 +101,14 @@ struct PDist { | ||||
|  | ||||
|     scalar_t * const res_start = result.data<scalar_t>(); | ||||
|     int64_t combs = result.numel(); // n * (n - 1) / 2 | ||||
|     const Vec pvec(p); | ||||
|  | ||||
|     // We conceptually iterate over tuples of (i, j, k) where i is the first | ||||
|     // vector from the input, j is the second, and k is the result index. This | ||||
|     // parallelizes over the range of k and infers what i and j are from the | ||||
|     // value of k. | ||||
|     parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) { | ||||
|       float n2 = n - .5; | ||||
|     parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) { | ||||
|       const Vec pvec(p); | ||||
|       double n2 = n - .5; | ||||
|       // The -1 accounts for floating point truncation issues | ||||
|       int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1))); | ||||
|       int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; | ||||
| @ -149,7 +149,7 @@ struct PDist { | ||||
|   } | ||||
|  | ||||
|   template <typename F> | ||||
|   inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) { | ||||
|   inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) { | ||||
|     for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) { | ||||
|  | ||||
|       const Vec self_vec_i = Vec::loadu(self_i, count); | ||||
| @ -177,7 +177,6 @@ struct PDist { | ||||
|     const int64_t n = self.size(0); | ||||
|     const int64_t m = self.size(1); | ||||
|     const int64_t gs = grad.stride(0); | ||||
|     const Vec pvec(p); | ||||
|  | ||||
|     const scalar_t * const grad_start = grad.data<scalar_t>(); | ||||
|     const scalar_t * const dist_start = dist.data<scalar_t>(); | ||||
| @ -187,17 +186,19 @@ struct PDist { | ||||
|     // The only way to parallelize and avoid locking requires parallelizing | ||||
|     // over the columns of the input, i.e. we compute the gradient for the | ||||
|     // first section of each vector independentaly of the second section, etc. | ||||
|     at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) { | ||||
|       const scalar_t * self_l = self_start + l * Vec::size; | ||||
|       scalar_t * res_l = res_start + l * Vec::size; | ||||
|     at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) { | ||||
|       const Vec pvec(p); | ||||
|  | ||||
|       for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) { | ||||
|       const scalar_t * self_l = self_start + l * Vec::size(); | ||||
|       scalar_t * res_l = res_start + l * Vec::size(); | ||||
|  | ||||
|       for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) { | ||||
|         backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs); | ||||
|       } | ||||
|     }); | ||||
|     const int64_t remainder = m % Vec::size; | ||||
|     const int64_t remainder = m % Vec::size(); | ||||
|     if (remainder) { | ||||
|       backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder); | ||||
|       backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -308,7 +308,9 @@ static inline void | ||||
| mask_scatter_add(const scalar_t *src, scalar_t* base_addr, | ||||
|                  const int_same_size_t<scalar_t> *offsets, | ||||
|                  const int_same_size_t<scalar_t> *mask, int64_t len) { | ||||
|   #pragma unroll | ||||
|   #ifndef _MSC_VER   | ||||
|   # pragma unroll   | ||||
|   #endif | ||||
|   for (int64_t i = 0; i < len; i++) { | ||||
|     if (mask[i] & 0x01) { | ||||
|       base_addr[offsets[i]] += src[i]; | ||||
| @ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding> | ||||
|     auto i_sw_offset = i_nw_offset + iVec(inp_sH); | ||||
|     auto i_se_offset = i_sw_offset + iVec(inp_sW); | ||||
|  | ||||
|     #pragma unroll | ||||
|     #ifndef _MSC_VER   | ||||
|     # pragma unroll   | ||||
|     #endif | ||||
|     for (int64_t c = 0; c < C; ++c) { | ||||
|       auto inp_slice_C_ptr = inp_slice[c].data(); | ||||
|  | ||||
| @ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding> | ||||
|     // So we store the necessary vectors to temporary arrays and use the helper | ||||
|     // mask_scatter_add defined above. | ||||
|  | ||||
|     integer_t i_gInp_nw_offset_arr[iVec::size]; | ||||
|     integer_t i_gInp_ne_offset_arr[iVec::size]; | ||||
|     integer_t i_gInp_sw_offset_arr[iVec::size]; | ||||
|     integer_t i_gInp_se_offset_arr[iVec::size]; | ||||
|     integer_t i_gInp_nw_offset_arr[iVec::size()]; | ||||
|     integer_t i_gInp_ne_offset_arr[iVec::size()]; | ||||
|     integer_t i_gInp_sw_offset_arr[iVec::size()]; | ||||
|     integer_t i_gInp_se_offset_arr[iVec::size()]; | ||||
|     i_gInp_nw_offset.store(i_gInp_nw_offset_arr); | ||||
|     i_gInp_ne_offset.store(i_gInp_ne_offset_arr); | ||||
|     i_gInp_sw_offset.store(i_gInp_sw_offset_arr); | ||||
|     i_gInp_se_offset.store(i_gInp_se_offset_arr); | ||||
|  | ||||
|     integer_t i_nw_mask_arr[iVec::size]; | ||||
|     integer_t i_ne_mask_arr[iVec::size]; | ||||
|     integer_t i_sw_mask_arr[iVec::size]; | ||||
|     integer_t i_se_mask_arr[iVec::size]; | ||||
|     integer_t i_nw_mask_arr[iVec::size()]; | ||||
|     integer_t i_ne_mask_arr[iVec::size()]; | ||||
|     integer_t i_sw_mask_arr[iVec::size()]; | ||||
|     integer_t i_se_mask_arr[iVec::size()]; | ||||
|     nw_mask.store(i_nw_mask_arr); | ||||
|     ne_mask.store(i_ne_mask_arr); | ||||
|     sw_mask.store(i_sw_mask_arr); | ||||
|     se_mask.store(i_se_mask_arr); | ||||
|  | ||||
|     scalar_t gInp_corner_arr[Vec::size]; | ||||
|     scalar_t gInp_corner_arr[Vec::size()]; | ||||
|  | ||||
|     auto gx = Vec(0), gy = Vec(0); | ||||
|     #pragma unroll | ||||
|     #ifndef _MSC_VER   | ||||
|     # pragma unroll   | ||||
|     #endif | ||||
|     for (int64_t c = 0; c < C; ++c) { | ||||
|       auto inp_slice_C_ptr = inp_slice[c].data(); | ||||
|       auto gInp_slice_C_ptr = gInp_slice[c].data(); | ||||
| @ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding> | ||||
|     gx = gx * gx_mult; | ||||
|     gy = gy * gy_mult; | ||||
|  | ||||
|     constexpr int64_t step = Vec::size; | ||||
|     constexpr int64_t step = Vec::size(); | ||||
|     auto interleaved_gGrid = interleave2(gx, gy); | ||||
|     auto gGrid_ptr = gGrid_slice.data() + offset * 2; | ||||
|     std::get<0>(interleaved_gGrid).store(gGrid_ptr, | ||||
| @ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding> | ||||
|     auto out_ptr = out_slice.data() + offset; | ||||
|     auto out_sC = out_slice.stride(0); | ||||
|     auto inp_slice_ptr = inp_slice.data(); | ||||
|     #pragma unroll | ||||
|     #ifndef _MSC_VER   | ||||
|     # pragma unroll   | ||||
|     #endif | ||||
|     for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) { | ||||
|       // mask_gather zeros out the mask, so we need to make a copy | ||||
|       auto mask_copy = mask; | ||||
| @ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding> | ||||
|  | ||||
|     auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous | ||||
|  | ||||
|     integer_t mask_arr[iVec::size]; | ||||
|     integer_t mask_arr[iVec::size()]; | ||||
|     i_mask.store(mask_arr); | ||||
|     integer_t gInp_offset_arr[iVec::size]; | ||||
|     integer_t gInp_offset_arr[iVec::size()]; | ||||
|     i_gInp_offset.store(gInp_offset_arr); | ||||
|  | ||||
|     #pragma unroll | ||||
|     #ifndef _MSC_VER   | ||||
|     # pragma unroll   | ||||
|     #endif | ||||
|     for (int64_t c = 0; c < C; ++c) { | ||||
|       mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(), | ||||
|                        gInp_offset_arr, mask_arr, len); | ||||
| @ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator( | ||||
|  | ||||
|   using Vec = Vec256<scalar_t>; | ||||
|   using iVec = Vec256<int_same_size_t<scalar_t>>; | ||||
|   constexpr int64_t step = Vec::size; | ||||
|   constexpr int64_t step = Vec::size(); | ||||
|  | ||||
|   // Loop over each output pixel in grid. | ||||
|   // We consider the following three cases (after slicing out the batch | ||||
| @ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator( | ||||
|     auto spatial_offset = 0; | ||||
|     auto i_offsets_delta = iVec(grid_sW * step); | ||||
|  | ||||
|     #pragma unroll | ||||
|     #ifndef _MSC_VER   | ||||
|     # pragma unroll   | ||||
|     #endif | ||||
|     for (int64_t h = 0; h < out_H; h++) { | ||||
|       auto grid_ptr_x = grid_ptr + h * grid_sH; | ||||
|       auto grid_ptr_y = grid_ptr_x + grid_sCoor; | ||||
|       auto i_offsets = iVec::arange(0, grid_sW); | ||||
|       #pragma unroll | ||||
|       #ifndef _MSC_VER   | ||||
|       # pragma unroll   | ||||
|       #endif | ||||
|       for (int64_t w = 0; w < out_W; w += step) { | ||||
|         auto len = std::min(step, out_W - w); | ||||
|         if (len < step) { | ||||
|  | ||||
| @ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t> | ||||
| static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) { | ||||
|   VEC_LOOP_HEADER(func_t, data) | ||||
|   int64_t i = 0; | ||||
|   for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) { | ||||
|   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { | ||||
|     auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t)); | ||||
|     auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|     auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t)); | ||||
|     auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|     auto out1 = vop(a1, b1); | ||||
|     auto out2 = vop(a2, b2); | ||||
|     out1.store(out_ptr + i * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|   } | ||||
|   int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) }; | ||||
|   binary_loop(data, strides, i, n, op); | ||||
| @ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op, | ||||
|   VEC_LOOP_HEADER(func_t, data) | ||||
|   int64_t i = 0; | ||||
|   auto a = Vec(*(scalar_t*)in1_ptr); | ||||
|   for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) { | ||||
|   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { | ||||
|     auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t)); | ||||
|     auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|     auto out1 = vop(a, b1); | ||||
|     auto out2 = vop(a, b2); | ||||
|     out1.store(out_ptr + i * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|   } | ||||
|   int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) }; | ||||
|   binary_loop(data, strides, i, n, op); | ||||
| @ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op, | ||||
|   VEC_LOOP_HEADER(func_t, data) | ||||
|   int64_t i = 0; | ||||
|   auto b = Vec(*(scalar_t*)in2_ptr); | ||||
|   for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) { | ||||
|   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { | ||||
|     auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t)); | ||||
|     auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|     auto out1 = vop(a1, b); | ||||
|     auto out2 = vop(a2, b); | ||||
|     out1.store(out_ptr + i * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t)); | ||||
|     out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t)); | ||||
|   } | ||||
|   int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 }; | ||||
|   binary_loop(data, strides, i, n, op); | ||||
| @ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o | ||||
|   char* in_ptr = data[1]; | ||||
|   Vec acc[4]; | ||||
|   for  (int j = 0; j < 4; j++) { | ||||
|     acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t)); | ||||
|     acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t)); | ||||
|   } | ||||
|   for (int64_t i = 1; i < n; i++) { | ||||
|     const char* ptr = in_ptr + stride * i; | ||||
|     acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t)))); | ||||
|     acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t)))); | ||||
|     acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t)))); | ||||
|     acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t)))); | ||||
|     acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t)))); | ||||
|     acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t)))); | ||||
|     acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t)))); | ||||
|     acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t)))); | ||||
|   } | ||||
|   if (reduce) { | ||||
|     scalar_t buffer[Vec::size]; | ||||
|     scalar_t buffer[Vec::size()]; | ||||
|     acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3])); | ||||
|     acc[0].store(buffer); | ||||
|     for (int j = 1; j < Vec::size; j++) { | ||||
|     for (int j = 1; j < Vec::size(); j++) { | ||||
|       buffer[0] = op(buffer[0], buffer[j]); | ||||
|     } | ||||
|     auto dst = (scalar_t*)out_ptr; | ||||
|     *dst = op(*dst, buffer[0]); | ||||
|   } else { | ||||
|     for (int j = 0; j < 4; j++) { | ||||
|       auto dst = out_ptr + j * Vec::size * sizeof(scalar_t); | ||||
|       auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t); | ||||
|       acc[j] = vop(acc[j], Vec::loadu(dst)); | ||||
|       acc[j].store(dst); | ||||
|     } | ||||
| @ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int | ||||
| template <typename func_t, typename vec_func_t> | ||||
| static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) { | ||||
|   VEC_HEADER(func_t) | ||||
|   int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t); | ||||
|   int64_t count = n / (4 * Vec::size); | ||||
|   int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t); | ||||
|   int64_t count = n / (4 * Vec::size()); | ||||
|   if (count > 0) { | ||||
|     reduction128(data, count, vector_stride, op, vop, /*reduce=*/true); | ||||
|   } | ||||
|   char* ptrs[3] = { data[0], data[0], data[1] }; | ||||
|   int64_t strides[] = { 0, 0, sizeof(scalar_t) }; | ||||
|   binary_loop(ptrs, strides, count * 4 * Vec::size, n, op); | ||||
|   binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op); | ||||
| } | ||||
|  | ||||
| // computes the reduction out = op(out, in) | ||||
| @ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t> | ||||
| static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) { | ||||
|   VEC_HEADER(func_t) | ||||
|  | ||||
|   // reduce down each column of 4 * Vec::size elements (128 bytes) | ||||
|   // reduce down each column of 4 * Vec::size() elements (128 bytes) | ||||
|   int64_t outer_stride[2] = { 128, 128 }; | ||||
|   UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] { | ||||
|   UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] { | ||||
|     reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false); | ||||
|   }); | ||||
|  | ||||
|   // reduce down the remaining columns | ||||
|   int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) }; | ||||
|   int64_t remaining = size1 % (4 * Vec::size); | ||||
|   int64_t remaining = size1 % (4 * Vec::size()); | ||||
|   UNARY_OUTER_LOOP(data, step, remaining, [&] { | ||||
|     char* ptrs[3] = { data[0], data[0], data[1] }; | ||||
|     int64_t strides[] = { 0, 0, inner_stride }; | ||||
|  | ||||
| @ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) { | ||||
|       /*identity=*/1); | ||||
|   }); | ||||
| } | ||||
|  | ||||
| static inline int64_t round_down(int64_t a, int64_t m) { | ||||
|   return a - (a % m); | ||||
| } | ||||
|  | ||||
| template<typename scalar_t> | ||||
| struct NormReduction { | ||||
|   // reduction width in number of scalar elements | ||||
|   static constexpr int WIDTH = 128 / sizeof(scalar_t); | ||||
|   using Vec = Vec256<scalar_t>; | ||||
|  | ||||
|   static void apply( | ||||
|       Tensor& res, | ||||
|       const Tensor& self, | ||||
|       Scalar p, | ||||
|       c10::optional<int64_t> dim) { | ||||
|     auto out_ = res.data<scalar_t>(); | ||||
|     auto data_ = self.data<scalar_t>(); | ||||
|     auto numel = self.numel(); | ||||
|     float pval = 0.0; | ||||
|     if (p.isIntegral()){ | ||||
|       pval = p.to<int64_t>(); | ||||
|     } else if (p.isFloatingPoint()) { | ||||
|       pval = p.to<float>(); | ||||
|     } | ||||
|     if (!dim.has_value()) { | ||||
|       *out_ = reduce_all(data_, numel,  pval); | ||||
|       return; | ||||
|     } | ||||
|     int64_t n = self.size(*dim); | ||||
|     int64_t stride = self.stride(*dim); | ||||
|     // A contiguous tensor does not need to hold a meaningful stride | ||||
|     // if the corresponding size is 1 | ||||
|     if (n == 1) { | ||||
|       stride = 1; | ||||
|       for (int64_t i = self.ndimension() - 1; i > *dim; i--) { | ||||
|         stride *= self.size(i); | ||||
|       } | ||||
|     } | ||||
|     int64_t batch = numel / n; | ||||
|     parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) { | ||||
|       for (int64_t bi = begin; bi < end; bi++) { | ||||
|         int64_t b = bi / stride; | ||||
|         int64_t i = bi % stride; | ||||
|         const scalar_t* data = &data_[b * n * stride + i]; | ||||
|         out_[bi] = norm_reduce(data, n, stride, pval); | ||||
|       } | ||||
|     }); | ||||
|   } | ||||
|  | ||||
|   static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) { | ||||
|     scalar_t sum = parallel_reduce( | ||||
|       0, | ||||
|       size, | ||||
|       internal::GRAIN_SIZE, | ||||
|       (scalar_t)0, | ||||
|       [=](int64_t begin, int64_t end, scalar_t init) { | ||||
|         const scalar_t* data = &data_[begin]; | ||||
|         int64_t n = end - begin; | ||||
|         scalar_t result = norm_reduce(data, n, 1, pval); | ||||
|         return result; | ||||
|       }, | ||||
|       std::plus<scalar_t>()); | ||||
|     return sum; | ||||
|   } | ||||
|  | ||||
|   static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) { | ||||
|     scalar_t result = 0.0; | ||||
|     if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) { | ||||
|       int64_t n_rounded = round_down(n, WIDTH); | ||||
|       scalar_t result1 = norm_reduce128(data, n_rounded, pval); | ||||
|       scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval); | ||||
|       result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval); | ||||
|     } else { | ||||
|       result = norm_reduce_sequential(data, n, stride, pval); | ||||
|     } | ||||
|     return result; | ||||
|   } | ||||
|  | ||||
|   static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) { | ||||
|     scalar_t result = 0.0; | ||||
|     if (pval == 0) { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result += (data[k * stride] != 0.0); | ||||
|       } | ||||
|     } else if (pval == 1) { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result += std::abs(data[k * stride]); | ||||
|       } | ||||
|     } else if (pval == 2) { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result += data[k * stride] * data[k * stride]; | ||||
|       } | ||||
|       result = std::sqrt(result); | ||||
|     } else if (pval == 3) { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]); | ||||
|       } | ||||
|       result = std::pow(result, 1.0/3); | ||||
|     } else if (pval == INFINITY) { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result; | ||||
|       } | ||||
|     } else if (pval == -INFINITY) { | ||||
|       result = INFINITY; | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result; | ||||
|       } | ||||
|     } else { | ||||
|       for (int64_t k = 0; k < n; k++) { | ||||
|         result += std::pow(std::abs(data[k * stride]), pval); | ||||
|       } | ||||
|       result = std::pow(result, 1.0/pval); | ||||
|     } | ||||
|     return result; | ||||
|   } | ||||
|  | ||||
|   // Reduce down a column of WIDTH elements (128 bytes) with the given number n | ||||
|   // n is already rounded by 128 | ||||
|   static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) { | ||||
|     scalar_t result = 0.0; | ||||
|     Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines) | ||||
|     static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes"); | ||||
|     int64_t rows = n / WIDTH; | ||||
|     if (pval == 1){ | ||||
|       for (int row = 0; row < rows; row ++) { | ||||
|         for (int j = 0; j != 4; j++) { | ||||
|           auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); | ||||
|           acc[j] = acc[j] + val.abs(); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     else if (pval == 2) { | ||||
|       for (int row = 0; row < rows; row ++) { | ||||
|         for (int j = 0; j != 4; j++) { | ||||
|           auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); | ||||
|           acc[j] = acc[j] + val * val; | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     else if (pval == 3) { | ||||
|       for (int row = 0; row < rows; row ++) { | ||||
|         for (int j = 0; j != 4; j++) { | ||||
|           auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); | ||||
|           acc[j] = acc[j] + (val * val * val).abs(); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     scalar_t buf[WIDTH] = {0}; | ||||
|     for (int j = 0; j != 4; j++) { | ||||
|       acc[j].store(&buf[j * Vec::size]); | ||||
|     } | ||||
|     for (int i = 0; i < WIDTH; i++) { | ||||
|       result += buf[i]; | ||||
|     } | ||||
|     result = std::pow(result, 1.0/pval); | ||||
|     return result; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| static void norm_kernel_impl( | ||||
|     Tensor& result, | ||||
|     const Tensor& self, | ||||
|     Scalar p, | ||||
|     c10::optional<int64_t> dim) { | ||||
|   AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] { | ||||
|     NormReduction<scalar_t>::apply(result, self, p, dim); | ||||
|   }); | ||||
| } | ||||
|  | ||||
| }  // anonymous namespace | ||||
|  | ||||
| REGISTER_DISPATCH(sum_stub, &sum_kernel_impl); | ||||
| REGISTER_DISPATCH(prod_stub, &prod_kernel_impl); | ||||
| REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl); | ||||
|  | ||||
| }}  // namespace at::native | ||||
|  | ||||
| @ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim( | ||||
|     int64_t outer_size, | ||||
|     int64_t dim_size) { | ||||
|   using Vec = vec256::Vec256<scalar_t>; | ||||
|   static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size; | ||||
|   static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size(); | ||||
|   int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); | ||||
|   if (grain_size < CHUNK_SIZE) | ||||
|     grain_size = CHUNK_SIZE; | ||||
|  | ||||
| @ -37,9 +37,9 @@ template <> | ||||
| int64_t _sigmoid(float* x, float* y, int64_t size) { | ||||
|   using Vec = Vec256<float>; | ||||
|   int64_t i = 0; | ||||
|   for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) { | ||||
|   for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) { | ||||
|     Vec ret = Vec::loadu(y + i); | ||||
|     Vec ret2 = Vec::loadu(y + i + Vec::size); | ||||
|     Vec ret2 = Vec::loadu(y + i + Vec::size()); | ||||
|     ret = ret.neg(); | ||||
|     ret2 = ret2.neg(); | ||||
| #if defined(__AVX2__) && !defined(_MSC_VER) | ||||
| @ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) { | ||||
|     ret = ret.reciprocal(); | ||||
|     ret2 = ret2.reciprocal(); | ||||
|     ret.store(x + i); | ||||
|     ret2.store(x + i + Vec::size); | ||||
|     ret2.store(x + i + Vec::size()); | ||||
|   } | ||||
|   return i; | ||||
| } | ||||
| @ -63,9 +63,9 @@ template <> | ||||
| int64_t _sigmoid(double* x, double* y, int64_t size) { | ||||
|   using Vec = Vec256<double>; | ||||
|   int64_t i = 0; | ||||
|   for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) { | ||||
|   for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) { | ||||
|     Vec ret = Vec::loadu(y + i); | ||||
|     Vec ret2 = Vec::loadu(y + i + Vec::size); | ||||
|     Vec ret2 = Vec::loadu(y + i + Vec::size()); | ||||
|     ret = ret.neg(); | ||||
|     ret2 = ret2.neg(); | ||||
|     ret = ret.exp(); | ||||
| @ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) { | ||||
|     ret = ret.reciprocal(); | ||||
|     ret2 = ret2.reciprocal(); | ||||
|     ret.store(x + i); | ||||
|     ret2.store(x + i + Vec::size); | ||||
|     ret2.store(x + i + Vec::size()); | ||||
|   } | ||||
|   return i; | ||||
| } | ||||
| @ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) { | ||||
|           if (stridex == 1 && stridey == 1) { | ||||
|             i = _sigmoid(x, y, size); | ||||
|           } | ||||
|           for (; i < size; i += Vec::size) { | ||||
|             scalar_t buffer[Vec::size]; | ||||
|             int64_t width = Vec::size; | ||||
|           for (; i < size; i += Vec::size()) { | ||||
|             scalar_t buffer[Vec::size()]; | ||||
|             int64_t width = Vec::size(); | ||||
|             width = std::min(width, size - i); | ||||
|             for (int64_t j = 0; j < width; j++) { | ||||
|               buffer[j] = y[stridey * (i + j)]; | ||||
|  | ||||
| @ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { | ||||
|       input_stride1 = strides[1]; | ||||
|     } | ||||
|     AT_CHECK(channel_size == weight_num, | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.", | ||||
|       weight_num, channel_size); | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, | ||||
|       " and channel size = ", channel_size, "."); | ||||
|  | ||||
|     // config to run cuda kernel | ||||
|     int64_t input_numel = input.numel(); | ||||
| @ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te | ||||
|       input_stride1 = strides[1]; | ||||
|     } | ||||
|     AT_CHECK(channel_size == weight_num, | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.", | ||||
|       weight_num, channel_size); | ||||
|       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, | ||||
|       " and channel size = ", channel_size, "."); | ||||
|  | ||||
|     // config to run cuda kernel | ||||
|     int64_t input_numel = input.numel(); | ||||
|  | ||||
| @ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, bool upper> | ||||
| __global__ | ||||
| void triu_tril_kernel( | ||||
|     scalar_t* result, scalar_t* self, int64_t k, int64_t N, | ||||
|     int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride, | ||||
|     int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) { | ||||
|   int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|   if (linear_idx >= N) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   int64_t self_batch_idx = blockIdx.y; | ||||
|   int64_t row = linear_idx / self_ncol; | ||||
|   int64_t col = linear_idx % self_ncol; | ||||
|  | ||||
|   bool mask = upper ? (col - row >= k) : (col - row <= k); | ||||
|  | ||||
|   // Now compute the offset for the self and result tensor | ||||
|   int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride; | ||||
|   int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride; | ||||
|   result[res_offset] = mask ? self[self_offset] : scalar_t(0); | ||||
| } | ||||
|  | ||||
| template <bool upper> | ||||
| Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) { | ||||
|   int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2), | ||||
|           res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1, | ||||
|           res_row_stride = result.stride(-2), res_col_stride = result.stride(-1), | ||||
|           self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1, | ||||
|           self_row_stride = self.stride(-2), self_col_stride = self.stride(-1); | ||||
|   dim3 dim_block = cuda::getApplyBlock(); | ||||
|   dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches); | ||||
|   AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{ | ||||
|     triu_tril_kernel<scalar_t, upper> | ||||
|       <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>( | ||||
|         result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size, | ||||
|         res_batch_stride, res_row_stride, res_col_stride, | ||||
|         self_batch_stride, self_row_stride, self_col_stride, self.size(-1)); | ||||
|   }); | ||||
|   AT_CUDA_CHECK(cudaGetLastError()); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| Tensor& tril_cuda_(Tensor &self, int64_t k) { | ||||
|   if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous(); | ||||
|   return tril_cuda_out(self, self, k); | ||||
| } | ||||
|  | ||||
| Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) { | ||||
|   if (result.sizes() != self.sizes()) { | ||||
|     result.resize_as_(self); | ||||
|   } | ||||
|   if (self.numel() == 0) { | ||||
|     return result; | ||||
|   } | ||||
|   Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous(); | ||||
|   return triu_tril_cuda_template<false>(result, self_c, k, "tril"); | ||||
| } | ||||
|  | ||||
| Tensor& triu_cuda_(Tensor &self, int64_t k) { | ||||
|   if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous(); | ||||
|   return triu_cuda_out(self, self, k); | ||||
| } | ||||
|  | ||||
| Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) { | ||||
|   if (result.sizes() != self.sizes()) { | ||||
|     result.resize_as_(self); | ||||
|   } | ||||
|   if (self.numel() == 0) { | ||||
|     return result; | ||||
|   } | ||||
|   Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous(); | ||||
|   return triu_tril_cuda_template<true>(result, self_c, k, "triu"); | ||||
| } | ||||
|  | ||||
| }}  // namespace at::native | ||||
|  | ||||
| #undef ALLOCATE_ARRAY | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| #include "ATen/ATen.h" | ||||
| #include <ATen/ATen.h> | ||||
| #include <ATen/cuda/Exceptions.h> | ||||
| #include <THC/THCTensorMathReduce.cuh> | ||||
| #include <math.h> | ||||
|  | ||||
| @ -78,13 +79,13 @@ struct dists { | ||||
| }; | ||||
|  | ||||
| template <typename scalar_t, typename F> | ||||
| __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) { | ||||
| __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p, | ||||
|                                               const double n2, const double n2_squared_minus_1) { | ||||
|   const int k = blockIdx.x; | ||||
|   const int stride = blockDim.x; | ||||
|  | ||||
|   float n2 = n - .5; | ||||
|   // The -1 accounts for floating point truncation issues | ||||
|   int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1))); | ||||
|   int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k))); | ||||
|   int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; | ||||
|  | ||||
|   const scalar_t * const start = self + i * m; | ||||
| @ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename F> | ||||
| __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) { | ||||
| __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p, | ||||
|                                                        const double n2, const double n2_squared_minus_1) { | ||||
|   const int k = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|   const int init = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|   const int stride = blockDim.x * gridDim.x; | ||||
| @ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   float n2 = n - .5; | ||||
|   // The -1 accounts for floating point truncation issues | ||||
|   int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1))); | ||||
|   int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k))); | ||||
|   int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; | ||||
|   int64_t ib = j - i - 1; | ||||
|   int64_t jb = n - 2 - i; | ||||
| @ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) { | ||||
|   const dim3 block(forward_threads); | ||||
|   int64_t n = self.size(0); | ||||
|   int64_t m = self.size(1); | ||||
|   // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do | ||||
|   // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device. | ||||
|   const double n2 = n - .5; | ||||
|   const double n2_squared_minus_1 = n2 * n2 - 1; | ||||
|  | ||||
|   AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] { | ||||
|     if (p == 0.0) { | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p); | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1); | ||||
|     } else if (p == 1.0) { | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p); | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1); | ||||
|     } else if (p == 2.0) { | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p); | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1); | ||||
|     } else if (std::isinf(p)) { | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p); | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1); | ||||
|     } else { | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p); | ||||
|       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1); | ||||
|     } | ||||
|   }); | ||||
|   AT_CUDA_CHECK(cudaGetLastError()); | ||||
| } | ||||
|  | ||||
| void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) { | ||||
| @ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor | ||||
|   const int64_t n = result.size(0); | ||||
|   int64_t m = self.size(1); | ||||
|   const int block_x = 64; | ||||
|   const int block_y = 4; | ||||
|   // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16. | ||||
|   // From binary search, block_y of 16 gives us max pdist dim0 of 1449, | ||||
|   //                     block_y of  4 gives us max pdist dim0 of  725. | ||||
|   const int block_y = 16; | ||||
|   const int grid_x = (m + block_x * 8 - 1) / (block_x * 8); | ||||
|   const int grid_y = (dist.numel() + block_y - 1) / block_y; | ||||
|   const dim3 grid(grid_x, grid_y); | ||||
|   const dim3 block(block_x, block_y); | ||||
|   // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do | ||||
|   // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device. | ||||
|   const double n2 = n - .5; | ||||
|   const double n2_squared_minus_1 = n2 * n2 - 1; | ||||
|  | ||||
|   Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options()); | ||||
|   AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] { | ||||
|     if (p == 1.0) { | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p); | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); | ||||
|     } else if (p < 2.0) { | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p); | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); | ||||
|     } else if (p == 2.0) { | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p); | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); | ||||
|     } else if (std::isinf(p)) { | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p); | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); | ||||
|     } else { | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p); | ||||
|       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); | ||||
|     } | ||||
|   }); | ||||
|   AT_CUDA_CHECK(cudaGetLastError()); | ||||
|  | ||||
|   at::sum_out(result, buffer, 0); | ||||
| } | ||||
|  | ||||
| @ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind | ||||
|  | ||||
|     default: | ||||
|       AT_ERROR( | ||||
|           "Unknown mode for embedding_bag_backward_cuda %d", mode); | ||||
|           "Unknown mode for embedding_bag_backward_cuda ", mode); | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, | ||||
|           + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime]; | ||||
|  | ||||
|         log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb; | ||||
|       } else if ((s < 2*max_target_length+1) || (t >= input_length)) { | ||||
|       } else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) { | ||||
|           log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf; | ||||
|       } | ||||
|     } | ||||
| @ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const | ||||
|       if (targets.type().scalarType() == kLong) { | ||||
| 	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); | ||||
|       } else { | ||||
| 	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); | ||||
| 	return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); | ||||
|       } | ||||
|     }); | ||||
| } | ||||
|  | ||||
| @ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_ | ||||
|                                                             const Tensor& running_mean_, const Tensor& running_var_, | ||||
|                                                             bool train, double momentum, double epsilon) { | ||||
|  | ||||
|   TensorArg input_arg{ input_, "input", 1 }, | ||||
|             weight_arg{ weight_, "weight", 2 }, | ||||
|             bias_arg{ bias_, "bias", 3 }, | ||||
|             run_mean_arg{ running_mean_, "running_mean", 4 }, | ||||
|             run_var_arg{ running_var_, "running_var", 5 }; | ||||
|   CheckedFrom c = "batch_norm_cuda"; | ||||
|   checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg}); | ||||
|  | ||||
|   using accscalar_t = at::acc_type<scalar_t, true>; | ||||
|   int64_t n_input = input_.size(1); | ||||
|   Tensor save_mean_; | ||||
|  | ||||
| @ -7,28 +7,13 @@ | ||||
| #include <tuple> | ||||
| #include <thrust/unique.h> | ||||
| #include <thrust/sort.h> | ||||
| #include <thrust/scan.h> | ||||
| #include <thrust/scatter.h> | ||||
|  | ||||
| namespace at { | ||||
| namespace native{ | ||||
|  | ||||
| namespace { | ||||
| template <typename scalar_t> | ||||
| __global__ void inverse_indices_kernel( | ||||
|     const scalar_t* input_data, | ||||
|     const scalar_t* output_data, | ||||
|     int64_t* inverse_indices_data, | ||||
|     int64_t num_inp, | ||||
|     int64_t num_out) { | ||||
|     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|     int64_t stride = blockDim.x * gridDim.x; | ||||
|  | ||||
|     for (int64_t i = idx; i < num_inp * num_out; i += stride) { | ||||
|       if (input_data[i / num_out] == output_data[i % num_out]){ | ||||
|         inverse_indices_data[i / num_out] = i % num_out; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  | ||||
| template <typename scalar_t> | ||||
|   std::tuple<Tensor, Tensor> _unique_cuda_template( | ||||
| @ -47,25 +32,29 @@ template <typename scalar_t> | ||||
|     Tensor output = input.clone(); | ||||
|     output = output.view(-1); | ||||
|     scalar_t* output_data = output.data<scalar_t>(); | ||||
|     thrust::sort(policy, output_data, output_data + num_inp); | ||||
|     scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp); | ||||
|     int64_t num_out = output_end - output_data; | ||||
|     output.resize_(num_out); | ||||
|  | ||||
|     Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong)); | ||||
|  | ||||
|     if (return_inverse) { | ||||
|       inverse_indices.resize_(input.sizes()); | ||||
|       int64_t* inverse_indices_data = inverse_indices.data<int64_t>(); | ||||
|       int block = 512; | ||||
|       int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L); | ||||
|       inverse_indices_kernel<<<grid, block, 0, stream>>>( | ||||
|         input_data, output_data, inverse_indices_data, num_inp, num_out); | ||||
|     Tensor inverse_indices; | ||||
|     if (!return_inverse) { | ||||
|         inverse_indices = at::empty({0},  self.type().toScalarType(kLong)); | ||||
|         thrust::sort(policy, output_data, output_data + num_inp); | ||||
|     } else { | ||||
|         Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong)); | ||||
|         int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>(); | ||||
|         thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr); | ||||
|         Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong)); | ||||
|         inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong)); | ||||
|         int64_t* inv_loc_ptr = inv_loc.data<int64_t>(); | ||||
|         int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>(); | ||||
|         thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }}); | ||||
|         inv_loc[0] = 0; | ||||
|         thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr); | ||||
|         thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr); | ||||
|         inverse_indices.resize_(input.sizes()); | ||||
|     } | ||||
|     int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data; | ||||
|     output.resize_(num_out); | ||||
|  | ||||
|     THCudaCheck(cudaGetLastError()); | ||||
|     return std::tuple<Tensor, Tensor>(output, inverse_indices); | ||||
|  | ||||
|   } | ||||
|  | ||||
| template <typename scalar_t> | ||||
|  | ||||
| @ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> { | ||||
|         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, | ||||
|         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, | ||||
|         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, | ||||
| #if CUDNN_VERSION >= 6000 | ||||
|         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, | ||||
| #endif | ||||
|     }; | ||||
|     // NOTE: - 1 because ALGO_WINOGRAD is not implemented | ||||
|     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1; | ||||
| @ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { | ||||
|   THCCachingAllocator_emptyCache(); | ||||
| } | ||||
|  | ||||
|  | ||||
| //hot fix for #16610 | ||||
| //specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData,  | ||||
| //adding "strided" argument, so in the end this looks simpler. | ||||
| template<> | ||||
| void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) { | ||||
|   using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>; | ||||
|   auto& cache = search::cache(); | ||||
|  | ||||
|   if (cache.find(args.params, algo)) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   if (args.params.deterministic && !benchmark) { | ||||
|     *algo = search::DEFAULT_ALGO; | ||||
|     return; | ||||
|   } | ||||
|    | ||||
|   int stride_dim = args.input.dim() - 2; | ||||
|   bool strided = false; | ||||
|   for (int i = 0; i< stride_dim; i++) { | ||||
|       if (args.params.stride[i] != 1) { | ||||
|          strided = true; | ||||
|          break; | ||||
|       } | ||||
|   } | ||||
|  | ||||
|   if (!benchmark) { | ||||
|     search::getAlgorithm(args, algo); | ||||
|     if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { | ||||
|        *algo = search::DEFAULT_ALGO; | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   if (cache.find(args.params, algo)) { | ||||
|     // re-check cache since another thread may have benchmarked the algorithm | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   auto perfResults = search::findAlgorithm(args); | ||||
|   // for deterministic algo, look at all the perf results and return the best | ||||
|   // deterministic algo | ||||
|   if (perfResults.status == CUDNN_STATUS_SUCCESS && | ||||
|       !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) { | ||||
|       *algo = perfResults.algo; | ||||
|   } else { | ||||
|       *algo = search::DEFAULT_ALGO; | ||||
|   } | ||||
|   if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { | ||||
|      *algo = search::DEFAULT_ALGO; | ||||
|   } | ||||
|   cache.insert(args.params, *algo); | ||||
|  | ||||
|   // Free the cached blocks in our caching allocator. They are | ||||
|   // needed here because the above benchmarking uses a huge amount of memory, | ||||
|   // e.g. a few GBs. | ||||
|   THCCachingAllocator_emptyCache(); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<typename algo_t> | ||||
| Workspace chooseAlgorithm( | ||||
|     const ConvolutionArgs& args, | ||||
| @ -848,19 +907,9 @@ Tensor cudnn_convolution_forward( | ||||
|   // See #4500 | ||||
|   Tensor weight_contig = weight->contiguous(); | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|   for (int i = 0; i < groups; i++) { | ||||
|     raw_cudnn_convolution_forward_out( | ||||
|         narrowGroup(*output, output_channels_dim,        i, groups), | ||||
|         narrowGroup(*input,  input_channels_dim,         i, groups), | ||||
|         narrowGroup(weight_contig, weight_output_channels_dim, i, groups), | ||||
|         padding, stride, dilation, 1, benchmark, deterministic); | ||||
|   } | ||||
| #else | ||||
|   raw_cudnn_convolution_forward_out( | ||||
|       *output, *input, weight_contig, | ||||
|       padding, stride, dilation, groups, benchmark, deterministic); | ||||
| #endif | ||||
|  | ||||
|   return *output; | ||||
| } | ||||
| @ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input( | ||||
|   // See #4500 | ||||
|   Tensor weight_contig = weight->contiguous(); | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|   for (int i = 0; i < groups; i++) { | ||||
|     raw_cudnn_convolution_backward_input_out( | ||||
|         narrowGroup(*grad_input, input_channels_dim, i, groups), | ||||
|         narrowGroup(*grad_output, output_channels_dim, i, groups), | ||||
|         narrowGroup(weight_contig, weight_output_channels_dim, i, groups), | ||||
|         padding, stride, dilation, 1, benchmark, deterministic); | ||||
|   } | ||||
| #else | ||||
|   raw_cudnn_convolution_backward_input_out( | ||||
|       *grad_input, *grad_output, weight_contig, | ||||
|       padding, stride, dilation, groups, benchmark, deterministic); | ||||
| #endif | ||||
|  | ||||
|   return *grad_input; | ||||
| } | ||||
| @ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight( | ||||
|   TensorArg grad_weight{ grad_weight_t, "result", 0 }; | ||||
|   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|   for (int i = 0; i < groups; i++) { | ||||
|     raw_cudnn_convolution_backward_weight_out( | ||||
|         narrowGroup(*grad_weight, weight_output_channels_dim, i, groups), | ||||
|         narrowGroup(*grad_output, output_channels_dim, i, groups), | ||||
|         narrowGroup(*input, input_channels_dim, i, groups), | ||||
|         padding, stride, dilation, groups, benchmark, deterministic); | ||||
|   } | ||||
| #else | ||||
|   raw_cudnn_convolution_backward_weight_out( | ||||
|       *grad_weight, *grad_output, *input, | ||||
|       padding, stride, dilation, groups, benchmark, deterministic); | ||||
| #endif | ||||
|  | ||||
|   return grad_weight_t; | ||||
| } | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
| #endif | ||||
|  | ||||
|  | ||||
| #if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000) | ||||
| #if !AT_CUDNN_ENABLED() | ||||
|  | ||||
| namespace at { namespace native { | ||||
|  | ||||
|  | ||||
| @ -375,7 +375,7 @@ namespace { | ||||
|       case CUDNN_RNN_TANH: | ||||
|         return 2; | ||||
|       default: | ||||
|         AT_ERROR("unknown cuDNN RNN mode %d", mode); | ||||
|         AT_ERROR("unknown cuDNN RNN mode ", mode); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -2555,9 +2555,15 @@ | ||||
|  | ||||
| - func: tril_(Tensor self, int64_t diagonal=0) -> Tensor | ||||
|   variants: method | ||||
|   dispatch: | ||||
|     CPU: tril_cpu_ | ||||
|     CUDA: tril_cuda_ | ||||
|  | ||||
| - func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor | ||||
|   variants: method | ||||
|   dispatch: | ||||
|     CPU: triu_cpu_ | ||||
|     CUDA: triu_cuda_ | ||||
|  | ||||
| - func: digamma_(Tensor self) -> Tensor | ||||
|   variants: method | ||||
| @ -2658,11 +2664,17 @@ | ||||
|   variants: method, function | ||||
|  | ||||
| - func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor | ||||
|   dispatch: | ||||
|     CPU: triu_cpu_out | ||||
|     CUDA: triu_cuda_out | ||||
|  | ||||
| - func: triu(Tensor self, int64_t diagonal=0) -> Tensor | ||||
|   variants: method, function | ||||
|  | ||||
| - func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor | ||||
|   dispatch: | ||||
|     CPU: tril_cpu_out | ||||
|     CUDA: tril_cuda_out | ||||
|  | ||||
| - func: tril(Tensor self, int64_t diagonal=0) -> Tensor | ||||
|   variants: method, function | ||||
|  | ||||
| @ -11,18 +11,4 @@ using namespace at::native; | ||||
| TEST(CUDNNTest, CUDNNTestCUDA) { | ||||
|   if (!at::cuda::is_available()) return; | ||||
|   manual_seed(123); | ||||
|  | ||||
| #if CUDNN_VERSION < 7000 | ||||
|   auto handle = getCudnnHandle(); | ||||
|   DropoutDescriptor desc1, desc2; | ||||
|   desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte)); | ||||
|   desc2.set(handle, 0.5, desc1.state); | ||||
|   bool isEQ; | ||||
|   isEQ = (desc1.desc()->dropout == desc2.desc()->dropout); | ||||
|   ASSERT_TRUE(isEQ); | ||||
|   isEQ = (desc1.desc()->nstates == desc2.desc()->nstates); | ||||
|   ASSERT_TRUE(isEQ); | ||||
|   isEQ = (desc1.desc()->states == desc2.desc()->states); | ||||
|   ASSERT_TRUE(isEQ); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @ -3,6 +3,8 @@ find_package(ATen REQUIRED) | ||||
| include_directories(${ATEN_INCLUDE_DIR}) | ||||
|  | ||||
| # C++11 | ||||
| set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") | ||||
| if (not MSVC)  | ||||
|     set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")  | ||||
| endif() | ||||
| add_executable(main main.cpp) | ||||
| target_link_libraries(main ${ATEN_LIBRARIES}) | ||||
|  | ||||
| @ -247,10 +247,13 @@ | ||||
|  | ||||
| #ifdef _OPENMP | ||||
|  | ||||
| #ifndef _WIN32 | ||||
| #define PRAGMA(P) _Pragma(#P) | ||||
| #ifdef _WIN32   | ||||
| // MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.   | ||||
| #define PRAGMA_LOOP(P)    // Noop   | ||||
| #define PRAGMA(P)         __pragma(P) | ||||
| #else | ||||
| #define PRAGMA(P) __pragma(P) | ||||
| #define PRAGMA_LOOP(P)    _Pragma(#P)   | ||||
| #define PRAGMA(P)         _Pragma(#P) | ||||
| #endif | ||||
|  | ||||
| #include <omp.h> | ||||
| @ -369,7 +372,7 @@ | ||||
|     TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \ | ||||
|     ptrdiff_t iter = 0;                                                                        \ | ||||
|     if(tp != (TYPE2*)rp) {                                                                             \ | ||||
|       PRAGMA(ivdep) \ | ||||
|       PRAGMA_LOOP(ivdep) \ | ||||
|       PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \ | ||||
|       for (iter = 0; iter < SIZE; iter++) {                             \ | ||||
|         TYPE2 *TENSOR2##_data = tp+iter;                                \ | ||||
| @ -377,7 +380,7 @@ | ||||
|         CODE                                                            \ | ||||
|       }\ | ||||
|     } else {\ | ||||
|       PRAGMA(simd) \ | ||||
|       PRAGMA_LOOP(simd) \ | ||||
|       PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \ | ||||
|       for (iter = 0; iter < SIZE; iter++) {\ | ||||
|         TYPE2* TENSOR2##_data = tp+iter;\ | ||||
| @ -449,7 +452,7 @@ | ||||
|     TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \ | ||||
|     ptrdiff_t iter = 0;\ | ||||
|     if(tp != (TYPE2*)rp) {                                                                             \ | ||||
|       PRAGMA(ivdep) \ | ||||
|       PRAGMA_LOOP(ivdep) \ | ||||
|       PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \ | ||||
|       for (iter = 0; iter < SIZE; iter++) {\ | ||||
|         TYPE1 *TENSOR1##_data = rp+iter;\ | ||||
| @ -458,7 +461,7 @@ | ||||
|         CODE                                \ | ||||
|       } \ | ||||
|     } else {\ | ||||
|       PRAGMA(simd) \ | ||||
|       PRAGMA_LOOP(simd) \ | ||||
|       PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \ | ||||
|       for (iter = 0; iter < SIZE; iter++) {\ | ||||
|         TYPE1 *TENSOR1##_data = rp+iter;\ | ||||
|  | ||||
| @ -13,10 +13,13 @@ | ||||
|  | ||||
| #ifdef _OPENMP | ||||
|  | ||||
| #ifndef _WIN32 | ||||
| #define PRAGMA(P) _Pragma(#P) | ||||
| #ifdef _WIN32   | ||||
| // MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.   | ||||
| #define PRAGMA_LOOP(P)    // Noop   | ||||
| #define PRAGMA(P)         __pragma(P) | ||||
| #else | ||||
| #define PRAGMA(P) __pragma(P) | ||||
| #define PRAGMA_LOOP(P)    _Pragma(#P)   | ||||
| #define PRAGMA(P)         _Pragma(#P) | ||||
| #endif | ||||
|  | ||||
| #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \ | ||||
|  | ||||
| @ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) | ||||
|   int free_b = 0; | ||||
|   if (a == NULL) a = ra_; | ||||
|   if (b == NULL) b = rb_; | ||||
|   THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d", | ||||
|       a->dim()); | ||||
|   THArgCheck(!a->is_empty(), 2, "A should not be empty"); | ||||
|   THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " | ||||
|       "dimensions, but has %d", b->dim()); | ||||
|   THArgCheck(!b->is_empty(), 2, "B should not be empty"); | ||||
|   THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld", | ||||
|       a->size(0), a->size(1)); | ||||
|   THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld " | ||||
|       "rows, B has %ld", a->size(0), b->size(0)); | ||||
|  | ||||
|   if (b->dim() == 1) { | ||||
|     b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0), | ||||
|             b->stride(0), 1, 0); | ||||
|     free_b = 1; | ||||
|   } | ||||
|  | ||||
|   int n, nrhs, lda, ldb, info; | ||||
|   THIntTensor *ipiv; | ||||
| @ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) | ||||
|   THTensor_(freeCopyTo)(ra__, ra_); | ||||
|   THTensor_(freeCopyTo)(rb__, rb_); | ||||
|   THIntTensor_free(ipiv); | ||||
|   if (free_b) c10::raw::intrusive_ptr::decref(b); | ||||
| } | ||||
|  | ||||
| void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a, | ||||
|  | ||||
| @ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n | ||||
|  | ||||
| TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder); | ||||
| TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted); | ||||
| TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k); | ||||
| TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k); | ||||
| TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension); | ||||
| TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension); | ||||
|  | ||||
| @ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n) | ||||
|   REAL_SWAP(ARR(III), ARR(JJJ)); \ | ||||
|   LONG_SWAP(IDX(III), IDX(JJJ)) | ||||
|  | ||||
| /* Emulate NumPy behavior of putting NaNs | ||||
|  * at the end of an ascending list. */ | ||||
| #define GT_OR_NAN(x, y) \ | ||||
|   ((x != x && y == y) || (x > y)) | ||||
|  | ||||
| static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride) | ||||
| { | ||||
|   int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left; | ||||
| @ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem | ||||
|       /* Use median of three for pivot choice */ | ||||
|     P=(L+R)>>1; | ||||
|     BOTH_SWAP(P, L+1); | ||||
|     if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); } | ||||
|     if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); } | ||||
|     if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); } | ||||
|     if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); } | ||||
|     if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); } | ||||
|     if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); } | ||||
|  | ||||
|     i = L+1; j = R; piv = ARR(L); pid = IDX(L); | ||||
|  | ||||
|     do { | ||||
|       do { i = i+1; } while(ARR(i) < piv); | ||||
|       do { j = j-1; } while(ARR(j) > piv); | ||||
|       do { i = i+1; } while(GT_OR_NAN(piv, ARR(i))); | ||||
|       do { j = j-1; } while(GT_OR_NAN(ARR(j), piv)); | ||||
|       if (j < i) | ||||
|           break; | ||||
|       BOTH_SWAP(i, j); | ||||
| @ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem | ||||
|   } /* while not done */ | ||||
|   /* Now insertion sort on the concatenation of subfiles */ | ||||
|   for(i=elements-2; i>=0; i--) { | ||||
|     if (ARR(i) > ARR(i+1)) { | ||||
|     if (GT_OR_NAN(ARR(i),ARR(i+1))) { | ||||
|       piv = ARR(i); | ||||
|       pid = IDX(i); | ||||
|       j = i+1; | ||||
| @ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem | ||||
|         ARR(j-1) = ARR(j); | ||||
|         IDX(j-1) = IDX(j); | ||||
|         j = j+1; | ||||
|       } while(j < elements && ARR(j) < piv); | ||||
|       } while(j < elements && GT_OR_NAN(piv, ARR(j))); | ||||
|       ARR(j-1) = piv; | ||||
|       IDX(j-1) = pid; | ||||
|      } | ||||
| @ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele | ||||
|       /* Use median of three for pivot choice */ | ||||
|     P=(L+R)>>1; | ||||
|     BOTH_SWAP(P, L+1); | ||||
|     if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); } | ||||
|     if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); } | ||||
|     if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); } | ||||
|     if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); } | ||||
|     if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); } | ||||
|     if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); } | ||||
|  | ||||
|     i = L+1; j = R; piv = ARR(L); pid = IDX(L); | ||||
|  | ||||
|     do { | ||||
|       do { i = i+1; } while(ARR(i) > piv); | ||||
|       do { j = j-1; } while(ARR(j) < piv); | ||||
|       do { i = i+1; } while(GT_OR_NAN(ARR(i), piv)); | ||||
|       do { j = j-1; } while(GT_OR_NAN(piv, ARR(j))); | ||||
|       if (j < i) | ||||
|           break; | ||||
|       BOTH_SWAP(i, j); | ||||
| @ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele | ||||
|   } /* while not done */ | ||||
|   /* Now insertion sort on the concatenation of subfiles */ | ||||
|   for(i=elements-2; i>=0; i--) { | ||||
|     if (ARR(i) < ARR(i+1)) { | ||||
|     if (GT_OR_NAN(ARR(i+1), ARR(i))) { | ||||
|       piv = ARR(i); | ||||
|       pid = IDX(i); | ||||
|       j = i+1; | ||||
| @ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele | ||||
|         ARR(j-1) = ARR(j); | ||||
|         IDX(j-1) = IDX(j); | ||||
|         j = j+1; | ||||
|       } while(j < elements && ARR(j) > piv); | ||||
|       } while(j < elements && GT_OR_NAN(ARR(j), piv)); | ||||
|       ARR(j-1) = piv; | ||||
|       IDX(j-1) = pid; | ||||
|      } | ||||
| @ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i | ||||
|   THLongTensor_free(tmpIndices); | ||||
| } | ||||
|  | ||||
| void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k) | ||||
| { | ||||
|   int64_t t_size_0, t_size_1; | ||||
|   int64_t t_stride_0, t_stride_1; | ||||
|   int64_t r__stride_0, r__stride_1; | ||||
|   scalar_t *t_data, *r__data; | ||||
|   int64_t r, c; | ||||
|  | ||||
|   THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix"); | ||||
|  | ||||
|   THTensor_(resizeAs)(r_, t); | ||||
|  | ||||
|   t_size_0 = THTensor_(size)(t, 0); | ||||
|   t_size_1 = THTensor_(size)(t, 1); | ||||
|   t_stride_0 = THTensor_(stride)(t, 0); | ||||
|   t_stride_1 = THTensor_(stride)(t, 1); | ||||
|   r__stride_0 = THTensor_(stride)(r_, 0); | ||||
|   r__stride_1 = THTensor_(stride)(r_, 1); | ||||
|   r__data = r_->data<scalar_t>(); | ||||
|   t_data = t->data<scalar_t>(); | ||||
|  | ||||
|   for(r = 0; r < t_size_0; r++) | ||||
|   { | ||||
|     int64_t sz = THMin(r+k+1, t_size_1); | ||||
|     for(c = THMax(0, r+k+1); c < t_size_1; c++) | ||||
|       r__data[r*r__stride_0+c*r__stride_1] = 0; | ||||
|     for(c = 0; c < sz; c++) | ||||
|       r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1]; | ||||
|   } | ||||
| } | ||||
|  | ||||
| void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k) | ||||
| { | ||||
|   int64_t t_size_0, t_size_1; | ||||
|  | ||||
| @ -6,17 +6,17 @@ | ||||
| #include "THCNumerics.cuh" | ||||
|  | ||||
| // Collection of kernel sort routines | ||||
| template <typename T> | ||||
| template <typename T, bool handleNaN = false> | ||||
| struct LTComp { | ||||
|   __device__ inline bool operator()(const T& a, const T& b) const { | ||||
|     return THCNumerics<T>::lt(a, b); | ||||
|     return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| template <typename T, bool handleNaN = false> | ||||
| struct GTComp { | ||||
|   __device__ inline bool operator()(const T& a, const T& b) const { | ||||
|     return THCNumerics<T>::gt(a, b); | ||||
|     return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) { | ||||
| } | ||||
|  | ||||
| template <typename T> | ||||
| __device__ int binarySearchForMultinomial(T* dist, | ||||
| __device__ int binarySearchForMultinomial(T* cumdist, | ||||
|                                           T* dist, | ||||
|                                           int size, | ||||
|                                           T val) { | ||||
|   int start = 0; | ||||
|   int end = size; | ||||
|   // dist[size - 1] = 0 => all zero prob dist | ||||
|   assert(THCNumerics<T>::gt(dist[size - 1], 0)); | ||||
|   // cumdist[size - 1] = 0 => all zero prob dist | ||||
|   assert(THCNumerics<T>::gt(cumdist[size - 1], 0)); | ||||
|  | ||||
|   while (end - start > 0) { | ||||
|     int mid = start + (end - start) / 2; | ||||
|  | ||||
|     T midVal = dist[mid]; | ||||
|     T midVal = cumdist[mid]; | ||||
|     if (THCNumerics<T>::lt(midVal, val)) { | ||||
|       start = mid + 1; | ||||
|     } else { | ||||
| @ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist, | ||||
|     start = size - 1; | ||||
|   } | ||||
|  | ||||
|   T curVal = dist[start]; | ||||
|   while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--; | ||||
|   T curVal = cumdist[start]; | ||||
|   while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--; | ||||
|  | ||||
|   return start; | ||||
| } | ||||
| @ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state, | ||||
|                                  int64_t* dest, | ||||
|                                  int64_t distributions, | ||||
|                                  int categories, | ||||
|                                  T* normDistPrefixSum) { | ||||
|                                  T* normDistPrefixSum, | ||||
|                                  T* normDist) { | ||||
|   // At the moment, each warp computes one sample value in the binary | ||||
|   // search due to divergence. It seems possible to compute multiple | ||||
|   // values and limit divergence though later on. However, no matter | ||||
| @ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state, | ||||
|         // Find the bucket that a uniform sample lies in | ||||
|         int choice = binarySearchForMultinomial<T>( | ||||
|           normDistPrefixSum + curDist * categories, | ||||
|           normDist + curDist * categories, | ||||
|           categories, | ||||
|           r); | ||||
|  | ||||
| @ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state, | ||||
|       // Find the bucket that a uniform sample lies in | ||||
|       int choice = binarySearchForMultinomial<T>( | ||||
|         normDistPrefixSum + curDist * categories, | ||||
|         origDist + curDist * categories, | ||||
|         categories, | ||||
|         r); | ||||
|  | ||||
|  | ||||
| @ -15,17 +15,17 @@ | ||||
| #include <thrust/system/cuda/execution_policy.h> | ||||
| #endif | ||||
|  | ||||
| template <typename T> | ||||
| template <typename T, bool handleNaN = false> | ||||
| struct ThrustGTOp { | ||||
|   __device__ bool operator()(const T& lhs, const T& rhs) const { | ||||
|     return THCNumerics<T>::gt(lhs, rhs); | ||||
|     return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| template <typename T, bool handleNaN = false> | ||||
| struct ThrustLTOp { | ||||
|   __device__ bool operator()(const T& lhs, const T& rhs) const { | ||||
|     return THCNumerics<T>::lt(lhs, rhs); | ||||
|     return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T | ||||
| void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) | ||||
| { | ||||
| #ifdef USE_MAGMA | ||||
|   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); | ||||
|   THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional"); | ||||
|   THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square"); | ||||
|   THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible"); | ||||
|  | ||||
|   int64_t n = a_->size(0); | ||||
|   int64_t nrhs = b_->size(1); | ||||
|  | ||||
|  | ||||
| @ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ | ||||
|       THArgCheck(false, 2, CUTORCH_DIM_WARNING); | ||||
|     } | ||||
|   } else { | ||||
|     THCTensor_(resizeAs)(state, self_, src_); | ||||
|  | ||||
|     if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) { | ||||
|       THArgCheck(false, 2, CUTORCH_DIM_WARNING); | ||||
|  | ||||
| @ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state, | ||||
|           n_sample, | ||||
|           THCudaLongTensor_data(state, self), | ||||
|           numDist, numCategories, | ||||
|           THCTensor_(data)(state, prefixSum)); | ||||
|           THCTensor_(data)(state, prefixSum), | ||||
| 	  THCTensor_(data)(state, normDist)); | ||||
|     } else { | ||||
|       // Sample without replacement | ||||
|  | ||||
|  | ||||
| @ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, | ||||
|     dim3 block(blockSize);                                              \ | ||||
|                                                                         \ | ||||
|     if (dir) {                                                          \ | ||||
|       bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \ | ||||
|       bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \ | ||||
|         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \ | ||||
|           keyInfo,                                                      \ | ||||
|           keySlices,                                                    \ | ||||
| @ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, | ||||
|           (TYPE) keyInfo.strides[collapseKeyDim],                       \ | ||||
|           valueInfo,                                                    \ | ||||
|           (TYPE) valueInfo.strides[collapseValueDim],                   \ | ||||
|           GTComp<scalar_t>());                                              \ | ||||
|           GTComp<scalar_t, true>());                                    \ | ||||
|     } else {                                                            \ | ||||
|       bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \ | ||||
|       bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \ | ||||
|         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \ | ||||
|           keyInfo,                                                      \ | ||||
|           keySlices,                                                    \ | ||||
| @ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, | ||||
|           (TYPE) keyInfo.strides[collapseKeyDim],                       \ | ||||
|           valueInfo,                                                    \ | ||||
|           (TYPE) valueInfo.strides[collapseValueDim],                   \ | ||||
|           LTComp<scalar_t>());                                              \ | ||||
|           LTComp<scalar_t, true>());                                              \ | ||||
|     }                                                                   \ | ||||
|   } while (0) | ||||
|  | ||||
| @ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state, | ||||
| #if CUDA_VERSION >= 7000 | ||||
|       thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), | ||||
| #endif | ||||
|       keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>()); | ||||
|       keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>()); | ||||
|   } else { | ||||
|     thrust::stable_sort_by_key( | ||||
| #if CUDA_VERSION >= 7000 | ||||
|       thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), | ||||
| #endif | ||||
|       keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>()); | ||||
|       keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>()); | ||||
|   } | ||||
|  | ||||
|   // Then, re-sort according to slice that each index is | ||||
|  | ||||
							
								
								
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							| @ -383,6 +383,14 @@ struct Converter< | ||||
|   } | ||||
| }; | ||||
|  | ||||
| // In some versions of MSVC, there will be a compiler error when building. | ||||
| // C4146: unary minus operator applied to unsigned type, result still unsigned | ||||
| // It can be addressed by disabling the following warning.  | ||||
| #ifdef _MSC_VER | ||||
| #pragma warning( push ) | ||||
| #pragma warning( disable : 4146 ) | ||||
| #endif | ||||
|  | ||||
| // skip isnan and isinf check for integral types | ||||
| template <typename To, typename From> | ||||
| typename std::enable_if<std::is_integral<From>::value, bool>::type overflows( | ||||
| @ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows( | ||||
|   } | ||||
| } | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
| #pragma warning( pop ) | ||||
| #endif | ||||
|  | ||||
| template <typename To, typename From> | ||||
| typename std::enable_if<std::is_floating_point<From>::value, bool>::type | ||||
| overflows(From f) { | ||||
|  | ||||
| @ -11,9 +11,11 @@ using c10::intrusive_ptr_target; | ||||
| using c10::make_intrusive; | ||||
| using c10::weak_intrusive_ptr; | ||||
|  | ||||
| #ifndef _MSC_VER | ||||
| #pragma GCC diagnostic ignored "-Wpragmas" | ||||
| #pragma GCC diagnostic ignored "-Wunknown-warning-option" | ||||
| #pragma GCC diagnostic ignored "-Wself-move" | ||||
| #endif | ||||
|  | ||||
| namespace { | ||||
| class SomeClass0Parameters : public intrusive_ptr_target {}; | ||||
|  | ||||
| @ -25,7 +25,7 @@ Error::Error( | ||||
| // Caffe2-style error message | ||||
| Error::Error( | ||||
|     const char* file, | ||||
|     const int line, | ||||
|     const uint32_t line, | ||||
|     const char* condition, | ||||
|     const std::string& msg, | ||||
|     const std::string& backtrace, | ||||
|  | ||||
| @ -49,7 +49,7 @@ class C10_API Error : public std::exception { | ||||
|   Error(SourceLocation source_location, const std::string& msg); | ||||
|   Error( | ||||
|       const char* file, | ||||
|       const int line, | ||||
|       const uint32_t line, | ||||
|       const char* condition, | ||||
|       const std::string& msg, | ||||
|       const std::string& backtrace, | ||||
| @ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e); | ||||
| // TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if | ||||
| // not met. | ||||
|  | ||||
| // In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t), | ||||
| // which is different from the definition of `SourceLocation` that requires | ||||
| // unsigned int (a.k.a uint32_t) and may cause a compile error with the message: | ||||
| // error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion | ||||
| // Here the static cast is used to pass the build. | ||||
|  | ||||
| #define AT_ERROR(...) \ | ||||
|   throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__)) | ||||
|   throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__)) | ||||
|  | ||||
| #define AT_WARN(...) \ | ||||
|   ::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__)) | ||||
|   ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__)) | ||||
|  | ||||
| #define AT_ASSERT(cond)                       \ | ||||
|   if (!(cond)) {                              \ | ||||
|  | ||||
| @ -17,9 +17,10 @@ | ||||
| #include <utility> | ||||
| #include <type_traits> | ||||
|  | ||||
| #ifndef _MSC_VER | ||||
| #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wshadow" | ||||
|  | ||||
| #endif | ||||
| #ifdef _MSC_VER | ||||
| #define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__ | ||||
| #else | ||||
| @ -1457,4 +1458,6 @@ namespace ska | ||||
|  | ||||
| } // end namespace ska | ||||
|  | ||||
| #ifndef _MSC_VER | ||||
| #pragma GCC diagnostic pop | ||||
| #endif | ||||
|  | ||||
| @ -72,18 +72,27 @@ class C10_API intrusive_ptr_target { | ||||
| // We also have to disable -Wunknown-warning-option and -Wpragmas, because | ||||
| // some other compilers don't know about -Wterminate or -Wexceptions and | ||||
| // will show a warning about unknown warning options otherwise. | ||||
| #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wpragmas" | ||||
| #pragma GCC diagnostic ignored "-Wunknown-warning-option" | ||||
| #pragma GCC diagnostic ignored "-Wterminate" | ||||
| #pragma GCC diagnostic ignored "-Wexceptions" | ||||
| #ifdef _MSC_VER | ||||
| #  pragma warning(push)   | ||||
| #  pragma warning(disable: 4297) // function assumed not to throw an exception but does   | ||||
| #else   | ||||
| #  pragma GCC diagnostic push   | ||||
| #  pragma GCC diagnostic ignored "-Wpragmas"   | ||||
| #  pragma GCC diagnostic ignored "-Wunknown-warning-option"   | ||||
| #  pragma GCC diagnostic ignored "-Wterminate"   | ||||
| #  pragma GCC diagnostic ignored "-Wexceptions"   | ||||
| #endif | ||||
|     AT_ASSERTM( | ||||
|         refcount_.load() == 0, | ||||
|         "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it"); | ||||
|     AT_ASSERTM( | ||||
|         weakcount_.load() == 0, | ||||
|         "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it"); | ||||
| #pragma GCC diagnostic pop | ||||
| #ifdef _MSC_VER | ||||
| #  pragma warning(pop)   | ||||
| #else   | ||||
| #  pragma GCC diagnostic pop   | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {} | ||||
|  | ||||
| @ -430,12 +430,16 @@ class C10_API TypeMeta { | ||||
|     // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be | ||||
|     // disabled for compilers that don't know '-Wundefined-var-template' and | ||||
|     // would error at our attempt to disable it. | ||||
| #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wpragmas" | ||||
| #pragma GCC diagnostic ignored "-Wunknown-warning-option" | ||||
| #pragma GCC diagnostic ignored "-Wundefined-var-template" | ||||
| #ifndef _MSC_VER   | ||||
| #  pragma GCC diagnostic push   | ||||
| #  pragma GCC diagnostic ignored "-Wpragmas"   | ||||
| #  pragma GCC diagnostic ignored "-Wunknown-warning-option"   | ||||
| #  pragma GCC diagnostic ignored "-Wundefined-var-template"   | ||||
| #endif | ||||
|     return TypeMeta(_typeMetaDataInstance<T>()); | ||||
| #pragma GCC diagnostic pop | ||||
| #ifndef _MSC_VER   | ||||
| #  pragma GCC diagnostic pop   | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|  private: | ||||
|  | ||||
| @ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY) | ||||
|   else() | ||||
|     target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf) | ||||
|   endif() | ||||
|  | ||||
|   #cmake only check for separate OpenMP library on AppleClang 7+ | ||||
|   #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 | ||||
|   if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") | ||||
|     if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR | ||||
|         CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") | ||||
|       target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY}) | ||||
|     endif() | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| target_link_libraries(caffe2 PUBLIC c10) | ||||
| target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) | ||||
| target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS}) | ||||
| @ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}") | ||||
| # Set standard properties on the target | ||||
| torch_set_target_props(caffe2) | ||||
|  | ||||
| if (MSVC) | ||||
| target_compile_options(caffe2 INTERFACE "-std=c++11") | ||||
| else() | ||||
| target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>") | ||||
| if (NOT MSVC)  | ||||
|   target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")  | ||||
| endif() | ||||
|  | ||||
| target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") | ||||
|  | ||||
| @ -93,7 +93,7 @@ using std::vector; | ||||
| #define CAFFE2_NORETURN __attribute__((noreturn)) | ||||
| #endif | ||||
|  | ||||
| #if defined(_MSC_VER) | ||||
| #if (defined _MSC_VER && !defined NOMINMAX) | ||||
| #define NOMINMAX | ||||
| #endif | ||||
|  | ||||
|  | ||||
| @ -1,5 +1,8 @@ | ||||
| from __future__ import absolute_import, division, print_function, unicode_literals | ||||
| from caffe2.proto import caffe2_pb2 | ||||
| import os | ||||
| import sys | ||||
| import platform | ||||
| # TODO: refactor & remove the following alias | ||||
| caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU | ||||
| caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA | ||||
| @ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP | ||||
| caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP | ||||
| caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES | ||||
| caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST | ||||
|  | ||||
| if platform.system() == 'Windows': | ||||
|     IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ]) | ||||
|  | ||||
|     if IS_CONDA: | ||||
|         from ctypes import windll, c_wchar_p | ||||
|         from ctypes.wintypes import DWORD, HMODULE | ||||
|  | ||||
|         AddDllDirectory = windll.kernel32.AddDllDirectory | ||||
|         AddDllDirectory.restype = DWORD | ||||
|         AddDllDirectory.argtypes = [c_wchar_p] | ||||
|  | ||||
|     def add_extra_dll_dir(extra_dll_dir): | ||||
|         if os.path.isdir(extra_dll_dir): | ||||
|             os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH'] | ||||
|  | ||||
|             if IS_CONDA: | ||||
|                 AddDllDirectory(extra_dll_dir) | ||||
|  | ||||
|     # first get nvToolsExt PATH | ||||
|     def get_nvToolsExt_path(): | ||||
|         NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt') | ||||
|  | ||||
|         if os.path.exists(NVTOOLEXT_HOME): | ||||
|             return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64') | ||||
|         else: | ||||
|             return '' | ||||
|  | ||||
|     py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin') | ||||
|     th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch') | ||||
|     th_dll_path = os.path.join(th_root, 'lib') | ||||
|  | ||||
|     dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()] | ||||
|  | ||||
|     # then add the path to env | ||||
|     for p in dll_paths: | ||||
|         add_extra_dll_dir(p) | ||||
|  | ||||
| @ -628,37 +628,12 @@ endif() | ||||
|  | ||||
| # ---[ OpenMP | ||||
| if(USE_OPENMP) | ||||
|   set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") | ||||
|   if(APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
|     exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION) | ||||
|     string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) | ||||
|     message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") | ||||
|     if(DARWIN_VERSION GREATER 9) | ||||
|       set(APPLE_OPENMP_SUCKS 1) | ||||
|     endif(DARWIN_VERSION GREATER 9) | ||||
|     execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion | ||||
|       OUTPUT_VARIABLE GCC_VERSION) | ||||
|     if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) | ||||
|       message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") | ||||
|       message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") | ||||
|       add_compile_options(-Wno-unknown-pragmas) | ||||
|       set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   if(WITH_OPENMP AND NOT CHECKED_OPENMP) | ||||
|     find_package(OpenMP) | ||||
|     set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") | ||||
|  | ||||
|     # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) | ||||
|     # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake | ||||
|     set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") | ||||
|   endif() | ||||
|  | ||||
|   find_package(OpenMP) | ||||
|   if(OPENMP_FOUND) | ||||
|     message(STATUS "Adding " ${OpenMP_CXX_FLAGS}) | ||||
|     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") | ||||
|     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | ||||
|     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") | ||||
|   else() | ||||
|     message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF") | ||||
|     caffe2_update_option(USE_OPENMP OFF) | ||||
| @ -690,7 +665,12 @@ if(USE_CUDA) | ||||
|       caffe2_update_option(USE_NVRTC OFF) | ||||
|     endif() | ||||
|     if(CAFFE2_USE_CUDNN) | ||||
|       list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn) | ||||
|       IF(CUDNN_STATIC_LINKAGE) | ||||
| 	LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS | ||||
| 	  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl") | ||||
|       ELSE() | ||||
| 	list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn) | ||||
|       ENDIF() | ||||
|     else() | ||||
|       caffe2_update_option(USE_CUDNN OFF) | ||||
|     endif() | ||||
| @ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE) | ||||
|     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG}) | ||||
|     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE}) | ||||
|   ENDIF() | ||||
|  | ||||
|   # OpenMP support? | ||||
|   SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") | ||||
|   IF (APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
|     EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION) | ||||
|     STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) | ||||
|     MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") | ||||
|     IF (DARWIN_VERSION GREATER 9) | ||||
|       SET(APPLE_OPENMP_SUCKS 1) | ||||
|     ENDIF (DARWIN_VERSION GREATER 9) | ||||
|     EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion | ||||
|       OUTPUT_VARIABLE GCC_VERSION) | ||||
|     IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) | ||||
|       MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") | ||||
|       MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") | ||||
|       add_compile_options(-Wno-unknown-pragmas) | ||||
|       SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) | ||||
|     ENDIF() | ||||
|   ENDIF() | ||||
|  | ||||
|   IF (WITH_OPENMP AND NOT CHECKED_OPENMP) | ||||
|     FIND_PACKAGE(OpenMP) | ||||
|     SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") | ||||
|  | ||||
|     # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) | ||||
|     # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake | ||||
|     SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") | ||||
|   ENDIF() | ||||
|  | ||||
|   IF (OPENMP_FOUND) | ||||
|     MESSAGE(STATUS "Compiling with OpenMP support") | ||||
|     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") | ||||
|     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | ||||
|   ENDIF() | ||||
|  | ||||
|  | ||||
|   SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) | ||||
|  | ||||
|   FIND_PACKAGE(MAGMA) | ||||
| @ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE) | ||||
|     SET(AT_CUDA_ENABLED 0) | ||||
|   else() | ||||
|     SET(AT_CUDA_ENABLED 1) | ||||
|     find_package(CUDA 5.5 REQUIRED) | ||||
|   endif() | ||||
|  | ||||
|   IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND) | ||||
| @ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE) | ||||
|   SET(AT_MKLDNN_ENABLED 0) | ||||
|   SET(CAFFE2_USE_MKLDNN OFF) | ||||
|   IF (USE_MKLDNN) | ||||
|     FIND_PACKAGE(MKLDNN) | ||||
|     INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake) | ||||
|     IF(MKLDNN_FOUND) | ||||
|       SET(AT_MKLDNN_ENABLED 1) | ||||
|       INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR}) | ||||
|       INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR}) | ||||
|       IF(BUILD_CAFFE2_OPS) | ||||
|         SET(CAFFE2_USE_MKLDNN ON) | ||||
|         LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn) | ||||
|  | ||||
| @ -2,7 +2,6 @@ | ||||
| # | ||||
| # The following variables are optionally searched for defaults | ||||
| #  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found | ||||
| #  USE_MKLDNN | ||||
| # | ||||
| # The following are set after configuration is done: | ||||
| #  MKLDNN_FOUND          : set to true if mkl-dnn is found. | ||||
| @ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND) | ||||
| SET(MKLDNN_LIBRARIES) | ||||
| SET(MKLDNN_INCLUDE_DIR) | ||||
|  | ||||
| IF (NOT USE_MKLDNN) | ||||
|   RETURN() | ||||
| ENDIF(NOT USE_MKLDNN) | ||||
|  | ||||
| IF(MSVC) | ||||
|   MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far") | ||||
|   RETURN() | ||||
| @ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR) | ||||
| LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR}) | ||||
|  | ||||
| IF(MKL_FOUND) | ||||
|   # Append to mkldnn dependencies | ||||
|   LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES}) | ||||
|   LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR}) | ||||
|   # The OMP-related variables of MKL-DNN have to be overwritten here, | ||||
|   # if MKL is used, and the OMP version is defined by MKL. | ||||
|   # MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL. | ||||
|   # INTEL_MKL_DIR gives the MKL root path. | ||||
|   IF (INTEL_MKL_DIR) | ||||
|     SET(MKLROOT ${INTEL_MKL_DIR}) | ||||
|     IF(WIN32) | ||||
|       SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE) | ||||
|     ELSE(WIN32) | ||||
|       IF (MKL_LIBRARIES_gomp_LIBRARY) | ||||
|         SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY}) | ||||
|       ELSE(MKL_LIBRARIES_gomp_LIBRARY) | ||||
|         SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY}) | ||||
|       ENDIF(MKL_LIBRARIES_gomp_LIBRARY) | ||||
|       SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE) | ||||
|     ENDIF(WIN32) | ||||
|   ELSE(INTEL_MKL_DIR) | ||||
|     MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!") | ||||
|   ENDIF(INTEL_MKL_DIR) | ||||
|  | ||||
| ELSE(MKL_FOUND) | ||||
|   # If we cannot find MKL, we will use the Intel MKL Small library | ||||
|   # comes with ${MKLDNN_ROOT}/external | ||||
| @ -75,60 +51,65 @@ ELSE(MKL_FOUND) | ||||
|   ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external) | ||||
|  | ||||
|   FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h) | ||||
|   IF(MKLML_INNER_INCLUDE_DIR) | ||||
|     # if user has multiple version under external/ then guess last | ||||
|     # one alphabetically is "latest" and warn | ||||
|     LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN) | ||||
|     IF(MKLINCLEN GREATER 1) | ||||
|       LIST(SORT MKLML_INNER_INCLUDE_DIR) | ||||
|       LIST(REVERSE MKLML_INNER_INCLUDE_DIR) | ||||
|       LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST) | ||||
|       SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}") | ||||
|     ENDIF(MKLINCLEN GREATER 1) | ||||
|     GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY) | ||||
|     LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR}) | ||||
|   IF(NOT MKLML_INNER_INCLUDE_DIR) | ||||
|     MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support") | ||||
|     RETURN() | ||||
|   ENDIF(NOT MKLML_INNER_INCLUDE_DIR) | ||||
|   # if user has multiple version under external/ then guess last | ||||
|   # one alphabetically is "latest" and warn | ||||
|   LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN) | ||||
|   IF(MKLINCLEN GREATER 1) | ||||
|     LIST(SORT MKLML_INNER_INCLUDE_DIR) | ||||
|     LIST(REVERSE MKLML_INNER_INCLUDE_DIR) | ||||
|     LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST) | ||||
|     SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}") | ||||
|   ENDIF(MKLINCLEN GREATER 1) | ||||
|   GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY) | ||||
|   LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR}) | ||||
|  | ||||
|     IF(APPLE) | ||||
|       SET(__mklml_inner_libs mklml iomp5) | ||||
|     ELSE(APPLE) | ||||
|       SET(__mklml_inner_libs mklml_intel iomp5) | ||||
|     ENDIF(APPLE) | ||||
|  | ||||
|     FOREACH(__mklml_inner_lib ${__mklml_inner_libs}) | ||||
|       STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper) | ||||
|       FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY | ||||
|             NAMES ${__mklml_inner_lib} | ||||
|             PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib" | ||||
|             DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library") | ||||
|       MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY) | ||||
|       LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY}) | ||||
|     ENDFOREACH(__mklml_inner_lib) | ||||
|   ENDIF(MKLML_INNER_INCLUDE_DIR) | ||||
|   IF(APPLE) | ||||
|     SET(__mklml_inner_libs mklml iomp5) | ||||
|   ELSE(APPLE) | ||||
|     SET(__mklml_inner_libs mklml_intel iomp5) | ||||
|   ENDIF(APPLE) | ||||
|   FOREACH(__mklml_inner_lib ${__mklml_inner_libs}) | ||||
|     STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper) | ||||
|     FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY | ||||
|           NAMES ${__mklml_inner_lib} | ||||
|           PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib" | ||||
|           DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library") | ||||
|     MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY) | ||||
|     IF(NOT ${__mklml_inner_lib_upper}_LIBRARY) | ||||
|       MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support") | ||||
|       RETURN() | ||||
|     ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY) | ||||
|     LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY}) | ||||
|   ENDFOREACH(__mklml_inner_lib) | ||||
| ENDIF(MKL_FOUND) | ||||
|  | ||||
| LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES) | ||||
| LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR) | ||||
| INCLUDE(FindPackageHandleStandardArgs) | ||||
| find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for}) | ||||
| IF(MKL_FOUND) | ||||
|   SET(MKL_cmake_included TRUE) | ||||
|   SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE) | ||||
| ENDIF(MKL_FOUND) | ||||
| SET(WITH_TEST FALSE CACHE BOOL "" FORCE) | ||||
| SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE) | ||||
| SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) | ||||
| ADD_SUBDIRECTORY(${MKLDNN_ROOT}) | ||||
| IF(NOT TARGET mkldnn) | ||||
|   MESSAGE("Failed to include MKL-DNN target") | ||||
|   RETURN() | ||||
| ENDIF(NOT TARGET mkldnn) | ||||
| IF(MKL_FOUND) | ||||
|   TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL) | ||||
| ENDIF(MKL_FOUND) | ||||
| IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
|   TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized) | ||||
|   TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow) | ||||
|   TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow) | ||||
| ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
| LIST(APPEND MKLDNN_LIBRARIES mkldnn) | ||||
|  | ||||
| IF(MKLDNN_FOUND) | ||||
|   IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
|     ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized) | ||||
|   ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) | ||||
|   SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE) | ||||
|   SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE) | ||||
|   ADD_SUBDIRECTORY(${MKLDNN_ROOT}) | ||||
|   SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}") | ||||
|   IF(WIN32) | ||||
|     LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}") | ||||
|   ELSE(WIN32) | ||||
|     LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}") | ||||
|   ENDIF(WIN32) | ||||
| ELSE(MKLDNN_FOUND) | ||||
|   MESSAGE(STATUS "MKLDNN source files not found!") | ||||
| ENDIF(MKLDNN_FOUND) | ||||
|  | ||||
| UNSET(__mklml_inner_libs) | ||||
| UNSET(__mkldnn_looked_for) | ||||
| SET(MKLDNN_FOUND TRUE) | ||||
| MESSAGE(STATUS "Found MKL-DNN: TRUE") | ||||
|  | ||||
| ENDIF(NOT MKLDNN_FOUND) | ||||
|  | ||||
| @ -9,6 +9,12 @@ endif() | ||||
| # release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache. | ||||
| list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix) | ||||
|  | ||||
|  # we dont want to statically link cudart, because we rely on it's dynamic linkage in | ||||
|  # python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName). | ||||
|  # Technically, we can link cudart here statically, and link libtorch_python.so | ||||
|  # to a dynamic libcudart.so, but that's just wasteful | ||||
| SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "") | ||||
|  | ||||
| # Find CUDA. | ||||
| find_package(CUDA 7.0) | ||||
| if(NOT CUDA_FOUND) | ||||
| @ -89,6 +95,9 @@ endif() | ||||
|  | ||||
| if(DEFINED ENV{CUDNN_LIBRARY}) | ||||
|   set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY}) | ||||
|   if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a") | ||||
|     SET(CUDNN_STATIC_LINKAGE ON) | ||||
|   endif() | ||||
| else() | ||||
|   find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME} | ||||
|     HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} | ||||
| @ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN) | ||||
|         "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}") | ||||
|   endif() | ||||
|   message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})") | ||||
|   if(CUDNN_VERSION VERSION_LESS "7.0.0") | ||||
|     message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.") | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| # ---[ CUDA libraries wrapper | ||||
| @ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED) | ||||
| if(CAFFE2_STATIC_LINK_CUDA) | ||||
|     set_property( | ||||
|         TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES | ||||
|         "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt) | ||||
|         "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl) | ||||
| else() | ||||
|     set_property( | ||||
|         TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES | ||||
|  | ||||
							
								
								
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,917 @@ | ||||
| PyTorch Contribution Guide | ||||
| ========================== | ||||
|  | ||||
| PyTorch is a GPU-accelerated Python tensor computation package for | ||||
| building deep neural networks built on tape-based autograd systems. | ||||
|  | ||||
| The PyTorch Contribution Process | ||||
| -------------------------------- | ||||
|  | ||||
| The PyTorch organization is governed by `PyTorch | ||||
| Governance </docs/community/governance.html>`__. | ||||
|  | ||||
| The PyTorch development process involves a healthy amount of open | ||||
| discussions between the core development team and the community. | ||||
|  | ||||
| PyTorch operates similar to most open source projects on GitHub. | ||||
| However, if you've never contributed to an open source project before, | ||||
| here is the basic process. | ||||
|  | ||||
| -  **Figure out what you're going to work on.** The majority of open | ||||
|    source contributions come from people scratching their own itches. | ||||
|    However, if you don't know what you want to work on, or are just | ||||
|    looking to get more acquainted with the project, here are some tips | ||||
|    for how to find appropriate tasks: | ||||
|  | ||||
|    -  Look through the `issue | ||||
|       tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if | ||||
|       there are any issues you know how to fix. Issues that are | ||||
|       confirmed by other contributors tend to be better to investigate. | ||||
|       We also maintain some labels for issues which are likely to be | ||||
|       good for new people, e.g., **bootcamp** and **1hr**, although | ||||
|       these labels are less well maintained. | ||||
|    -  Join us on Slack and let us know you're interested in getting to | ||||
|       know PyTorch. We're very happy to help out researchers and | ||||
|       partners get up to speed with the codebase. | ||||
|  | ||||
| -  **Figure out the scope of your change and reach out for design | ||||
|    comments on a GitHub issue if it's large.** The majority of pull | ||||
|    requests are small; in that case, no need to let us know about what | ||||
|    you want to do, just get cracking. But if the change is going to be | ||||
|    large, it's usually a good idea to get some design comments about it | ||||
|    first. | ||||
|  | ||||
|    -  If you don't know how big a change is going to be, we can help you | ||||
|       figure it out! Just post about it on issues or Slack. | ||||
|    -  Some feature additions are very standardized; for example, lots of | ||||
|       people add new operators or optimizers to PyTorch. Design | ||||
|       discussion in these cases boils down mostly to, “Do we want this | ||||
|       operator/optimizer?” Giving evidence for its utility, e.g., usage | ||||
|       in peer reviewed papers, or existence in other frameworks, helps a | ||||
|       bit when making this case. | ||||
|    -  Core changes and refactors can be quite difficult to coordinate, | ||||
|       as the pace of development on PyTorch master is quite fast. | ||||
|       Definitely reach out about fundamental or cross-cutting changes; | ||||
|       we can often give guidance about how to stage such changes into | ||||
|       more easily reviewable pieces. | ||||
|  | ||||
| -  **Code it out!** | ||||
|  | ||||
|    -  See the technical guide for advice for working with PyTorch in a | ||||
|       technical form. | ||||
|  | ||||
| -  **Open a pull request.** | ||||
|  | ||||
|    -  If you are not ready for the pull request to be reviewed, tag it | ||||
|       with [WIP]. We will ignore it when doing review passes. If you are | ||||
|       working on a complex change, it's good to start things off as WIP, | ||||
|       because you will need to spend time looking at CI results to see | ||||
|       if things worked out or not. | ||||
|    -  Find an appropriate reviewer for your change. We have some folks | ||||
|       who regularly go through the PR queue and try to review | ||||
|       everything, but if you happen to know who the maintainer for a | ||||
|       given subsystem affected by your patch is, feel free to include | ||||
|       them directly on the pull request. You can learn more about this | ||||
|       structure at PyTorch Subsystem Ownership. | ||||
|  | ||||
| -  **Iterate on the pull request until it's accepted!** | ||||
|  | ||||
|    -  We'll try our best to minimize the number of review roundtrips and | ||||
|       block PRs only when there are major issues. For the most common | ||||
|       issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__. | ||||
|    -  Once a pull request is accepted and CI is passing, there is | ||||
|       nothing else you need to do; we will merge the PR for you. | ||||
|  | ||||
| Getting Started | ||||
| --------------- | ||||
|  | ||||
| Proposing new features | ||||
| ~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| New feature ideas are best discussed on a specific issue. Please include | ||||
| as much information as you can, any accompanying data, and your proposed | ||||
| solution. The PyTorch team and community frequently reviews new issues | ||||
| and comments where they think they can help. If you feel confident in | ||||
| your solution, go ahead and implement it. | ||||
|  | ||||
| Reporting Issues | ||||
| ~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| If you've identified an issue, first search through the `list of | ||||
| existing issues <https://github.com/pytorch/pytorch/issues>`__ on the | ||||
| repo. If you are unable to find a similar issue, then create a new one. | ||||
| Supply as much information you can to reproduce the problematic | ||||
| behavior. Also, include any additional insights like the behavior you | ||||
| expect. | ||||
|  | ||||
| Implementing Features or Fixing Bugs | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| If you want to fix a specific issue, it's best to comment on the | ||||
| individual issue with your intent. However, we do not lock or assign | ||||
| issues except in cases where we have worked with the developer before. | ||||
| It's best to strike up a conversation on the issue and discuss your | ||||
| proposed solution. The PyTorch team can provide guidance that saves you | ||||
| time. | ||||
|  | ||||
| Issues that are labeled first-new-issue, low, or medium priority provide | ||||
| the best entrance point are great places to start. | ||||
|  | ||||
| Adding Tutorials | ||||
| ~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__ | ||||
| come from the community itself and we welcome additional contributions. | ||||
| To learn more about how to contribute a new tutorial you can learn more | ||||
| here: `PyTorch.org Tutorial Contribution Guide on | ||||
| Github <https://github.com/pytorch/tutorials/#contributing>`__ | ||||
|  | ||||
| Improving Documentation & Tutorials | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| We aim to produce high quality documentation and tutorials. On rare | ||||
| occasions that content includes typos or bugs. If you find something you | ||||
| can fix, send us a pull request for consideration. | ||||
|  | ||||
| Take a look at the `Documentation <#on-documentation>`__ section to learn how our system | ||||
| works. | ||||
|  | ||||
| Participating in online discussions | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| You can find active discussions happening on the PyTorch Discussion | ||||
| `forum <https://discuss.pytorch.org/>`__. | ||||
|  | ||||
| Submitting pull requests to fix open issues | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| You can view a list of all open issues | ||||
| `here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an | ||||
| issue is a great way to get the attention of the team. From here you can | ||||
| share your ideas and how you plan to resolve the issue. | ||||
|  | ||||
| For more challenging issues, the team will provide feedback and | ||||
| direction for how to best solve the issue. | ||||
|  | ||||
| If you're not able to fix the issue itself, commenting and sharing | ||||
| whether you can reproduce the issue can be useful for helping the team | ||||
| identify problem areas. | ||||
|  | ||||
| Reviewing open pull requests | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| We appreciate your help reviewing and commenting on pull requests. Our | ||||
| team strives to keep the number of open pull requests at a manageable | ||||
| size, we respond quickly for more information if we need it, and we | ||||
| merge PRs that we think are useful. However, due to the high level of | ||||
| interest, additional eyes on pull requests is appreciated. | ||||
|  | ||||
| Improving code readability | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Improve code readability helps everyone. It is often better to submit a | ||||
| small number of pull requests that touch few files versus a large pull | ||||
| request that touches many files. Starting a discussion in the PyTorch | ||||
| forum `here <https://discuss.pytorch.org/>`__ or on an issue related to | ||||
| your improvement is the best way to get started. | ||||
|  | ||||
| Adding test cases to make the codebase more robust | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Additional test coverage is appreciated. | ||||
|  | ||||
| Promoting PyTorch | ||||
| ~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Your use of PyTorch in your projects, research papers, write ups, blogs, | ||||
| or general discussions around the internet helps to raise awareness for | ||||
| PyTorch and our growing community. Please reach out to | ||||
| `pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__ | ||||
| for marketing support. | ||||
|  | ||||
| Triaging issues | ||||
| ~~~~~~~~~~~~~~~ | ||||
|  | ||||
| If you feel that an issue could benefit from a particular tag or level | ||||
| of complexity comment on the issue and share your opinion. If an you | ||||
| feel an issue isn't categorized properly comment and let the team know. | ||||
|  | ||||
| About open source development | ||||
| ----------------------------- | ||||
|  | ||||
| If this is your first time contributing to an open source project, some | ||||
| aspects of the development process may seem unusual to you. | ||||
|  | ||||
| -  **There is no way to “claim” issues.** People often want to “claim” | ||||
|    an issue when they decide to work on it, to ensure that there isn't | ||||
|    wasted work when someone else ends up working on it. This doesn't | ||||
|    really work too well in open source, since someone may decide to work | ||||
|    on something, and end up not having time to do it. Feel free to give | ||||
|    information in an advisory fashion, but at the end of the day, we | ||||
|    will take running code and rough consensus. | ||||
| -  **There is a high bar for new functionality that is added.** Unlike | ||||
|    in a corporate environment, where the person who wrote code | ||||
|    implicitly “owns” it and can be expected to take care of it in the | ||||
|    beginning of its lifetime, once a pull request is merged into an open | ||||
|    source project, it immediately becomes the collective responsibility | ||||
|    of all maintainers on the project. When we merge code, we are saying | ||||
|    that we, the maintainers, are able to review subsequent changes and | ||||
|    make a bugfix to the code. This naturally leads to a higher standard | ||||
|    of contribution. | ||||
|  | ||||
| Common Mistakes To Avoid | ||||
| ------------------------ | ||||
|  | ||||
| -  **Did you add tests?** (Or if the change is hard to test, did you | ||||
|    describe how you tested your change?) | ||||
|  | ||||
|    -  We have a few motivations for why we ask for tests: | ||||
|  | ||||
|       1. to help us tell if we break it later | ||||
|       2. to help us tell if the patch is correct in the first place | ||||
|          (yes, we did review it, but as Knuth says, “beware of the | ||||
|          following code, for I have not run it, merely proven it | ||||
|          correct”) | ||||
|  | ||||
|    -  When is it OK not to add a test? Sometimes a change can't be | ||||
|       conveniently tested, or the change is so obviously correct (and | ||||
|       unlikely to be broken) that it's OK not to test it. On the | ||||
|       contrary, if a change is seems likely (or is known to be likely) | ||||
|       to be accidentally broken, it's important to put in the time to | ||||
|       work out a testing strategy. | ||||
|  | ||||
| -  **Is your PR too long?** | ||||
|  | ||||
|    -  It's easier for us to review and merge small PRs. Difficulty of | ||||
|       reviewing a PR scales nonlinearly with its size. | ||||
|    -  When is it OK to submit a large PR? It helps a lot if there was a | ||||
|       corresponding design discussion in an issue, with sign off from | ||||
|       the people who are going to review your diff. We can also help | ||||
|       give advice about how to split up a large change into individually | ||||
|       shippable parts. Similarly, it helps if there is a complete | ||||
|       description of the contents of the PR: it's easier to review code | ||||
|       if we know what's inside! | ||||
|  | ||||
| -  **Comments for subtle things?** In cases where behavior of your code | ||||
|    is nuanced, please include extra comments and documentation to allow | ||||
|    us to better understand the intention of your code. | ||||
| -  **Did you add a hack?** Sometimes a hack is the right answer. But | ||||
|    usually we will have to discuss it. | ||||
| -  **Do you want to touch a very core component?** In order to prevent | ||||
|    major regressions, pull requests that touch core components receive | ||||
|    extra scrutiny. Make sure you've discussed your changes with the team | ||||
|    before undertaking major changes. | ||||
| -  **Want to add a new feature?** If you want to add new features, | ||||
|    comment your intention on the related issue. Our team tries to | ||||
|    comment on and provide feedback to the community. It's better to have | ||||
|    an open discussion with the team and the rest of the community prior | ||||
|    to building new features. This helps us stay aware of what you're | ||||
|    working on and increases the chance that it'll be merged. | ||||
| -  **Did you touch unrelated code to the PR?** To aid in code review, | ||||
|    please only include files in your pull request that are directly | ||||
|    related to your changes. | ||||
|  | ||||
| Frequently asked questions | ||||
|  | ||||
| -  **How can I contribute as a reviewer?** There is lots of value if | ||||
|    community developer reproduce issues, try out new functionality, or | ||||
|    otherwise help us identify or troubleshoot issues. Commenting on | ||||
|    tasks or pull requests with your enviroment details is helpful and | ||||
|    appreciated. | ||||
| -  **CI tests failed, what does it mean?** Maybe you need to merge with | ||||
|    master or rebase with latest changes. Pushing your changes should | ||||
|    re-trigger CI tests. If the tests persist, you'll want to trace | ||||
|    through the error messages and resolve the related issues. | ||||
| -  **What are the most high risk changes?** Anything that tourhces build | ||||
|    configuration is an risky area. Please avoid changing these unless | ||||
|    you've had a discussion with the team beforehand. | ||||
| -  **Hey, a commit showed up on my branch, what's up with that?** | ||||
|    Sometimes another community member will provide a patch or fix to | ||||
|    your pull request or branch. This is often needed for getting CI tests | ||||
|    to pass. | ||||
|  | ||||
| On Documentation | ||||
| ---------------- | ||||
|  | ||||
| Python Docs | ||||
| ~~~~~~~~~~~ | ||||
|  | ||||
| PyTorch documentation is generated from python source using | ||||
| `Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is | ||||
| copied to the docs folder in the master branch of | ||||
| `pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__, | ||||
| and is served via GitHub pages. | ||||
|  | ||||
| -  Site: http://pytorch.org/docs | ||||
| -  GitHub: http://github.com/pytorch/pytorch/docs | ||||
| -  Served from: | ||||
|    `https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__ | ||||
|  | ||||
| C++ Docs | ||||
| ~~~~~~~~ | ||||
|  | ||||
| For C++ code we use Doxygen to generate the content files. The C++ docs | ||||
| are built on a special server and the resulting files are copied to the | ||||
| https://github.com/pytorch/cppdocs repo, and are served from GitHub | ||||
| pages. | ||||
|  | ||||
| -  Site: http://pytorch.org/cppdocs | ||||
| -  GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp | ||||
| -  Served from: https://github.com/pytorch/cppdocs | ||||
|  | ||||
| Tutorials | ||||
| --------- | ||||
|  | ||||
| PyTorch tutorials are documents used to help understand using PyTorch to | ||||
| accomplish specific tasks or to understand more holistic concepts. | ||||
| Tutorials are built using | ||||
| `Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__ | ||||
| from executable python sources files, or from restructured-text (rst) | ||||
| files. | ||||
|  | ||||
| -  Site: http://pytorch.org/tutorials | ||||
| -  GitHub: http://github.com/pytorch/tutorials | ||||
|  | ||||
| Tutorials Build Overview | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| For tutorials, `pull | ||||
| requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a | ||||
| rebuild the entire site using CircleCI to test the effects of the | ||||
| change. This build is sharded into 9 worker builds and takes around 40 | ||||
| minutes total. At the same time, we do a Netlify build using *make | ||||
| html-noplot*, which builds the site without rendering the notebook | ||||
| output into pages for quick review. | ||||
|  | ||||
| After a PR is accepted, the site is rebuilt and deployed from CircleCI. | ||||
|  | ||||
| Contributing a new Tutorial | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| `PyTorch.org Tutorial Contribution | ||||
| Guide <https://github.com/pytorch/tutorials/#contributing>`__ | ||||
|  | ||||
| Code Style | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
| **Python style** | ||||
|  | ||||
| **C++ style** | ||||
|  | ||||
| Submitting a Pull Request | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| PyTorch development happens publicly on our Github repo. | ||||
|  | ||||
| To have your feature or fix added to PyTorch, please submit a Pull | ||||
| Request. | ||||
|  | ||||
| Running Tests | ||||
| ~~~~~~~~~~~~~ | ||||
|  | ||||
| Show examples for running all tests, just one individual... | ||||
|  | ||||
| Technical Process | ||||
| ----------------- | ||||
|  | ||||
| Developing PyTorch | ||||
| ~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| To develop PyTorch on your machine, here are some tips: | ||||
|  | ||||
| 1. Uninstall all existing PyTorch installs: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     conda uninstall pytorch | ||||
|     pip uninstall torch | ||||
|     pip uninstall torch # run this command twice | ||||
|  | ||||
| 2. Clone a copy of PyTorch from source: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     git clone https://github.com/pytorch/pytorch | ||||
|     cd pytorch | ||||
|  | ||||
| 3. Install PyTorch in ``build develop`` mode: | ||||
|  | ||||
| A full set of instructions on installing PyTorch from source is here: | ||||
| https://github.com/pytorch/pytorch#from-source | ||||
|  | ||||
| The change you have to make is to replace | ||||
|  | ||||
| :: | ||||
|  | ||||
|     python setup.py install | ||||
|  | ||||
| with | ||||
|  | ||||
| :: | ||||
|  | ||||
|     python setup.py build develop | ||||
|  | ||||
| This is especially useful if you are only changing Python files. | ||||
|  | ||||
| This mode will symlink the Python files from the current local source | ||||
| tree into the Python install. | ||||
|  | ||||
| Hence, if you modify a Python file, you do not need to reinstall PyTorch | ||||
| again and again. | ||||
|  | ||||
| For example: | ||||
|  | ||||
| -  Install local PyTorch in ``build develop`` mode | ||||
| -  modify your Python file ``torch/__init__.py`` (for example) | ||||
| -  test functionality | ||||
| -  modify your Python file ``torch/__init__.py`` | ||||
| -  test functionality | ||||
| -  modify your Python file ``torch/__init__.py`` | ||||
| -  test functionality | ||||
|  | ||||
| You do not need to repeatedly install after modifying Python files. | ||||
|  | ||||
| In case you want to reinstall, make sure that you uninstall PyTorch | ||||
| first by running ``pip uninstall torch`` and ``python setup.py clean``. | ||||
| Then you can install in ``build develop`` mode again. | ||||
|  | ||||
| Codebase structure | ||||
| ------------------ | ||||
|  | ||||
| -  `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core | ||||
|    library files that work everywhere, both server and mobile. We are | ||||
|    slowly moving pieces from | ||||
|    `ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__ | ||||
|    here. This library is intended only to contain essential | ||||
|    functionality, and appropriate to use in settings where binary size | ||||
|    matters. (But you'll have a lot of missing functionality if you try | ||||
|    to use it directly.) | ||||
| -  `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++ | ||||
|    tensor library for PyTorch (no autograd support) | ||||
|  | ||||
|    -  `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__ | ||||
|  | ||||
|       -  `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__ | ||||
|          `THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__ | ||||
|          `THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__ | ||||
|          `THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__ | ||||
|          - Legacy library code from the original Torch. Try not to add | ||||
|          things here; we're slowly porting these to | ||||
|          `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__. | ||||
|  | ||||
|          -  generic - Contains actual implementations of operators, | ||||
|             parametrized over ``scalar_t``. Files here get compiled N | ||||
|             times per supported scalar type in PyTorch. | ||||
|  | ||||
|       -  `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__ | ||||
|  | ||||
|          -  `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__ | ||||
|             - Core functionality of ATen. This is migrating to top-level | ||||
|             c10 folder. | ||||
|          -  `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__ | ||||
|             - Modern implementations of operators. If you want to write | ||||
|             a new operator, here is where it should go. Most CPU | ||||
|             operators go in the top level directory, except for | ||||
|             operators which need to be compiled specially; see cpu | ||||
|             below. | ||||
|  | ||||
|             -  `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__ | ||||
|                - Not actually CPU implementations of operators, but | ||||
|                specifically implementations which are compiled with | ||||
|                processor-specific instructions, like AVX. See the | ||||
|                `README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__ | ||||
|                for more details. | ||||
|             -  `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__ | ||||
|                - CUDA implementations of operators. | ||||
|             -  `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__ | ||||
|                - CPU and CUDA implementations of COO sparse tensor | ||||
|                operations | ||||
|             -  `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__ | ||||
|                `mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__ | ||||
|                `miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__ | ||||
|                `cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__ | ||||
|  | ||||
|                -  implementations of operators which simply bind to some | ||||
|                   backend library. | ||||
|  | ||||
| -  `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ - | ||||
|    The actual PyTorch library. Everything that is not in | ||||
|    `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__ | ||||
|    is a Python module, following the PyTorch Python frontend module | ||||
|    structure. | ||||
|  | ||||
|    -  `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__ | ||||
|       - C++ files composing the PyTorch library. Files in this directory | ||||
|       tree are a mix of Python binding code, and C++ heavy lifting. | ||||
|       Consult ``setup.py`` for the canonical list of Python binding | ||||
|       files; conventionally, they are often prefixed with ``python_``. | ||||
|  | ||||
|       -  `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__ | ||||
|          - Compiler and frontend for TorchScript JIT frontend. | ||||
|       -  `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__ | ||||
|          - Implementation of reverse-mode automatic differentiation. | ||||
|       -  `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__ | ||||
|          - The PyTorch C++ frontend. | ||||
|       -  `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__ | ||||
|          - Distributed training support for PyTorch. | ||||
|  | ||||
| -  `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ - | ||||
|    Code generation scripts for the PyTorch library. See | ||||
|    `README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__ | ||||
|    of this directory for more details. | ||||
| -  `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ - | ||||
|    Python unit tests for PyTorch Python frontend. | ||||
|  | ||||
|    -  `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__ | ||||
|       - Basic tests for PyTorch functionality. | ||||
|    -  `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__ | ||||
|       - Tests for non-NN automatic differentiation support. | ||||
|    -  `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__ | ||||
|       - Tests for NN operators and their automatic differentiation. | ||||
|    -  `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__ | ||||
|       - Tests for the JIT compiler and TorchScript. | ||||
|    -  ... | ||||
|    -  `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__ | ||||
|       - C++ unit tests for PyTorch C++ frontend. | ||||
|    -  `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__ | ||||
|       - Automatically generated "expect" files which are used to compare | ||||
|       against expected output. | ||||
|    -  `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__ | ||||
|       - Tests for ONNX export functionality, using both PyTorch and | ||||
|       Caffe2. | ||||
|  | ||||
| -  `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ - | ||||
|    The Caffe2 library. | ||||
|  | ||||
|    -  `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__ | ||||
|       - Core files of Caffe2, e.g., tensor, workspace, blobs, etc. | ||||
|    -  `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__ | ||||
|       - Operators of Caffe2. | ||||
|    -  `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__ | ||||
|       - Python bindings to Caffe2. | ||||
|    -  ... | ||||
|  | ||||
| Unit Testing | ||||
| ------------ | ||||
|  | ||||
| PyTorch's testing is located under ``test/``. Run the entire test suite | ||||
| with | ||||
|  | ||||
| :: | ||||
|  | ||||
|     python test/run_test.py | ||||
|  | ||||
| or run individual test files, like ``python test/test_nn.py``, for | ||||
| individual test suites. | ||||
|  | ||||
| Better local unit tests with pytest | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| We don't officially support ``pytest``, but it works well with our | ||||
| ``unittest`` tests and offers a number of useful features for local | ||||
| developing. Install it via ``pip install pytest``. | ||||
|  | ||||
| If you want to just run tests that contain a specific substring, you can | ||||
| use the ``-k`` flag: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     pytest test/test_nn.py -k Loss -v | ||||
|  | ||||
| The above is an example of testing a change to Loss functions: this | ||||
| command runs tests such as ``TestNN.test_BCELoss``\ and | ||||
| ``TestNN.test_MSELoss`` and can be useful to save keystrokes. | ||||
|  | ||||
| Writing documentation | ||||
| --------------------- | ||||
|  | ||||
| PyTorch uses `Google | ||||
| style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__ | ||||
| for formatting docstrings. Length of line inside docstrings block must | ||||
| be limited to 80 characters to fit into Jupyter documentation popups. | ||||
|  | ||||
| For C++ documentation (https://pytorch.org/cppdocs), we use | ||||
| `Doxygen <http://www.doxygen.nl/>`__ and then convert it to | ||||
| `Sphinx <http://www.sphinx-doc.org/>`__ via | ||||
| `Breathe <https://github.com/michaeljones/breathe>`__ | ||||
| and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen | ||||
| reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__ | ||||
| for more information on the documentation syntax. To build the | ||||
| documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``. | ||||
|  | ||||
| We run Doxygen in CI (Travis) to verify that you do not use invalid | ||||
| Doxygen commands. To run this check locally, run ``./check-doxygen.sh`` | ||||
| from inside ``docs/cpp``. | ||||
|  | ||||
| Managing multiple build trees | ||||
| ----------------------------- | ||||
|  | ||||
| One downside to using ``python setup.py develop`` is that your | ||||
| development version of PyTorch will be installed globally on your | ||||
| account (e.g., if you run ``import torch`` anywhere else, the | ||||
| development version will be used. | ||||
|  | ||||
| If you want to manage multiple builds of PyTorch, you can make use of | ||||
| `conda environments <https://conda.io/docs/using/envs.html>`__ to | ||||
| maintain separate Python package environments, each of which can be tied | ||||
| to a specific build of PyTorch. To set one up: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed | ||||
|     python setup.py build develop | ||||
|  | ||||
| C++ Development tips | ||||
| -------------------- | ||||
|  | ||||
| If you are working on the C++ code, there are a few important things | ||||
| that you will want to keep in mind: | ||||
|  | ||||
| 1. How to rebuild only the code you are working on. | ||||
| 2. How to make rebuilds in the absence of changes go faster. | ||||
|  | ||||
| Build only what you need. | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| ``python setup.py build`` will build everything, but since our build | ||||
| system is not very optimized for incremental rebuilds, this will | ||||
| actually be very slow. Far better is to only request rebuilds of the | ||||
| parts of the project you are working on: | ||||
|  | ||||
| -  Working on the Python bindings? Run ``python setup.py develop`` to | ||||
|    rebuild (NB: no ``build`` here!) | ||||
| -  Working on ``torch/csrc`` or ``aten``? Run | ||||
|    ``python setup.py rebuild_libtorch`` to rebuild and avoid having to | ||||
|    rebuild other dependent libraries we depend on. | ||||
| -  Working on one of the other dependent libraries? The other valid | ||||
|    targets are listed in ``dep_libs`` in ``setup.py``. prepend | ||||
|    ``build_`` to get a target, and run as e.g. | ||||
|    ``python setup.py build_gloo``. | ||||
| -  Working on a test binary? Run | ||||
|    ``(cd build && ninja bin/test_binary_name)`` to rebuild only that | ||||
|    test binary (without rerunning cmake). (Replace ``ninja`` with | ||||
|    ``make`` if you don't have ninja installed). | ||||
|  | ||||
| On the initial build, you can also speed things up with the environment | ||||
| variables ``DEBUG`` and ``NO_CUDA``. | ||||
|  | ||||
| -  ``DEBUG=1`` will enable debug builds (-g -O0) | ||||
| -  ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations | ||||
|    (-g -O3) | ||||
| -  ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing | ||||
|    on something not CUDA related), to save compile time. | ||||
|  | ||||
| For example: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     NO_CUDA=1 DEBUG=1 python setup.py build develop | ||||
|  | ||||
| Make sure you continue to pass these flags on subsequent builds. | ||||
|  | ||||
| Code completion and IDE support | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| When using ``python setup.py develop``, PyTorch will generate a | ||||
| ``compile_commands.json`` file that can be used by many editors to | ||||
| provide command completion and error highlighting for PyTorch's C++ | ||||
| code. You need to ``pip install ninja`` to generate accurate information | ||||
| for the code in ``torch/csrc``. More information at: | ||||
|  | ||||
| -  https://sarcasm.github.io/notes/dev/compilation-database.html | ||||
|  | ||||
| Make no-op build fast. | ||||
| ~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Use Ninja | ||||
| ~~~~~~~~~ | ||||
|  | ||||
| Python ``setuptools`` is pretty dumb, and always rebuilds every C file | ||||
| in a project. If you install the ninja build system with | ||||
| ``pip install ninja``, then PyTorch will use it to track dependencies | ||||
| correctly. If PyTorch was already built, you will need to run | ||||
| ``python setup.py clean`` once after installing ninja for builds to | ||||
| succeed. | ||||
|  | ||||
| Use CCache | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
| Even when dependencies are tracked with file modification, there are | ||||
| many situations where files get rebuilt when a previous compilation was | ||||
| exactly the same. | ||||
|  | ||||
| Using ccache in a situation like this is a real time-saver. However, by | ||||
| default, ccache does not properly support CUDA stuff, so here are the | ||||
| instructions for installing a custom ccache fork that has CUDA support: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     # install and export ccacheif ! ls ~/ccache/bin/ccachethen | ||||
|         sudo apt-get update | ||||
|         sudo apt-get install -y automake autoconf | ||||
|         sudo apt-get install -y asciidoc | ||||
|         mkdir -p ~/ccache | ||||
|         pushd /tmp | ||||
|         rm -rf ccache | ||||
|         git clone https://github.com/colesbury/ccache -b ccbin | ||||
|         pushd ccache | ||||
|         ./autogen.sh | ||||
|         ./configure | ||||
|         make install prefix=~/ccache | ||||
|         popdpopd | ||||
|  | ||||
|         mkdir -p ~/ccache/lib | ||||
|         mkdir -p ~/ccache/cuda | ||||
|         ln -s ~/ccache/bin/ccache ~/ccache/lib/cc | ||||
|         ln -s ~/ccache/bin/ccache ~/ccache/lib/c++ | ||||
|         ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc | ||||
|         ln -s ~/ccache/bin/ccache ~/ccache/lib/g++ | ||||
|         ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc | ||||
|  | ||||
|         ~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc | ||||
|  | ||||
| CUDA Development tips | ||||
| --------------------- | ||||
|  | ||||
| If you are working on the CUDA code, here are some useful CUDA debugging | ||||
| tips: | ||||
|  | ||||
| 1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug | ||||
|    symbols (``-g -G``). This will be particularly helpful in debugging | ||||
|    device code. However, it will slow down the build process for about | ||||
|    50% (compared to only ``DEBUG=1``), so use wisely. | ||||
| 2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging | ||||
|    friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a | ||||
|    CUDA tensor (rather than all zeros). | ||||
|  | ||||
| Hope this helps, and thanks for considering to contribute. | ||||
|  | ||||
| Windows development tips | ||||
| ------------------------ | ||||
|  | ||||
| Occasionally, you will write a patch which works on Linux, but fails CI | ||||
| on Windows. There are a few aspects in which MSVC (the Windows compiler | ||||
| toolchain we use) is stricter than Linux, which are worth keeping in | ||||
| mind when fixing these problems. | ||||
|  | ||||
| 1. Symbols are NOT exported by default on Windows; instead, you have to | ||||
|    explicitly mark a symbol as exported/imported in a header file with | ||||
|    ``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have | ||||
|    codified this pattern into a set of macros which follow the | ||||
|    convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen. | ||||
|    (Every separate shared library needs a unique macro name, because | ||||
|    symbol visibility is on a per shared library basis. See | ||||
|    c10/macros/Macros.h for more details.) The upshot is if you see an | ||||
|    "unresolved external" error in your Windows build, this is probably | ||||
|    because you forgot to mark a function with ``*_API``. However, there | ||||
|    is one important counterexample to this principle: if you want a | ||||
|    *templated* function to be instantiated at the call site, do NOT mark | ||||
|    it with ``*_API`` (if you do mark it, you'll have to explicitly | ||||
|    instantiate all of the specializations used by the call sites.) | ||||
| 2. If you link against a library, this does not make its dependencies | ||||
|    transitively visible. You must explicitly specify a link dependency | ||||
|    against every library whose symbols you use. (This is different from | ||||
|    Linux where in most environments, transitive dependencies can be used | ||||
|    to fulfill unresolved symbols.) | ||||
| 3. If you have a Windows box (we have a few on EC2 which you can request | ||||
|    access to) and you want to run the build, the easiest way is to just | ||||
|    run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run | ||||
|    ``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing | ||||
|    away your Conda environment.) | ||||
|  | ||||
| Even if you don't know anything about MSVC, you can use cmake to build | ||||
| simple programs on Windows; this can be helpful if you want to learn | ||||
| more about some peculiar linking behavior by reproducing it on a small | ||||
| example. Here's a simple example cmake file that defines two dynamic | ||||
| libraries, one linking with the other: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo) | ||||
|  | ||||
| You can build it with: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     mkdir buildcd build | ||||
|     cmake .. | ||||
|     cmake --build . | ||||
|  | ||||
| Known MSVC (and MSVC with NVCC) bugs | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| The PyTorch codebase sometimes likes to use exciting C++ features, and | ||||
| these exciting features lead to exciting bugs in Windows compilers. To | ||||
| add insult to injury, the error messages will often not tell you which | ||||
| line of code actually induced the erroring template instantiation. We've | ||||
| found the most effective way to debug these problems is to carefully | ||||
| read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few | ||||
| well known pitfalls and workarounds: | ||||
|  | ||||
| -  This is not actually a bug per se, but in general, code generated by | ||||
|    MSVC is more sensitive to memory errors; you may have written some | ||||
|    code that does a use-after-free or stack overflows; on Linux the code | ||||
|    might work, but on Windows your program will crash. ASAN may not | ||||
|    catch all of these problems: stay vigilant to the possibility that | ||||
|    your crash is due to a real memory problem. | ||||
| -  (NVCC) ``c10::optional`` does not work when used from device code. | ||||
|    Don't use it from kernels. Upstream issue: | ||||
|    https://github.com/akrzemi1/Optional/issues/58 and our local issue | ||||
|    #10329. | ||||
| -  ``constexpr`` generally works less well on MSVC. | ||||
|  | ||||
|    -  The idiom ``static_assert(f() == f())`` to test if ``f`` is | ||||
|       constexpr does not work; you'll get "error C2131: expression did | ||||
|       not evaluate to a constant". Don't use these asserts on Windows. | ||||
|       (Example: ``c10/util/intrusive_ptr.h``) | ||||
|  | ||||
| -  (NVCC) Code you access inside a ``static_assert`` will eagerly be | ||||
|    evaluated as if it were device code, and so you might get an error | ||||
|    that the code is "not accessible". | ||||
|  | ||||
| :: | ||||
|  | ||||
|     class A { | ||||
|       static A singleton_; | ||||
|       static constexpr inline A* singleton() { | ||||
|         return &singleton_; | ||||
|       } | ||||
|     };static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm"); | ||||
|  | ||||
| -  The compiler will run out of heap space if you attempt to compile | ||||
|    files that are too large. Splitting such files into separate files | ||||
|    helps. (Example: ``THTensorMath``, ``THTensorMoreMath``, | ||||
|    ``THTensorEvenMoreMath``.) | ||||
| -  MSVC's preprocessor (but not the standard compiler) has a bug where | ||||
|    it incorrectly tokenizes raw string literals, ending when it sees a | ||||
|    ``"``. This causes preprocessor tokens inside the literal like | ||||
|    an\ ``#endif`` to be incorrectly treated as preprocessor directives. | ||||
|    See https://godbolt.org/z/eVTIJq as an example. | ||||
|  | ||||
| Running Clang-Tidy | ||||
| ~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| `Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a | ||||
| C++ linter and static analysis tool based on the clang compiler. We run | ||||
| clang-tidy in our CI to make sure that new C++ code is safe, sane and | ||||
| efficient. See our | ||||
| `.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__ | ||||
| file for the simple commands we use for this. To run clang-tidy locally, | ||||
| follow these steps: | ||||
|  | ||||
| 1. Install clang-tidy. First, check if you already have clang-tidy by | ||||
|    simply writing ``clang-tidy`` in your terminal. If you don't yet have | ||||
|    clang-tidy, you should be able to install it easily with your package | ||||
|    manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu. | ||||
|    See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on | ||||
|    how to install the latest version. Note that newer versions of | ||||
|    clang-tidy will have more checks than older versions. In our CI, we | ||||
|    run clang-tidy-6.0. | ||||
| 2. Use our driver script to run clang-tidy over any changes relative to | ||||
|    some git revision (you may want to replace ``HEAD~1`` with ``HEAD`` | ||||
|    to pick up uncommitted changes). Changes are picked up based on a | ||||
|    ``git diff`` with the given revision: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1' | ||||
|  | ||||
| Above, it is assumed you are in the PyTorch root folder. | ||||
| ``path/to/build`` should be the path to where you built PyTorch from | ||||
| source, e.g. ``build`` in the PyTorch root folder if you used | ||||
| ``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change | ||||
| the clang-tidy this script uses. Make sure you have PyYaml installed, | ||||
| which is in PyTorch's ``requirements.txt``. | ||||
|  | ||||
| Pre-commit Tidy/Linting Hook | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| We use clang-tidy and flake8 to perform additional formatting and | ||||
| semantic checking of code. We provide a pre-commit git hook for | ||||
| performing these checks, before a commit is created: | ||||
|  | ||||
| :: | ||||
|  | ||||
|     ln -s ../../tools/git-pre-commit .git/hooks/pre-commit | ||||
|  | ||||
| Caffe2 notes | ||||
| ------------ | ||||
|  | ||||
| In 2018, we merged Caffe2 into the PyTorch source repository. While the | ||||
| steady state aspiration is that Caffe2 and PyTorch share code freely, in | ||||
| the meantime there will be some separation. If you submit a PR to only | ||||
| PyTorch or only Caffe2 code, CI will only run for the project you | ||||
| edited. The logic for this is implemented in | ||||
| ``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can | ||||
| look at this to see what path prefixes constitute changes. This also | ||||
| means if you ADD a new top-level path, or you start sharing code between | ||||
| projects, you need to modify these files. There are a few "unusual" | ||||
| directories which, for historical reasons, are Caffe2/PyTorch specific. | ||||
| Here they are: | ||||
|  | ||||
| -  ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``, | ||||
|    ``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code | ||||
|    in them without extra coordination. | ||||
| -  ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools`` | ||||
|    are PyTorch-specific. Don't put Caffe2 code in them without extra | ||||
|    coordination. | ||||
							
								
								
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,154 @@ | ||||
| PyTorch Governance | ||||
| ========================== | ||||
|  | ||||
| Governance Philosophy and Guiding Tenets | ||||
| ----------------------------------------- | ||||
|  | ||||
| PyTorch adopts a governance structure with a small set of maintainers | ||||
| driving the overall project direction with a strong bias towards | ||||
| PyTorch's design philosophy where design and code contributions are | ||||
| valued. Beyond the core maintainers, there is also a slightly broader | ||||
| set of core developers that have the ability to directly merge pull | ||||
| requests and own various parts of the core code base. | ||||
|  | ||||
| Beyond the maintainers and core devs, the community is encouraged to | ||||
| contribute, file issues, make proposals, review pull requests and be | ||||
| present in the community. Given contributions and willingness to | ||||
| invest, anyone can be provided write access or ownership of parts of | ||||
| the codebase. | ||||
|  | ||||
| Based on this governance structure, the project has the following core | ||||
| operating tenets by which decisions are made and overall culture is | ||||
| derived: | ||||
|  | ||||
| 1. **Code contributions** matter much more than corporate sponsorship | ||||
|    and independent developers are highly valued. | ||||
| 2. **Project influence** is gained through contributions (whether PRs, | ||||
|    forum answers, code reviews or otherwise) | ||||
|  | ||||
| Key people and their functions | ||||
| ------------------------------ | ||||
|  | ||||
| Project Maintainers | ||||
| ~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Project maintainers provide leadership and direction for the PyTorch | ||||
| project. Specifics include: | ||||
|  | ||||
| -  Articulate a cohesive long-term vision for the project | ||||
| -  Possess a deep understanding of the PyTorch code base | ||||
| -  Negotiate and resolve contentious issues in ways acceptable to all | ||||
|    parties involved | ||||
|  | ||||
| PyTorch Maintainers: | ||||
|  | ||||
| -  Adam Paszke (`apaszke <https://github.com/apaszke>`__) | ||||
| -  Soumith Chintala (`soumith <https://github.com/soumith>`__) | ||||
| -  Edward Yang (`ezyang <https://github.com/ezyang>`__) | ||||
| -  Greg Chanan (`gchanan <https://github.com/gchanan>`__) | ||||
| -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__) | ||||
| -  (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__) | ||||
|  | ||||
| Core Developers | ||||
| ~~~~~~~~~~~~~~~ | ||||
|  | ||||
| The PyTorch project is developed by a team of core developers. You can | ||||
| find the list of core developers at `PyTorch Governance \| Persons of | ||||
| Interest </docs/community/persons_of_interest.html>`__. | ||||
|  | ||||
| While membership is determined by presence in the "PyTorch core" team in | ||||
| the "PyTorch" | ||||
| `organization <https://github.com/orgs/pytorch/teams/facebook>`__ on | ||||
| GitHub, contribution takes many forms: | ||||
|  | ||||
| -  committing changes to the repository; | ||||
| -  reviewing pull requests by others; | ||||
| -  triaging bug reports on the issue tracker; | ||||
| -  discussing topics on official PyTorch communication channels. | ||||
|  | ||||
| Moderators | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
| There is a group of people, some of which are not core developers, | ||||
| responsible for ensuring that discussions on official communication | ||||
| channels adhere to the Code of Conduct. They take action in view of | ||||
| violations and help to support a healthy community. You can find the | ||||
| list of moderators `here <https://discuss.pytorch.org/about>`__. | ||||
|  | ||||
| Decision Making | ||||
| --------------- | ||||
|  | ||||
| Uncontroversial Changes | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Primary work happens through bug tracker issues and pull requests on | ||||
| GitHub. Core developers should avoid pushing their changes directly to | ||||
| the PyTorch repository, instead relying on pull requests. Approving a | ||||
| pull request by a core developer allows it to be merged without further | ||||
| process. Core Developers and Project Maintainers ultimately approve | ||||
| these changes. | ||||
|  | ||||
| Notifying relevant experts about a bug tracker issue or a pull request | ||||
| is important. Reviews from experts in the given interest area are | ||||
| strongly preferred, especially on pull request approvals. Failure to do | ||||
| so might end up with the change being reverted by the relevant expert. | ||||
|  | ||||
| Controversial decision process | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Substantial changes in a given interest area require a GitHub issue to | ||||
| be opened for discussion. This includes: | ||||
|  | ||||
| -  Any semantic or syntactic change to the framework. | ||||
| -  Backwards-incompatible changes to the Python or Cpp API. | ||||
| -  Additions to the core framework, including substantial new | ||||
|    functionality within an existing library. | ||||
| -  Removing core features | ||||
|  | ||||
| Project Maintainers ultimately approve these changes. | ||||
|  | ||||
| FAQ | ||||
| --- | ||||
|  | ||||
| **Q: What if I would like to own (or partly own) a part of the project | ||||
| such as a domain api (i.e. Torch Vision)?** This is absolutely possible. | ||||
| The first step is to start contributing to the existing project area and | ||||
| contributing to its health and success. In addition to this, you can | ||||
| make a proposal through a GitHub issue for new functionality or changes | ||||
| to improve the project area. | ||||
|  | ||||
| **Q: What if I am a company looking to use PyTorch internally for | ||||
| development, can I be granted or purchase a board seat to drive the | ||||
| project direction?** No, the PyTorch project is strictly driven by the | ||||
| maintainer-driven project philosophy and does not have a board or | ||||
| vehicle to take financial contributions relating to gaining influence | ||||
| over technical direction. | ||||
|  | ||||
| **Q: Does the PyTorch project support grants or ways to support | ||||
| independent developers using or contributing to the project?** No, not | ||||
| at this point. We are however looking at ways to better support the | ||||
| community of independent developers around PyTorch. If you have | ||||
| suggestions or inputs, please reach out on the PyTorch forums to | ||||
| discuss. | ||||
|  | ||||
| **Q: How do I contribute code to the project?** If the change is | ||||
| relatively minor, a pull request on GitHub can be opened up immediately | ||||
| for review and merge by the project committers. For larger changes, | ||||
| please open an issue to make a proposal to discuss prior. Please also | ||||
| see the **`PyTorch Contributor | ||||
| Guide </docs/community/contribution_guide.html>`__** for contribution | ||||
| guidelines. | ||||
|  | ||||
| **Q: Can I become a committer on the project?** Unfortunately, the | ||||
| current commit process to PyTorch involves an interaction with Facebook | ||||
| infrastructure that can only be triggered by Facebook employees. We are | ||||
| however looking at ways to expand the committer base to individuals | ||||
| outside of Facebook and will provide an update when the tooling exists | ||||
| to allow this. | ||||
|  | ||||
| **Q: What if i would like to deliver a PyTorch tutorial at a conference | ||||
| or otherwise? Do I need to be 'officially' a committer to do this?** No, | ||||
| we encourage community members to showcase their work wherever and | ||||
| whenever they can. Please reach out to | ||||
| `pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__ | ||||
| for marketing support. | ||||
							
								
								
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,130 @@ | ||||
| PyTorch Governance | Persons of Interest | ||||
| ========================================= | ||||
|  | ||||
| General Maintainers | ||||
| ------------------- | ||||
|  | ||||
| -  Adam Paszke (`apaszke <https://github.com/apaszke>`__) | ||||
| -  Soumith Chintala (`soumith <https://github.com/soumith>`__) | ||||
| -  Edward Yang (`ezyang <https://github.com/ezyang>`__) | ||||
| -  Greg Chanan (`gchanan <https://github.com/gchanan>`__) | ||||
| -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__) | ||||
| -  (sunsetting) Sam Gross | ||||
|    (`colesbury <https://github.com/colesbury>`__) | ||||
|  | ||||
| Module-level maintainers | ||||
| ------------------------ | ||||
|  | ||||
| JIT | ||||
| ~~~ | ||||
|  | ||||
| -  Zach Devito (`zdevito <https://github.com/zdevito>`__) | ||||
| -  Michael Suo (`suo <https://github.com/suo>`__) | ||||
|  | ||||
| Distributed | ||||
| ~~~~~~~~~~~ | ||||
|  | ||||
| -  Pieter Noordhuis (`pietern <https://github.com/pietern>`__) | ||||
| -  Shen Li (`mrshenli <https://github.com/mrshenli>`__) | ||||
| -  (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__) | ||||
|  | ||||
| Autograd Engine | ||||
| ~~~~~~~~~~~~~~~ | ||||
|  | ||||
| -  Alban Desmaison (`alband <https://github.com/alband>`__) | ||||
| -  Adam Paszke (`apaszke <https://github.com/apaszke>`__) | ||||
|  | ||||
| Multiprocessing and DataLoaders | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| -  Simon Wang (`SsnL <https://github.com/SsnL>`__) | ||||
| -  Adam Paszke (`apaszke <https://github.com/apaszke>`__) | ||||
| -  (proposed) Vitaly Fedyunin | ||||
|    (`VitalyFedyunin <https://github.com/proposed>`__) | ||||
|  | ||||
| CUDA | ||||
| ~~~~ | ||||
|  | ||||
| -  Edward Yang (`ezyang <https://github.com/ezyang>`__) | ||||
| -  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__) | ||||
|  | ||||
| C++ | ||||
| ~~~ | ||||
|  | ||||
| -  Will Feng (`yf225 <https://github.com/yf225>`__) | ||||
| -  (sunsetting) Peter Goldsborough | ||||
|    (`goldsborough <https://github.com/goldsborough>`__) | ||||
|  | ||||
| Build + CI | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
| -  Will Feng (`yf225 <https://github.com/yf225>`__) | ||||
| -  Edward Yang (`ezyang <https://github.com/ezyang>`__) | ||||
| -  Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__) | ||||
| -  Soumith Chintala (`soumith <https://github.com/soumith>`__) | ||||
| -  (sunsetting) Orion Reblitz-Richardson | ||||
| (`orionr <https://github.com/orionr>`__) | ||||
|  | ||||
| Distributions & RNG | ||||
| ~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| -  Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__) | ||||
| -  Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__) | ||||
| -  Alican Bozkurt (`alicanb <https://github.com/alicanb>`__) | ||||
| -  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__) | ||||
|  | ||||
| C10 | ||||
| ~~~ | ||||
|  | ||||
| -  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__) | ||||
| -  Edward Yang (`ezyang <https://github.com/ezyang>`__) | ||||
|  | ||||
| ONNX <-> PyTorch | ||||
| ~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| -  Lu Fang (`houseroad <https://github.com/houseroad>`__) | ||||
|  | ||||
| torch.nn | ||||
| ~~~~~~~~ | ||||
|  | ||||
| -  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__) | ||||
| -  Adam Paszke (`apaszke <https://github.com/apaszke>`__) | ||||
| -  Greg Chanan (`gchanan <https://github.com/gchanan>`__) | ||||
| -  Soumith Chintala (`soumith <https://github.com/soumith>`__) | ||||
| -  Sam Gross (`colesbury <https://github.com/colesbury>`__) | ||||
|  | ||||
| CPU Performance / SIMD | ||||
| ~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| -  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__) | ||||
| -  Sam Gross (`colesbury <https://github.com/colesbury>`__) | ||||
| -  Richard Zou (`zou3519 <https://github.com/zou3519>`__) | ||||
|  | ||||
| AMD/ROCm/HIP | ||||
| ~~~~~~~~~~~~ | ||||
|  | ||||
| -  Junjie Bai (`bddppq <https://github.com/bddppq>`__) | ||||
| -  Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__) | ||||
|  | ||||
| Windows | ||||
| ~~~~~~~ | ||||
|  | ||||
| -  Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__) | ||||
|  | ||||
| MKLDNN | ||||
| ~~~~~~ | ||||
|  | ||||
| -  Yinghai Lu (`yinghai <https://github.com/yinghai>`__) | ||||
|  | ||||
| XLA | ||||
| ~~~ | ||||
|  | ||||
| -  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__) | ||||
| -  Gregory Chanan (`gchanan <https://github.com/gchanan>`__) | ||||
| -  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__) | ||||
| -  Alex Suhan (`asuhan <https://github.com/asuhan>`__) | ||||
|  | ||||
| PPC | ||||
| ~~~ | ||||
|  | ||||
| -  Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__) | ||||
| @ -1,6 +1,101 @@ | ||||
| torch.hub | ||||
| =================================== | ||||
| Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility. | ||||
|  | ||||
| Publishing models | ||||
| ----------------- | ||||
|  | ||||
| Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights) | ||||
| to a github repository by adding a simple ``hubconf.py`` file; | ||||
|  | ||||
| ``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with | ||||
| the following signature. | ||||
|  | ||||
| :: | ||||
|  | ||||
|     def entrypoint_name(pretrained=False, *args, **kwargs): | ||||
|         ... | ||||
|  | ||||
| How to implement an entrypoint? | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
| Here is a code snipet from pytorch/vision repository, which specifies an entrypoint | ||||
| for ``resnet18`` model. You can see a full script in | ||||
| `pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_ | ||||
|  | ||||
| :: | ||||
|  | ||||
|     dependencies = ['torch', 'math'] | ||||
|  | ||||
|     def resnet18(pretrained=False, *args, **kwargs): | ||||
|         """ | ||||
|         Resnet18 model | ||||
|         pretrained (bool): a recommended kwargs for all entrypoints | ||||
|         args & kwargs are arguments for the function | ||||
|         """ | ||||
|         ######## Call the model in the repo ############### | ||||
|         from torchvision.models.resnet import resnet18 as _resnet18 | ||||
|         model = _resnet18(*args, **kwargs) | ||||
|         ######## End of call ############################## | ||||
|         # The following logic is REQUIRED | ||||
|         if pretrained: | ||||
|             # For weights saved in local repo | ||||
| 			# model.load_state_dict(<path_to_saved_file>) | ||||
|  | ||||
| 			# For weights saved elsewhere | ||||
| 			checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth' | ||||
|             model.load_state_dict(model_zoo.load_url(checkpoint, progress=False)) | ||||
|         return model | ||||
|  | ||||
| - ``dependencies`` variable is a **list** of package names required to to run the model. | ||||
| - Pretrained weights can either be stored local in the github repo, or loadable by | ||||
|   ``model_zoo.load()``. | ||||
| - ``pretrained`` controls whether to load the pre-trained weights provided by repo owners. | ||||
| - ``args`` and ``kwargs`` are passed along to the real callable function. | ||||
| - Docstring of the function works as a help message, explaining what does the model do and what | ||||
|   are the allowed arguments. | ||||
| - Entrypoint function should **ALWAYS** return a model(nn.module). | ||||
|  | ||||
| Important Notice | ||||
| ^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| - The published models should be at least in a branch/tag. It can't be a random commit. | ||||
|  | ||||
| Loading models from Hub | ||||
| ----------------------- | ||||
|  | ||||
| Users can load the pre-trained models using ``torch.hub.load()`` API. | ||||
|  | ||||
|  | ||||
| .. automodule:: torch.hub | ||||
| .. autofunction:: load | ||||
|  | ||||
| Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo. | ||||
|  | ||||
| :: | ||||
|  | ||||
|     hub_model = hub.load( | ||||
|         'pytorch/vision:master', # repo_owner/repo_name:branch | ||||
|         'resnet18', # entrypoint | ||||
|         1234, # args for callable [not applicable to resnet] | ||||
|         pretrained=True) # kwargs for callable | ||||
|  | ||||
| Where are my downloaded model & weights saved? | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| The locations are used in the order of | ||||
|  | ||||
| - hub_dir: user specified path. It can be set in the following ways: | ||||
|   - Setting the environment variable ``TORCH_HUB_DIR`` | ||||
|   - Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)`` | ||||
| - ``~/.torch/hub`` | ||||
|  | ||||
| .. autofunction:: set_dir | ||||
|  | ||||
| Caching logic | ||||
| ^^^^^^^^^^^^^ | ||||
|  | ||||
| By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``. | ||||
|  | ||||
| Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete | ||||
| the existing github folder and downloaded weights, reinitialize a fresh download. This is useful | ||||
| when updates are published to the same branch, users can keep up with the latest release. | ||||
|  | ||||
| @ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. | ||||
|  | ||||
|    notes/* | ||||
|  | ||||
| .. toctree:: | ||||
|   :glob: | ||||
|   :maxdepth: 1 | ||||
|   :caption: Community | ||||
|  | ||||
|   community/* | ||||
|  | ||||
| .. toctree:: | ||||
|    :maxdepth: 1 | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
| Torch Script | ||||
| TorchScript | ||||
| ============ | ||||
|  | ||||
| .. contents:: :local: | ||||
| @ -6,17 +6,17 @@ Torch Script | ||||
| .. automodule:: torch.jit | ||||
| .. currentmodule:: torch.jit | ||||
|  | ||||
| Torch Script is a way to create serializable and optimizable models from PyTorch code. | ||||
| Any code written in Torch Script can be saved from your Python | ||||
| TorchScript is a way to create serializable and optimizable models from PyTorch code. | ||||
| Any code written in TorchScript can be saved from your Python | ||||
| process and loaded in a process where there is no Python dependency. | ||||
|  | ||||
| We provide tools to incrementally transition a model from being a pure Python program | ||||
| to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program. | ||||
| to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program. | ||||
| This makes it possible to train models in PyTorch using familiar tools and then export | ||||
| the model to a production environment where it is not a good idea to run models as Python programs | ||||
| for performance and multi-threading reasons. | ||||
|  | ||||
| Creating Torch Script Code | ||||
| Creating TorchScript Code | ||||
| -------------------------- | ||||
|  | ||||
|  | ||||
| @ -117,26 +117,26 @@ Example: | ||||
|             return self.resnet(input - self.means) | ||||
|  | ||||
|  | ||||
| Torch Script Language Reference | ||||
| TorchScript Language Reference | ||||
| ------------------------------- | ||||
|  | ||||
| Torch Script is a subset of Python that can either be written directly (using | ||||
| TorchScript is a subset of Python that can either be written directly (using | ||||
| the @script annotations) or generated automatically from Python code via | ||||
| tracing. When using tracing, code is automatically converted into this subset of | ||||
| Python by recording only the actual operators on tensors and simply executing and | ||||
| discarding the other surrounding Python code. | ||||
|  | ||||
| When writing Torch Script directly using @script annotations, the programmer must | ||||
| only use the subset of Python supported in Torch Script. This section documents | ||||
| what is supported in Torch Script as if it were a language reference for a stand | ||||
| When writing TorchScript directly using @script annotations, the programmer must | ||||
| only use the subset of Python supported in TorchScript. This section documents | ||||
| what is supported in TorchScript as if it were a language reference for a stand | ||||
| alone language. Any features of Python not mentioned in this reference are not | ||||
| part of Torch Script. | ||||
| part of TorchScript. | ||||
|  | ||||
| As a subset of Python any valid Torch Script function is also a valid Python | ||||
| As a subset of Python any valid TorchScript function is also a valid Python | ||||
| function. This makes it possible to remove the @script annotations and debug the | ||||
| function using standard Python tools like pdb. The reverse is not true: there | ||||
| are many valid python programs that are not valid Torch Script programs. | ||||
| Instead, Torch Script focuses specifically on the features of Python that are | ||||
| are many valid python programs that are not valid TorchScript programs. | ||||
| Instead, TorchScript focuses specifically on the features of Python that are | ||||
| needed to represent neural network models in Torch. | ||||
|  | ||||
| .. envvar:: PYTORCH_JIT=1 | ||||
| @ -150,9 +150,9 @@ needed to represent neural network models in Torch. | ||||
| Types | ||||
| ~~~~~ | ||||
|  | ||||
| The largest difference between Torch Script and the full Python language is that | ||||
| Torch Script only support a small set of types that are needed to express neural | ||||
| net models. In particular Torch Script supports: | ||||
| The largest difference between TorchScript and the full Python language is that | ||||
| TorchScript only support a small set of types that are needed to express neural | ||||
| net models. In particular TorchScript supports: | ||||
|  | ||||
| ``Tensor`` | ||||
|     A PyTorch tensor of any dtype, dimension, or backend. | ||||
| @ -169,8 +169,8 @@ net models. In particular Torch Script supports: | ||||
| ``List[T]`` | ||||
|     A list of which all members are type ``T`` | ||||
|  | ||||
| Unlike Python, each variable in Torch Script function must have a single static type. | ||||
| This makes it easier to optimize Torch Script functions. | ||||
| Unlike Python, each variable in TorchScript function must have a single static type. | ||||
| This makes it easier to optimize TorchScript functions. | ||||
|  | ||||
| Example:: | ||||
|  | ||||
| @ -183,9 +183,9 @@ Example:: | ||||
|         return r # Type mismatch: r is set to type Tensor in the true branch | ||||
|                  # and type int in the false branch | ||||
|  | ||||
| By default, all parameters to a Torch Script function are assumed to be Tensor | ||||
| By default, all parameters to a TorchScript function are assumed to be Tensor | ||||
| because this is the most common type used in modules. To specify that an | ||||
| argument to a Torch Script function is another type, it is possible to use | ||||
| argument to a TorchScript function is another type, it is possible to use | ||||
| MyPy-style type annotations using the types listed above: | ||||
|  | ||||
| Example:: | ||||
| @ -264,7 +264,7 @@ Subscripts | ||||
|   ``t[i:j, i]`` | ||||
|  | ||||
|   .. note:: | ||||
|     Torch Script currently does not support mutating tensors in place, so any | ||||
|     TorchScript currently does not support mutating tensors in place, so any | ||||
|     tensor indexing can only appear on the right-hand size of an expression. | ||||
|  | ||||
| Function calls | ||||
| @ -328,7 +328,7 @@ Accessing Module Parameters | ||||
| Statements | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
| Torch Script supports the following types of statements: | ||||
| TorchScript supports the following types of statements: | ||||
|  | ||||
| Simple Assignments | ||||
|  | ||||
| @ -438,7 +438,7 @@ Return | ||||
| Variable Resolution | ||||
| ~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Torch Script supports a subset of Python's variable resolution (i.e. scoping) | ||||
| TorchScript supports a subset of Python's variable resolution (i.e. scoping) | ||||
| rules. Local variables behave the same as in Python, except for the restriction | ||||
| that a variable must have the same type along all paths through a function. | ||||
| If a variable has a different type on different sides of an if statement, it | ||||
| @ -456,23 +456,23 @@ Example:: | ||||
|         print(y) # Error: undefined value y | ||||
|  | ||||
| Non-local variables are resolved to Python values at compile time when the | ||||
| function is defined. These values are then converted into Torch Script values using | ||||
| function is defined. These values are then converted into TorchScript values using | ||||
| the rules described in `Use of Python Values`_. | ||||
|  | ||||
| Use of Python Values | ||||
| ~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| To make writing Torch Script more convenient, we allow script code to refer | ||||
| To make writing TorchScript more convenient, we allow script code to refer | ||||
| to Python values in the surrounding scope. For instance, any time there is a | ||||
| reference to ``torch``, the Torch Script compiler is actually resolving it to the | ||||
| reference to ``torch``, the TorchScript compiler is actually resolving it to the | ||||
| ``torch`` Python module when the function is declared.  These Python values are | ||||
| not a first class part of Torch Script. Instead they are desugared at compile-time | ||||
| into the primitive types that Torch Script supports. This section describes the | ||||
| rules that are used when accessing Python values in Torch Script. They depend | ||||
| not a first class part of TorchScript. Instead they are desugared at compile-time | ||||
| into the primitive types that TorchScript supports. This section describes the | ||||
| rules that are used when accessing Python values in TorchScript. They depend | ||||
| on the dynamic type of the python valued referenced. | ||||
|  | ||||
| Functions | ||||
|   Torch Script can call python functions. This functionality is very useful when | ||||
|   TorchScript can call python functions. This functionality is very useful when | ||||
|   incrementally converting a model into script. The model can be moved function-by-function | ||||
|   to script, leaving calls to Python functions in place. This way you can incrementally | ||||
|   check the correctness of the model as you go. | ||||
| @ -495,12 +495,12 @@ Functions | ||||
|  | ||||
|  | ||||
| Attribute Lookup On Python Modules | ||||
|     Torch Script can lookup attributes on modules. Builtin functions like ``torch.add`` | ||||
|     are accessed this way. This allows Torch Script to call functions defined in | ||||
|     TorchScript can lookup attributes on modules. Builtin functions like ``torch.add`` | ||||
|     are accessed this way. This allows TorchScript to call functions defined in | ||||
|     other modules. | ||||
|  | ||||
| Python-defined Constants | ||||
|     Torch Script also provides a way to use constants that are defined in Python. | ||||
|     TorchScript also provides a way to use constants that are defined in Python. | ||||
|     These can be used to hard-code hyper-parameters into the function, or to | ||||
|     define universal constants. There are two ways of specifying that a Python | ||||
|     value should be treated as a constant. | ||||
| @ -597,36 +597,35 @@ Interpreting Graphs | ||||
|  | ||||
|     The example script above produces the graph:: | ||||
|  | ||||
|         graph(%len : int) { | ||||
|           %13 : float = prim::Constant[value=1]() | ||||
|           %10 : int = prim::Constant[value=10]() | ||||
|           %2 : int = prim::Constant[value=4]() | ||||
|           %1 : int = prim::Constant[value=3]() | ||||
|           %3 : int[] = prim::ListConstruct(%1, %2) | ||||
|           %4 : int = prim::Constant[value=6]() | ||||
|           %5 : int = prim::Constant[value=0]() | ||||
|           %6 : int[] = prim::Constant[value=[0, -1]]() | ||||
|           %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6) | ||||
|           %8 : int = prim::Constant[value=1]() | ||||
|           %rv : Dynamic = prim::Loop(%len, %8, %rv.1) | ||||
|             block0(%i : int, %12 : Dynamic) { | ||||
|               %11 : int = aten::lt(%i, %10) | ||||
|               %rv.4 : Dynamic = prim::If(%11) | ||||
|                 block0() { | ||||
|                   %14 : int = prim::Constant[value=1]() | ||||
|                   %rv.2 : Dynamic = aten::sub(%12, %13, %14) | ||||
|                   -> (%rv.2) | ||||
|                 } | ||||
|                 block1() { | ||||
|                   %16 : int = prim::Constant[value=1]() | ||||
|                   %rv.3 : Dynamic = aten::add(%12, %13, %16) | ||||
|                   -> (%rv.3) | ||||
|                 } | ||||
|               %19 : int = prim::Constant[value=1]() | ||||
|               -> (%19, %rv.4) | ||||
|             } | ||||
|           return (%rv); | ||||
|         } | ||||
| 	graph(%len : int) { | ||||
| 	  %15 : int = prim::Constant[value=1]() | ||||
| 	  %9 : bool = prim::Constant[value=1]() | ||||
| 	  %7 : Device = prim::Constant[value="cpu"]() | ||||
| 	  %6 : int = prim::Constant[value=0]() | ||||
| 	  %5 : int = prim::Constant[value=6]() | ||||
| 	  %1 : int = prim::Constant[value=3]() | ||||
| 	  %2 : int = prim::Constant[value=4]() | ||||
| 	  %11 : int = prim::Constant[value=10]() | ||||
| 	  %14 : float = prim::Constant[value=1]() | ||||
| 	  %4 : int[] = prim::ListConstruct(%1, %2) | ||||
| 	  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7) | ||||
| 	  %rv : Tensor = prim::Loop(%len, %9, %rv.1) | ||||
| 	    block0(%i : int, %13 : Tensor) { | ||||
| 	      %12 : bool = aten::lt(%i, %11) | ||||
| 	      %rv.4 : Tensor = prim::If(%12) | ||||
| 		block0() { | ||||
| 		  %rv.2 : Tensor = aten::sub(%13, %14, %15) | ||||
| 		  -> (%rv.2) | ||||
| 		} | ||||
| 		block1() { | ||||
| 		  %rv.3 : Tensor = aten::add(%13, %14, %15) | ||||
| 		  -> (%rv.3) | ||||
| 		} | ||||
| 	      -> (%9, %rv.4) | ||||
| 	    } | ||||
| 	  return (%rv); | ||||
| 	} | ||||
|  | ||||
|  | ||||
|     Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for | ||||
|     example. ``%rv.1 : Dynamic`` means we assign the output to a (unique) | ||||
| @ -676,34 +675,39 @@ Automatic Trace Checking | ||||
|         traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs) | ||||
|  | ||||
|     Gives us the following diagnostic information:: | ||||
|  | ||||
|         ERROR: Graphs differed across invocations! | ||||
|         Graph diff: | ||||
|             graph(%0 : Dynamic) { | ||||
|                   %1 : int = prim::Constant[value=0]() | ||||
|                   %2 : int = prim::Constant[value=0]() | ||||
|                   %3 : Dynamic = aten::select(%0, %1, %2) | ||||
|                   %4 : int = prim::Constant[value=0]() | ||||
|                   %5 : int = prim::Constant[value=0]() | ||||
|                   %6 : Dynamic = aten::select(%0, %4, %5) | ||||
|                   %7 : Dynamic = aten::mul(%3, %6) | ||||
|                   %8 : int = prim::Constant[value=0]() | ||||
|                   %9 : int = prim::Constant[value=1]() | ||||
|                   %10 : Dynamic = aten::select(%0, %8, %9) | ||||
|                   %11 : Dynamic = aten::mul(%7, %10) | ||||
|                   %12 : int = prim::Constant[value=0]() | ||||
|                   %13 : int = prim::Constant[value=2]() | ||||
|                   %14 : Dynamic = aten::select(%0, %12, %13) | ||||
|                   %15 : Dynamic = aten::mul(%11, %14) | ||||
|               +   %16 : int = prim::Constant[value=0]() | ||||
|               +   %17 : int = prim::Constant[value=3]() | ||||
|               +   %18 : Dynamic = aten::select(%0, %16, %17) | ||||
|               +   %19 : Dynamic = aten::mul(%15, %18) | ||||
|               -   return (%15); | ||||
|               ?             ^ | ||||
|               +   return (%19); | ||||
|               ?             ^ | ||||
|             } | ||||
| 	ERROR: Graphs differed across invocations! | ||||
| 	Graph diff:: | ||||
|    | ||||
| 		  graph(%x : Tensor) { | ||||
| 		    %1 : int = prim::Constant[value=0]() | ||||
| 		    %2 : int = prim::Constant[value=0]() | ||||
| 		    %result.1 : Tensor = aten::select(%x, %1, %2) | ||||
| 		    %4 : int = prim::Constant[value=0]() | ||||
| 		    %5 : int = prim::Constant[value=0]() | ||||
| 		    %6 : Tensor = aten::select(%x, %4, %5) | ||||
| 		    %result.2 : Tensor = aten::mul(%result.1, %6) | ||||
| 		    %8 : int = prim::Constant[value=0]() | ||||
| 		    %9 : int = prim::Constant[value=1]() | ||||
| 		    %10 : Tensor = aten::select(%x, %8, %9) | ||||
| 		-   %result : Tensor = aten::mul(%result.2, %10) | ||||
| 		+   %result.3 : Tensor = aten::mul(%result.2, %10) | ||||
| 		?          ++ | ||||
| 		    %12 : int = prim::Constant[value=0]() | ||||
| 		    %13 : int = prim::Constant[value=2]() | ||||
| 		    %14 : Tensor = aten::select(%x, %12, %13) | ||||
| 		+   %result : Tensor = aten::mul(%result.3, %14) | ||||
| 		+   %16 : int = prim::Constant[value=0]() | ||||
| 		+   %17 : int = prim::Constant[value=3]() | ||||
| 		+   %18 : Tensor = aten::select(%x, %16, %17) | ||||
| 		-   %15 : Tensor = aten::mul(%result, %14) | ||||
| 		?     ^                                 ^ | ||||
| 		+   %19 : Tensor = aten::mul(%result, %18) | ||||
| 		?     ^                                 ^ | ||||
| 		-   return (%15); | ||||
| 		?             ^ | ||||
| 		+   return (%19); | ||||
| 		?             ^ | ||||
| 		  } | ||||
|  | ||||
|  | ||||
|     This message indicates to us that the computation differed between when | ||||
| @ -733,23 +737,19 @@ Automatic Trace Checking | ||||
|  | ||||
|     Which produces:: | ||||
|  | ||||
|         graph(%x : Dynamic) { | ||||
|           %1 : int = prim::Constant[value=0]() | ||||
|           %2 : int = prim::Constant[value=0]() | ||||
|           %result.1 : Dynamic = aten::select(%x, %2, %1) | ||||
|           %4 : int = aten::size(%x, %1) | ||||
|           %5 : int = prim::Constant[value=1]() | ||||
|           %result : Dynamic = prim::Loop(%4, %5, %result.1) | ||||
|             block0(%i : int, %7 : Dynamic) { | ||||
|               %9 : int = prim::Constant[value=0]() | ||||
|               %10 : Dynamic = aten::select(%x, %9, %i) | ||||
|               %result.2 : Dynamic = aten::mul(%7, %10) | ||||
|               %12 : int = prim::Constant[value=1]() | ||||
|               -> (%12, %result.2) | ||||
|             } | ||||
|           return (%result); | ||||
|         } | ||||
|  | ||||
| 	graph(%x : Tensor) { | ||||
| 	  %5 : bool = prim::Constant[value=1]() | ||||
| 	  %1 : int = prim::Constant[value=0]() | ||||
| 	  %result.1 : Tensor = aten::select(%x, %1, %1) | ||||
| 	  %4 : int = aten::size(%x, %1) | ||||
| 	  %result : Tensor = prim::Loop(%4, %5, %result.1) | ||||
| 	    block0(%i : int, %7 : Tensor) { | ||||
| 	      %10 : Tensor = aten::select(%x, %1, %i) | ||||
| 	      %result.2 : Tensor = aten::mul(%7, %10) | ||||
| 	      -> (%5, %result.2) | ||||
| 	    } | ||||
| 	  return (%result); | ||||
| 	} | ||||
|  | ||||
| Tracer Warnings | ||||
|     The tracer produces warnings for several problematic patterns in traced | ||||
| @ -789,14 +789,24 @@ Tracer Warnings | ||||
| Builtin Functions | ||||
| ~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| Torch Script supports a subset of the builtin tensor and neural network functions that | ||||
| PyTorch provides. Most methods on Tensor as well as functions in the ``torch`` | ||||
| namespace are available. Many functions in ``torch.nn.functional`` are also availiable. | ||||
| Torch Script supports a subset of the builtin tensor and neural network | ||||
| functions that PyTorch provides. Most methods on Tensor as well as functions in | ||||
| the ``torch`` namespace, all functions in ``torch.nn.functional`` and all | ||||
| modules from ``torch.nn`` are supported in Torch Script, excluding those in the | ||||
| table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`. | ||||
|  | ||||
| Unsupported ``torch.nn`` Modules  :: | ||||
|  | ||||
|     torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss | ||||
|     torch.nn.modules.normalization.CrossMapLRN2d | ||||
|     torch.nn.modules.fold.Fold | ||||
|     torch.nn.modules.fold.Unfold | ||||
|     torch.nn.modules.rnn.GRU | ||||
|     torch.nn.modules.rnn.LSTM | ||||
|     torch.nn.modules.rnn.RNN | ||||
|     torch.nn.modules.rnn.GRUCell | ||||
|     torch.nn.modules.rnn.LSTMCell | ||||
|     torch.nn.modules.rnn.RNNCell | ||||
|  | ||||
| We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or | ||||
| ``Conv`` module. This functionality is something that will be developed in the future. | ||||
| For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn`` | ||||
| modules into ScriptModules on construction. | ||||
|  | ||||
| .. automodule:: torch.jit.supported_ops | ||||
|  | ||||
| @ -338,6 +338,7 @@ view of a storage and defines numeric operations on it. | ||||
|    .. automethod:: reshape_as | ||||
|    .. automethod:: resize_ | ||||
|    .. automethod:: resize_as_ | ||||
|    .. automethod:: roll | ||||
|    .. automethod:: round | ||||
|    .. automethod:: round_ | ||||
|    .. automethod:: rsqrt | ||||
|  | ||||
| @ -269,6 +269,7 @@ Other Operations | ||||
| .. autofunction:: histc | ||||
| .. autofunction:: meshgrid | ||||
| .. autofunction:: renorm | ||||
| .. autofunction:: roll | ||||
| .. autofunction:: tensordot | ||||
| .. autofunction:: trace | ||||
| .. autofunction:: tril | ||||
|  | ||||
| @ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) | ||||
| file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu) | ||||
|  | ||||
| if (BUILD_CAFFE2_OPS) | ||||
|   #cmake only check for separate OpenMP library on AppleClang 7+ | ||||
|   #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 | ||||
|   if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") | ||||
|     if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR | ||||
|         CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") | ||||
|       Set(OpenMP_link ${OpenMP_libomp_LIBRARY}) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   # Note(ilijar): Since Detectron ops currently have no | ||||
|   # CPU implementation, we only build GPU ops for now. | ||||
|   if (USE_CUDA) | ||||
| @ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS) | ||||
|         ${Detectron_CPU_SRCS} | ||||
|         ${Detectron_GPU_SRCS}) | ||||
|  | ||||
|     target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link}) | ||||
|     target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu) | ||||
|     install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) | ||||
|   elseif(NOT IOS_PLATFORM) | ||||
|     add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) | ||||
|     target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link}) | ||||
|     target_link_libraries(caffe2_detectron_ops caffe2) | ||||
|     install(TARGETS caffe2_detectron_ops DESTINATION lib) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
							
								
								
									
										166
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										166
									
								
								setup.py
									
									
									
									
									
								
							| @ -124,6 +124,7 @@ | ||||
| #   LD_LIBRARY_PATH | ||||
| #     we will search for libraries in these paths | ||||
|  | ||||
| from __future__ import print_function | ||||
| from setuptools import setup, Extension, distutils, Command, find_packages | ||||
| import setuptools.command.build_ext | ||||
| import setuptools.command.install | ||||
| @ -144,86 +145,32 @@ import json | ||||
| import glob | ||||
| import importlib | ||||
|  | ||||
| from tools.setup_helpers.env import check_env_flag, check_negative_env_flag | ||||
|  | ||||
|  | ||||
| def hotpatch_var(var, prefix='USE_'): | ||||
|     if check_env_flag('NO_' + var): | ||||
|         os.environ[prefix + var] = '0' | ||||
|     elif check_negative_env_flag('NO_' + var): | ||||
|         os.environ[prefix + var] = '1' | ||||
|     elif check_env_flag('WITH_' + var): | ||||
|         os.environ[prefix + var] = '1' | ||||
|     elif check_negative_env_flag('WITH_' + var): | ||||
|         os.environ[prefix + var] = '0' | ||||
|  | ||||
| # Before we run the setup_helpers, let's look for NO_* and WITH_* | ||||
| # variables and hotpatch environment with the USE_* equivalent | ||||
| use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED', | ||||
|                 'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS'] | ||||
| list(map(hotpatch_var, use_env_vars)) | ||||
|  | ||||
| # Also hotpatch a few with BUILD_* equivalent | ||||
| build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS'] | ||||
| [hotpatch_var(v, 'BUILD_') for v in build_env_vars] | ||||
|  | ||||
| from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION | ||||
| from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST, | ||||
|                                        BUILD_CAFFE2_OPS, USE_LEVELDB, | ||||
|                                        USE_LMDB, USE_OPENCV, USE_FFMPEG) | ||||
| from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION | ||||
| from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY, | ||||
|                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR) | ||||
| from tools.setup_helpers.fbgemm import USE_FBGEMM | ||||
| from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY, | ||||
|                                         MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR) | ||||
| from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \ | ||||
|     NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB | ||||
| from tools.setup_helpers.nnpack import USE_NNPACK | ||||
| from tools.setup_helpers.qnnpack import USE_QNNPACK | ||||
| from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME | ||||
| # If you want to modify flags or environmental variables that is set when | ||||
| # building torch, you should do it in tools/setup_helpers/configure.py. | ||||
| # Please don't add it here unless it's only used in PyTorch. | ||||
| from tools.setup_helpers.configure import * | ||||
| from tools.setup_helpers.generate_code import generate_code | ||||
| from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext | ||||
| from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \ | ||||
|     USE_GLOO_IBVERBS | ||||
|  | ||||
| ################################################################################ | ||||
| # Parameters parsed from environment | ||||
| ################################################################################ | ||||
|  | ||||
| DEBUG = check_env_flag('DEBUG') | ||||
| REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO') | ||||
| IS_WINDOWS = (platform.system() == 'Windows') | ||||
| IS_DARWIN = (platform.system() == 'Darwin') | ||||
| IS_LINUX = (platform.system() == 'Linux') | ||||
| IS_PPC = (platform.machine() == 'ppc64le') | ||||
| IS_ARM = (platform.machine() == 'aarch64') | ||||
| VERBOSE_SCRIPT = True | ||||
| # see if the user passed a quiet flag to setup.py arguments and respect | ||||
| # that in our parts of the build | ||||
| for arg in sys.argv: | ||||
|     if arg == "--": | ||||
|         break | ||||
|     if arg == '-q' or arg == '--quiet': | ||||
|         VERBOSE_SCRIPT = False | ||||
|  | ||||
| BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH') | ||||
| # ppc64le and aarch64 do not support MKLDNN | ||||
| if IS_PPC or IS_ARM: | ||||
|     USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF') | ||||
| if VERBOSE_SCRIPT: | ||||
|     def report(*args): | ||||
|         print(*args) | ||||
| else: | ||||
|     USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON') | ||||
|  | ||||
| USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK') | ||||
| RERUN_CMAKE = True | ||||
|  | ||||
| NUM_JOBS = multiprocessing.cpu_count() | ||||
| max_jobs = os.getenv("MAX_JOBS") | ||||
| if max_jobs is not None: | ||||
|     NUM_JOBS = min(NUM_JOBS, int(max_jobs)) | ||||
|  | ||||
| ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE") | ||||
| if not ONNX_NAMESPACE: | ||||
|     ONNX_NAMESPACE = "onnx_torch" | ||||
|  | ||||
| # Ninja | ||||
| try: | ||||
|     import ninja | ||||
|     USE_NINJA = True | ||||
| except ImportError: | ||||
|     USE_NINJA = False | ||||
|     def report(*args): | ||||
|         pass | ||||
|  | ||||
| # Constant known variables used throughout this file | ||||
| cwd = os.path.dirname(os.path.abspath(__file__)) | ||||
| @ -323,8 +270,9 @@ def build_libs(libs): | ||||
|         build_libs_cmd = ['tools\\build_pytorch_libs.bat'] | ||||
|     else: | ||||
|         build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')] | ||||
|     my_env = os.environ.copy() | ||||
|     my_env["PYTORCH_PYTHON"] = sys.executable | ||||
|  | ||||
|     my_env, extra_flags = get_pytorch_env_with_flags() | ||||
|     build_libs_cmd.extend(extra_flags) | ||||
|     my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library | ||||
|     my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir | ||||
|     my_env["PYTORCH_BUILD_VERSION"] = version | ||||
| @ -334,64 +282,8 @@ def build_libs(libs): | ||||
|         cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path | ||||
|     my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path | ||||
|  | ||||
|     my_env["NUM_JOBS"] = str(NUM_JOBS) | ||||
|     my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE | ||||
|     if not IS_WINDOWS: | ||||
|         if USE_NINJA: | ||||
|             my_env["CMAKE_GENERATOR"] = '-GNinja' | ||||
|             my_env["CMAKE_INSTALL"] = 'ninja install' | ||||
|         else: | ||||
|             my_env['CMAKE_GENERATOR'] = '' | ||||
|             my_env['CMAKE_INSTALL'] = 'make install' | ||||
|     if USE_SYSTEM_NCCL: | ||||
|         my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR | ||||
|         my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR | ||||
|         my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB | ||||
|     if USE_CUDA: | ||||
|         my_env["CUDA_BIN_PATH"] = CUDA_HOME | ||||
|         build_libs_cmd += ['--use-cuda'] | ||||
|         if IS_WINDOWS: | ||||
|             my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME | ||||
|     if USE_CUDA_STATIC_LINK: | ||||
|         build_libs_cmd += ['--cuda-static-link'] | ||||
|     if USE_FBGEMM: | ||||
|         build_libs_cmd += ['--use-fbgemm'] | ||||
|     if USE_ROCM: | ||||
|         build_libs_cmd += ['--use-rocm'] | ||||
|     if USE_NNPACK: | ||||
|         build_libs_cmd += ['--use-nnpack'] | ||||
|     if USE_NUMPY: | ||||
|         my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR | ||||
|     if USE_CUDNN: | ||||
|         my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR | ||||
|         my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY | ||||
|         my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR | ||||
|     if USE_MIOPEN: | ||||
|         my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR | ||||
|         my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY | ||||
|         my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR | ||||
|     if USE_MKLDNN: | ||||
|         build_libs_cmd += ['--use-mkldnn'] | ||||
|     if USE_QNNPACK: | ||||
|         build_libs_cmd += ['--use-qnnpack'] | ||||
|     if USE_GLOO_IBVERBS: | ||||
|         build_libs_cmd += ['--use-gloo-ibverbs'] | ||||
|     if not RERUN_CMAKE: | ||||
|         build_libs_cmd += ['--dont-rerun-cmake'] | ||||
|  | ||||
|     my_env["BUILD_TORCH"] = "ON" | ||||
|     my_env["BUILD_PYTHON"] = "ON" | ||||
|     my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF" | ||||
|     my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF" | ||||
|     my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF" | ||||
|     my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF" | ||||
|     my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF" | ||||
|     my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF" | ||||
|     my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF" | ||||
|     my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF" | ||||
|     my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF" | ||||
|     my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF" | ||||
|  | ||||
|     if VERBOSE_SCRIPT: | ||||
|         my_env['VERBOSE_SCRIPT'] = '1' | ||||
|     try: | ||||
|         os.mkdir('build') | ||||
|     except OSError: | ||||
| @ -660,6 +552,16 @@ class build_ext(build_ext_parent): | ||||
|         return outputs | ||||
|  | ||||
|  | ||||
| # this is a subclass of build just to get access to self.build_lib | ||||
| # as there does not seem to be an utility function getting this | ||||
| class create_pyi(distutils.command.build.build): | ||||
|     def run(self): | ||||
|         print("-- Building .pyi --") | ||||
|         if sys.version_info[0] == 3: | ||||
|             from tools.pyi.gen_pyi import gen_pyi | ||||
|             gen_pyi(self.build_lib) | ||||
|  | ||||
|  | ||||
| class build(distutils.command.build.build): | ||||
|     sub_commands = [ | ||||
|         ('build_deps', lambda self: True), | ||||
| @ -914,6 +816,7 @@ if USE_CUDA: | ||||
|  | ||||
| cmdclass = { | ||||
|     'create_version_file': create_version_file, | ||||
|     'create_pyi': create_pyi, | ||||
|     'build': build, | ||||
|     'build_py': build_py, | ||||
|     'build_ext': build_ext, | ||||
| @ -946,6 +849,7 @@ if __name__ == '__main__': | ||||
|         entry_points=entry_points, | ||||
|         package_data={ | ||||
|             'torch': [ | ||||
|                 '__init__.pyi', | ||||
|                 'lib/*.so*', | ||||
|                 'lib/*.dylib*', | ||||
|                 'lib/*.dll', | ||||
|  | ||||
| @ -458,6 +458,10 @@ method_tests = [ | ||||
|      NO_ARGS, [skipIfNoLapack]), | ||||
|     ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2", | ||||
|      NO_ARGS, [skipIfNoLapack]), | ||||
|     ('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"), | ||||
|     ('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"), | ||||
|     ('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"), | ||||
|     ('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"), | ||||
|     ('addcmul', (S, S), ((S, S), (S, S))), | ||||
|     ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'), | ||||
|     ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'), | ||||
| @ -560,8 +564,14 @@ method_tests = [ | ||||
|     ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'), | ||||
|     ('tril', (M, M), NO_ARGS), | ||||
|     ('tril', (M, M), (2,), 'idx'), | ||||
|     ('tril', (S, M, M), NO_ARGS, 'batched'), | ||||
|     ('tril', (S, M, M), (2,), 'batched_idx'), | ||||
|     ('tril', (3, 3, S, S), NO_ARGS, 'more_batched'), | ||||
|     ('triu', (M, M), NO_ARGS), | ||||
|     ('triu', (M, M), (2,), 'idx'), | ||||
|     ('triu', (S, M, M), NO_ARGS, 'batched'), | ||||
|     ('triu', (S, M, M), (2,), 'batched_idx'), | ||||
|     ('triu', (3, 3, S, S), NO_ARGS, 'more_batched'), | ||||
|     ('trace', (M, M), NO_ARGS), | ||||
|     ('cross', (S, 3), ((S, 3),)), | ||||
|     ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'), | ||||
|  | ||||
| @ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs): | ||||
|         return torch.stack(all_matrices).reshape(*(batches + (l, l))) | ||||
|  | ||||
|  | ||||
| def brute_pdist(inp, p=2): | ||||
|     """Computes the same as torch.pdist using primitives""" | ||||
|     n = inp.shape[-2] | ||||
|     k = n * (n - 1) // 2 | ||||
|     if k == 0: | ||||
|         # torch complains about empty indices | ||||
|         return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device) | ||||
|     square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1) | ||||
|     unroll = square.view(square.shape[:-2] + (n * n,)) | ||||
|     inds = torch.ones(k, dtype=torch.int) | ||||
|     inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int) | ||||
|     return unroll[..., inds.cumsum(0)] | ||||
|  | ||||
|  | ||||
| def do_test_dtypes(self, dtypes, layout, device): | ||||
|     for dtype in dtypes: | ||||
|         if dtype != torch.float16: | ||||
|  | ||||
| @ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) { | ||||
|   ASSERT_EQ(batch[1].target, "2"); | ||||
| } | ||||
|  | ||||
| struct DummyTensorDataset | ||||
|     : datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> { | ||||
|   Example<torch::Tensor, int> get(size_t index) override { | ||||
|     const auto channels = static_cast<int64_t>(index); | ||||
|     torch::Tensor tensor = | ||||
|         (channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4}); | ||||
|     return {tensor, static_cast<int>(channels)}; | ||||
|   } | ||||
|  | ||||
|   torch::optional<size_t> size() const override { | ||||
|     return 100; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| TEST(DataTest, NormalizeTransform) { | ||||
|   auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1)); | ||||
|  | ||||
|   // Works for zero (one implicit) channels | ||||
|   std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0); | ||||
|   ASSERT_EQ(output.size(), 1); | ||||
|   // (1 - 0.5) / 0.1 = 5 | ||||
|   ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5)) | ||||
|       << output[0].data; | ||||
|  | ||||
|   // Works for one explicit channel | ||||
|   output = dataset.get_batch(1); | ||||
|   ASSERT_EQ(output.size(), 1); | ||||
|   ASSERT_EQ(output[0].data.size(0), 1); | ||||
|   ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5)) | ||||
|       << output[0].data; | ||||
|  | ||||
|   // Works for two channels with different moments | ||||
|   dataset = DummyTensorDataset().map( | ||||
|       transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2})); | ||||
|   output = dataset.get_batch(2); | ||||
|   ASSERT_EQ(output.size(), 1); | ||||
|   ASSERT_EQ(output[0].data.size(0), 2); | ||||
|   ASSERT_TRUE(output[0] | ||||
|                   .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1) | ||||
|                   .allclose(torch::ones({1, 4, 4}) * 5)) | ||||
|       << output[0].data; | ||||
|   ASSERT_TRUE(output[0] | ||||
|                   .data.slice(/*dim=*/0, /*start=*/1) | ||||
|                   .allclose(torch::ones({1, 4, 4}) * -2.5)) | ||||
|       << output[0].data; | ||||
|  | ||||
|   // Works for three channels with one moment value | ||||
|   dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2)); | ||||
|   output = dataset.get_batch(3); | ||||
|   ASSERT_EQ(output.size(), 1); | ||||
|   ASSERT_EQ(output[0].data.size(0), 3); | ||||
|   ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5)) | ||||
|       << output[0].data; | ||||
|  | ||||
|   // Works for three channels with different moments | ||||
|   dataset = DummyTensorDataset().map( | ||||
|       transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2})); | ||||
|   output = dataset.get_batch(3); | ||||
|   ASSERT_EQ(output.size(), 1); | ||||
|   ASSERT_EQ(output[0].data.size(0), 3); | ||||
|   ASSERT_TRUE(output[0] | ||||
|                   .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1) | ||||
|                   .allclose(torch::ones({1, 4, 4}) * 5)) | ||||
|       << output[0].data; | ||||
|   ASSERT_TRUE(output[0] | ||||
|                   .data.slice(/*dim=*/0, /*start=*/1, /*end=*/2) | ||||
|                   .allclose(torch::ones({1, 4, 4}) * -2.5)) | ||||
|       << output[0].data; | ||||
|   ASSERT_TRUE(output[0] | ||||
|                   .data.slice(/*dim=*/0, /*start=*/2) | ||||
|                   .allclose(torch::ones({1, 4, 4}) * 12.5)) | ||||
|       << output[0].data; | ||||
| } | ||||
|  | ||||
| struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> { | ||||
|   UnCopyableDataset() = default; | ||||
|  | ||||
|  | ||||
| @ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) { | ||||
| TEST_F(ModuleTest, ZeroGrad) { | ||||
|   Linear module(3, 4); | ||||
|   auto weight = torch::ones({8, 3}, torch::requires_grad()); | ||||
|   auto loss = module->forward(weight).sum(); | ||||
|   auto loss = module(weight).sum(); | ||||
|   loss.backward(); | ||||
|   for (auto& parameter : module->parameters()) { | ||||
|     auto grad = parameter.grad(); | ||||
| @ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) { | ||||
|     ASSERT_NO_THROW(module->modules()); | ||||
|   } | ||||
| } | ||||
|  | ||||
| struct ModuleWithNonTensorForwardImpl : torch::nn::Module { | ||||
|   int64_t forward(torch::Tensor x) { | ||||
|     return x.numel(); | ||||
|   } | ||||
| }; | ||||
| TORCH_MODULE(ModuleWithNonTensorForward); | ||||
|  | ||||
| TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) { | ||||
|   ModuleWithNonTensorForward m; | ||||
|   ASSERT_EQ(m(torch::ones(123)), 123); | ||||
| } | ||||
|  | ||||
| @ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {}; | ||||
| TEST_F(ModulesTest, Conv1d) { | ||||
|   Conv1d model(Conv1dOptions(3, 2, 3).stride(2)); | ||||
|   auto x = torch::randn({2, 3, 5}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) { | ||||
| TEST_F(ModulesTest, Conv2dEven) { | ||||
|   Conv2d model(Conv2dOptions(3, 2, 3).stride(2)); | ||||
|   auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) { | ||||
| TEST_F(ModulesTest, Conv2dUneven) { | ||||
|   Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2})); | ||||
|   auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) { | ||||
| TEST_F(ModulesTest, Conv3d) { | ||||
|   Conv3d model(Conv3dOptions(3, 2, 3).stride(2)); | ||||
|   auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) { | ||||
| TEST_F(ModulesTest, Linear) { | ||||
|   Linear model(5, 2); | ||||
|   auto x = torch::randn({10, 5}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) { | ||||
|   auto l3 = model->add(Linear(5, 100), "l3"); | ||||
|  | ||||
|   auto x = torch::randn({1000, 10}, torch::requires_grad()); | ||||
|   x = l1->forward(x).clamp_min(0); | ||||
|   x = l2->forward(x).clamp_min(0); | ||||
|   x = l3->forward(x).clamp_min(0); | ||||
|   x = l1(x).clamp_min(0); | ||||
|   x = l2(x).clamp_min(0); | ||||
|   x = l3(x).clamp_min(0); | ||||
|  | ||||
|   x.backward(); | ||||
|   ASSERT_EQ(x.ndimension(), 2); | ||||
| @ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) { | ||||
|   // Cannot get gradients to change indices (input) - only for embedding | ||||
|   // params | ||||
|   auto x = torch::full({10}, dict_size - 1, torch::kInt64); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) { | ||||
| TEST_F(ModulesTest, EmbeddingList) { | ||||
|   Embedding model(6, 4); | ||||
|   auto x = torch::full({2, 3}, 5, torch::kInt64); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) { | ||||
| TEST_F(ModulesTest, Dropout) { | ||||
|   Dropout dropout(0.5); | ||||
|   torch::Tensor x = torch::ones(100, torch::requires_grad()); | ||||
|   torch::Tensor y = dropout->forward(x); | ||||
|   torch::Tensor y = dropout(x); | ||||
|  | ||||
|   y.backward(); | ||||
|   ASSERT_EQ(y.ndimension(), 1); | ||||
| @ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) { | ||||
|   ASSERT_GT(y.sum().item<float>(), 70); // Probably | ||||
|  | ||||
|   dropout->eval(); | ||||
|   y = dropout->forward(x); | ||||
|   y = dropout(x); | ||||
|   ASSERT_EQ(y.sum().item<float>(), 100); | ||||
| } | ||||
|  | ||||
| @ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) { | ||||
|     was_called = true; | ||||
|     return input; | ||||
|   }); | ||||
|   auto output = functional->forward(torch::ones(5, torch::requires_grad())); | ||||
|   auto output = functional(torch::ones(5, torch::requires_grad())); | ||||
|   ASSERT_TRUE(was_called); | ||||
|   ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad()))); | ||||
|  | ||||
| @ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) { | ||||
|   ASSERT_FALSE(bn->bias.defined()); | ||||
|  | ||||
|   ASSERT_THROWS_WITH( | ||||
|       bn->forward(torch::ones({2, 5})), | ||||
|       bn(torch::ones({2, 5})), | ||||
|       "Calling BatchNorm::forward is only permitted " | ||||
|       "when the 'stateful' option is true (was false). " | ||||
|       "Use BatchNorm::pure_forward instead."); | ||||
| @ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) { | ||||
|   model->to(torch::kCUDA); | ||||
|   auto x = | ||||
|       torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true)); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
| @ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) { | ||||
|   model->to(torch::kCUDA); | ||||
|   model->to(torch::kCPU); | ||||
|   auto x = torch::randn({10, 5}, torch::requires_grad()); | ||||
|   auto y = model->forward(x); | ||||
|   auto y = model(x); | ||||
|   torch::Tensor s = y.sum(); | ||||
|  | ||||
|   s.backward(); | ||||
|  | ||||
| @ -215,7 +215,9 @@ TEST(SerializeTest, Optim) { | ||||
| TEST(SerializeTest, XOR_CUDA) { | ||||
|   torch::manual_seed(0); | ||||
|   // We better be able to save and load a XOR model! | ||||
|   auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) { | ||||
|   auto getLoss = [](Sequential model, | ||||
|                     uint32_t batch_size, | ||||
|                     bool is_cuda = false) { | ||||
|     auto inputs = torch::empty({batch_size, 2}); | ||||
|     auto labels = torch::empty({batch_size}); | ||||
|     if (is_cuda) { | ||||
| @ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) { | ||||
|   loss = getLoss(model3, 100, true); | ||||
|   ASSERT_LT(loss.item<float>(), 0.1); | ||||
| } | ||||
|  | ||||
| TEST( | ||||
|     SerializeTest, | ||||
|     CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) { | ||||
|   struct C : torch::nn::Module { | ||||
|     C() { | ||||
|       register_buffer("foo", torch::ones(5, torch::kInt32)); | ||||
|     } | ||||
|   }; | ||||
|   struct B : torch::nn::Module {}; | ||||
|   struct A : torch::nn::Module { | ||||
|     A() { | ||||
|       register_module("b", std::make_shared<B>()); | ||||
|       register_module("c", std::make_shared<C>()); | ||||
|     } | ||||
|   }; | ||||
|   struct M : torch::nn::Module { | ||||
|     M() { | ||||
|       register_module("a", std::make_shared<A>()); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   auto out = std::make_shared<M>(); | ||||
|   std::stringstream ss; | ||||
|   torch::save(out, ss); | ||||
|   auto in = std::make_shared<M>(); | ||||
|   torch::load(in, ss); | ||||
|  | ||||
|   const int output = in->named_buffers()["a.c.foo"].sum().item<int>(); | ||||
|   ASSERT_EQ(output, 5); | ||||
| } | ||||
|  | ||||
| @ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) { | ||||
|   ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>()); | ||||
| } | ||||
|  | ||||
| struct A : torch::nn::Module { | ||||
|   int forward() { | ||||
|     return 5; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| struct B : torch::nn::Module { | ||||
|   std::string forward(torch::Tensor tensor) { | ||||
|     return ""; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| struct C : torch::nn::Module { | ||||
|   float forward(torch::Tensor& tensor) { | ||||
|     return 5.0; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| struct D : torch::nn::Module { | ||||
|   char forward(torch::Tensor&& tensor) { | ||||
|     return 'x'; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| struct E : torch::nn::Module {}; | ||||
|  | ||||
| // Put in a function because macros don't handle the comma between arguments to | ||||
| // is_same well ... | ||||
| template <typename Module, typename ExpectedType, typename... Args> | ||||
| void assert_has_expected_type() { | ||||
|   using ReturnType = | ||||
|       typename torch::detail::return_type_of_forward<Module, Args...>::type; | ||||
|   constexpr bool is_expected_type = | ||||
|       std::is_same<ReturnType, ExpectedType>::value; | ||||
|   ASSERT_TRUE(is_expected_type) << Module().name(); | ||||
| } | ||||
|  | ||||
| TEST(TestStatic, ReturnTypeOfForward) { | ||||
|   assert_has_expected_type<A, int>(); | ||||
|   assert_has_expected_type<B, std::string, torch::Tensor>(); | ||||
|   assert_has_expected_type<C, float, torch::Tensor&>(); | ||||
|   assert_has_expected_type<D, char, torch::Tensor&&>(); | ||||
|   assert_has_expected_type<E, void>(); | ||||
| } | ||||
|  | ||||
| TEST(TestStatic, Apply) { | ||||
|   std::vector<int> v; | ||||
|   torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5); | ||||
|  | ||||
| @ -10,12 +10,13 @@ graph(%x.1_data : Tensor | ||||
|   %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims) | ||||
|     block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) { | ||||
|       %16 : Long() = prim::NumToTensor(%6) | ||||
|       %alpha : float = prim::TensorToNum(%16) | ||||
|       %alpha : float = prim::Float(%16) | ||||
|       %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha) | ||||
|       %mask : Tensor = aten::mul(%5_mask, %y_mask) | ||||
|       %dims : Tensor = aten::__or__(%5_dims, %y_dims) | ||||
|       %data : Tensor = aten::where(%mask, %data.1, %5_data) | ||||
|       -> (%7, %data, %mask, %dims) | ||||
|     } | ||||
|   return (%x, %10, %11); | ||||
|   %22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11) | ||||
|   return (%22); | ||||
| } | ||||
|  | ||||
| @ -7,33 +7,31 @@ graph(%a.1_data : Tensor | ||||
|   %6 : int = prim::Constant[value=1]() | ||||
|   %7 : Tensor = aten::gt(%a.1_data, %b_data) | ||||
|   %8 : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %9 : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %10 : bool = prim::TensorToBool(%7) | ||||
|   %11 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha.1 : float = prim::TensorToNum(%11) | ||||
|   %9 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha.1 : float = prim::Float(%9) | ||||
|   %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1) | ||||
|   %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %16 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::TensorToNum(%16) | ||||
|   %14 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::Float(%14) | ||||
|   %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha) | ||||
|   %mask : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %21 : bool = prim::Constant[value=1]() | ||||
|   %22 : int = prim::Constant[value=1]() | ||||
|   %23 : Tensor = aten::type_as(%8, %7) | ||||
|   %data.2 : Tensor = aten::mul(%7, %23) | ||||
|   %25 : int = aten::dim(%data.2) | ||||
|   %26 : bool = aten::eq(%25, %22) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26) | ||||
|   %19 : bool = prim::Constant[value=1]() | ||||
|   %20 : int = prim::Constant[value=1]() | ||||
|   %21 : Tensor = aten::type_as(%8, %7) | ||||
|   %data.2 : Tensor = aten::mul(%7, %21) | ||||
|   %23 : int = aten::dim(%data.2) | ||||
|   %24 : bool = aten::eq(%23, %20) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%24) | ||||
|     block0() { | ||||
|       %29 : int = aten::dim(%data.1) | ||||
|       %30 : int = aten::sub(%29, %22) | ||||
|       %data.4 : Tensor = prim::Loop(%30, %21, %data.2) | ||||
|         block0(%32 : int, %33 : Tensor) { | ||||
|           %34 : int = aten::dim(%33) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%33, %34) | ||||
|           -> (%21, %data.3) | ||||
|       %27 : int = aten::dim(%data.1) | ||||
|       %28 : int = aten::sub(%27, %20) | ||||
|       %data.4 : Tensor = prim::Loop(%28, %19, %data.2) | ||||
|         block0(%30 : int, %31 : Tensor) { | ||||
|           %32 : int = aten::dim(%31) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%31, %32) | ||||
|           -> (%19, %data.3) | ||||
|         } | ||||
|       %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1) | ||||
|       %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1) | ||||
| @ -45,5 +43,6 @@ graph(%a.1_data : Tensor | ||||
|   %res_data : Tensor = aten::where(%cond_data, %data.1, %data) | ||||
|   %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask) | ||||
|   %res_dims : Tensor = aten::__or__(%dims.1, %dims) | ||||
|   return (%res_data, %res_mask, %res_dims); | ||||
|   %39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims) | ||||
|   return (%39); | ||||
| } | ||||
|  | ||||
| @ -7,34 +7,33 @@ graph(%a.1_data : Tensor | ||||
|   %6 : int = prim::Constant[value=1]() | ||||
|   %7 : float = prim::Constant[value=0.1]() | ||||
|   %8 : Float() = prim::NumToTensor(%7) | ||||
|   %other : float = prim::TensorToNum(%8) | ||||
|   %other : float = prim::Float(%8) | ||||
|   %10 : Tensor = aten::gt(%a.1_data, %other) | ||||
|   %11 : bool = prim::TensorToBool(%10) | ||||
|   %12 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha.1 : float = prim::TensorToNum(%12) | ||||
|   %11 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha.1 : float = prim::Float(%11) | ||||
|   %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1) | ||||
|   %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %17 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::TensorToNum(%17) | ||||
|   %16 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::Float(%16) | ||||
|   %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha) | ||||
|   %mask : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %22 : bool = prim::Constant[value=1]() | ||||
|   %23 : int = prim::Constant[value=1]() | ||||
|   %24 : Tensor = aten::type_as(%a.1_mask, %10) | ||||
|   %data.2 : Tensor = aten::mul(%10, %24) | ||||
|   %26 : int = aten::dim(%data.2) | ||||
|   %27 : bool = aten::eq(%26, %23) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%27) | ||||
|   %21 : bool = prim::Constant[value=1]() | ||||
|   %22 : int = prim::Constant[value=1]() | ||||
|   %23 : Tensor = aten::type_as(%a.1_mask, %10) | ||||
|   %data.2 : Tensor = aten::mul(%10, %23) | ||||
|   %25 : int = aten::dim(%data.2) | ||||
|   %26 : bool = aten::eq(%25, %22) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26) | ||||
|     block0() { | ||||
|       %30 : int = aten::dim(%data.1) | ||||
|       %31 : int = aten::sub(%30, %23) | ||||
|       %data.4 : Tensor = prim::Loop(%31, %22, %data.2) | ||||
|         block0(%33 : int, %34 : Tensor) { | ||||
|           %35 : int = aten::dim(%34) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%34, %35) | ||||
|           -> (%22, %data.3) | ||||
|       %29 : int = aten::dim(%data.1) | ||||
|       %30 : int = aten::sub(%29, %22) | ||||
|       %data.4 : Tensor = prim::Loop(%30, %21, %data.2) | ||||
|         block0(%32 : int, %33 : Tensor) { | ||||
|           %34 : int = aten::dim(%33) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%33, %34) | ||||
|           -> (%21, %data.3) | ||||
|         } | ||||
|       %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1) | ||||
|       %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1) | ||||
| @ -46,5 +45,6 @@ graph(%a.1_data : Tensor | ||||
|   %res_data : Tensor = aten::where(%cond_data, %data.1, %data) | ||||
|   %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask) | ||||
|   %res_dims : Tensor = aten::__or__(%dims.1, %dims) | ||||
|   return (%res_data, %res_mask, %res_dims); | ||||
|   %41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims) | ||||
|   return (%41); | ||||
| } | ||||
|  | ||||
| @ -7,28 +7,26 @@ graph(%a.1_data : Tensor | ||||
|   %6 : int = prim::Constant[value=1]() | ||||
|   %7 : Tensor = aten::gt(%a.1_data, %b_data) | ||||
|   %8 : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %9 : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %10 : bool = prim::TensorToBool(%7) | ||||
|   %11 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::TensorToNum(%11) | ||||
|   %9 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::Float(%9) | ||||
|   %data : Tensor = aten::add(%a.1_data, %b_data, %alpha) | ||||
|   %mask : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %16 : bool = prim::Constant[value=1]() | ||||
|   %17 : int = prim::Constant[value=1]() | ||||
|   %18 : Tensor = aten::type_as(%8, %7) | ||||
|   %data.2 : Tensor = aten::mul(%7, %18) | ||||
|   %20 : int = aten::dim(%data.2) | ||||
|   %21 : bool = aten::eq(%20, %17) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21) | ||||
|   %14 : bool = prim::Constant[value=1]() | ||||
|   %15 : int = prim::Constant[value=1]() | ||||
|   %16 : Tensor = aten::type_as(%8, %7) | ||||
|   %data.2 : Tensor = aten::mul(%7, %16) | ||||
|   %18 : int = aten::dim(%data.2) | ||||
|   %19 : bool = aten::eq(%18, %15) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%19) | ||||
|     block0() { | ||||
|       %24 : int = aten::dim(%data) | ||||
|       %25 : int = aten::sub(%24, %17) | ||||
|       %data.4 : Tensor = prim::Loop(%25, %16, %data.2) | ||||
|         block0(%27 : int, %28 : Tensor) { | ||||
|           %29 : int = aten::dim(%28) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%28, %29) | ||||
|           -> (%16, %data.3) | ||||
|       %22 : int = aten::dim(%data) | ||||
|       %23 : int = aten::sub(%22, %15) | ||||
|       %data.4 : Tensor = prim::Loop(%23, %14, %data.2) | ||||
|         block0(%25 : int, %26 : Tensor) { | ||||
|           %27 : int = aten::dim(%26) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%26, %27) | ||||
|           -> (%14, %data.3) | ||||
|         } | ||||
|       %cond_data.1 : Tensor = aten::expand_as(%data.4, %data) | ||||
|       %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask) | ||||
| @ -40,5 +38,6 @@ graph(%a.1_data : Tensor | ||||
|   %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data) | ||||
|   %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask) | ||||
|   %res_dims : Tensor = aten::__or__(%dims, %a.1_dims) | ||||
|   return (%res_data, %res_mask, %res_dims); | ||||
|   %34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims) | ||||
|   return (%34); | ||||
| } | ||||
|  | ||||
| @ -7,29 +7,28 @@ graph(%a.1_data : Tensor | ||||
|   %6 : int = prim::Constant[value=1]() | ||||
|   %7 : float = prim::Constant[value=0.1]() | ||||
|   %8 : Float() = prim::NumToTensor(%7) | ||||
|   %other : float = prim::TensorToNum(%8) | ||||
|   %other : float = prim::Float(%8) | ||||
|   %10 : Tensor = aten::gt(%a.1_data, %other) | ||||
|   %11 : bool = prim::TensorToBool(%10) | ||||
|   %12 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::TensorToNum(%12) | ||||
|   %11 : Long() = prim::NumToTensor(%6) | ||||
|   %alpha : float = prim::Float(%11) | ||||
|   %data : Tensor = aten::add(%a.1_data, %b_data, %alpha) | ||||
|   %mask : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %dims : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %17 : bool = prim::Constant[value=1]() | ||||
|   %18 : int = prim::Constant[value=1]() | ||||
|   %19 : Tensor = aten::type_as(%a.1_mask, %10) | ||||
|   %data.2 : Tensor = aten::mul(%10, %19) | ||||
|   %21 : int = aten::dim(%data.2) | ||||
|   %22 : bool = aten::eq(%21, %18) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%22) | ||||
|   %16 : bool = prim::Constant[value=1]() | ||||
|   %17 : int = prim::Constant[value=1]() | ||||
|   %18 : Tensor = aten::type_as(%a.1_mask, %10) | ||||
|   %data.2 : Tensor = aten::mul(%10, %18) | ||||
|   %20 : int = aten::dim(%data.2) | ||||
|   %21 : bool = aten::eq(%20, %17) | ||||
|   %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21) | ||||
|     block0() { | ||||
|       %25 : int = aten::dim(%data) | ||||
|       %26 : int = aten::sub(%25, %18) | ||||
|       %data.4 : Tensor = prim::Loop(%26, %17, %data.2) | ||||
|         block0(%28 : int, %29 : Tensor) { | ||||
|           %30 : int = aten::dim(%29) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%29, %30) | ||||
|           -> (%17, %data.3) | ||||
|       %24 : int = aten::dim(%data) | ||||
|       %25 : int = aten::sub(%24, %17) | ||||
|       %data.4 : Tensor = prim::Loop(%25, %16, %data.2) | ||||
|         block0(%27 : int, %28 : Tensor) { | ||||
|           %29 : int = aten::dim(%28) | ||||
|           %data.3 : Tensor = aten::unsqueeze(%28, %29) | ||||
|           -> (%16, %data.3) | ||||
|         } | ||||
|       %cond_data.1 : Tensor = aten::expand_as(%data.4, %data) | ||||
|       %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask) | ||||
| @ -41,5 +40,6 @@ graph(%a.1_data : Tensor | ||||
|   %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data) | ||||
|   %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask) | ||||
|   %res_dims : Tensor = aten::__or__(%dims, %a.1_dims) | ||||
|   return (%res_data, %res_mask, %res_dims); | ||||
|   %36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims) | ||||
|   return (%36); | ||||
| } | ||||
|  | ||||
| @ -9,38 +9,35 @@ graph(%a.1_data : Tensor | ||||
|   %8 : Tensor = aten::gt(%a.1_data, %b_data) | ||||
|   %9 : Tensor = aten::mul(%a.1_mask, %b_mask) | ||||
|   %10 : Tensor = aten::__or__(%a.1_dims, %b_dims) | ||||
|   %11 : bool = prim::TensorToBool(%8) | ||||
|   %12 : int = prim::Constant[value=0]() | ||||
|   %13 : Tensor = aten::mul(%8, %9) | ||||
|   %14 : Tensor = aten::sum(%13) | ||||
|   %15 : Tensor = aten::gt(%14, %12) | ||||
|   %16 : bool = prim::TensorToBool(%15) | ||||
|   %17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims) | ||||
|     block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) { | ||||
|       %30 : Long() = prim::NumToTensor(%6) | ||||
|       %alpha : float = prim::TensorToNum(%30) | ||||
|   %11 : int = prim::Constant[value=0]() | ||||
|   %12 : Tensor = aten::mul(%8, %9) | ||||
|   %13 : Tensor = aten::sum(%12) | ||||
|   %14 : Tensor = aten::gt(%13, %11) | ||||
|   %15 : bool = prim::Bool(%14) | ||||
|   %16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) | ||||
|     block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) { | ||||
|       %27 : Long() = prim::NumToTensor(%6) | ||||
|       %alpha : float = prim::Float(%27) | ||||
|       %data : Tensor = aten::sub(%6_data, %b_data, %alpha) | ||||
|       %mask : Tensor = aten::mul(%6_mask, %b_mask) | ||||
|       %dims : Tensor = aten::__or__(%6_dims, %b_dims) | ||||
|       %35 : Tensor = aten::gt(%data, %b_data) | ||||
|       %36 : Tensor = aten::mul(%mask, %b_mask) | ||||
|       %37 : Tensor = aten::__or__(%dims, %b_dims) | ||||
|       %38 : bool = prim::TensorToBool(%35) | ||||
|       %39 : bool = prim::Constant[value=1]() | ||||
|       %40 : int = prim::Constant[value=1]() | ||||
|       %41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2) | ||||
|       %data.2 : Tensor = aten::mul(%cond_data.2, %41) | ||||
|       %43 : int = aten::dim(%data.2) | ||||
|       %44 : bool = aten::eq(%43, %40) | ||||
|       %cond_data : Tensor, %cond_mask : Tensor = prim::If(%44) | ||||
|       %32 : Tensor = aten::gt(%data, %b_data) | ||||
|       %33 : Tensor = aten::mul(%mask, %b_mask) | ||||
|       %34 : bool = prim::Constant[value=1]() | ||||
|       %35 : int = prim::Constant[value=1]() | ||||
|       %36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2) | ||||
|       %data.2 : Tensor = aten::mul(%cond_data.2, %36) | ||||
|       %38 : int = aten::dim(%data.2) | ||||
|       %39 : bool = aten::eq(%38, %35) | ||||
|       %cond_data : Tensor, %cond_mask : Tensor = prim::If(%39) | ||||
|         block0() { | ||||
|           %47 : int = aten::dim(%data) | ||||
|           %48 : int = aten::sub(%47, %40) | ||||
|           %data.4 : Tensor = prim::Loop(%48, %39, %data.2) | ||||
|             block0(%50 : int, %51 : Tensor) { | ||||
|               %52 : int = aten::dim(%51) | ||||
|               %data.3 : Tensor = aten::unsqueeze(%51, %52) | ||||
|               -> (%39, %data.3) | ||||
|           %42 : int = aten::dim(%data) | ||||
|           %43 : int = aten::sub(%42, %35) | ||||
|           %data.4 : Tensor = prim::Loop(%43, %34, %data.2) | ||||
|             block0(%45 : int, %46 : Tensor) { | ||||
|               %47 : int = aten::dim(%46) | ||||
|               %data.3 : Tensor = aten::unsqueeze(%46, %47) | ||||
|               -> (%34, %data.3) | ||||
|             } | ||||
|           %cond_data.1 : Tensor = aten::expand_as(%data.4, %data) | ||||
|           %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask) | ||||
| @ -52,12 +49,13 @@ graph(%a.1_data : Tensor | ||||
|       %res_data : Tensor = aten::where(%cond_data, %data, %6_data) | ||||
|       %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask) | ||||
|       %res_dims : Tensor = aten::__or__(%dims, %6_dims) | ||||
|       %59 : int = prim::Constant[value=0]() | ||||
|       %60 : Tensor = aten::mul(%35, %36) | ||||
|       %61 : Tensor = aten::sum(%60) | ||||
|       %62 : Tensor = aten::gt(%61, %59) | ||||
|       %63 : bool = prim::TensorToBool(%62) | ||||
|       -> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims) | ||||
|       %54 : int = prim::Constant[value=0]() | ||||
|       %55 : Tensor = aten::mul(%32, %33) | ||||
|       %56 : Tensor = aten::sum(%55) | ||||
|       %57 : Tensor = aten::gt(%56, %54) | ||||
|       %58 : bool = prim::Bool(%57) | ||||
|       -> (%58, %32, %33, %res_data, %res_mask, %res_dims) | ||||
|     } | ||||
|   return (%a, %21, %22); | ||||
|   %59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20) | ||||
|   return (%59); | ||||
| } | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	