mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 16:04:58 +08:00 
			
		
		
		
	Compare commits
	
		
			101 Commits
		
	
	
		
			ciflow/tru
			...
			v1.0.1
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| bb15580e88 | |||
| 743fdbdb19 | |||
| cdb9fd44dc | |||
| 83221655a8 | |||
| 48fcfdaccb | |||
| 18eef1d8d9 | |||
| 770462a5ff | |||
| 0f87ff6e38 | |||
| eb531da9a8 | |||
| 37c8a33b54 | |||
| 0e9bdcab80 | |||
| 1347a184ca | |||
| 1cb565fa34 | |||
| dab52a4a16 | |||
| 0a440da88b | |||
| cf11411d42 | |||
| fd8212cebd | |||
| ce37ec38f3 | |||
| 047231e1e1 | |||
| f748654e0e | |||
| 8fdcdc3c3f | |||
| 40fa56a5d1 | |||
| 2f9642010e | |||
| 3c10845036 | |||
| d85372f330 | |||
| 5fc2c8b115 | |||
| fc0c2252d2 | |||
| 304b932879 | |||
| e274158c72 | |||
| 8d1fc20c8b | |||
| af03dbb93b | |||
| b24edae29e | |||
| c99c8d8aa3 | |||
| eac4c5d901 | |||
| 231f1a4991 | |||
| b65b55a652 | |||
| c926cb4408 | |||
| a6f4538f31 | |||
| 7d3e2fa190 | |||
| 98bc784694 | |||
| 3c83026249 | |||
| 202909d601 | |||
| d4eea46dcd | |||
| cf0965736c | |||
| 274e755237 | |||
| c19b16cc99 | |||
| 228f73e7a9 | |||
| 1e61241227 | |||
| 9a9eae14d0 | |||
| fb92c3c7b5 | |||
| a9cf3f69ab | |||
| 6460628b25 | |||
| 74433436e8 | |||
| 57c685520f | |||
| ca1f9349dd | |||
| 6290587244 | |||
| 9c863c1952 | |||
| 84cf1660d2 | |||
| e8361887b1 | |||
| 9a7737146c | |||
| e27b4ba594 | |||
| 0384a0282b | |||
| f80dba92ae | |||
| 1b7113eaae | |||
| 7fec47f40d | |||
| d711595a07 | |||
| eef3be686e | |||
| ba4d1e8ca6 | |||
| ab1cd6241b | |||
| 1ff075b7df | |||
| b879d006f1 | |||
| 167f8e8314 | |||
| dfdf2376bb | |||
| 95fd0afed5 | |||
| 4e5b994ca7 | |||
| 5dbcbbf715 | |||
| 9067e9411d | |||
| 4c964dac7f | |||
| 7b40d9c7ff | |||
| e7767c1af3 | |||
| 982a8722cc | |||
| 3c1cbb8da8 | |||
| 5f51de77c9 | |||
| a4e2d27ddb | |||
| 4909529584 | |||
| 7b98af16ee | |||
| fe098a3605 | |||
| 3486cebd87 | |||
| a5a34fb5b1 | |||
| b2c4c55734 | |||
| b104068d24 | |||
| e0834ded98 | |||
| 30aed0237d | |||
| 033ae1598f | |||
| 8ca4fc3fd2 | |||
| 20296297ca | |||
| 72d27e3802 | |||
| 563d67087c | |||
| 7dc06810c2 | |||
| 07e4a5e069 | |||
| db5d3131d1 | 
@ -1,14 +1,14 @@
 | 
			
		||||
# IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 | 
			
		||||
# in this file to the new version number, and **ALSO** update the version number below:
 | 
			
		||||
# PyTorchDockerVersion:262
 | 
			
		||||
# Caffe2DockerVersion:230
 | 
			
		||||
# PyTorchDockerVersion:282
 | 
			
		||||
# Caffe2DockerVersion:238
 | 
			
		||||
 | 
			
		||||
docker_config_defaults: &docker_config_defaults
 | 
			
		||||
  user: jenkins
 | 
			
		||||
  aws_auth:
 | 
			
		||||
    # This IAM user only allows read-write access to ECR
 | 
			
		||||
    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
 | 
			
		||||
    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
 | 
			
		||||
    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
 | 
			
		||||
    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
 | 
			
		||||
 | 
			
		||||
# NOTE: We only perform the merge in build step and not in test step, because
 | 
			
		||||
# all source files will be shared from build to test
 | 
			
		||||
@ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client
 | 
			
		||||
    sudo apt-get -qq update
 | 
			
		||||
    sudo apt-get -qq install openssh-client git
 | 
			
		||||
 | 
			
		||||
install_doc_push_script: &install_doc_push_script
 | 
			
		||||
  name: Install the doc push script
 | 
			
		||||
  no_output_timeout: "2m"
 | 
			
		||||
  command: |
 | 
			
		||||
    cat >/home/circleci/project/doc_push_script.sh <<EOL
 | 
			
		||||
    # =================== The following code **should** be executed inside Docker container ===================
 | 
			
		||||
 | 
			
		||||
    # This is where the local pytorch install in the docker image is located
 | 
			
		||||
    pt_checkout="/var/lib/jenkins/workspace"
 | 
			
		||||
 | 
			
		||||
    # Since we're cat-ing this file, we need to escape all $'s
 | 
			
		||||
    echo "doc_push_script.sh: Invoked with \$*"
 | 
			
		||||
 | 
			
		||||
    git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
 | 
			
		||||
    pushd pytorch.github.io
 | 
			
		||||
 | 
			
		||||
    set -ex
 | 
			
		||||
 | 
			
		||||
    # Argument 1: Where to copy the built documentation to
 | 
			
		||||
    # (pytorch.github.io/$install_path)
 | 
			
		||||
    install_path="\$1"
 | 
			
		||||
    if [ -z "\$install_path" ]; then
 | 
			
		||||
    echo "error: doc_push_script.sh: install_path (arg1) not specified"
 | 
			
		||||
      exit 1
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # Argument 2: What version of the docs we are building.
 | 
			
		||||
    version="\$2"
 | 
			
		||||
    if [ -z "\$version" ]; then
 | 
			
		||||
    echo "error: doc_push_script.sh: version (arg2) not specified"
 | 
			
		||||
      exit 1
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    is_master_doc=false
 | 
			
		||||
    if [ "\$version" == "master" ]; then
 | 
			
		||||
      is_master_doc=true
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # Argument 3: (optional) If present, we will NOT do any pushing. Used for testing.
 | 
			
		||||
    dry_run=false
 | 
			
		||||
    if [ "\$3" != "" ]; then
 | 
			
		||||
      dry_run=true
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    echo "install_path: \$install_path  version: \$version  dry_run: \$dry_run"
 | 
			
		||||
 | 
			
		||||
    export LC_ALL=C
 | 
			
		||||
    export PATH=/opt/conda/bin:$PATH
 | 
			
		||||
 | 
			
		||||
    rm -rf pytorch || true
 | 
			
		||||
 | 
			
		||||
    # Get all the documentation sources, put them in one place
 | 
			
		||||
    pushd "\$pt_checkout"
 | 
			
		||||
    git clone https://github.com/pytorch/vision
 | 
			
		||||
    pushd vision
 | 
			
		||||
    conda install -q pillow
 | 
			
		||||
    time python setup.py install
 | 
			
		||||
    popd
 | 
			
		||||
    pushd docs
 | 
			
		||||
    rm -rf source/torchvision
 | 
			
		||||
    cp -r ../vision/docs/source source/torchvision
 | 
			
		||||
 | 
			
		||||
    # Build the docs
 | 
			
		||||
    pip -q install -r requirements.txt || true
 | 
			
		||||
    if [ "\$is_master_doc" = true ]; then
 | 
			
		||||
      make html
 | 
			
		||||
    else
 | 
			
		||||
      make html-stable
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # Move them into the docs repo
 | 
			
		||||
    popd
 | 
			
		||||
    popd
 | 
			
		||||
    git rm -rf "\$install_path" || true
 | 
			
		||||
    mv "\$pt_checkout/docs/build/html" "\$install_path"
 | 
			
		||||
 | 
			
		||||
    # Add the version handler by search and replace.
 | 
			
		||||
    # XXX: Consider moving this to the docs Makefile or site build
 | 
			
		||||
    if [ "\$is_master_doc" = true ]; then
 | 
			
		||||
      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \▼</a>@g"
 | 
			
		||||
    else
 | 
			
		||||
      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \▼</a>@g"
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    git add "\$install_path" || true
 | 
			
		||||
    git status
 | 
			
		||||
    git config user.email "soumith+bot@pytorch.org"
 | 
			
		||||
    git config user.name "pytorchbot"
 | 
			
		||||
    # If there aren't changes, don't make a commit; push is no-op
 | 
			
		||||
    git commit -m "auto-generating sphinx docs" || true
 | 
			
		||||
    git status
 | 
			
		||||
 | 
			
		||||
    if [ "\$dry_run" = false ]; then
 | 
			
		||||
      echo "Pushing to pytorch.github.io:site"
 | 
			
		||||
      git push origin site
 | 
			
		||||
    else
 | 
			
		||||
      echo "Skipping push due to dry_run"
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    popd
 | 
			
		||||
    # =================== The above code **should** be executed inside Docker container ===================
 | 
			
		||||
    EOL
 | 
			
		||||
    chmod +x /home/circleci/project/doc_push_script.sh
 | 
			
		||||
 | 
			
		||||
setup_ci_environment: &setup_ci_environment
 | 
			
		||||
  name: Set Up CI Environment
 | 
			
		||||
  no_output_timeout: "1h"
 | 
			
		||||
@ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment
 | 
			
		||||
      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
 | 
			
		||||
 | 
			
		||||
      # This IAM user allows write access to S3 bucket for sccache
 | 
			
		||||
      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
			
		||||
      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
			
		||||
      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
			
		||||
      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # This IAM user only allows read-write access to ECR
 | 
			
		||||
    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
 | 
			
		||||
    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
 | 
			
		||||
    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
 | 
			
		||||
    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
 | 
			
		||||
    eval $(aws ecr get-login --region us-east-1 --no-include-email)
 | 
			
		||||
 | 
			
		||||
pytorch_linux_build_defaults: &pytorch_linux_build_defaults
 | 
			
		||||
@ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
 | 
			
		||||
      <<: *setup_ci_environment
 | 
			
		||||
  - run:
 | 
			
		||||
      name: Test
 | 
			
		||||
      no_output_timeout: "90m"
 | 
			
		||||
      no_output_timeout: "1h"
 | 
			
		||||
      command: |
 | 
			
		||||
        set -e
 | 
			
		||||
        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
			
		||||
@ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
			
		||||
 | 
			
		||||
          export IN_CIRCLECI=1
 | 
			
		||||
 | 
			
		||||
          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
			
		||||
          brew install moreutils --without-parallel
 | 
			
		||||
          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
			
		||||
          # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
			
		||||
          brew unlink parallel
 | 
			
		||||
          brew install moreutils
 | 
			
		||||
          brew link parallel --overwrite
 | 
			
		||||
          brew install cmake
 | 
			
		||||
          brew install expect
 | 
			
		||||
 | 
			
		||||
@ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
			
		||||
          export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
			
		||||
 | 
			
		||||
          # This IAM user allows write access to S3 bucket for sccache
 | 
			
		||||
          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
 | 
			
		||||
          export SCCACHE_BIN=${PWD}/sccache_bin
 | 
			
		||||
          mkdir -p ${SCCACHE_BIN}
 | 
			
		||||
@ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
			
		||||
            sccache --show-stats
 | 
			
		||||
          fi
 | 
			
		||||
 | 
			
		||||
##############################################################################
 | 
			
		||||
##############################################################################
 | 
			
		||||
# Job specifications
 | 
			
		||||
##############################################################################
 | 
			
		||||
##############################################################################
 | 
			
		||||
 | 
			
		||||
version: 2
 | 
			
		||||
jobs:
 | 
			
		||||
  pytorch_linux_trusty_py2_7_9_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py2_7_9_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py2_7_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py2_7_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_5_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_5_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc4_8_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc4_8_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc5_4_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc5_4_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc7_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_py3_6_gcc7_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_pynightly_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_trusty_pynightly_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_py3_clang5_asan_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_py3_clang5_asan_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_build:
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn7_py3_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_test:
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn7_py3_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      MULTI_GPU: "1"
 | 
			
		||||
    resource_class: gpu.large
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
    <<: *pytorch_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
 | 
			
		||||
  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
@ -517,7 +631,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
 | 
			
		||||
      PYTHON_VERSION: "2.7"
 | 
			
		||||
      CUDA_VERSION: "9"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
@ -525,7 +639,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
 | 
			
		||||
      PYTHON_VERSION: "2.7"
 | 
			
		||||
      CUDA_VERSION: "9"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
@ -534,7 +648,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "9"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
@ -542,7 +656,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "9"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
@ -551,7 +665,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "9.2"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
@ -559,7 +673,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "9.2"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
@ -568,7 +682,7 @@ jobs:
 | 
			
		||||
  pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "10"
 | 
			
		||||
    <<: *pytorch_linux_build_defaults
 | 
			
		||||
@ -576,7 +690,7 @@ jobs:
 | 
			
		||||
  pytorch_short_perf_test_gpu:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-short-perf-test-gpu
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
      PYTHON_VERSION: "3.6"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
@ -597,8 +711,8 @@ jobs:
 | 
			
		||||
 | 
			
		||||
          docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
 | 
			
		||||
          # This IAM user allows write access to S3 bucket for perf test numbers
 | 
			
		||||
          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
			
		||||
          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
			
		||||
          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
			
		||||
          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
			
		||||
          docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env
 | 
			
		||||
 | 
			
		||||
          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
			
		||||
@ -607,7 +721,7 @@ jobs:
 | 
			
		||||
  pytorch_doc_push:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: pytorch-doc-push
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
			
		||||
    resource_class: large
 | 
			
		||||
    machine:
 | 
			
		||||
      image: default
 | 
			
		||||
@ -615,72 +729,39 @@ jobs:
 | 
			
		||||
    - run:
 | 
			
		||||
        <<: *setup_ci_environment
 | 
			
		||||
    - run:
 | 
			
		||||
        name: Doc Push
 | 
			
		||||
        <<: *install_doc_push_script
 | 
			
		||||
    - run:
 | 
			
		||||
        name: Doc Build and Push
 | 
			
		||||
        no_output_timeout: "1h"
 | 
			
		||||
        command: |
 | 
			
		||||
          set -e
 | 
			
		||||
          if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
 | 
			
		||||
            echo "Skipping doc push..."
 | 
			
		||||
            exit 0
 | 
			
		||||
          fi
 | 
			
		||||
          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
			
		||||
          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
 | 
			
		||||
          docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
 | 
			
		||||
          export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 | 
			
		||||
 | 
			
		||||
          cat >/home/circleci/project/doc_push_script.sh <<EOL
 | 
			
		||||
          # =================== The following code will be executed inside Docker container ===================
 | 
			
		||||
          git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
 | 
			
		||||
          pushd pytorch.github.io
 | 
			
		||||
 | 
			
		||||
          set -ex
 | 
			
		||||
 | 
			
		||||
          export LC_ALL=C
 | 
			
		||||
          export PATH=/opt/conda/bin:$PATH
 | 
			
		||||
 | 
			
		||||
          rm -rf pytorch || true
 | 
			
		||||
 | 
			
		||||
          # Get all the documentation sources, put them in one place
 | 
			
		||||
          # TODO: These clones can race
 | 
			
		||||
          git clone https://github.com/pytorch/pytorch
 | 
			
		||||
          pushd pytorch
 | 
			
		||||
          git clone https://github.com/pytorch/vision
 | 
			
		||||
          pushd vision
 | 
			
		||||
          conda install -q pillow
 | 
			
		||||
          time python setup.py install
 | 
			
		||||
          popd
 | 
			
		||||
          pushd docs
 | 
			
		||||
          rm -rf source/torchvision
 | 
			
		||||
          cp -r ../vision/docs/source source/torchvision
 | 
			
		||||
 | 
			
		||||
          # Build the docs
 | 
			
		||||
          pip -q install -r requirements.txt || true
 | 
			
		||||
          make html
 | 
			
		||||
 | 
			
		||||
          # Move them into the docs repo
 | 
			
		||||
          popd
 | 
			
		||||
          popd
 | 
			
		||||
          git rm -rf docs/master || true
 | 
			
		||||
          mv pytorch/docs/build/html docs/master
 | 
			
		||||
          find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\▼<\\/a>/g'
 | 
			
		||||
          git add docs/master || true
 | 
			
		||||
          git status
 | 
			
		||||
          git config user.email "soumith+bot@pytorch.org"
 | 
			
		||||
          git config user.name "pytorchbot"
 | 
			
		||||
          # If there aren't changes, don't make a commit; push is no-op
 | 
			
		||||
          git commit -m "auto-generating sphinx docs" || true
 | 
			
		||||
          git status
 | 
			
		||||
          git push origin site
 | 
			
		||||
 | 
			
		||||
          popd
 | 
			
		||||
          # =================== The above code will be executed inside Docker container ===================
 | 
			
		||||
          EOL
 | 
			
		||||
          chmod +x /home/circleci/project/doc_push_script.sh
 | 
			
		||||
          docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh
 | 
			
		||||
 | 
			
		||||
          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
			
		||||
          # master branch docs push
 | 
			
		||||
          if [[ "${CIRCLE_BRANCH}" == "master" ]]; then
 | 
			
		||||
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
			
		||||
 | 
			
		||||
          # stable release docs push. We keep an eternal PR open for merging
 | 
			
		||||
          # v1.0.1 -> master; everytime v1.0.1 is updated the following is run.
 | 
			
		||||
          elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then
 | 
			
		||||
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
			
		||||
 | 
			
		||||
          # For open PRs: Do a dry_run of the docs build, don't push build
 | 
			
		||||
          else
 | 
			
		||||
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
			
		||||
          fi
 | 
			
		||||
          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 | 
			
		||||
 | 
			
		||||
          # Save the docs build so we can debug any problems
 | 
			
		||||
          export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug
 | 
			
		||||
          docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE}
 | 
			
		||||
          docker push ${DEBUG_COMMIT_DOCKER_IMAGE}
 | 
			
		||||
 | 
			
		||||
  pytorch_macos_10_13_py3_build:
 | 
			
		||||
    macos:
 | 
			
		||||
      xcode: "9.0"
 | 
			
		||||
@ -696,8 +777,11 @@ jobs:
 | 
			
		||||
            set -e
 | 
			
		||||
 | 
			
		||||
            export IN_CIRCLECI=1
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
			
		||||
            brew install moreutils --without-parallel
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
			
		||||
            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
			
		||||
            brew unlink parallel
 | 
			
		||||
            brew install moreutils
 | 
			
		||||
            brew link parallel --overwrite
 | 
			
		||||
            brew install expect
 | 
			
		||||
 | 
			
		||||
            # Install sccache
 | 
			
		||||
@ -706,8 +790,8 @@ jobs:
 | 
			
		||||
 | 
			
		||||
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
			
		||||
            # This IAM user allows write access to S3 bucket for sccache
 | 
			
		||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
 | 
			
		||||
            git submodule sync && git submodule update -q --init
 | 
			
		||||
            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
			
		||||
@ -740,8 +824,11 @@ jobs:
 | 
			
		||||
          command: |
 | 
			
		||||
            set -e
 | 
			
		||||
            export IN_CIRCLECI=1
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
			
		||||
            brew install moreutils --without-parallel
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
			
		||||
            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
			
		||||
            brew unlink parallel
 | 
			
		||||
            brew install moreutils
 | 
			
		||||
            brew link parallel --overwrite
 | 
			
		||||
            brew install expect
 | 
			
		||||
 | 
			
		||||
            cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
 | 
			
		||||
@ -765,8 +852,11 @@ jobs:
 | 
			
		||||
 | 
			
		||||
            export IN_CIRCLECI=1
 | 
			
		||||
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
			
		||||
            brew install moreutils --without-parallel
 | 
			
		||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
			
		||||
            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
			
		||||
            brew unlink parallel
 | 
			
		||||
            brew install moreutils
 | 
			
		||||
            brew link parallel --overwrite
 | 
			
		||||
            brew install expect
 | 
			
		||||
 | 
			
		||||
            # Install CUDA 9.2
 | 
			
		||||
@ -790,30 +880,13 @@ jobs:
 | 
			
		||||
            sudo chmod +x /usr/local/bin/sccache
 | 
			
		||||
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
			
		||||
            # This IAM user allows write access to S3 bucket for sccache
 | 
			
		||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
			
		||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
			
		||||
 | 
			
		||||
            git submodule sync && git submodule update -q --init
 | 
			
		||||
            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
			
		||||
            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
 | 
			
		||||
 | 
			
		||||
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
 | 
			
		||||
    <<: *caffe2_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
    <<: *caffe2_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
 | 
			
		||||
@ -896,11 +969,20 @@ jobs:
 | 
			
		||||
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
 | 
			
		||||
      BUILD_ONLY: "1"
 | 
			
		||||
    <<: *caffe2_linux_build_defaults
 | 
			
		||||
 | 
			
		||||
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test
 | 
			
		||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
 | 
			
		||||
      CUDA_VERSION: "8"
 | 
			
		||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
 | 
			
		||||
    resource_class: gpu.medium
 | 
			
		||||
    <<: *caffe2_linux_test_defaults
 | 
			
		||||
 | 
			
		||||
  caffe2_py2_gcc4_9_ubuntu14_04_build:
 | 
			
		||||
    environment:
 | 
			
		||||
      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
 | 
			
		||||
@ -1008,25 +1090,25 @@ workflows:
 | 
			
		||||
      - pytorch_linux_xenial_py3_clang5_asan_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_py3_clang5_asan_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn7_py3_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_short_perf_test_gpu:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_doc_push:
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
			
		||||
            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
 | 
			
		||||
      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
			
		||||
          requires:
 | 
			
		||||
@ -1047,10 +1129,6 @@ workflows:
 | 
			
		||||
            - pytorch_macos_10_13_py3_build
 | 
			
		||||
      - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 | 
			
		||||
 | 
			
		||||
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
 | 
			
		||||
          requires:
 | 
			
		||||
@ -1072,6 +1150,9 @@ workflows:
 | 
			
		||||
          requires:
 | 
			
		||||
            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
 | 
			
		||||
          requires:
 | 
			
		||||
            - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_clang3_8_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_clang3_9_ubuntu16_04_build
 | 
			
		||||
      - caffe2_py2_clang7_ubuntu16_04_build
 | 
			
		||||
 | 
			
		||||
@ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 | 
			
		||||
 | 
			
		||||
if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
 | 
			
		||||
  CMAKE_ARGS+=("-DBLAS=MKL")
 | 
			
		||||
  CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
 | 
			
		||||
fi
 | 
			
		||||
if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
 | 
			
		||||
  CMAKE_ARGS+=("-DUSE_CUDA=ON")
 | 
			
		||||
 | 
			
		||||
@ -14,18 +14,8 @@ clang --version
 | 
			
		||||
# symbolize=1: Gives us much better errors when things go wrong
 | 
			
		||||
export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 | 
			
		||||
 | 
			
		||||
# FIXME: Remove the hardcoded "-pthread" option.
 | 
			
		||||
# With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
 | 
			
		||||
# succeed because "pthread_create" is in libasan.so. However, libasan doesn't
 | 
			
		||||
# have the full pthread implementation. Other advanced pthread functions doesn't
 | 
			
		||||
# exist in libasan.so[2]. If we need some pthread advanced functions, we still
 | 
			
		||||
# need to link the pthread library.
 | 
			
		||||
# [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
 | 
			
		||||
# [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
 | 
			
		||||
#
 | 
			
		||||
# TODO: Make the ASAN flags a more unified env var
 | 
			
		||||
CC="clang" CXX="clang++" LDSHARED="clang --shared" \
 | 
			
		||||
  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
 | 
			
		||||
  CXX_FLAGS="-pthread" \
 | 
			
		||||
  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
 | 
			
		||||
  NO_CUDA=1 USE_MKLDNN=0 \
 | 
			
		||||
  python setup.py install
 | 
			
		||||
 | 
			
		||||
@ -129,7 +129,7 @@ fi
 | 
			
		||||
git add -f build/bin
 | 
			
		||||
 | 
			
		||||
# Test documentation build
 | 
			
		||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
			
		||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
 | 
			
		||||
  pushd docs
 | 
			
		||||
  # TODO: Don't run this here
 | 
			
		||||
  pip install -q -r requirements.txt || true
 | 
			
		||||
@ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Test standalone c10 build
 | 
			
		||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
			
		||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
 | 
			
		||||
  mkdir -p c10/build
 | 
			
		||||
  pushd c10/build
 | 
			
		||||
  cmake ..
 | 
			
		||||
 | 
			
		||||
@ -122,7 +122,7 @@ fi
 | 
			
		||||
# Use conda cmake in some CI build. Conda cmake will be newer than our supported
 | 
			
		||||
# min version 3.5, so we only do it in two builds that we know should use conda.
 | 
			
		||||
if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
 | 
			
		||||
     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
 | 
			
		||||
    if ! which conda; then
 | 
			
		||||
      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
 | 
			
		||||
 | 
			
		||||
@ -5,9 +5,9 @@
 | 
			
		||||
# in this file will report a failure (so you don't forget to
 | 
			
		||||
# reenable the tests on merge ;)
 | 
			
		||||
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn6-py3-build
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn6-py3-test
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn7-py3-build
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn7-py3-test
 | 
			
		||||
pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 | 
			
		||||
pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
			
		||||
pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
			
		||||
pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
			
		||||
 | 
			
		||||
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
 | 
			
		||||
    sccache --show-stats
 | 
			
		||||
    sccache --zero-stats
 | 
			
		||||
    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
 | 
			
		||||
    for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
 | 
			
		||||
      IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
 | 
			
		||||
        del /S /Q %%i
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -34,10 +34,4 @@ matrix:
 | 
			
		||||
        script: cd docs/cpp/source && ./check-doxygen.sh
 | 
			
		||||
      - env: CLANG_TIDY
 | 
			
		||||
        python: "3.6"
 | 
			
		||||
        addons:
 | 
			
		||||
          apt:
 | 
			
		||||
            sources:
 | 
			
		||||
              - ubuntu-toolchain-r-test
 | 
			
		||||
              - llvm-toolchain-trusty
 | 
			
		||||
            packages: clang-tidy
 | 
			
		||||
        script: tools/run-clang-tidy-in-ci.sh
 | 
			
		||||
 | 
			
		||||
@ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 | 
			
		||||
	--generate-code arch=compute_50,code=sm_50
 | 
			
		||||
	--generate-code arch=compute_60,code=sm_60
 | 
			
		||||
	--generate-code arch=compute_70,code=sm_70)
 | 
			
		||||
    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
 | 
			
		||||
      SET(CUFFT_FAKELINK_OPTIONS
 | 
			
		||||
	--generate-code arch=compute_35,code=sm_35
 | 
			
		||||
	--generate-code arch=compute_50,code=sm_50
 | 
			
		||||
	--generate-code arch=compute_60,code=sm_60
 | 
			
		||||
	--generate-code arch=compute_70,code=sm_70)
 | 
			
		||||
    else()
 | 
			
		||||
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
@ -2122,55 +2122,6 @@
 | 
			
		||||
    - arg: THTensor* self
 | 
			
		||||
    - arg: THTensor* tensor
 | 
			
		||||
]]
 | 
			
		||||
[[
 | 
			
		||||
  name: _th_tril
 | 
			
		||||
  cname: tril
 | 
			
		||||
  variants:
 | 
			
		||||
    - function
 | 
			
		||||
  return: argument 0
 | 
			
		||||
  arguments:
 | 
			
		||||
    - arg: THTensor* result
 | 
			
		||||
      output: True
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - arg: long diagonal
 | 
			
		||||
      default: 0
 | 
			
		||||
]]
 | 
			
		||||
[[
 | 
			
		||||
  name: _th_tril_
 | 
			
		||||
  cname: tril
 | 
			
		||||
  variants: function
 | 
			
		||||
  return: self
 | 
			
		||||
  arguments:
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - arg: long diagonal
 | 
			
		||||
      default: 0
 | 
			
		||||
]]
 | 
			
		||||
[[
 | 
			
		||||
  name: _th_triu
 | 
			
		||||
  cname: triu
 | 
			
		||||
  variants:
 | 
			
		||||
    - function
 | 
			
		||||
  return: argument 0
 | 
			
		||||
  arguments:
 | 
			
		||||
    - arg: THTensor* result
 | 
			
		||||
      output: True
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - arg: long diagonal
 | 
			
		||||
      default: 0
 | 
			
		||||
]]
 | 
			
		||||
[[
 | 
			
		||||
  name: _th_triu_
 | 
			
		||||
  cname: triu
 | 
			
		||||
  variants:
 | 
			
		||||
    - function
 | 
			
		||||
  return: self
 | 
			
		||||
  arguments:
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - THTensor* self
 | 
			
		||||
    - arg: long diagonal
 | 
			
		||||
      default: 0
 | 
			
		||||
]]
 | 
			
		||||
[[
 | 
			
		||||
  name: _th_cross
 | 
			
		||||
  cname: cross
 | 
			
		||||
 | 
			
		||||
@ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) {
 | 
			
		||||
    reduce_dims.push_back(i);
 | 
			
		||||
  }
 | 
			
		||||
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
 | 
			
		||||
    if (shape[i - leading_dims] == 1 && sizes[i] > 1) {
 | 
			
		||||
    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
 | 
			
		||||
      reduce_dims.push_back(i);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@ -81,6 +81,39 @@ inline void parallel_for(
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
parallel_reduce
 | 
			
		||||
 | 
			
		||||
begin: index at which to start applying reduction
 | 
			
		||||
 | 
			
		||||
end: index at which to stop applying reduction
 | 
			
		||||
 | 
			
		||||
grain_size: number of elements per chunk. impacts number of elements in
 | 
			
		||||
intermediate results tensor and degree of parallelization.
 | 
			
		||||
 | 
			
		||||
ident: identity for binary combination function sf. sf(ident, x) needs to return
 | 
			
		||||
x.
 | 
			
		||||
 | 
			
		||||
f: function for reduction over a chunk. f needs to be of signature scalar_t
 | 
			
		||||
f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
 | 
			
		||||
 | 
			
		||||
sf: function to combine two partial results. sf needs to be of signature
 | 
			
		||||
scalar_t sf(scalar_t x, scalar_t y)
 | 
			
		||||
 | 
			
		||||
For example, you might have a tensor of 10000 entires and want to sum together
 | 
			
		||||
all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 | 
			
		||||
an intermediate result tensor with 4 elements. Then it will execute the function
 | 
			
		||||
"f" you provide and pass the beginning and end index of these chunks, so
 | 
			
		||||
0-24999, 2500-4999, etc. and the combination identity. It will then write out
 | 
			
		||||
the result from each of these chunks into the intermediate result tensor. After
 | 
			
		||||
that it'll reduce the partial results from each chunk into a single number using
 | 
			
		||||
the combination function sf and the identity ident. For a total summation this
 | 
			
		||||
would be "+" and 0 respectively. This is similar to tbb's approach [1], where
 | 
			
		||||
you need to provide a function to accumulate a subrange, a function to combine
 | 
			
		||||
two partial results and an identity.
 | 
			
		||||
 | 
			
		||||
[1] https://software.intel.com/en-us/node/506154
 | 
			
		||||
*/
 | 
			
		||||
template <class scalar_t, class F, class SF>
 | 
			
		||||
inline scalar_t parallel_reduce(
 | 
			
		||||
    const int64_t begin,
 | 
			
		||||
 | 
			
		||||
@ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
 | 
			
		||||
 | 
			
		||||
void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
 | 
			
		||||
  AT_CHECK(
 | 
			
		||||
    t.type().backend() == backend,
 | 
			
		||||
    !t.defined() || t.type().backend() == backend,
 | 
			
		||||
    "Expected tensor to have ", toString(backend),
 | 
			
		||||
    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
 | 
			
		||||
    "(while checking arguments for ", c, ")");
 | 
			
		||||
 | 
			
		||||
@ -52,14 +52,11 @@ namespace c10 {
 | 
			
		||||
  _(prim, TupleSlice)              \
 | 
			
		||||
  _(prim, ListConstruct)           \
 | 
			
		||||
  _(prim, ListUnpack)              \
 | 
			
		||||
  _(prim, BoolToTensor)            \
 | 
			
		||||
  _(prim, NumToTensor)             \
 | 
			
		||||
  _(prim, TensorToNum)             \
 | 
			
		||||
  _(prim, ImplicitTensorToNum)     \
 | 
			
		||||
  _(prim, TensorToBool)            \
 | 
			
		||||
  _(prim, IntToFloat)              \
 | 
			
		||||
  _(prim, FloatToInt)              \
 | 
			
		||||
  _(prim, StringToFloat)           \
 | 
			
		||||
  _(prim, Bool)                    \
 | 
			
		||||
  _(prim, Int)                     \
 | 
			
		||||
  _(prim, Float)                   \
 | 
			
		||||
  _(prim, device)                  \
 | 
			
		||||
  _(prim, dtype)                   \
 | 
			
		||||
  _(prim, shape)                   \
 | 
			
		||||
@ -139,7 +136,8 @@ namespace c10 {
 | 
			
		||||
  _(attr, name)                    \
 | 
			
		||||
  _(attr, a)                       \
 | 
			
		||||
  _(attr, b)                       \
 | 
			
		||||
  _(attr, beg)
 | 
			
		||||
  _(attr, beg)                     \
 | 
			
		||||
  _(attr, idx)
 | 
			
		||||
#else
 | 
			
		||||
#define FORALL_NS_SYMBOLS(_) \
 | 
			
		||||
  _(namespaces, prim)              \
 | 
			
		||||
 | 
			
		||||
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
 | 
			
		||||
    ss << "Future[" << getElementType()->python_str() << "]";
 | 
			
		||||
    return ss.str();
 | 
			
		||||
  }
 | 
			
		||||
  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
 | 
			
		||||
    return create(contained_types.at(0));
 | 
			
		||||
  }
 | 
			
		||||
private:
 | 
			
		||||
  FutureType(TypePtr elem) : SingleElementType(elem) {}
 | 
			
		||||
};
 | 
			
		||||
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
 | 
			
		||||
  AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
 | 
			
		||||
  if (typ->isSubtypeOf(IntType::get())) {
 | 
			
		||||
    return CompleteTensorType::create(at::kLong, at::kCPU, {});
 | 
			
		||||
  } else if (typ->isSubtypeOf(FloatType::get())) {
 | 
			
		||||
@ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
 | 
			
		||||
template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 | 
			
		||||
template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }
 | 
			
		||||
 | 
			
		||||
CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
 | 
			
		||||
CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);
 | 
			
		||||
 | 
			
		||||
using TypeEnv = std::unordered_map<std::string, TypePtr>;
 | 
			
		||||
struct MatchTypeReturn {
 | 
			
		||||
 | 
			
		||||
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
 | 
			
		||||
  return value;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TypePtr inferTypeFrom(const IValue& value) {
 | 
			
		||||
// why incomplete? You cannot completely recover a type from
 | 
			
		||||
// an IValue, List[List[int]] and List[List[Tensor]] will both
 | 
			
		||||
// become ivalue.isGenericList() and cannot be recovered.
 | 
			
		||||
// The only appropriate place to use this is where you know that
 | 
			
		||||
// you are only dealing with a subset of objects where you can recover
 | 
			
		||||
// the type, like in the tracer.
 | 
			
		||||
TypePtr incompleteInferTypeFrom(const IValue& value) {
 | 
			
		||||
  if (value.isTensor()) {
 | 
			
		||||
    return CompleteTensorType::create(value.toTensor());
 | 
			
		||||
  } else if (value.isDouble()) {
 | 
			
		||||
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
 | 
			
		||||
  } else if (value.isDoubleList()) {
 | 
			
		||||
    return ListType::ofFloats();
 | 
			
		||||
  } else if (value.isTuple()) {
 | 
			
		||||
    return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
 | 
			
		||||
    return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
 | 
			
		||||
  } else if (value.isDevice()) {
 | 
			
		||||
    return DeviceObjType::get();
 | 
			
		||||
  }
 | 
			
		||||
  AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
 | 
			
		||||
  AT_ERROR("Type cannot be accurately recovered from this IValue.");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
 | 
			
		||||
 | 
			
		||||
@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
 | 
			
		||||
    vec256::Vec256<scalar_t> acc_vec,
 | 
			
		||||
    int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  scalar_t acc_arr[Vec::size];
 | 
			
		||||
  scalar_t acc_arr[Vec::size()];
 | 
			
		||||
  acc_vec.store(acc_arr);
 | 
			
		||||
  for (int64_t i = 1; i < size; i++) {
 | 
			
		||||
    scalar_t acc_arr_next[Vec::size];
 | 
			
		||||
    scalar_t acc_arr_next[Vec::size()];
 | 
			
		||||
    acc_arr_next[0] = acc_arr[i];
 | 
			
		||||
    Vec acc_vec_next = Vec::loadu(acc_arr_next);
 | 
			
		||||
    acc_vec = vec_fun(acc_vec, acc_vec_next);
 | 
			
		||||
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
 | 
			
		||||
template <typename scalar_t, typename Op>
 | 
			
		||||
inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  if (size < Vec::size)
 | 
			
		||||
  if (size < Vec::size())
 | 
			
		||||
    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
 | 
			
		||||
  int64_t d = Vec::size;
 | 
			
		||||
  int64_t d = Vec::size();
 | 
			
		||||
  Vec acc_vec = Vec::loadu(data);
 | 
			
		||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
			
		||||
  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(data + d);
 | 
			
		||||
    acc_vec = vec_fun(acc_vec, data_vec);
 | 
			
		||||
  }
 | 
			
		||||
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(data + d, size - d);
 | 
			
		||||
    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
 | 
			
		||||
  }
 | 
			
		||||
  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
 | 
			
		||||
  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
			
		||||
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
 | 
			
		||||
    scalar_t* data,
 | 
			
		||||
    int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  if (size < Vec::size)
 | 
			
		||||
  if (size < Vec::size())
 | 
			
		||||
    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
 | 
			
		||||
  int64_t d = Vec::size;
 | 
			
		||||
  int64_t d = Vec::size();
 | 
			
		||||
  Vec acc_vec = map_fun(Vec::loadu(data));
 | 
			
		||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
			
		||||
  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(data + d);
 | 
			
		||||
    data_vec = map_fun(data_vec);
 | 
			
		||||
    acc_vec = red_fun(acc_vec, data_vec);
 | 
			
		||||
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
 | 
			
		||||
    data_vec = map_fun(data_vec);
 | 
			
		||||
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
			
		||||
  }
 | 
			
		||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size);
 | 
			
		||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
			
		||||
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
 | 
			
		||||
    const scalar_t* data2,
 | 
			
		||||
    int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  if (size < Vec::size) {
 | 
			
		||||
  if (size < Vec::size()) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(data, size);
 | 
			
		||||
    Vec data2_vec = Vec::loadu(data2, size);
 | 
			
		||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
			
		||||
    return vec_reduce_all(red_fun, data_vec, size);
 | 
			
		||||
  }
 | 
			
		||||
  int64_t d = Vec::size;
 | 
			
		||||
  int64_t d = Vec::size();
 | 
			
		||||
  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
 | 
			
		||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
			
		||||
  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(data + d);
 | 
			
		||||
    Vec data2_vec = Vec::loadu(data2 + d);
 | 
			
		||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
			
		||||
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
 | 
			
		||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
			
		||||
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
			
		||||
  }
 | 
			
		||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size);
 | 
			
		||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, typename Op>
 | 
			
		||||
@ -103,7 +103,7 @@ inline void map(
 | 
			
		||||
    int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  int64_t d = 0;
 | 
			
		||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
			
		||||
  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
			
		||||
    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
 | 
			
		||||
    output_vec.store(output_data + d);
 | 
			
		||||
  }
 | 
			
		||||
@ -122,7 +122,7 @@ inline void map2(
 | 
			
		||||
    int64_t size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  int64_t d = 0;
 | 
			
		||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
			
		||||
  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
			
		||||
    Vec data_vec = Vec::loadu(input_data + d);
 | 
			
		||||
    Vec data_vec2 = Vec::loadu(input_data2 + d);
 | 
			
		||||
    Vec output_vec = vec_fun(data_vec, data_vec2);
 | 
			
		||||
 | 
			
		||||
@ -15,14 +15,24 @@
 | 
			
		||||
 | 
			
		||||
namespace at {
 | 
			
		||||
namespace vec256 {
 | 
			
		||||
 | 
			
		||||
// Note [Acceptable use of anonymous namespace in header]
 | 
			
		||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
// Yes you saw right, this is an anonymous namespace in a header.  This header,
 | 
			
		||||
// and all of its subheaders, REQUIRE their code to be entirely inlined into
 | 
			
		||||
// the compilation unit that uses them.  It's important that these functions have
 | 
			
		||||
// internal linkage so that kernels for different architectures don't get
 | 
			
		||||
// combined during linking. It's sufficient to label functions "static", but
 | 
			
		||||
// class methods must be an unnamed namespace to have internal linkage (since
 | 
			
		||||
// static means something different in the context of classes).
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
 | 
			
		||||
  T buf[Vec256<T>::size];
 | 
			
		||||
  T buf[Vec256<T>::size()];
 | 
			
		||||
  vec.store(buf);
 | 
			
		||||
  stream << "vec[";
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    if (i != 0) {
 | 
			
		||||
      stream << ", ";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -20,6 +20,7 @@
 | 
			
		||||
 | 
			
		||||
namespace at {
 | 
			
		||||
namespace vec256 {
 | 
			
		||||
// See Note [Acceptable use of anonymous namespace in header]
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
template<size_t n> struct int_of_size;
 | 
			
		||||
@ -45,15 +46,49 @@ struct Vec256 {
 | 
			
		||||
private:
 | 
			
		||||
  T values[32 / sizeof(T)] = {0};
 | 
			
		||||
public:
 | 
			
		||||
  static constexpr int size = 32 / sizeof(T);
 | 
			
		||||
  // Note [constexpr static function to avoid odr-usage compiler bug]
 | 
			
		||||
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
  // Why, you might ask, is size defined to be a static constexpr function,
 | 
			
		||||
  // rather than a more ordinary 'static constexpr int size;' variable?
 | 
			
		||||
  // The problem lies within ODR rules for static constexpr members versus
 | 
			
		||||
  // static constexpr functions.  First, recall that this class (along with all
 | 
			
		||||
  // of its derivations) live in an anonymous namespace: they are intended to be
 | 
			
		||||
  // *completely* inlined at their use-sites, because we need to compile it
 | 
			
		||||
  // multiple times for different instruction sets.
 | 
			
		||||
  //
 | 
			
		||||
  // Because of this constraint, we CANNOT provide a single definition for
 | 
			
		||||
  // any static members in this class; since we want to compile the class
 | 
			
		||||
  // multiple times, there wouldn't actually be any good place to put the
 | 
			
		||||
  // definition.  Now here is the problem: if we ODR-use a static constexpr
 | 
			
		||||
  // member, we are *obligated* to provide a definition.  Without the
 | 
			
		||||
  // definition, you get a compile error like:
 | 
			
		||||
  //
 | 
			
		||||
  //    relocation R_X86_64_PC32 against undefined symbol
 | 
			
		||||
  //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
 | 
			
		||||
  //    a shared object; recompile with -fPIC
 | 
			
		||||
  //
 | 
			
		||||
  // If this were C++17, we could replace a static constexpr variable with
 | 
			
		||||
  // an inline variable which doesn't require one definition. But we are not
 | 
			
		||||
  // C++17.  So the next best thing is to replace the member with a static
 | 
			
		||||
  // constexpr (and therefore inline) function, which does not require ODR
 | 
			
		||||
  // either.
 | 
			
		||||
  //
 | 
			
		||||
  // Also, technically according to the C++ standard, we don't have to define
 | 
			
		||||
  // a constexpr variable if we never odr-use it.  But it seems that some
 | 
			
		||||
  // versions GCC/Clang have buggy determinations on whether or not an
 | 
			
		||||
  // identifier is odr-used or not, and in any case it's hard to tel if
 | 
			
		||||
  // a variabe is odr-used or not.  So best to just cut the probem at the root.
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 32 / sizeof(T);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(T val) {
 | 
			
		||||
    for (int i = 0; i != size; i++) {
 | 
			
		||||
    for (int i = 0; i != size(); i++) {
 | 
			
		||||
      values[i] = val;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<typename... Args,
 | 
			
		||||
           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
 | 
			
		||||
           typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
 | 
			
		||||
  Vec256(Args... vals) {
 | 
			
		||||
    values = { vals... };
 | 
			
		||||
  }
 | 
			
		||||
@ -61,7 +96,7 @@ public:
 | 
			
		||||
  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
    int64_t mask = mask_;
 | 
			
		||||
    Vec256 vec;
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      if (mask & 0x01) {
 | 
			
		||||
        vec[i] = b[i];
 | 
			
		||||
      } else {
 | 
			
		||||
@ -74,9 +109,9 @@ public:
 | 
			
		||||
  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
 | 
			
		||||
                          const Vec256<T>& mask) {
 | 
			
		||||
    Vec256 vec;
 | 
			
		||||
    int_same_size_t<T> buffer[size];
 | 
			
		||||
    int_same_size_t<T> buffer[size()];
 | 
			
		||||
    mask.store(buffer);
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      if (buffer[i] & 0x01)
 | 
			
		||||
       {
 | 
			
		||||
        vec[i] = b[i];
 | 
			
		||||
@ -88,14 +123,14 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
 | 
			
		||||
    Vec256 vec;
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      vec.values[i] = base + i * step;
 | 
			
		||||
    }
 | 
			
		||||
    return vec;
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
 | 
			
		||||
  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
 | 
			
		||||
    Vec256 vec;
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      if (i < count) {
 | 
			
		||||
        vec[i] = b[i];
 | 
			
		||||
      } else {
 | 
			
		||||
@ -114,7 +149,7 @@ public:
 | 
			
		||||
    std::memcpy(vec.values, ptr, count * sizeof(T));
 | 
			
		||||
    return vec;
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int count = size) const {
 | 
			
		||||
  void store(void* ptr, int count = size()) const {
 | 
			
		||||
    std::memcpy(ptr, values, count * sizeof(T));
 | 
			
		||||
  }
 | 
			
		||||
  const T& operator[](int idx) const {
 | 
			
		||||
@ -125,14 +160,14 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<T> map(T (*f)(T)) const {
 | 
			
		||||
    Vec256<T> ret;
 | 
			
		||||
    for (int64_t i = 0; i != size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i != size(); i++) {
 | 
			
		||||
      ret[i] = f(values[i]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<T> abs() const {
 | 
			
		||||
    Vec256<T> ret;
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      ret[i] = values[i] < 0 ? -values[i] : values[i];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@ -214,7 +249,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<T> pow(const Vec256<T> &exp) const {
 | 
			
		||||
    Vec256<T> ret;
 | 
			
		||||
    for (int64_t i = 0; i < size; i++) {
 | 
			
		||||
    for (int64_t i = 0; i < size(); i++) {
 | 
			
		||||
      ret[i] = std::pow(values[i], exp[i]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@ -222,7 +257,7 @@ public:
 | 
			
		||||
#define DEFINE_COMP(binary_pred)                                              \
 | 
			
		||||
  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
 | 
			
		||||
    Vec256<T> vec;                                                            \
 | 
			
		||||
    for (int64_t i = 0; i != size; i++) {                                     \
 | 
			
		||||
    for (int64_t i = 0; i != size(); i++) {                                     \
 | 
			
		||||
      if (values[i] binary_pred other.values[i]) {                            \
 | 
			
		||||
        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
 | 
			
		||||
      } else {                                                                \
 | 
			
		||||
@ -242,7 +277,7 @@ public:
 | 
			
		||||
 | 
			
		||||
template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = a[i] + b[i];
 | 
			
		||||
  }
 | 
			
		||||
  return c;
 | 
			
		||||
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T
 | 
			
		||||
 | 
			
		||||
template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = a[i] - b[i];
 | 
			
		||||
  }
 | 
			
		||||
  return c;
 | 
			
		||||
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T
 | 
			
		||||
 | 
			
		||||
template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = a[i] * b[i];
 | 
			
		||||
  }
 | 
			
		||||
  return c;
 | 
			
		||||
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T
 | 
			
		||||
 | 
			
		||||
template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = a[i] / b[i];
 | 
			
		||||
  }
 | 
			
		||||
  return c;
 | 
			
		||||
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
 | 
			
		||||
// either input is a NaN.
 | 
			
		||||
template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
 | 
			
		||||
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
			
		||||
      // If either input is NaN, propagate a NaN.
 | 
			
		||||
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
 | 
			
		||||
// either input is a NaN.
 | 
			
		||||
template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
			
		||||
  Vec256<T> c = Vec256<T>();
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
 | 
			
		||||
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
			
		||||
      // If either input is NaN, propagate a NaN.
 | 
			
		||||
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
 | 
			
		||||
template <class T>                                                          \
 | 
			
		||||
Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
 | 
			
		||||
  using iT = int_same_size_t<T>;                                            \
 | 
			
		||||
  iT buffer[Vec256<T>::size];                                               \
 | 
			
		||||
  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
 | 
			
		||||
  iT buffer[Vec256<T>::size()];                                               \
 | 
			
		||||
  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
 | 
			
		||||
    auto a_val = a[i];                                                      \
 | 
			
		||||
    auto b_val = b[i];                                                      \
 | 
			
		||||
    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
 | 
			
		||||
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
 | 
			
		||||
template <int64_t scale = 1, typename T = void>
 | 
			
		||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
			
		||||
inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
 | 
			
		||||
  static constexpr int size = Vec256<T>::size;
 | 
			
		||||
  static constexpr int size = Vec256<T>::size();
 | 
			
		||||
  int_same_size_t<T> index_arr[size];
 | 
			
		||||
  vindex.store(static_cast<void*>(index_arr));
 | 
			
		||||
  T buffer[size];
 | 
			
		||||
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
 | 
			
		||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
			
		||||
inline mask_gather(const Vec256<T>& src, T const* base_addr,
 | 
			
		||||
                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
 | 
			
		||||
  static constexpr int size = Vec256<T>::size;
 | 
			
		||||
  static constexpr int size = Vec256<T>::size();
 | 
			
		||||
  T src_arr[size];
 | 
			
		||||
  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
 | 
			
		||||
  int_same_size_t<T> index_arr[size];
 | 
			
		||||
@ -392,7 +427,7 @@ namespace {
 | 
			
		||||
  template<typename dst_t, typename src_t>
 | 
			
		||||
  struct CastImpl {
 | 
			
		||||
    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
 | 
			
		||||
      src_t src_arr[Vec256<src_t>::size];
 | 
			
		||||
      src_t src_arr[Vec256<src_t>::size()];
 | 
			
		||||
      src.store(static_cast<void*>(src_arr));
 | 
			
		||||
      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
 | 
			
		||||
    }
 | 
			
		||||
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
 | 
			
		||||
  static constexpr int size = Vec256<T>::size;
 | 
			
		||||
  static constexpr int size = Vec256<T>::size();
 | 
			
		||||
  T src_arr[size];
 | 
			
		||||
  src.store(static_cast<void*>(src_arr));
 | 
			
		||||
  int_same_size_t<T> buffer[size];
 | 
			
		||||
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
 | 
			
		||||
//       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 | 
			
		||||
//                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 | 
			
		||||
template <typename T>
 | 
			
		||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
			
		||||
inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
			
		||||
deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
  static constexpr int size = Vec256<T>::size;
 | 
			
		||||
  static constexpr int size = Vec256<T>::size();
 | 
			
		||||
  static constexpr int half_size = size / 2;
 | 
			
		||||
  T a_arr[size];
 | 
			
		||||
  T b_arr[size];
 | 
			
		||||
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
//       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 | 
			
		||||
//                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 | 
			
		||||
template <typename T>
 | 
			
		||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
			
		||||
inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
			
		||||
interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
  static constexpr int size = Vec256<T>::size;
 | 
			
		||||
  static constexpr int size = Vec256<T>::size();
 | 
			
		||||
  static constexpr int half_size = size / 2;
 | 
			
		||||
  T a_arr[size];
 | 
			
		||||
  T b_arr[size];
 | 
			
		||||
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
 | 
			
		||||
template <typename src_T, typename dst_T>
 | 
			
		||||
void convert(const src_T *src, dst_T *dst, int64_t n) {
 | 
			
		||||
#pragma unroll
 | 
			
		||||
#ifndef _MSC_VER  
 | 
			
		||||
# pragma unroll  
 | 
			
		||||
#endif
 | 
			
		||||
  for (int64_t i = 0; i < n; i++) {
 | 
			
		||||
    *dst = static_cast<dst_T>(
 | 
			
		||||
        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
 | 
			
		||||
 | 
			
		||||
@ -8,6 +8,7 @@
 | 
			
		||||
 | 
			
		||||
namespace at {
 | 
			
		||||
namespace vec256 {
 | 
			
		||||
// See Note [Acceptable use of anonymous namespace in header]
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
#if defined(__AVX__) && !defined(_MSC_VER)
 | 
			
		||||
@ -16,7 +17,9 @@ template <> class Vec256<double> {
 | 
			
		||||
private:
 | 
			
		||||
  __m256d values;
 | 
			
		||||
public:
 | 
			
		||||
  static constexpr int size = 4;
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 4;
 | 
			
		||||
  }
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(__m256d v) : values(v) {}
 | 
			
		||||
  Vec256(double val) {
 | 
			
		||||
@ -40,7 +43,7 @@ public:
 | 
			
		||||
    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
 | 
			
		||||
                            int64_t count = size) {
 | 
			
		||||
                            int64_t count = size()) {
 | 
			
		||||
    switch (count) {
 | 
			
		||||
      case 0:
 | 
			
		||||
        return a;
 | 
			
		||||
@ -53,22 +56,22 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    return b;
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
 | 
			
		||||
    if (count == size)
 | 
			
		||||
  static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
 | 
			
		||||
    if (count == size())
 | 
			
		||||
      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
 | 
			
		||||
 | 
			
		||||
    __at_align32__ double tmp_values[size];
 | 
			
		||||
    __at_align32__ double tmp_values[size()];
 | 
			
		||||
    std::memcpy(
 | 
			
		||||
        tmp_values,
 | 
			
		||||
        reinterpret_cast<const double*>(ptr),
 | 
			
		||||
        count * sizeof(double));
 | 
			
		||||
    return _mm256_load_pd(tmp_values);
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int count = size) const {
 | 
			
		||||
    if (count == size) {
 | 
			
		||||
  void store(void* ptr, int count = size()) const {
 | 
			
		||||
    if (count == size()) {
 | 
			
		||||
      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
 | 
			
		||||
    } else if (count > 0) {
 | 
			
		||||
      double tmp_values[size];
 | 
			
		||||
      double tmp_values[size()];
 | 
			
		||||
      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
 | 
			
		||||
      std::memcpy(ptr, tmp_values, count * sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
@ -252,7 +255,7 @@ template <>
 | 
			
		||||
void convert(const double* src, double* dst, int64_t n) {
 | 
			
		||||
  int64_t i;
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
 | 
			
		||||
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
 | 
			
		||||
  }
 | 
			
		||||
#pragma unroll
 | 
			
		||||
 | 
			
		||||
@ -8,6 +8,7 @@
 | 
			
		||||
 | 
			
		||||
namespace at {
 | 
			
		||||
namespace vec256 {
 | 
			
		||||
// See Note [Acceptable use of anonymous namespace in header]
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
#if defined(__AVX__) && !defined(_MSC_VER)
 | 
			
		||||
@ -16,7 +17,9 @@ template <> class Vec256<float> {
 | 
			
		||||
private:
 | 
			
		||||
  __m256 values;
 | 
			
		||||
public:
 | 
			
		||||
  static constexpr int size = 8;
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 8;
 | 
			
		||||
  }
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(__m256 v) : values(v) {}
 | 
			
		||||
  Vec256(float val) {
 | 
			
		||||
@ -43,7 +46,7 @@ public:
 | 
			
		||||
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
 | 
			
		||||
                           int64_t count = size) {
 | 
			
		||||
                           int64_t count = size()) {
 | 
			
		||||
    switch (count) {
 | 
			
		||||
      case 0:
 | 
			
		||||
        return a;
 | 
			
		||||
@ -64,19 +67,19 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    return b;
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
 | 
			
		||||
    if (count == size)
 | 
			
		||||
  static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
 | 
			
		||||
    if (count == size())
 | 
			
		||||
      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
 | 
			
		||||
    __at_align32__ float tmp_values[size];
 | 
			
		||||
    __at_align32__ float tmp_values[size()];
 | 
			
		||||
    std::memcpy(
 | 
			
		||||
        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
 | 
			
		||||
    return _mm256_loadu_ps(tmp_values);
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int64_t count = size) const {
 | 
			
		||||
    if (count == size) {
 | 
			
		||||
  void store(void* ptr, int64_t count = size()) const {
 | 
			
		||||
    if (count == size()) {
 | 
			
		||||
      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
 | 
			
		||||
    } else if (count > 0) {
 | 
			
		||||
      float tmp_values[size];
 | 
			
		||||
      float tmp_values[size()];
 | 
			
		||||
      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
 | 
			
		||||
      std::memcpy(ptr, tmp_values, count * sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
@ -260,7 +263,7 @@ template <>
 | 
			
		||||
void convert(const float* src, float* dst, int64_t n) {
 | 
			
		||||
  int64_t i;
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
 | 
			
		||||
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
 | 
			
		||||
  }
 | 
			
		||||
#pragma unroll
 | 
			
		||||
 | 
			
		||||
@ -12,6 +12,11 @@ namespace {
 | 
			
		||||
struct Vec256i {
 | 
			
		||||
protected:
 | 
			
		||||
  __m256i values;
 | 
			
		||||
 | 
			
		||||
  static inline __m256i invert(const __m256i& v) {
 | 
			
		||||
    const auto ones = _mm256_set1_epi64x(-1);
 | 
			
		||||
    return _mm256_xor_si256(ones, v);
 | 
			
		||||
  }
 | 
			
		||||
public:
 | 
			
		||||
  Vec256i() {}
 | 
			
		||||
  Vec256i(__m256i v) : values(v) {}
 | 
			
		||||
@ -22,7 +27,9 @@ public:
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct Vec256<int64_t> : public Vec256i {
 | 
			
		||||
  static constexpr int size = 4;
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 4;
 | 
			
		||||
  }
 | 
			
		||||
  using Vec256i::Vec256i;
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
 | 
			
		||||
@ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i {
 | 
			
		||||
  }
 | 
			
		||||
  template <int64_t mask>
 | 
			
		||||
  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
 | 
			
		||||
    __at_align32__ int64_t tmp_values[size];
 | 
			
		||||
    __at_align32__ int64_t tmp_values[size()];
 | 
			
		||||
    a.store(tmp_values);
 | 
			
		||||
    if (mask & 0x01)
 | 
			
		||||
      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
 | 
			
		||||
@ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i {
 | 
			
		||||
    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int64_t>
 | 
			
		||||
  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
 | 
			
		||||
  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
 | 
			
		||||
    switch (count) {
 | 
			
		||||
      case 0:
 | 
			
		||||
        return a;
 | 
			
		||||
@ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i {
 | 
			
		||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
 | 
			
		||||
    __at_align32__ int64_t tmp_values[size];
 | 
			
		||||
    __at_align32__ int64_t tmp_values[size()];
 | 
			
		||||
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
 | 
			
		||||
    return loadu(tmp_values);
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int count = size) const {
 | 
			
		||||
    if (count == size) {
 | 
			
		||||
  void store(void* ptr, int count = size()) const {
 | 
			
		||||
    if (count == size()) {
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
			
		||||
    } else if (count > 0) {
 | 
			
		||||
      __at_align32__ int64_t tmp_values[size];
 | 
			
		||||
      __at_align32__ int64_t tmp_values[size()];
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
			
		||||
      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
 | 
			
		||||
    }
 | 
			
		||||
@ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i {
 | 
			
		||||
    return _mm256_cmpeq_epi64(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto eq = _mm256_cmpeq_epi64(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
			
		||||
    return invert(_mm256_cmpeq_epi64(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi64(other.values, values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto gt = _mm256_cmpgt_epi64(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi64(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi64(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto lt = _mm256_cmpgt_epi64(other.values, values);
 | 
			
		||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi64(other.values, values));
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct Vec256<int32_t> : public Vec256i {
 | 
			
		||||
  static constexpr int size = 8;
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 8;
 | 
			
		||||
  }
 | 
			
		||||
  using Vec256i::Vec256i;
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
 | 
			
		||||
@ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i {
 | 
			
		||||
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int32_t>
 | 
			
		||||
  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
 | 
			
		||||
  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
 | 
			
		||||
    switch (count) {
 | 
			
		||||
      case 0:
 | 
			
		||||
        return a;
 | 
			
		||||
@ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i {
 | 
			
		||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
 | 
			
		||||
    __at_align32__ int32_t tmp_values[size];
 | 
			
		||||
    __at_align32__ int32_t tmp_values[size()];
 | 
			
		||||
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
 | 
			
		||||
    return loadu(tmp_values);
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int count = size) const {
 | 
			
		||||
    if (count == size) {
 | 
			
		||||
  void store(void* ptr, int count = size()) const {
 | 
			
		||||
    if (count == size()) {
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
			
		||||
    } else if (count > 0) {
 | 
			
		||||
      __at_align32__ int32_t tmp_values[size];
 | 
			
		||||
      __at_align32__ int32_t tmp_values[size()];
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
			
		||||
      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
 | 
			
		||||
    }
 | 
			
		||||
@ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i {
 | 
			
		||||
    return _mm256_cmpeq_epi32(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto eq = _mm256_cmpeq_epi32(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
			
		||||
    return invert(_mm256_cmpeq_epi32(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi32(other.values, values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto gt = _mm256_cmpgt_epi32(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi32(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi32(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto lt = _mm256_cmpgt_epi32(other.values, values);
 | 
			
		||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi32(other.values, values));
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@ -212,13 +209,17 @@ template <>
 | 
			
		||||
void convert(const int32_t *src, float *dst, int64_t n) {
 | 
			
		||||
  int64_t i;
 | 
			
		||||
  // int32_t and float have same size
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
# pragma unroll
 | 
			
		||||
#endif
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
 | 
			
		||||
    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
 | 
			
		||||
    auto output_vec = _mm256_cvtepi32_ps(input_vec);
 | 
			
		||||
    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
 | 
			
		||||
  }
 | 
			
		||||
#pragma unroll
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
# pragma unroll
 | 
			
		||||
#endif
 | 
			
		||||
  for (; i < n; i++) {
 | 
			
		||||
    dst[i] = static_cast<float>(src[i]);
 | 
			
		||||
  }
 | 
			
		||||
@ -228,13 +229,17 @@ template <>
 | 
			
		||||
void convert(const int32_t *src, double *dst, int64_t n) {
 | 
			
		||||
  int64_t i;
 | 
			
		||||
  // int32_t has half the size of double
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
# pragma unroll
 | 
			
		||||
#endif
 | 
			
		||||
  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
 | 
			
		||||
    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
 | 
			
		||||
    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
 | 
			
		||||
    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
 | 
			
		||||
  }
 | 
			
		||||
#pragma unroll
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
# pragma unroll
 | 
			
		||||
#endif
 | 
			
		||||
  for (; i < n; i++) {
 | 
			
		||||
    dst[i] = static_cast<double>(src[i]);
 | 
			
		||||
  }
 | 
			
		||||
@ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
struct Vec256<int16_t> : public Vec256i {
 | 
			
		||||
  static constexpr int size = 16;
 | 
			
		||||
  static constexpr int size() {
 | 
			
		||||
    return 16;
 | 
			
		||||
  }
 | 
			
		||||
  using Vec256i::Vec256i;
 | 
			
		||||
  Vec256() {}
 | 
			
		||||
  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
 | 
			
		||||
@ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i {
 | 
			
		||||
  }
 | 
			
		||||
  template <int64_t mask>
 | 
			
		||||
  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
 | 
			
		||||
    __at_align32__ int16_t tmp_values[size];
 | 
			
		||||
    __at_align32__ int16_t tmp_values[size()];
 | 
			
		||||
    a.store(tmp_values);
 | 
			
		||||
    if (mask & 0x01)
 | 
			
		||||
      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
 | 
			
		||||
@ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i {
 | 
			
		||||
      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int16_t>
 | 
			
		||||
  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
 | 
			
		||||
  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
 | 
			
		||||
    switch (count) {
 | 
			
		||||
      case 0:
 | 
			
		||||
        return a;
 | 
			
		||||
@ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i {
 | 
			
		||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
			
		||||
  }
 | 
			
		||||
  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
 | 
			
		||||
    __at_align32__ int16_t tmp_values[size];
 | 
			
		||||
    __at_align32__ int16_t tmp_values[size()];
 | 
			
		||||
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
 | 
			
		||||
    return loadu(tmp_values);
 | 
			
		||||
  }
 | 
			
		||||
  void store(void* ptr, int count = size) const {
 | 
			
		||||
    if (count == size) {
 | 
			
		||||
  void store(void* ptr, int count = size()) const {
 | 
			
		||||
    if (count == size()) {
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
			
		||||
    } else if (count > 0) {
 | 
			
		||||
      __at_align32__ int16_t tmp_values[size];
 | 
			
		||||
      __at_align32__ int16_t tmp_values[size()];
 | 
			
		||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
			
		||||
      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
 | 
			
		||||
    }
 | 
			
		||||
@ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i {
 | 
			
		||||
    return _mm256_cmpeq_epi16(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto eq = _mm256_cmpeq_epi16(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
			
		||||
    return invert(_mm256_cmpeq_epi16(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi16(other.values, values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto gt = _mm256_cmpgt_epi16(values, other.values);
 | 
			
		||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi16(values, other.values));
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
 | 
			
		||||
    return _mm256_cmpgt_epi16(values, other.values);
 | 
			
		||||
  }
 | 
			
		||||
  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
 | 
			
		||||
    auto zero = _mm256_set1_epi64x(0);
 | 
			
		||||
    auto lt = _mm256_cmpgt_epi16(other.values, values);
 | 
			
		||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
			
		||||
    return invert(_mm256_cmpgt_epi16(other.values, values));
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
 | 
			
		||||
  T values_a[Vec256<T>::size];
 | 
			
		||||
  T values_b[Vec256<T>::size];
 | 
			
		||||
  T values_a[Vec256<T>::size()];
 | 
			
		||||
  T values_b[Vec256<T>::size()];
 | 
			
		||||
  a.store(values_a);
 | 
			
		||||
  b.store(values_b);
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
			
		||||
  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
			
		||||
    values_a[i] /= values_b[i];
 | 
			
		||||
  }
 | 
			
		||||
  return Vec256<T>::loadu(values_a);
 | 
			
		||||
 | 
			
		||||
@ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 | 
			
		||||
      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
 | 
			
		||||
  // NOTE: extra parenthesis around numbers disable clang warnings about
 | 
			
		||||
  // dead code
 | 
			
		||||
  return (
 | 
			
		||||
      (CUDNN_VERSION >= (6021)) ||
 | 
			
		||||
      (CUDNN_VERSION >= (6000) && prop->major >= 5));
 | 
			
		||||
  return true;
 | 
			
		||||
#else
 | 
			
		||||
  return false;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -9,45 +9,6 @@
 | 
			
		||||
#include "ATen/cuda/ATenCUDAGeneral.h"
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
 | 
			
		||||
#include <curand_kernel.h>
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
Note [cuDNN dropout descriptor initialization]
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
In most cases, setting descriptors in cuDNN is cheap (e.g.,
 | 
			
		||||
cudnnSetTensorNdDescriptor).  However, this is not the case for
 | 
			
		||||
cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
 | 
			
		||||
expensive precomputation to initialize the random number generator states.  In
 | 
			
		||||
cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
 | 
			
		||||
which means that law-abiding clients were expected to generate a dropout
 | 
			
		||||
descriptor once and cache it.  However, our ATen interface is (1) stateless (so
 | 
			
		||||
we can't cache the descriptors) and (2) does not accept arbitrary user types in
 | 
			
		||||
its interface (so we can't pass the descriptor in).  This puts us in a pickle.
 | 
			
		||||
 | 
			
		||||
In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
 | 
			
		||||
forgoes the expensive initialization process, and can initialize the
 | 
			
		||||
descriptor with a pre-initialized state CUDA tensor.  This is great, because
 | 
			
		||||
it means we can simply pass in the state tensor and then initialize the
 | 
			
		||||
descriptor internally.  Unfortunately, this function is not available in
 | 
			
		||||
cuDNN 6.
 | 
			
		||||
 | 
			
		||||
To work around this, we break the cuDNN abstraction barrier, and have
 | 
			
		||||
the struct layout of the underlaying dropout descriptor.  With this struct,
 | 
			
		||||
we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
 | 
			
		||||
struct cudnnDropoutStruct {
 | 
			
		||||
  float dropout;
 | 
			
		||||
  int nstates;
 | 
			
		||||
  void * states;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace at { namespace native {
 | 
			
		||||
 | 
			
		||||
// TODO: Add constructors for all of the descriptors
 | 
			
		||||
@ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor
 | 
			
		||||
    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
 | 
			
		||||
    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
 | 
			
		||||
                                          CUDNN_CROSS_CORRELATION, mathType));
 | 
			
		||||
#if CUDNN_VERSION >= 7000
 | 
			
		||||
    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
 | 
			
		||||
    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
 | 
			
		||||
    if(dataType == CUDNN_DATA_HALF)
 | 
			
		||||
      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
 | 
			
		||||
// See Note [cuDNN dropout descriptor initialization]
 | 
			
		||||
inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
 | 
			
		||||
    cudnnDropoutDescriptor_t dropoutDesc,
 | 
			
		||||
    cudnnHandle_t handle,
 | 
			
		||||
    float dropout,
 | 
			
		||||
    void *states,
 | 
			
		||||
    size_t stateSizeInBytes,
 | 
			
		||||
    unsigned long long seed) {
 | 
			
		||||
  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
 | 
			
		||||
  // This is not entirely accurate but is good enough to catch some API
 | 
			
		||||
  // uses which would not be compatible in cuDNN 7.  Feel free to fix
 | 
			
		||||
  // this if you notice something is wrong.
 | 
			
		||||
  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
 | 
			
		||||
  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
 | 
			
		||||
  size_t expectedStateSizeInBytes;
 | 
			
		||||
  // State size will differ depending on size of GPU
 | 
			
		||||
  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
 | 
			
		||||
  if (ret != CUDNN_STATUS_SUCCESS) return ret;
 | 
			
		||||
  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
 | 
			
		||||
  dropoutDesc->dropout = dropout;
 | 
			
		||||
  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
 | 
			
		||||
  dropoutDesc->states = states;
 | 
			
		||||
  return CUDNN_STATUS_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif // CUDNN_VERSION
 | 
			
		||||
 | 
			
		||||
struct AT_CUDA_API DropoutDescriptor
 | 
			
		||||
  : public Descriptor<cudnnDropoutStruct,
 | 
			
		||||
                      &cudnnCreateDropoutDescriptor,
 | 
			
		||||
@ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor
 | 
			
		||||
          mode,
 | 
			
		||||
          algo,
 | 
			
		||||
          datatype));
 | 
			
		||||
#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
 | 
			
		||||
#if CUDA_VERSION >= 9000
 | 
			
		||||
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 | 
			
		||||
    if (prop->major >= 7) {
 | 
			
		||||
      if (datatype == CUDNN_DATA_HALF) {
 | 
			
		||||
@ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION >= 7000
 | 
			
		||||
 | 
			
		||||
struct AT_CUDA_API CTCLossDescriptor
 | 
			
		||||
  : public Descriptor<cudnnCTCLossStruct,
 | 
			
		||||
                      &cudnnCreateCTCLossDescriptor,
 | 
			
		||||
@ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
union Constant
 | 
			
		||||
{
 | 
			
		||||
  float f;
 | 
			
		||||
 | 
			
		||||
@ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
 | 
			
		||||
      input_stride1 = strides[1];
 | 
			
		||||
    }
 | 
			
		||||
    AT_CHECK(channel_size == weight_num,
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
			
		||||
      weight_num, channel_size);
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
			
		||||
      " and channel size = ", channel_size, ".");
 | 
			
		||||
 | 
			
		||||
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
 | 
			
		||||
      prelu_cpu_kernel_multi_weights<scalar_t>(
 | 
			
		||||
@ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
 | 
			
		||||
      input_stride1 = strides[1];
 | 
			
		||||
    }
 | 
			
		||||
    AT_CHECK(channel_size == weight_num,
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
			
		||||
      weight_num, channel_size);
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
			
		||||
      " and channel size = ", channel_size, ".");
 | 
			
		||||
 | 
			
		||||
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
 | 
			
		||||
      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
 | 
			
		||||
 | 
			
		||||
@ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)
 | 
			
		||||
 | 
			
		||||
// Supports arbitrary batch dimensions for self and A
 | 
			
		||||
std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
 | 
			
		||||
  if (self.dim() <= 2 && A.dim() <= 2) {
 | 
			
		||||
  AT_CHECK(self.dim() >= 2,
 | 
			
		||||
           "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
 | 
			
		||||
  AT_CHECK(A.dim() >= 2,
 | 
			
		||||
           "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
 | 
			
		||||
  if (self.dim() == 2 && A.dim() == 2) {
 | 
			
		||||
    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
 | 
			
		||||
    // TH and ATen. We should remove the TH gesv bindings, especially
 | 
			
		||||
    // since the lapackGesv function is already in ATen.
 | 
			
		||||
    linearSolveCheckInputs(self, A);  // Checks square shape of A, and compatibility of self and A
 | 
			
		||||
    return at::_th_gesv_single(self, A);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) {
 | 
			
		||||
  }
 | 
			
		||||
  squareCheckInputs(self);
 | 
			
		||||
 | 
			
		||||
  // TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors,
 | 
			
		||||
  // this can be simplified. Currently, we are zero-ing out values in the
 | 
			
		||||
  // batch of matrices by using a mask and the `where` function.
 | 
			
		||||
  // The simplification with batched `triu` and `tril` would be this:
 | 
			
		||||
  // if (upper) {
 | 
			
		||||
  //   return raw_cholesky_output.triu();
 | 
			
		||||
  // } else {
 | 
			
		||||
  //   return raw_cholesky_output.tril();
 | 
			
		||||
  // }
 | 
			
		||||
  auto raw_cholesky_output = at::_cholesky_helper(self, upper);
 | 
			
		||||
  int64_t n = self.size(-1);
 | 
			
		||||
  auto indices = at::ones({n, n}, self.options().dtype(at::kByte));
 | 
			
		||||
  indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self);
 | 
			
		||||
  return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output);
 | 
			
		||||
  if (upper) {
 | 
			
		||||
    return raw_cholesky_output.triu_();
 | 
			
		||||
  } else {
 | 
			
		||||
    return raw_cholesky_output.tril_();
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
 | 
			
		||||
@ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, bool inplace, bool upper>
 | 
			
		||||
static void apply_triu_tril_single(
 | 
			
		||||
    scalar_t* result, scalar_t* self,
 | 
			
		||||
    int64_t k, int64_t n, int64_t m,
 | 
			
		||||
    int64_t res_row_stride, int64_t res_col_stride,
 | 
			
		||||
    int64_t self_row_stride, int64_t self_col_stride) {
 | 
			
		||||
 | 
			
		||||
  constexpr int64_t zero = 0;
 | 
			
		||||
  int64_t i;
 | 
			
		||||
 | 
			
		||||
  if (upper) {
 | 
			
		||||
    #pragma omp parallel for private(i)
 | 
			
		||||
    for (i = 0; i < n; i++) {
 | 
			
		||||
      for (int64_t j = 0; j < std::min(m, i + k); j++) {
 | 
			
		||||
        result[i * res_row_stride + j * res_col_stride] = 0;
 | 
			
		||||
      }
 | 
			
		||||
      if (!inplace) {  // copy the rest of the self if not inplace
 | 
			
		||||
        for (int64_t j = std::max(zero, i + k); j < m; j++) {
 | 
			
		||||
          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    #pragma omp parallel for private(i)
 | 
			
		||||
    for (i = 0; i < n; i++) {
 | 
			
		||||
      for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
 | 
			
		||||
        result[i * res_row_stride + j * res_col_stride] = 0;
 | 
			
		||||
      }
 | 
			
		||||
      if (!inplace) {  // copy the rest of the self if not inplace
 | 
			
		||||
        for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
 | 
			
		||||
          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, bool inplace, bool upper>
 | 
			
		||||
void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) {
 | 
			
		||||
  auto n = self.size(-2);
 | 
			
		||||
  auto m = self.size(-1);
 | 
			
		||||
  auto self_data = self.data<scalar_t>();
 | 
			
		||||
  auto self_stride = self.dim() > 2 ? self.stride(-3) : 1;
 | 
			
		||||
  auto batchsize = batchCount(self);
 | 
			
		||||
  auto self_row_stride = self.stride(-2);
 | 
			
		||||
  auto self_column_stride = self.stride(-1);
 | 
			
		||||
 | 
			
		||||
  auto result_data = result.data<scalar_t>();
 | 
			
		||||
  int64_t result_stride, result_row_stride, result_column_stride;
 | 
			
		||||
  if (result_data != self_data) {
 | 
			
		||||
    result_stride = result.dim() > 2 ? result.stride(-3) : 1;
 | 
			
		||||
    result_row_stride = result.stride(-2);
 | 
			
		||||
    result_column_stride = result.stride(-1);
 | 
			
		||||
  } else {
 | 
			
		||||
    result_stride = self_stride;
 | 
			
		||||
    result_row_stride = self_row_stride;
 | 
			
		||||
    result_column_stride = self_column_stride;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int64_t b;
 | 
			
		||||
  #pragma omp parallel for private(b)
 | 
			
		||||
  for (b = 0; b < batchsize; b++) {
 | 
			
		||||
    scalar_t* self_batch = &self_data[b * self_stride];
 | 
			
		||||
    scalar_t* result_batch = &result_data[b * result_stride];
 | 
			
		||||
    apply_triu_tril_single<scalar_t, inplace, upper>(
 | 
			
		||||
        result_batch, self_batch, k, n, m,
 | 
			
		||||
        result_row_stride, result_column_stride, self_row_stride, self_column_stride);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor tril(const Tensor& self, int64_t k) {
 | 
			
		||||
  Tensor result = at::empty({0}, self.options());
 | 
			
		||||
  at::tril_out(result, self, k);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& tril_cpu_(Tensor &self, int64_t k) {
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return self;
 | 
			
		||||
  }
 | 
			
		||||
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
			
		||||
  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
 | 
			
		||||
    apply_triu_tril<scalar_t, true, false>(self, self, k);
 | 
			
		||||
  });
 | 
			
		||||
  return self;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
			
		||||
  if (result.sizes() != self.sizes()) {
 | 
			
		||||
    result.resize_as_(self);
 | 
			
		||||
  }
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
			
		||||
  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
 | 
			
		||||
    apply_triu_tril<scalar_t, false, false>(result, self_c, k);
 | 
			
		||||
  });
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor triu(const Tensor& self, int64_t k) {
 | 
			
		||||
  Tensor result = at::empty({0}, self.options());
 | 
			
		||||
  at::triu_out(result, self, k);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& triu_cpu_(Tensor &self, int64_t k) {
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return self;
 | 
			
		||||
  }
 | 
			
		||||
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
			
		||||
  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
 | 
			
		||||
    apply_triu_tril<scalar_t, true, true>(self, self, k);
 | 
			
		||||
  });
 | 
			
		||||
  return self;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
			
		||||
  if (result.sizes() != self.sizes()) {
 | 
			
		||||
    result.resize_as_(self);
 | 
			
		||||
  }
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
			
		||||
  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
 | 
			
		||||
    apply_triu_tril<scalar_t, false, true>(result, self_c, k);
 | 
			
		||||
  });
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}}  // namespace at::native
 | 
			
		||||
 | 
			
		||||
@ -378,8 +378,8 @@ at::Tensor _convolution(
 | 
			
		||||
    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
 | 
			
		||||
             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
 | 
			
		||||
             ") should be the same");
 | 
			
		||||
 | 
			
		||||
    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 | 
			
		||||
    output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias,
 | 
			
		||||
                                    params.padding, params.stride, params.dilation, params.groups);
 | 
			
		||||
#endif
 | 
			
		||||
  } else {
 | 
			
		||||
    if (params.groups == 1) {
 | 
			
		||||
 | 
			
		||||
@ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & eq_(Tensor& self, const Tensor & other) {
 | 
			
		||||
  return at::_th_ge_(self, other);
 | 
			
		||||
  return at::_th_eq_(self, other);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & ne_(Tensor& self, Scalar other) {
 | 
			
		||||
@ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) {
 | 
			
		||||
  return at::_th_atan2_(self, other);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & tril_(Tensor& self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_tril_(self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & triu_(Tensor& self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_triu_(self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & digamma_(Tensor& self) {
 | 
			
		||||
  return at::_th_digamma_(self);
 | 
			
		||||
}
 | 
			
		||||
@ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
 | 
			
		||||
  return at::_th_cross(self, other, dim);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_triu_out(result, self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor triu(const Tensor & self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_triu(self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_tril_out(result, self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor tril(const Tensor & self, int64_t diagonal) {
 | 
			
		||||
  return at::_th_tril(self, diagonal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor trace(const Tensor & self) {
 | 
			
		||||
  return at::_th_trace(self);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) {
 | 
			
		||||
  return batched_matrices.size(-1) * batched_matrices.size(-2);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Checks a necessary property for the triu and tril implementations, hence the name.
 | 
			
		||||
 * Here batch contiguity is checked for tensors with greater than 4 dimensions.
 | 
			
		||||
 * Contiguous tensors and tensors with less than 3 dimensions pass this check
 | 
			
		||||
 */ 
 | 
			
		||||
static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) {
 | 
			
		||||
  // Complete contiguity is the most desired property, which is why
 | 
			
		||||
  // we return true if the tensor is contiguous
 | 
			
		||||
  if (tensor.is_contiguous()) return true;
 | 
			
		||||
 | 
			
		||||
  int64_t dims = tensor.dim();
 | 
			
		||||
 | 
			
		||||
  // Tensors with dimension less than 4 are handled by default
 | 
			
		||||
  if (dims <= 3) return true;
 | 
			
		||||
 | 
			
		||||
  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
 | 
			
		||||
  for (int64_t i = dims - 3; i >= 0; i--) {
 | 
			
		||||
    if (expected_stride != tensor.stride(i)) return false;
 | 
			
		||||
    expected_stride *= tensor.size(i);
 | 
			
		||||
  }
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Returns the epsilon value for floating types except half
 | 
			
		||||
static inline double _get_epsilon(const ScalarType& sc_type) {
 | 
			
		||||
  switch (sc_type) {
 | 
			
		||||
 | 
			
		||||
@ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
 | 
			
		||||
std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
 | 
			
		||||
                                                  const Tensor& running_mean, const Tensor& running_var,
 | 
			
		||||
                                                  bool train, double momentum, double eps) {
 | 
			
		||||
  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
 | 
			
		||||
 | 
			
		||||
  return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
 | 
			
		||||
      return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
@ -21,7 +21,6 @@ namespace native {
 | 
			
		||||
 | 
			
		||||
DEFINE_DISPATCH(sum_stub);
 | 
			
		||||
DEFINE_DISPATCH(prod_stub);
 | 
			
		||||
DEFINE_DISPATCH(norm_kernel);
 | 
			
		||||
 | 
			
		||||
static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
 | 
			
		||||
  ScalarType scalarType = self.type().scalarType();
 | 
			
		||||
@ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_
 | 
			
		||||
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
 | 
			
		||||
  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
 | 
			
		||||
    return result;
 | 
			
		||||
  if (self.is_contiguous() && result.is_contiguous()) {
 | 
			
		||||
    _dimreduce_setup(result, self, dim);
 | 
			
		||||
    norm_kernel(kCPU, result, self, p, dim);
 | 
			
		||||
    if (!keepdim) {
 | 
			
		||||
      result.squeeze_(dim);
 | 
			
		||||
    }
 | 
			
		||||
    return result;
 | 
			
		||||
  } else {
 | 
			
		||||
    return at::_th_norm_out(result, self, p, dim, keepdim);
 | 
			
		||||
  }
 | 
			
		||||
  return at::_th_norm_out(result, self, p, dim, keepdim);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
 | 
			
		||||
@ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
 | 
			
		||||
    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
 | 
			
		||||
             "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
 | 
			
		||||
    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
 | 
			
		||||
    if (self.is_cuda()) {
 | 
			
		||||
      return at::_th_norm(self, p);
 | 
			
		||||
    } else {
 | 
			
		||||
      if (self.is_contiguous()) {
 | 
			
		||||
        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
 | 
			
		||||
        norm_kernel(kCPU, result, self, p, c10::nullopt);
 | 
			
		||||
        return result;
 | 
			
		||||
      } else {
 | 
			
		||||
        return at::_th_norm(self, p);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    return at::_th_norm(self, p);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -34,11 +34,11 @@ Tensor _bincount_cpu_template(
 | 
			
		||||
  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
 | 
			
		||||
  nbins = std::max(nbins, minlength); // at least minlength # of bins
 | 
			
		||||
 | 
			
		||||
  const input_t* self_p = self.contiguous().data<input_t>();
 | 
			
		||||
  const input_t* self_p = self.data<input_t>();
 | 
			
		||||
  if (has_weights) {
 | 
			
		||||
    output = native::zeros({nbins}, weights.options());
 | 
			
		||||
    weights_t* output_p = output.data<weights_t>();
 | 
			
		||||
    const weights_t* weights_p = weights.contiguous().data<weights_t>();
 | 
			
		||||
    const weights_t* weights_p = weights.data<weights_t>();
 | 
			
		||||
    for (int64_t i = 0; i < self.size(0); i++) {
 | 
			
		||||
      output_p[self_p[i]] += weights_p[i];
 | 
			
		||||
    }
 | 
			
		||||
@ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
 | 
			
		||||
  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
 | 
			
		||||
    const auto scalar = weights.type().scalarType();
 | 
			
		||||
    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
 | 
			
		||||
      return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
 | 
			
		||||
      return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
 | 
			
		||||
    return _bincount_cpu_template<scalar_t, double>(
 | 
			
		||||
        self, weights.toType(CPU(kDouble)), minlength);
 | 
			
		||||
        self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
 | 
			
		||||
  if (range.size() == 0) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
  auto strides = get_strides();
 | 
			
		||||
  while (strides.size() < 2 * ntensors()) {
 | 
			
		||||
    strides.push_back(0);
 | 
			
		||||
@ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range)
 | 
			
		||||
  int64_t ndim = values.size();
 | 
			
		||||
  for (int dim = 0; dim < ndim; dim++) {
 | 
			
		||||
    int64_t size = shape[dim];
 | 
			
		||||
    values[dim] = linear_offset % size;
 | 
			
		||||
    linear_offset /= size;
 | 
			
		||||
    if (size > 0) {
 | 
			
		||||
      values[dim] = linear_offset % size;
 | 
			
		||||
      linear_offset /= size;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  AT_ASSERT(linear_offset == 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -101,14 +101,14 @@ struct PDist {
 | 
			
		||||
 | 
			
		||||
    scalar_t * const res_start = result.data<scalar_t>();
 | 
			
		||||
    int64_t combs = result.numel(); // n * (n - 1) / 2
 | 
			
		||||
    const Vec pvec(p);
 | 
			
		||||
 | 
			
		||||
    // We conceptually iterate over tuples of (i, j, k) where i is the first
 | 
			
		||||
    // vector from the input, j is the second, and k is the result index. This
 | 
			
		||||
    // parallelizes over the range of k and infers what i and j are from the
 | 
			
		||||
    // value of k.
 | 
			
		||||
    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
 | 
			
		||||
      float n2 = n - .5;
 | 
			
		||||
    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) {
 | 
			
		||||
      const Vec pvec(p);
 | 
			
		||||
      double n2 = n - .5;
 | 
			
		||||
      // The -1 accounts for floating point truncation issues
 | 
			
		||||
      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
 | 
			
		||||
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
			
		||||
@ -149,7 +149,7 @@ struct PDist {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <typename F>
 | 
			
		||||
  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
 | 
			
		||||
  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
 | 
			
		||||
    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
 | 
			
		||||
 | 
			
		||||
      const Vec self_vec_i = Vec::loadu(self_i, count);
 | 
			
		||||
@ -177,7 +177,6 @@ struct PDist {
 | 
			
		||||
    const int64_t n = self.size(0);
 | 
			
		||||
    const int64_t m = self.size(1);
 | 
			
		||||
    const int64_t gs = grad.stride(0);
 | 
			
		||||
    const Vec pvec(p);
 | 
			
		||||
 | 
			
		||||
    const scalar_t * const grad_start = grad.data<scalar_t>();
 | 
			
		||||
    const scalar_t * const dist_start = dist.data<scalar_t>();
 | 
			
		||||
@ -187,17 +186,19 @@ struct PDist {
 | 
			
		||||
    // The only way to parallelize and avoid locking requires parallelizing
 | 
			
		||||
    // over the columns of the input, i.e. we compute the gradient for the
 | 
			
		||||
    // first section of each vector independentaly of the second section, etc.
 | 
			
		||||
    at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
 | 
			
		||||
      const scalar_t * self_l = self_start + l * Vec::size;
 | 
			
		||||
      scalar_t * res_l = res_start + l * Vec::size;
 | 
			
		||||
    at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) {
 | 
			
		||||
      const Vec pvec(p);
 | 
			
		||||
 | 
			
		||||
      for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
 | 
			
		||||
      const scalar_t * self_l = self_start + l * Vec::size();
 | 
			
		||||
      scalar_t * res_l = res_start + l * Vec::size();
 | 
			
		||||
 | 
			
		||||
      for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) {
 | 
			
		||||
        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
    const int64_t remainder = m % Vec::size;
 | 
			
		||||
    const int64_t remainder = m % Vec::size();
 | 
			
		||||
    if (remainder) {
 | 
			
		||||
      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
 | 
			
		||||
      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -308,7 +308,9 @@ static inline void
 | 
			
		||||
mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
 | 
			
		||||
                 const int_same_size_t<scalar_t> *offsets,
 | 
			
		||||
                 const int_same_size_t<scalar_t> *mask, int64_t len) {
 | 
			
		||||
  #pragma unroll
 | 
			
		||||
  #ifndef _MSC_VER  
 | 
			
		||||
  # pragma unroll  
 | 
			
		||||
  #endif
 | 
			
		||||
  for (int64_t i = 0; i < len; i++) {
 | 
			
		||||
    if (mask[i] & 0x01) {
 | 
			
		||||
      base_addr[offsets[i]] += src[i];
 | 
			
		||||
@ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
			
		||||
    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
 | 
			
		||||
    auto i_se_offset = i_sw_offset + iVec(inp_sW);
 | 
			
		||||
 | 
			
		||||
    #pragma unroll
 | 
			
		||||
    #ifndef _MSC_VER  
 | 
			
		||||
    # pragma unroll  
 | 
			
		||||
    #endif
 | 
			
		||||
    for (int64_t c = 0; c < C; ++c) {
 | 
			
		||||
      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
			
		||||
 | 
			
		||||
@ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
			
		||||
    // So we store the necessary vectors to temporary arrays and use the helper
 | 
			
		||||
    // mask_scatter_add defined above.
 | 
			
		||||
 | 
			
		||||
    integer_t i_gInp_nw_offset_arr[iVec::size];
 | 
			
		||||
    integer_t i_gInp_ne_offset_arr[iVec::size];
 | 
			
		||||
    integer_t i_gInp_sw_offset_arr[iVec::size];
 | 
			
		||||
    integer_t i_gInp_se_offset_arr[iVec::size];
 | 
			
		||||
    integer_t i_gInp_nw_offset_arr[iVec::size()];
 | 
			
		||||
    integer_t i_gInp_ne_offset_arr[iVec::size()];
 | 
			
		||||
    integer_t i_gInp_sw_offset_arr[iVec::size()];
 | 
			
		||||
    integer_t i_gInp_se_offset_arr[iVec::size()];
 | 
			
		||||
    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
 | 
			
		||||
    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
 | 
			
		||||
    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
 | 
			
		||||
    i_gInp_se_offset.store(i_gInp_se_offset_arr);
 | 
			
		||||
 | 
			
		||||
    integer_t i_nw_mask_arr[iVec::size];
 | 
			
		||||
    integer_t i_ne_mask_arr[iVec::size];
 | 
			
		||||
    integer_t i_sw_mask_arr[iVec::size];
 | 
			
		||||
    integer_t i_se_mask_arr[iVec::size];
 | 
			
		||||
    integer_t i_nw_mask_arr[iVec::size()];
 | 
			
		||||
    integer_t i_ne_mask_arr[iVec::size()];
 | 
			
		||||
    integer_t i_sw_mask_arr[iVec::size()];
 | 
			
		||||
    integer_t i_se_mask_arr[iVec::size()];
 | 
			
		||||
    nw_mask.store(i_nw_mask_arr);
 | 
			
		||||
    ne_mask.store(i_ne_mask_arr);
 | 
			
		||||
    sw_mask.store(i_sw_mask_arr);
 | 
			
		||||
    se_mask.store(i_se_mask_arr);
 | 
			
		||||
 | 
			
		||||
    scalar_t gInp_corner_arr[Vec::size];
 | 
			
		||||
    scalar_t gInp_corner_arr[Vec::size()];
 | 
			
		||||
 | 
			
		||||
    auto gx = Vec(0), gy = Vec(0);
 | 
			
		||||
    #pragma unroll
 | 
			
		||||
    #ifndef _MSC_VER  
 | 
			
		||||
    # pragma unroll  
 | 
			
		||||
    #endif
 | 
			
		||||
    for (int64_t c = 0; c < C; ++c) {
 | 
			
		||||
      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
			
		||||
      auto gInp_slice_C_ptr = gInp_slice[c].data();
 | 
			
		||||
@ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
			
		||||
    gx = gx * gx_mult;
 | 
			
		||||
    gy = gy * gy_mult;
 | 
			
		||||
 | 
			
		||||
    constexpr int64_t step = Vec::size;
 | 
			
		||||
    constexpr int64_t step = Vec::size();
 | 
			
		||||
    auto interleaved_gGrid = interleave2(gx, gy);
 | 
			
		||||
    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
 | 
			
		||||
    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
 | 
			
		||||
@ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
 | 
			
		||||
    auto out_ptr = out_slice.data() + offset;
 | 
			
		||||
    auto out_sC = out_slice.stride(0);
 | 
			
		||||
    auto inp_slice_ptr = inp_slice.data();
 | 
			
		||||
    #pragma unroll
 | 
			
		||||
    #ifndef _MSC_VER  
 | 
			
		||||
    # pragma unroll  
 | 
			
		||||
    #endif
 | 
			
		||||
    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
 | 
			
		||||
      // mask_gather zeros out the mask, so we need to make a copy
 | 
			
		||||
      auto mask_copy = mask;
 | 
			
		||||
@ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
 | 
			
		||||
 | 
			
		||||
    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
 | 
			
		||||
 | 
			
		||||
    integer_t mask_arr[iVec::size];
 | 
			
		||||
    integer_t mask_arr[iVec::size()];
 | 
			
		||||
    i_mask.store(mask_arr);
 | 
			
		||||
    integer_t gInp_offset_arr[iVec::size];
 | 
			
		||||
    integer_t gInp_offset_arr[iVec::size()];
 | 
			
		||||
    i_gInp_offset.store(gInp_offset_arr);
 | 
			
		||||
 | 
			
		||||
    #pragma unroll
 | 
			
		||||
    #ifndef _MSC_VER  
 | 
			
		||||
    # pragma unroll  
 | 
			
		||||
    #endif
 | 
			
		||||
    for (int64_t c = 0; c < C; ++c) {
 | 
			
		||||
      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
 | 
			
		||||
                       gInp_offset_arr, mask_arr, len);
 | 
			
		||||
@ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
 | 
			
		||||
 | 
			
		||||
  using Vec = Vec256<scalar_t>;
 | 
			
		||||
  using iVec = Vec256<int_same_size_t<scalar_t>>;
 | 
			
		||||
  constexpr int64_t step = Vec::size;
 | 
			
		||||
  constexpr int64_t step = Vec::size();
 | 
			
		||||
 | 
			
		||||
  // Loop over each output pixel in grid.
 | 
			
		||||
  // We consider the following three cases (after slicing out the batch
 | 
			
		||||
@ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator(
 | 
			
		||||
    auto spatial_offset = 0;
 | 
			
		||||
    auto i_offsets_delta = iVec(grid_sW * step);
 | 
			
		||||
 | 
			
		||||
    #pragma unroll
 | 
			
		||||
    #ifndef _MSC_VER  
 | 
			
		||||
    # pragma unroll  
 | 
			
		||||
    #endif
 | 
			
		||||
    for (int64_t h = 0; h < out_H; h++) {
 | 
			
		||||
      auto grid_ptr_x = grid_ptr + h * grid_sH;
 | 
			
		||||
      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
 | 
			
		||||
      auto i_offsets = iVec::arange(0, grid_sW);
 | 
			
		||||
      #pragma unroll
 | 
			
		||||
      #ifndef _MSC_VER  
 | 
			
		||||
      # pragma unroll  
 | 
			
		||||
      #endif
 | 
			
		||||
      for (int64_t w = 0; w < out_W; w += step) {
 | 
			
		||||
        auto len = std::min(step, out_W - w);
 | 
			
		||||
        if (len < step) {
 | 
			
		||||
 | 
			
		||||
@ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t>
 | 
			
		||||
static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
			
		||||
  VEC_LOOP_HEADER(func_t, data)
 | 
			
		||||
  int64_t i = 0;
 | 
			
		||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
			
		||||
  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
			
		||||
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
    auto out1 = vop(a1, b1);
 | 
			
		||||
    auto out2 = vop(a2, b2);
 | 
			
		||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
  }
 | 
			
		||||
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
 | 
			
		||||
  binary_loop(data, strides, i, n, op);
 | 
			
		||||
@ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op,
 | 
			
		||||
  VEC_LOOP_HEADER(func_t, data)
 | 
			
		||||
  int64_t i = 0;
 | 
			
		||||
  auto a = Vec(*(scalar_t*)in1_ptr);
 | 
			
		||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
			
		||||
  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
			
		||||
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
    auto out1 = vop(a, b1);
 | 
			
		||||
    auto out2 = vop(a, b2);
 | 
			
		||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
  }
 | 
			
		||||
  int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
 | 
			
		||||
  binary_loop(data, strides, i, n, op);
 | 
			
		||||
@ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op,
 | 
			
		||||
  VEC_LOOP_HEADER(func_t, data)
 | 
			
		||||
  int64_t i = 0;
 | 
			
		||||
  auto b = Vec(*(scalar_t*)in2_ptr);
 | 
			
		||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
			
		||||
  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
			
		||||
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
    auto out1 = vop(a1, b);
 | 
			
		||||
    auto out2 = vop(a2, b);
 | 
			
		||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
			
		||||
    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
			
		||||
  }
 | 
			
		||||
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
 | 
			
		||||
  binary_loop(data, strides, i, n, op);
 | 
			
		||||
@ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o
 | 
			
		||||
  char* in_ptr = data[1];
 | 
			
		||||
  Vec acc[4];
 | 
			
		||||
  for  (int j = 0; j < 4; j++) {
 | 
			
		||||
    acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t));
 | 
			
		||||
    acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t));
 | 
			
		||||
  }
 | 
			
		||||
  for (int64_t i = 1; i < n; i++) {
 | 
			
		||||
    const char* ptr = in_ptr + stride * i;
 | 
			
		||||
    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t))));
 | 
			
		||||
    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t))));
 | 
			
		||||
    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t))));
 | 
			
		||||
    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t))));
 | 
			
		||||
    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
 | 
			
		||||
    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
 | 
			
		||||
    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
 | 
			
		||||
    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
 | 
			
		||||
  }
 | 
			
		||||
  if (reduce) {
 | 
			
		||||
    scalar_t buffer[Vec::size];
 | 
			
		||||
    scalar_t buffer[Vec::size()];
 | 
			
		||||
    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
 | 
			
		||||
    acc[0].store(buffer);
 | 
			
		||||
    for (int j = 1; j < Vec::size; j++) {
 | 
			
		||||
    for (int j = 1; j < Vec::size(); j++) {
 | 
			
		||||
      buffer[0] = op(buffer[0], buffer[j]);
 | 
			
		||||
    }
 | 
			
		||||
    auto dst = (scalar_t*)out_ptr;
 | 
			
		||||
    *dst = op(*dst, buffer[0]);
 | 
			
		||||
  } else {
 | 
			
		||||
    for (int j = 0; j < 4; j++) {
 | 
			
		||||
      auto dst = out_ptr + j * Vec::size * sizeof(scalar_t);
 | 
			
		||||
      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
 | 
			
		||||
      acc[j] = vop(acc[j], Vec::loadu(dst));
 | 
			
		||||
      acc[j].store(dst);
 | 
			
		||||
    }
 | 
			
		||||
@ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int
 | 
			
		||||
template <typename func_t, typename vec_func_t>
 | 
			
		||||
static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
			
		||||
  VEC_HEADER(func_t)
 | 
			
		||||
  int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t);
 | 
			
		||||
  int64_t count = n / (4 * Vec::size);
 | 
			
		||||
  int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
 | 
			
		||||
  int64_t count = n / (4 * Vec::size());
 | 
			
		||||
  if (count > 0) {
 | 
			
		||||
    reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
 | 
			
		||||
  }
 | 
			
		||||
  char* ptrs[3] = { data[0], data[0], data[1] };
 | 
			
		||||
  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
 | 
			
		||||
  binary_loop(ptrs, strides, count * 4 * Vec::size, n, op);
 | 
			
		||||
  binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// computes the reduction out = op(out, in)
 | 
			
		||||
@ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t>
 | 
			
		||||
static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
 | 
			
		||||
  VEC_HEADER(func_t)
 | 
			
		||||
 | 
			
		||||
  // reduce down each column of 4 * Vec::size elements (128 bytes)
 | 
			
		||||
  // reduce down each column of 4 * Vec::size() elements (128 bytes)
 | 
			
		||||
  int64_t outer_stride[2] = { 128, 128 };
 | 
			
		||||
  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] {
 | 
			
		||||
  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
 | 
			
		||||
    reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  // reduce down the remaining columns
 | 
			
		||||
  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
 | 
			
		||||
  int64_t remaining = size1 % (4 * Vec::size);
 | 
			
		||||
  int64_t remaining = size1 % (4 * Vec::size());
 | 
			
		||||
  UNARY_OUTER_LOOP(data, step, remaining, [&] {
 | 
			
		||||
    char* ptrs[3] = { data[0], data[0], data[1] };
 | 
			
		||||
    int64_t strides[] = { 0, 0, inner_stride };
 | 
			
		||||
 | 
			
		||||
@ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) {
 | 
			
		||||
      /*identity=*/1);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline int64_t round_down(int64_t a, int64_t m) {
 | 
			
		||||
  return a - (a % m);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<typename scalar_t>
 | 
			
		||||
struct NormReduction {
 | 
			
		||||
  // reduction width in number of scalar elements
 | 
			
		||||
  static constexpr int WIDTH = 128 / sizeof(scalar_t);
 | 
			
		||||
  using Vec = Vec256<scalar_t>;
 | 
			
		||||
 | 
			
		||||
  static void apply(
 | 
			
		||||
      Tensor& res,
 | 
			
		||||
      const Tensor& self,
 | 
			
		||||
      Scalar p,
 | 
			
		||||
      c10::optional<int64_t> dim) {
 | 
			
		||||
    auto out_ = res.data<scalar_t>();
 | 
			
		||||
    auto data_ = self.data<scalar_t>();
 | 
			
		||||
    auto numel = self.numel();
 | 
			
		||||
    float pval = 0.0;
 | 
			
		||||
    if (p.isIntegral()){
 | 
			
		||||
      pval = p.to<int64_t>();
 | 
			
		||||
    } else if (p.isFloatingPoint()) {
 | 
			
		||||
      pval = p.to<float>();
 | 
			
		||||
    }
 | 
			
		||||
    if (!dim.has_value()) {
 | 
			
		||||
      *out_ = reduce_all(data_, numel,  pval);
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
    int64_t n = self.size(*dim);
 | 
			
		||||
    int64_t stride = self.stride(*dim);
 | 
			
		||||
    // A contiguous tensor does not need to hold a meaningful stride
 | 
			
		||||
    // if the corresponding size is 1
 | 
			
		||||
    if (n == 1) {
 | 
			
		||||
      stride = 1;
 | 
			
		||||
      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
 | 
			
		||||
        stride *= self.size(i);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    int64_t batch = numel / n;
 | 
			
		||||
    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
 | 
			
		||||
      for (int64_t bi = begin; bi < end; bi++) {
 | 
			
		||||
        int64_t b = bi / stride;
 | 
			
		||||
        int64_t i = bi % stride;
 | 
			
		||||
        const scalar_t* data = &data_[b * n * stride + i];
 | 
			
		||||
        out_[bi] = norm_reduce(data, n, stride, pval);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
 | 
			
		||||
    scalar_t sum = parallel_reduce(
 | 
			
		||||
      0,
 | 
			
		||||
      size,
 | 
			
		||||
      internal::GRAIN_SIZE,
 | 
			
		||||
      (scalar_t)0,
 | 
			
		||||
      [=](int64_t begin, int64_t end, scalar_t init) {
 | 
			
		||||
        const scalar_t* data = &data_[begin];
 | 
			
		||||
        int64_t n = end - begin;
 | 
			
		||||
        scalar_t result = norm_reduce(data, n, 1, pval);
 | 
			
		||||
        return result;
 | 
			
		||||
      },
 | 
			
		||||
      std::plus<scalar_t>());
 | 
			
		||||
    return sum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
 | 
			
		||||
    scalar_t result = 0.0;
 | 
			
		||||
    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
 | 
			
		||||
      int64_t n_rounded = round_down(n, WIDTH);
 | 
			
		||||
      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
 | 
			
		||||
      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
 | 
			
		||||
      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
 | 
			
		||||
    } else {
 | 
			
		||||
      result = norm_reduce_sequential(data, n, stride, pval);
 | 
			
		||||
    }
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
 | 
			
		||||
    scalar_t result = 0.0;
 | 
			
		||||
    if (pval == 0) {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result += (data[k * stride] != 0.0);
 | 
			
		||||
      }
 | 
			
		||||
    } else if (pval == 1) {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result += std::abs(data[k * stride]);
 | 
			
		||||
      }
 | 
			
		||||
    } else if (pval == 2) {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result += data[k * stride] * data[k * stride];
 | 
			
		||||
      }
 | 
			
		||||
      result = std::sqrt(result);
 | 
			
		||||
    } else if (pval == 3) {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
 | 
			
		||||
      }
 | 
			
		||||
      result = std::pow(result, 1.0/3);
 | 
			
		||||
    } else if (pval == INFINITY) {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
 | 
			
		||||
      }
 | 
			
		||||
    } else if (pval == -INFINITY) {
 | 
			
		||||
      result = INFINITY;
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result;
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      for (int64_t k = 0; k < n; k++) {
 | 
			
		||||
        result += std::pow(std::abs(data[k * stride]), pval);
 | 
			
		||||
      }
 | 
			
		||||
      result = std::pow(result, 1.0/pval);
 | 
			
		||||
    }
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
 | 
			
		||||
  // n is already rounded by 128
 | 
			
		||||
  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
 | 
			
		||||
    scalar_t result = 0.0;
 | 
			
		||||
    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
 | 
			
		||||
    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
 | 
			
		||||
    int64_t rows = n / WIDTH;
 | 
			
		||||
    if (pval == 1){
 | 
			
		||||
      for (int row = 0; row < rows; row ++) {
 | 
			
		||||
        for (int j = 0; j != 4; j++) {
 | 
			
		||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
			
		||||
          acc[j] = acc[j] + val.abs();
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    else if (pval == 2) {
 | 
			
		||||
      for (int row = 0; row < rows; row ++) {
 | 
			
		||||
        for (int j = 0; j != 4; j++) {
 | 
			
		||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
			
		||||
          acc[j] = acc[j] + val * val;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    else if (pval == 3) {
 | 
			
		||||
      for (int row = 0; row < rows; row ++) {
 | 
			
		||||
        for (int j = 0; j != 4; j++) {
 | 
			
		||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
			
		||||
          acc[j] = acc[j] + (val * val * val).abs();
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    scalar_t buf[WIDTH] = {0};
 | 
			
		||||
    for (int j = 0; j != 4; j++) {
 | 
			
		||||
      acc[j].store(&buf[j * Vec::size]);
 | 
			
		||||
    }
 | 
			
		||||
    for (int i = 0; i < WIDTH; i++) {
 | 
			
		||||
      result += buf[i];
 | 
			
		||||
    }
 | 
			
		||||
    result = std::pow(result, 1.0/pval);
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static void norm_kernel_impl(
 | 
			
		||||
    Tensor& result,
 | 
			
		||||
    const Tensor& self,
 | 
			
		||||
    Scalar p,
 | 
			
		||||
    c10::optional<int64_t> dim) {
 | 
			
		||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
 | 
			
		||||
    NormReduction<scalar_t>::apply(result, self, p, dim);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}  // anonymous namespace
 | 
			
		||||
 | 
			
		||||
REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
 | 
			
		||||
REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
 | 
			
		||||
REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 | 
			
		||||
 | 
			
		||||
}}  // namespace at::native
 | 
			
		||||
 | 
			
		||||
@ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim(
 | 
			
		||||
    int64_t outer_size,
 | 
			
		||||
    int64_t dim_size) {
 | 
			
		||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
			
		||||
  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
 | 
			
		||||
  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
 | 
			
		||||
  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
 | 
			
		||||
  if (grain_size < CHUNK_SIZE)
 | 
			
		||||
    grain_size = CHUNK_SIZE;
 | 
			
		||||
 | 
			
		||||
@ -37,9 +37,9 @@ template <>
 | 
			
		||||
int64_t _sigmoid(float* x, float* y, int64_t size) {
 | 
			
		||||
  using Vec = Vec256<float>;
 | 
			
		||||
  int64_t i = 0;
 | 
			
		||||
  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
 | 
			
		||||
  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
 | 
			
		||||
    Vec ret = Vec::loadu(y + i);
 | 
			
		||||
    Vec ret2 = Vec::loadu(y + i + Vec::size);
 | 
			
		||||
    Vec ret2 = Vec::loadu(y + i + Vec::size());
 | 
			
		||||
    ret = ret.neg();
 | 
			
		||||
    ret2 = ret2.neg();
 | 
			
		||||
#if defined(__AVX2__) && !defined(_MSC_VER)
 | 
			
		||||
@ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) {
 | 
			
		||||
    ret = ret.reciprocal();
 | 
			
		||||
    ret2 = ret2.reciprocal();
 | 
			
		||||
    ret.store(x + i);
 | 
			
		||||
    ret2.store(x + i + Vec::size);
 | 
			
		||||
    ret2.store(x + i + Vec::size());
 | 
			
		||||
  }
 | 
			
		||||
  return i;
 | 
			
		||||
}
 | 
			
		||||
@ -63,9 +63,9 @@ template <>
 | 
			
		||||
int64_t _sigmoid(double* x, double* y, int64_t size) {
 | 
			
		||||
  using Vec = Vec256<double>;
 | 
			
		||||
  int64_t i = 0;
 | 
			
		||||
  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
 | 
			
		||||
  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
 | 
			
		||||
    Vec ret = Vec::loadu(y + i);
 | 
			
		||||
    Vec ret2 = Vec::loadu(y + i + Vec::size);
 | 
			
		||||
    Vec ret2 = Vec::loadu(y + i + Vec::size());
 | 
			
		||||
    ret = ret.neg();
 | 
			
		||||
    ret2 = ret2.neg();
 | 
			
		||||
    ret = ret.exp();
 | 
			
		||||
@ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) {
 | 
			
		||||
    ret = ret.reciprocal();
 | 
			
		||||
    ret2 = ret2.reciprocal();
 | 
			
		||||
    ret.store(x + i);
 | 
			
		||||
    ret2.store(x + i + Vec::size);
 | 
			
		||||
    ret2.store(x + i + Vec::size());
 | 
			
		||||
  }
 | 
			
		||||
  return i;
 | 
			
		||||
}
 | 
			
		||||
@ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
 | 
			
		||||
          if (stridex == 1 && stridey == 1) {
 | 
			
		||||
            i = _sigmoid(x, y, size);
 | 
			
		||||
          }
 | 
			
		||||
          for (; i < size; i += Vec::size) {
 | 
			
		||||
            scalar_t buffer[Vec::size];
 | 
			
		||||
            int64_t width = Vec::size;
 | 
			
		||||
          for (; i < size; i += Vec::size()) {
 | 
			
		||||
            scalar_t buffer[Vec::size()];
 | 
			
		||||
            int64_t width = Vec::size();
 | 
			
		||||
            width = std::min(width, size - i);
 | 
			
		||||
            for (int64_t j = 0; j < width; j++) {
 | 
			
		||||
              buffer[j] = y[stridey * (i + j)];
 | 
			
		||||
 | 
			
		||||
@ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
 | 
			
		||||
      input_stride1 = strides[1];
 | 
			
		||||
    }
 | 
			
		||||
    AT_CHECK(channel_size == weight_num,
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
			
		||||
      weight_num, channel_size);
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
			
		||||
      " and channel size = ", channel_size, ".");
 | 
			
		||||
 | 
			
		||||
    // config to run cuda kernel
 | 
			
		||||
    int64_t input_numel = input.numel();
 | 
			
		||||
@ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
 | 
			
		||||
      input_stride1 = strides[1];
 | 
			
		||||
    }
 | 
			
		||||
    AT_CHECK(channel_size == weight_num,
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
			
		||||
      weight_num, channel_size);
 | 
			
		||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
			
		||||
      " and channel size = ", channel_size, ".");
 | 
			
		||||
 | 
			
		||||
    // config to run cuda kernel
 | 
			
		||||
    int64_t input_numel = input.numel();
 | 
			
		||||
 | 
			
		||||
@ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, bool upper>
 | 
			
		||||
__global__
 | 
			
		||||
void triu_tril_kernel(
 | 
			
		||||
    scalar_t* result, scalar_t* self, int64_t k, int64_t N,
 | 
			
		||||
    int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride,
 | 
			
		||||
    int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) {
 | 
			
		||||
  int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  if (linear_idx >= N) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int64_t self_batch_idx = blockIdx.y;
 | 
			
		||||
  int64_t row = linear_idx / self_ncol;
 | 
			
		||||
  int64_t col = linear_idx % self_ncol;
 | 
			
		||||
 | 
			
		||||
  bool mask = upper ? (col - row >= k) : (col - row <= k);
 | 
			
		||||
 | 
			
		||||
  // Now compute the offset for the self and result tensor
 | 
			
		||||
  int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride;
 | 
			
		||||
  int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride;
 | 
			
		||||
  result[res_offset] = mask ? self[self_offset] : scalar_t(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <bool upper>
 | 
			
		||||
Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
 | 
			
		||||
  int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2),
 | 
			
		||||
          res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1,
 | 
			
		||||
          res_row_stride = result.stride(-2), res_col_stride = result.stride(-1),
 | 
			
		||||
          self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1,
 | 
			
		||||
          self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
 | 
			
		||||
  dim3 dim_block = cuda::getApplyBlock();
 | 
			
		||||
  dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
 | 
			
		||||
  AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{
 | 
			
		||||
    triu_tril_kernel<scalar_t, upper>
 | 
			
		||||
      <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
 | 
			
		||||
        result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
 | 
			
		||||
        res_batch_stride, res_row_stride, res_col_stride,
 | 
			
		||||
        self_batch_stride, self_row_stride, self_col_stride, self.size(-1));
 | 
			
		||||
  });
 | 
			
		||||
  AT_CUDA_CHECK(cudaGetLastError());
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& tril_cuda_(Tensor &self, int64_t k) {
 | 
			
		||||
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
			
		||||
  return tril_cuda_out(self, self, k);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
			
		||||
  if (result.sizes() != self.sizes()) {
 | 
			
		||||
    result.resize_as_(self);
 | 
			
		||||
  }
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
			
		||||
  return triu_tril_cuda_template<false>(result, self_c, k, "tril");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& triu_cuda_(Tensor &self, int64_t k) {
 | 
			
		||||
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
			
		||||
  return triu_cuda_out(self, self, k);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
			
		||||
  if (result.sizes() != self.sizes()) {
 | 
			
		||||
    result.resize_as_(self);
 | 
			
		||||
  }
 | 
			
		||||
  if (self.numel() == 0) {
 | 
			
		||||
    return result;
 | 
			
		||||
  }
 | 
			
		||||
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
			
		||||
  return triu_tril_cuda_template<true>(result, self_c, k, "triu");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}}  // namespace at::native
 | 
			
		||||
 | 
			
		||||
#undef ALLOCATE_ARRAY
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,5 @@
 | 
			
		||||
#include "ATen/ATen.h"
 | 
			
		||||
#include <ATen/ATen.h>
 | 
			
		||||
#include <ATen/cuda/Exceptions.h>
 | 
			
		||||
#include <THC/THCTensorMathReduce.cuh>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
 | 
			
		||||
@ -78,13 +79,13 @@ struct dists {
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, typename F>
 | 
			
		||||
__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
 | 
			
		||||
__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
 | 
			
		||||
                                              const double n2, const double n2_squared_minus_1) {
 | 
			
		||||
  const int k = blockIdx.x;
 | 
			
		||||
  const int stride = blockDim.x;
 | 
			
		||||
 | 
			
		||||
  float n2 = n - .5;
 | 
			
		||||
  // The -1 accounts for floating point truncation issues
 | 
			
		||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
 | 
			
		||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
 | 
			
		||||
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
			
		||||
 | 
			
		||||
  const scalar_t * const start = self + i * m;
 | 
			
		||||
@ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t, typename F>
 | 
			
		||||
__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
 | 
			
		||||
__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
 | 
			
		||||
                                                       const double n2, const double n2_squared_minus_1) {
 | 
			
		||||
  const int k = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
  const int init = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
  const int stride = blockDim.x * gridDim.x;
 | 
			
		||||
@ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  float n2 = n - .5;
 | 
			
		||||
  // The -1 accounts for floating point truncation issues
 | 
			
		||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
 | 
			
		||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
 | 
			
		||||
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
			
		||||
  int64_t ib = j - i - 1;
 | 
			
		||||
  int64_t jb = n - 2 - i;
 | 
			
		||||
@ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
 | 
			
		||||
  const dim3 block(forward_threads);
 | 
			
		||||
  int64_t n = self.size(0);
 | 
			
		||||
  int64_t m = self.size(1);
 | 
			
		||||
  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
 | 
			
		||||
  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
 | 
			
		||||
  const double n2 = n - .5;
 | 
			
		||||
  const double n2_squared_minus_1 = n2 * n2 - 1;
 | 
			
		||||
 | 
			
		||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
 | 
			
		||||
    if (p == 0.0) {
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (p == 1.0) {
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (p == 2.0) {
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (std::isinf(p)) {
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else {
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
			
		||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
  AT_CUDA_CHECK(cudaGetLastError());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
 | 
			
		||||
@ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
 | 
			
		||||
  const int64_t n = result.size(0);
 | 
			
		||||
  int64_t m = self.size(1);
 | 
			
		||||
  const int block_x = 64;
 | 
			
		||||
  const int block_y = 4;
 | 
			
		||||
  // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
 | 
			
		||||
  // From binary search, block_y of 16 gives us max pdist dim0 of 1449,
 | 
			
		||||
  //                     block_y of  4 gives us max pdist dim0 of  725.
 | 
			
		||||
  const int block_y = 16;
 | 
			
		||||
  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
 | 
			
		||||
  const int grid_y = (dist.numel() + block_y - 1) / block_y;
 | 
			
		||||
  const dim3 grid(grid_x, grid_y);
 | 
			
		||||
  const dim3 block(block_x, block_y);
 | 
			
		||||
  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
 | 
			
		||||
  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
 | 
			
		||||
  const double n2 = n - .5;
 | 
			
		||||
  const double n2_squared_minus_1 = n2 * n2 - 1;
 | 
			
		||||
 | 
			
		||||
  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
 | 
			
		||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
 | 
			
		||||
    if (p == 1.0) {
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (p < 2.0) {
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (p == 2.0) {
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else if (std::isinf(p)) {
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
			
		||||
    } else {
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
			
		||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
  AT_CUDA_CHECK(cudaGetLastError());
 | 
			
		||||
 | 
			
		||||
  at::sum_out(result, buffer, 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
 | 
			
		||||
 | 
			
		||||
    default:
 | 
			
		||||
      AT_ERROR(
 | 
			
		||||
          "Unknown mode for embedding_bag_backward_cuda %d", mode);
 | 
			
		||||
          "Unknown mode for embedding_bag_backward_cuda ", mode);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
 | 
			
		||||
          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
 | 
			
		||||
 | 
			
		||||
        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
 | 
			
		||||
      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
 | 
			
		||||
      } else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) {
 | 
			
		||||
          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
@ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
 | 
			
		||||
      if (targets.type().scalarType() == kLong) {
 | 
			
		||||
	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
			
		||||
      } else {
 | 
			
		||||
	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
			
		||||
	return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_
 | 
			
		||||
                                                            const Tensor& running_mean_, const Tensor& running_var_,
 | 
			
		||||
                                                            bool train, double momentum, double epsilon) {
 | 
			
		||||
 | 
			
		||||
  TensorArg input_arg{ input_, "input", 1 },
 | 
			
		||||
            weight_arg{ weight_, "weight", 2 },
 | 
			
		||||
            bias_arg{ bias_, "bias", 3 },
 | 
			
		||||
            run_mean_arg{ running_mean_, "running_mean", 4 },
 | 
			
		||||
            run_var_arg{ running_var_, "running_var", 5 };
 | 
			
		||||
  CheckedFrom c = "batch_norm_cuda";
 | 
			
		||||
  checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg});
 | 
			
		||||
 | 
			
		||||
  using accscalar_t = at::acc_type<scalar_t, true>;
 | 
			
		||||
  int64_t n_input = input_.size(1);
 | 
			
		||||
  Tensor save_mean_;
 | 
			
		||||
 | 
			
		||||
@ -7,28 +7,13 @@
 | 
			
		||||
#include <tuple>
 | 
			
		||||
#include <thrust/unique.h>
 | 
			
		||||
#include <thrust/sort.h>
 | 
			
		||||
#include <thrust/scan.h>
 | 
			
		||||
#include <thrust/scatter.h>
 | 
			
		||||
 | 
			
		||||
namespace at {
 | 
			
		||||
namespace native{
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
template <typename scalar_t>
 | 
			
		||||
__global__ void inverse_indices_kernel(
 | 
			
		||||
    const scalar_t* input_data,
 | 
			
		||||
    const scalar_t* output_data,
 | 
			
		||||
    int64_t* inverse_indices_data,
 | 
			
		||||
    int64_t num_inp,
 | 
			
		||||
    int64_t num_out) {
 | 
			
		||||
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    int64_t stride = blockDim.x * gridDim.x;
 | 
			
		||||
 | 
			
		||||
    for (int64_t i = idx; i < num_inp * num_out; i += stride) {
 | 
			
		||||
      if (input_data[i / num_out] == output_data[i % num_out]){
 | 
			
		||||
        inverse_indices_data[i / num_out] = i % num_out;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t>
 | 
			
		||||
  std::tuple<Tensor, Tensor> _unique_cuda_template(
 | 
			
		||||
@ -47,25 +32,29 @@ template <typename scalar_t>
 | 
			
		||||
    Tensor output = input.clone();
 | 
			
		||||
    output = output.view(-1);
 | 
			
		||||
    scalar_t* output_data = output.data<scalar_t>();
 | 
			
		||||
    thrust::sort(policy, output_data, output_data + num_inp);
 | 
			
		||||
    scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
 | 
			
		||||
    int64_t num_out = output_end - output_data;
 | 
			
		||||
    output.resize_(num_out);
 | 
			
		||||
 | 
			
		||||
    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
 | 
			
		||||
 | 
			
		||||
    if (return_inverse) {
 | 
			
		||||
      inverse_indices.resize_(input.sizes());
 | 
			
		||||
      int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
 | 
			
		||||
      int block = 512;
 | 
			
		||||
      int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
 | 
			
		||||
      inverse_indices_kernel<<<grid, block, 0, stream>>>(
 | 
			
		||||
        input_data, output_data, inverse_indices_data, num_inp, num_out);
 | 
			
		||||
    Tensor inverse_indices;
 | 
			
		||||
    if (!return_inverse) {
 | 
			
		||||
        inverse_indices = at::empty({0},  self.type().toScalarType(kLong));
 | 
			
		||||
        thrust::sort(policy, output_data, output_data + num_inp);
 | 
			
		||||
    } else {
 | 
			
		||||
        Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong));
 | 
			
		||||
        int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>();
 | 
			
		||||
        thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr);
 | 
			
		||||
        Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong));
 | 
			
		||||
        inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong));
 | 
			
		||||
        int64_t* inv_loc_ptr = inv_loc.data<int64_t>();
 | 
			
		||||
        int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>();
 | 
			
		||||
        thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }});
 | 
			
		||||
        inv_loc[0] = 0;
 | 
			
		||||
        thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
 | 
			
		||||
        thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
 | 
			
		||||
        inverse_indices.resize_(input.sizes());
 | 
			
		||||
    }
 | 
			
		||||
    int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data;
 | 
			
		||||
    output.resize_(num_out);
 | 
			
		||||
 | 
			
		||||
    THCudaCheck(cudaGetLastError());
 | 
			
		||||
    return std::tuple<Tensor, Tensor>(output, inverse_indices);
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
template <typename scalar_t>
 | 
			
		||||
 | 
			
		||||
@ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
 | 
			
		||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
 | 
			
		||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
 | 
			
		||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
 | 
			
		||||
#if CUDNN_VERSION >= 6000
 | 
			
		||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
 | 
			
		||||
#endif
 | 
			
		||||
    };
 | 
			
		||||
    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
 | 
			
		||||
    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
 | 
			
		||||
@ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
 | 
			
		||||
  THCCachingAllocator_emptyCache();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//hot fix for #16610
 | 
			
		||||
//specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData, 
 | 
			
		||||
//adding "strided" argument, so in the end this looks simpler.
 | 
			
		||||
template<>
 | 
			
		||||
void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) {
 | 
			
		||||
  using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>;
 | 
			
		||||
  auto& cache = search::cache();
 | 
			
		||||
 | 
			
		||||
  if (cache.find(args.params, algo)) {
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (args.params.deterministic && !benchmark) {
 | 
			
		||||
    *algo = search::DEFAULT_ALGO;
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  int stride_dim = args.input.dim() - 2;
 | 
			
		||||
  bool strided = false;
 | 
			
		||||
  for (int i = 0; i< stride_dim; i++) {
 | 
			
		||||
      if (args.params.stride[i] != 1) {
 | 
			
		||||
         strided = true;
 | 
			
		||||
         break;
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (!benchmark) {
 | 
			
		||||
    search::getAlgorithm(args, algo);
 | 
			
		||||
    if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
 | 
			
		||||
       *algo = search::DEFAULT_ALGO;
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (cache.find(args.params, algo)) {
 | 
			
		||||
    // re-check cache since another thread may have benchmarked the algorithm
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto perfResults = search::findAlgorithm(args);
 | 
			
		||||
  // for deterministic algo, look at all the perf results and return the best
 | 
			
		||||
  // deterministic algo
 | 
			
		||||
  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
 | 
			
		||||
      !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
 | 
			
		||||
      *algo = perfResults.algo;
 | 
			
		||||
  } else {
 | 
			
		||||
      *algo = search::DEFAULT_ALGO;
 | 
			
		||||
  }
 | 
			
		||||
  if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
 | 
			
		||||
     *algo = search::DEFAULT_ALGO;
 | 
			
		||||
  }
 | 
			
		||||
  cache.insert(args.params, *algo);
 | 
			
		||||
 | 
			
		||||
  // Free the cached blocks in our caching allocator. They are
 | 
			
		||||
  // needed here because the above benchmarking uses a huge amount of memory,
 | 
			
		||||
  // e.g. a few GBs.
 | 
			
		||||
  THCCachingAllocator_emptyCache();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<typename algo_t>
 | 
			
		||||
Workspace chooseAlgorithm(
 | 
			
		||||
    const ConvolutionArgs& args,
 | 
			
		||||
@ -848,19 +907,9 @@ Tensor cudnn_convolution_forward(
 | 
			
		||||
  // See #4500
 | 
			
		||||
  Tensor weight_contig = weight->contiguous();
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
  for (int i = 0; i < groups; i++) {
 | 
			
		||||
    raw_cudnn_convolution_forward_out(
 | 
			
		||||
        narrowGroup(*output, output_channels_dim,        i, groups),
 | 
			
		||||
        narrowGroup(*input,  input_channels_dim,         i, groups),
 | 
			
		||||
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
 | 
			
		||||
        padding, stride, dilation, 1, benchmark, deterministic);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  raw_cudnn_convolution_forward_out(
 | 
			
		||||
      *output, *input, weight_contig,
 | 
			
		||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  return *output;
 | 
			
		||||
}
 | 
			
		||||
@ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input(
 | 
			
		||||
  // See #4500
 | 
			
		||||
  Tensor weight_contig = weight->contiguous();
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
  for (int i = 0; i < groups; i++) {
 | 
			
		||||
    raw_cudnn_convolution_backward_input_out(
 | 
			
		||||
        narrowGroup(*grad_input, input_channels_dim, i, groups),
 | 
			
		||||
        narrowGroup(*grad_output, output_channels_dim, i, groups),
 | 
			
		||||
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
 | 
			
		||||
        padding, stride, dilation, 1, benchmark, deterministic);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  raw_cudnn_convolution_backward_input_out(
 | 
			
		||||
      *grad_input, *grad_output, weight_contig,
 | 
			
		||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  return *grad_input;
 | 
			
		||||
}
 | 
			
		||||
@ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight(
 | 
			
		||||
  TensorArg grad_weight{ grad_weight_t, "result", 0 };
 | 
			
		||||
  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
  for (int i = 0; i < groups; i++) {
 | 
			
		||||
    raw_cudnn_convolution_backward_weight_out(
 | 
			
		||||
        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
 | 
			
		||||
        narrowGroup(*grad_output, output_channels_dim, i, groups),
 | 
			
		||||
        narrowGroup(*input, input_channels_dim, i, groups),
 | 
			
		||||
        padding, stride, dilation, groups, benchmark, deterministic);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  raw_cudnn_convolution_backward_weight_out(
 | 
			
		||||
      *grad_weight, *grad_output, *input,
 | 
			
		||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  return grad_weight_t;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -7,7 +7,7 @@
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
 | 
			
		||||
#if !AT_CUDNN_ENABLED()
 | 
			
		||||
 | 
			
		||||
namespace at { namespace native {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -375,7 +375,7 @@ namespace {
 | 
			
		||||
      case CUDNN_RNN_TANH:
 | 
			
		||||
        return 2;
 | 
			
		||||
      default:
 | 
			
		||||
        AT_ERROR("unknown cuDNN RNN mode %d", mode);
 | 
			
		||||
        AT_ERROR("unknown cuDNN RNN mode ", mode);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -2555,9 +2555,15 @@
 | 
			
		||||
 | 
			
		||||
- func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
 | 
			
		||||
  variants: method
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: tril_cpu_
 | 
			
		||||
    CUDA: tril_cuda_
 | 
			
		||||
 | 
			
		||||
- func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
 | 
			
		||||
  variants: method
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: triu_cpu_
 | 
			
		||||
    CUDA: triu_cuda_
 | 
			
		||||
 | 
			
		||||
- func: digamma_(Tensor self) -> Tensor
 | 
			
		||||
  variants: method
 | 
			
		||||
@ -2658,11 +2664,17 @@
 | 
			
		||||
  variants: method, function
 | 
			
		||||
 | 
			
		||||
- func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: triu_cpu_out
 | 
			
		||||
    CUDA: triu_cuda_out
 | 
			
		||||
 | 
			
		||||
- func: triu(Tensor self, int64_t diagonal=0) -> Tensor
 | 
			
		||||
  variants: method, function
 | 
			
		||||
 | 
			
		||||
- func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: tril_cpu_out
 | 
			
		||||
    CUDA: tril_cuda_out
 | 
			
		||||
 | 
			
		||||
- func: tril(Tensor self, int64_t diagonal=0) -> Tensor
 | 
			
		||||
  variants: method, function
 | 
			
		||||
 | 
			
		||||
@ -11,18 +11,4 @@ using namespace at::native;
 | 
			
		||||
TEST(CUDNNTest, CUDNNTestCUDA) {
 | 
			
		||||
  if (!at::cuda::is_available()) return;
 | 
			
		||||
  manual_seed(123);
 | 
			
		||||
 | 
			
		||||
#if CUDNN_VERSION < 7000
 | 
			
		||||
  auto handle = getCudnnHandle();
 | 
			
		||||
  DropoutDescriptor desc1, desc2;
 | 
			
		||||
  desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte));
 | 
			
		||||
  desc2.set(handle, 0.5, desc1.state);
 | 
			
		||||
  bool isEQ;
 | 
			
		||||
  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
 | 
			
		||||
  ASSERT_TRUE(isEQ);
 | 
			
		||||
  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
 | 
			
		||||
  ASSERT_TRUE(isEQ);
 | 
			
		||||
  isEQ = (desc1.desc()->states == desc2.desc()->states);
 | 
			
		||||
  ASSERT_TRUE(isEQ);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -3,6 +3,8 @@ find_package(ATen REQUIRED)
 | 
			
		||||
include_directories(${ATEN_INCLUDE_DIR})
 | 
			
		||||
 | 
			
		||||
# C++11
 | 
			
		||||
set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
 | 
			
		||||
if (not MSVC) 
 | 
			
		||||
    set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") 
 | 
			
		||||
endif()
 | 
			
		||||
add_executable(main main.cpp)
 | 
			
		||||
target_link_libraries(main ${ATEN_LIBRARIES})
 | 
			
		||||
 | 
			
		||||
@ -247,10 +247,13 @@
 | 
			
		||||
 | 
			
		||||
#ifdef _OPENMP
 | 
			
		||||
 | 
			
		||||
#ifndef _WIN32
 | 
			
		||||
#define PRAGMA(P) _Pragma(#P)
 | 
			
		||||
#ifdef _WIN32  
 | 
			
		||||
// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.  
 | 
			
		||||
#define PRAGMA_LOOP(P)    // Noop  
 | 
			
		||||
#define PRAGMA(P)         __pragma(P)
 | 
			
		||||
#else
 | 
			
		||||
#define PRAGMA(P) __pragma(P)
 | 
			
		||||
#define PRAGMA_LOOP(P)    _Pragma(#P)  
 | 
			
		||||
#define PRAGMA(P)         _Pragma(#P)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
@ -369,7 +372,7 @@
 | 
			
		||||
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
 | 
			
		||||
    ptrdiff_t iter = 0;                                                                        \
 | 
			
		||||
    if(tp != (TYPE2*)rp) {                                                                             \
 | 
			
		||||
      PRAGMA(ivdep) \
 | 
			
		||||
      PRAGMA_LOOP(ivdep) \
 | 
			
		||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
 | 
			
		||||
      for (iter = 0; iter < SIZE; iter++) {                             \
 | 
			
		||||
        TYPE2 *TENSOR2##_data = tp+iter;                                \
 | 
			
		||||
@ -377,7 +380,7 @@
 | 
			
		||||
        CODE                                                            \
 | 
			
		||||
      }\
 | 
			
		||||
    } else {\
 | 
			
		||||
      PRAGMA(simd) \
 | 
			
		||||
      PRAGMA_LOOP(simd) \
 | 
			
		||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
 | 
			
		||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
			
		||||
        TYPE2* TENSOR2##_data = tp+iter;\
 | 
			
		||||
@ -449,7 +452,7 @@
 | 
			
		||||
    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
 | 
			
		||||
    ptrdiff_t iter = 0;\
 | 
			
		||||
    if(tp != (TYPE2*)rp) {                                                                             \
 | 
			
		||||
      PRAGMA(ivdep) \
 | 
			
		||||
      PRAGMA_LOOP(ivdep) \
 | 
			
		||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
			
		||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
			
		||||
        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
			
		||||
@ -458,7 +461,7 @@
 | 
			
		||||
        CODE                                \
 | 
			
		||||
      } \
 | 
			
		||||
    } else {\
 | 
			
		||||
      PRAGMA(simd) \
 | 
			
		||||
      PRAGMA_LOOP(simd) \
 | 
			
		||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
			
		||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
			
		||||
        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
			
		||||
 | 
			
		||||
@ -13,10 +13,13 @@
 | 
			
		||||
 | 
			
		||||
#ifdef _OPENMP
 | 
			
		||||
 | 
			
		||||
#ifndef _WIN32
 | 
			
		||||
#define PRAGMA(P) _Pragma(#P)
 | 
			
		||||
#ifdef _WIN32  
 | 
			
		||||
// MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.  
 | 
			
		||||
#define PRAGMA_LOOP(P)    // Noop  
 | 
			
		||||
#define PRAGMA(P)         __pragma(P)
 | 
			
		||||
#else
 | 
			
		||||
#define PRAGMA(P) __pragma(P)
 | 
			
		||||
#define PRAGMA_LOOP(P)    _Pragma(#P)  
 | 
			
		||||
#define PRAGMA(P)         _Pragma(#P)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
 | 
			
		||||
 | 
			
		||||
@ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 | 
			
		||||
  int free_b = 0;
 | 
			
		||||
  if (a == NULL) a = ra_;
 | 
			
		||||
  if (b == NULL) b = rb_;
 | 
			
		||||
  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
 | 
			
		||||
      a->dim());
 | 
			
		||||
  THArgCheck(!a->is_empty(), 2, "A should not be empty");
 | 
			
		||||
  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
 | 
			
		||||
      "dimensions, but has %d", b->dim());
 | 
			
		||||
  THArgCheck(!b->is_empty(), 2, "B should not be empty");
 | 
			
		||||
  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
 | 
			
		||||
      a->size(0), a->size(1));
 | 
			
		||||
  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
 | 
			
		||||
      "rows, B has %ld", a->size(0), b->size(0));
 | 
			
		||||
 | 
			
		||||
  if (b->dim() == 1) {
 | 
			
		||||
    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
 | 
			
		||||
            b->stride(0), 1, 0);
 | 
			
		||||
    free_b = 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int n, nrhs, lda, ldb, info;
 | 
			
		||||
  THIntTensor *ipiv;
 | 
			
		||||
@ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 | 
			
		||||
  THTensor_(freeCopyTo)(ra__, ra_);
 | 
			
		||||
  THTensor_(freeCopyTo)(rb__, rb_);
 | 
			
		||||
  THIntTensor_free(ipiv);
 | 
			
		||||
  if (free_b) c10::raw::intrusive_ptr::decref(b);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
 | 
			
		||||
 | 
			
		||||
@ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n
 | 
			
		||||
 | 
			
		||||
TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 | 
			
		||||
TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
 | 
			
		||||
TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
 | 
			
		||||
TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 | 
			
		||||
TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 | 
			
		||||
TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
 | 
			
		||||
 | 
			
		||||
@ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
 | 
			
		||||
  REAL_SWAP(ARR(III), ARR(JJJ)); \
 | 
			
		||||
  LONG_SWAP(IDX(III), IDX(JJJ))
 | 
			
		||||
 | 
			
		||||
/* Emulate NumPy behavior of putting NaNs
 | 
			
		||||
 * at the end of an ascending list. */
 | 
			
		||||
#define GT_OR_NAN(x, y) \
 | 
			
		||||
  ((x != x && y == y) || (x > y))
 | 
			
		||||
 | 
			
		||||
static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
 | 
			
		||||
{
 | 
			
		||||
  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
 | 
			
		||||
@ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
			
		||||
      /* Use median of three for pivot choice */
 | 
			
		||||
    P=(L+R)>>1;
 | 
			
		||||
    BOTH_SWAP(P, L+1);
 | 
			
		||||
    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
 | 
			
		||||
    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
 | 
			
		||||
    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }
 | 
			
		||||
 | 
			
		||||
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
			
		||||
 | 
			
		||||
    do {
 | 
			
		||||
      do { i = i+1; } while(ARR(i) < piv);
 | 
			
		||||
      do { j = j-1; } while(ARR(j) > piv);
 | 
			
		||||
      do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
 | 
			
		||||
      do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
 | 
			
		||||
      if (j < i)
 | 
			
		||||
          break;
 | 
			
		||||
      BOTH_SWAP(i, j);
 | 
			
		||||
@ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
			
		||||
  } /* while not done */
 | 
			
		||||
  /* Now insertion sort on the concatenation of subfiles */
 | 
			
		||||
  for(i=elements-2; i>=0; i--) {
 | 
			
		||||
    if (ARR(i) > ARR(i+1)) {
 | 
			
		||||
    if (GT_OR_NAN(ARR(i),ARR(i+1))) {
 | 
			
		||||
      piv = ARR(i);
 | 
			
		||||
      pid = IDX(i);
 | 
			
		||||
      j = i+1;
 | 
			
		||||
@ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
			
		||||
        ARR(j-1) = ARR(j);
 | 
			
		||||
        IDX(j-1) = IDX(j);
 | 
			
		||||
        j = j+1;
 | 
			
		||||
      } while(j < elements && ARR(j) < piv);
 | 
			
		||||
      } while(j < elements && GT_OR_NAN(piv, ARR(j)));
 | 
			
		||||
      ARR(j-1) = piv;
 | 
			
		||||
      IDX(j-1) = pid;
 | 
			
		||||
     }
 | 
			
		||||
@ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
			
		||||
      /* Use median of three for pivot choice */
 | 
			
		||||
    P=(L+R)>>1;
 | 
			
		||||
    BOTH_SWAP(P, L+1);
 | 
			
		||||
    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
 | 
			
		||||
    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
 | 
			
		||||
    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
 | 
			
		||||
    if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }
 | 
			
		||||
 | 
			
		||||
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
			
		||||
 | 
			
		||||
    do {
 | 
			
		||||
      do { i = i+1; } while(ARR(i) > piv);
 | 
			
		||||
      do { j = j-1; } while(ARR(j) < piv);
 | 
			
		||||
      do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
 | 
			
		||||
      do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
 | 
			
		||||
      if (j < i)
 | 
			
		||||
          break;
 | 
			
		||||
      BOTH_SWAP(i, j);
 | 
			
		||||
@ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
			
		||||
  } /* while not done */
 | 
			
		||||
  /* Now insertion sort on the concatenation of subfiles */
 | 
			
		||||
  for(i=elements-2; i>=0; i--) {
 | 
			
		||||
    if (ARR(i) < ARR(i+1)) {
 | 
			
		||||
    if (GT_OR_NAN(ARR(i+1), ARR(i))) {
 | 
			
		||||
      piv = ARR(i);
 | 
			
		||||
      pid = IDX(i);
 | 
			
		||||
      j = i+1;
 | 
			
		||||
@ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
			
		||||
        ARR(j-1) = ARR(j);
 | 
			
		||||
        IDX(j-1) = IDX(j);
 | 
			
		||||
        j = j+1;
 | 
			
		||||
      } while(j < elements && ARR(j) > piv);
 | 
			
		||||
      } while(j < elements && GT_OR_NAN(ARR(j), piv));
 | 
			
		||||
      ARR(j-1) = piv;
 | 
			
		||||
      IDX(j-1) = pid;
 | 
			
		||||
     }
 | 
			
		||||
@ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i
 | 
			
		||||
  THLongTensor_free(tmpIndices);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
 | 
			
		||||
{
 | 
			
		||||
  int64_t t_size_0, t_size_1;
 | 
			
		||||
  int64_t t_stride_0, t_stride_1;
 | 
			
		||||
  int64_t r__stride_0, r__stride_1;
 | 
			
		||||
  scalar_t *t_data, *r__data;
 | 
			
		||||
  int64_t r, c;
 | 
			
		||||
 | 
			
		||||
  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
 | 
			
		||||
 | 
			
		||||
  THTensor_(resizeAs)(r_, t);
 | 
			
		||||
 | 
			
		||||
  t_size_0 = THTensor_(size)(t, 0);
 | 
			
		||||
  t_size_1 = THTensor_(size)(t, 1);
 | 
			
		||||
  t_stride_0 = THTensor_(stride)(t, 0);
 | 
			
		||||
  t_stride_1 = THTensor_(stride)(t, 1);
 | 
			
		||||
  r__stride_0 = THTensor_(stride)(r_, 0);
 | 
			
		||||
  r__stride_1 = THTensor_(stride)(r_, 1);
 | 
			
		||||
  r__data = r_->data<scalar_t>();
 | 
			
		||||
  t_data = t->data<scalar_t>();
 | 
			
		||||
 | 
			
		||||
  for(r = 0; r < t_size_0; r++)
 | 
			
		||||
  {
 | 
			
		||||
    int64_t sz = THMin(r+k+1, t_size_1);
 | 
			
		||||
    for(c = THMax(0, r+k+1); c < t_size_1; c++)
 | 
			
		||||
      r__data[r*r__stride_0+c*r__stride_1] = 0;
 | 
			
		||||
    for(c = 0; c < sz; c++)
 | 
			
		||||
      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 | 
			
		||||
{
 | 
			
		||||
  int64_t t_size_0, t_size_1;
 | 
			
		||||
 | 
			
		||||
@ -6,17 +6,17 @@
 | 
			
		||||
#include "THCNumerics.cuh"
 | 
			
		||||
 | 
			
		||||
// Collection of kernel sort routines
 | 
			
		||||
template <typename T>
 | 
			
		||||
template <typename T, bool handleNaN = false>
 | 
			
		||||
struct LTComp {
 | 
			
		||||
  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
			
		||||
    return THCNumerics<T>::lt(a, b);
 | 
			
		||||
    return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
template <typename T, bool handleNaN = false>
 | 
			
		||||
struct GTComp {
 | 
			
		||||
  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
			
		||||
    return THCNumerics<T>::gt(a, b);
 | 
			
		||||
    return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
__device__ int binarySearchForMultinomial(T* dist,
 | 
			
		||||
__device__ int binarySearchForMultinomial(T* cumdist,
 | 
			
		||||
                                          T* dist,
 | 
			
		||||
                                          int size,
 | 
			
		||||
                                          T val) {
 | 
			
		||||
  int start = 0;
 | 
			
		||||
  int end = size;
 | 
			
		||||
  // dist[size - 1] = 0 => all zero prob dist
 | 
			
		||||
  assert(THCNumerics<T>::gt(dist[size - 1], 0));
 | 
			
		||||
  // cumdist[size - 1] = 0 => all zero prob dist
 | 
			
		||||
  assert(THCNumerics<T>::gt(cumdist[size - 1], 0));
 | 
			
		||||
 | 
			
		||||
  while (end - start > 0) {
 | 
			
		||||
    int mid = start + (end - start) / 2;
 | 
			
		||||
 | 
			
		||||
    T midVal = dist[mid];
 | 
			
		||||
    T midVal = cumdist[mid];
 | 
			
		||||
    if (THCNumerics<T>::lt(midVal, val)) {
 | 
			
		||||
      start = mid + 1;
 | 
			
		||||
    } else {
 | 
			
		||||
@ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist,
 | 
			
		||||
    start = size - 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  T curVal = dist[start];
 | 
			
		||||
  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
 | 
			
		||||
  T curVal = cumdist[start];
 | 
			
		||||
  while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--;
 | 
			
		||||
 | 
			
		||||
  return start;
 | 
			
		||||
}
 | 
			
		||||
@ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
 | 
			
		||||
                                 int64_t* dest,
 | 
			
		||||
                                 int64_t distributions,
 | 
			
		||||
                                 int categories,
 | 
			
		||||
                                 T* normDistPrefixSum) {
 | 
			
		||||
                                 T* normDistPrefixSum,
 | 
			
		||||
                                 T* normDist) {
 | 
			
		||||
  // At the moment, each warp computes one sample value in the binary
 | 
			
		||||
  // search due to divergence. It seems possible to compute multiple
 | 
			
		||||
  // values and limit divergence though later on. However, no matter
 | 
			
		||||
@ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
 | 
			
		||||
        // Find the bucket that a uniform sample lies in
 | 
			
		||||
        int choice = binarySearchForMultinomial<T>(
 | 
			
		||||
          normDistPrefixSum + curDist * categories,
 | 
			
		||||
          normDist + curDist * categories,
 | 
			
		||||
          categories,
 | 
			
		||||
          r);
 | 
			
		||||
 | 
			
		||||
@ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
 | 
			
		||||
      // Find the bucket that a uniform sample lies in
 | 
			
		||||
      int choice = binarySearchForMultinomial<T>(
 | 
			
		||||
        normDistPrefixSum + curDist * categories,
 | 
			
		||||
        origDist + curDist * categories,
 | 
			
		||||
        categories,
 | 
			
		||||
        r);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -15,17 +15,17 @@
 | 
			
		||||
#include <thrust/system/cuda/execution_policy.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
template <typename T, bool handleNaN = false>
 | 
			
		||||
struct ThrustGTOp {
 | 
			
		||||
  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
			
		||||
    return THCNumerics<T>::gt(lhs, rhs);
 | 
			
		||||
    return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
template <typename T, bool handleNaN = false>
 | 
			
		||||
struct ThrustLTOp {
 | 
			
		||||
  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
			
		||||
    return THCNumerics<T>::lt(lhs, rhs);
 | 
			
		||||
    return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 | 
			
		||||
void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 | 
			
		||||
{
 | 
			
		||||
#ifdef USE_MAGMA
 | 
			
		||||
  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
 | 
			
		||||
  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
 | 
			
		||||
  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
 | 
			
		||||
  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
 | 
			
		||||
 | 
			
		||||
  int64_t n = a_->size(0);
 | 
			
		||||
  int64_t nrhs = b_->size(1);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
 | 
			
		||||
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    THCTensor_(resizeAs)(state, self_, src_);
 | 
			
		||||
 | 
			
		||||
    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
 | 
			
		||||
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
			
		||||
 | 
			
		||||
@ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state,
 | 
			
		||||
          n_sample,
 | 
			
		||||
          THCudaLongTensor_data(state, self),
 | 
			
		||||
          numDist, numCategories,
 | 
			
		||||
          THCTensor_(data)(state, prefixSum));
 | 
			
		||||
          THCTensor_(data)(state, prefixSum),
 | 
			
		||||
	  THCTensor_(data)(state, normDist));
 | 
			
		||||
    } else {
 | 
			
		||||
      // Sample without replacement
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
			
		||||
    dim3 block(blockSize);                                              \
 | 
			
		||||
                                                                        \
 | 
			
		||||
    if (dir) {                                                          \
 | 
			
		||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \
 | 
			
		||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \
 | 
			
		||||
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
			
		||||
          keyInfo,                                                      \
 | 
			
		||||
          keySlices,                                                    \
 | 
			
		||||
@ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
			
		||||
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
			
		||||
          valueInfo,                                                    \
 | 
			
		||||
          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
			
		||||
          GTComp<scalar_t>());                                              \
 | 
			
		||||
          GTComp<scalar_t, true>());                                    \
 | 
			
		||||
    } else {                                                            \
 | 
			
		||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \
 | 
			
		||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \
 | 
			
		||||
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
			
		||||
          keyInfo,                                                      \
 | 
			
		||||
          keySlices,                                                    \
 | 
			
		||||
@ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
			
		||||
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
			
		||||
          valueInfo,                                                    \
 | 
			
		||||
          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
			
		||||
          LTComp<scalar_t>());                                              \
 | 
			
		||||
          LTComp<scalar_t, true>());                                              \
 | 
			
		||||
    }                                                                   \
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
@ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state,
 | 
			
		||||
#if CUDA_VERSION >= 7000
 | 
			
		||||
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
			
		||||
#endif
 | 
			
		||||
      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>());
 | 
			
		||||
      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>());
 | 
			
		||||
  } else {
 | 
			
		||||
    thrust::stable_sort_by_key(
 | 
			
		||||
#if CUDA_VERSION >= 7000
 | 
			
		||||
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
			
		||||
#endif
 | 
			
		||||
      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>());
 | 
			
		||||
      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Then, re-sort according to slice that each index is
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							@ -383,6 +383,14 @@ struct Converter<
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// In some versions of MSVC, there will be a compiler error when building.
 | 
			
		||||
// C4146: unary minus operator applied to unsigned type, result still unsigned
 | 
			
		||||
// It can be addressed by disabling the following warning. 
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
#pragma warning( push )
 | 
			
		||||
#pragma warning( disable : 4146 )
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// skip isnan and isinf check for integral types
 | 
			
		||||
template <typename To, typename From>
 | 
			
		||||
typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 | 
			
		||||
@ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
#pragma warning( pop )
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template <typename To, typename From>
 | 
			
		||||
typename std::enable_if<std::is_floating_point<From>::value, bool>::type
 | 
			
		||||
overflows(From f) {
 | 
			
		||||
 | 
			
		||||
@ -11,9 +11,11 @@ using c10::intrusive_ptr_target;
 | 
			
		||||
using c10::make_intrusive;
 | 
			
		||||
using c10::weak_intrusive_ptr;
 | 
			
		||||
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wself-move"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
class SomeClass0Parameters : public intrusive_ptr_target {};
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ Error::Error(
 | 
			
		||||
// Caffe2-style error message
 | 
			
		||||
Error::Error(
 | 
			
		||||
    const char* file,
 | 
			
		||||
    const int line,
 | 
			
		||||
    const uint32_t line,
 | 
			
		||||
    const char* condition,
 | 
			
		||||
    const std::string& msg,
 | 
			
		||||
    const std::string& backtrace,
 | 
			
		||||
 | 
			
		||||
@ -49,7 +49,7 @@ class C10_API Error : public std::exception {
 | 
			
		||||
  Error(SourceLocation source_location, const std::string& msg);
 | 
			
		||||
  Error(
 | 
			
		||||
      const char* file,
 | 
			
		||||
      const int line,
 | 
			
		||||
      const uint32_t line,
 | 
			
		||||
      const char* condition,
 | 
			
		||||
      const std::string& msg,
 | 
			
		||||
      const std::string& backtrace,
 | 
			
		||||
@ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e);
 | 
			
		||||
// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
 | 
			
		||||
// not met.
 | 
			
		||||
 | 
			
		||||
// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
 | 
			
		||||
// which is different from the definition of `SourceLocation` that requires
 | 
			
		||||
// unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
 | 
			
		||||
// error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
 | 
			
		||||
// Here the static cast is used to pass the build.
 | 
			
		||||
 | 
			
		||||
#define AT_ERROR(...) \
 | 
			
		||||
  throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
 | 
			
		||||
  throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 | 
			
		||||
 | 
			
		||||
#define AT_WARN(...) \
 | 
			
		||||
  ::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
 | 
			
		||||
  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 | 
			
		||||
 | 
			
		||||
#define AT_ASSERT(cond)                       \
 | 
			
		||||
  if (!(cond)) {                              \
 | 
			
		||||
 | 
			
		||||
@ -17,9 +17,10 @@
 | 
			
		||||
#include <utility>
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
#pragma GCC diagnostic push
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wshadow"
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
 | 
			
		||||
#else
 | 
			
		||||
@ -1457,4 +1458,6 @@ namespace ska
 | 
			
		||||
 | 
			
		||||
} // end namespace ska
 | 
			
		||||
 | 
			
		||||
#ifndef _MSC_VER
 | 
			
		||||
#pragma GCC diagnostic pop
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -72,18 +72,27 @@ class C10_API intrusive_ptr_target {
 | 
			
		||||
// We also have to disable -Wunknown-warning-option and -Wpragmas, because
 | 
			
		||||
// some other compilers don't know about -Wterminate or -Wexceptions and
 | 
			
		||||
// will show a warning about unknown warning options otherwise.
 | 
			
		||||
#pragma GCC diagnostic push
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wterminate"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wexceptions"
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
#  pragma warning(push)  
 | 
			
		||||
#  pragma warning(disable: 4297) // function assumed not to throw an exception but does  
 | 
			
		||||
#else  
 | 
			
		||||
#  pragma GCC diagnostic push  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wpragmas"  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wterminate"  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wexceptions"  
 | 
			
		||||
#endif
 | 
			
		||||
    AT_ASSERTM(
 | 
			
		||||
        refcount_.load() == 0,
 | 
			
		||||
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
 | 
			
		||||
    AT_ASSERTM(
 | 
			
		||||
        weakcount_.load() == 0,
 | 
			
		||||
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 | 
			
		||||
#pragma GCC diagnostic pop
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
#  pragma warning(pop)  
 | 
			
		||||
#else  
 | 
			
		||||
#  pragma GCC diagnostic pop  
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
 | 
			
		||||
 | 
			
		||||
@ -430,12 +430,16 @@ class C10_API TypeMeta {
 | 
			
		||||
    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
 | 
			
		||||
    // disabled for compilers that don't know '-Wundefined-var-template' and
 | 
			
		||||
    // would error at our attempt to disable it.
 | 
			
		||||
#pragma GCC diagnostic push
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
			
		||||
#pragma GCC diagnostic ignored "-Wundefined-var-template"
 | 
			
		||||
#ifndef _MSC_VER  
 | 
			
		||||
#  pragma GCC diagnostic push  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wpragmas"  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 | 
			
		||||
#  pragma GCC diagnostic ignored "-Wundefined-var-template"  
 | 
			
		||||
#endif
 | 
			
		||||
    return TypeMeta(_typeMetaDataInstance<T>());
 | 
			
		||||
#pragma GCC diagnostic pop
 | 
			
		||||
#ifndef _MSC_VER  
 | 
			
		||||
#  pragma GCC diagnostic pop  
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 private:
 | 
			
		||||
 | 
			
		||||
@ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY)
 | 
			
		||||
  else()
 | 
			
		||||
    target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  #cmake only check for separate OpenMP library on AppleClang 7+
 | 
			
		||||
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
 | 
			
		||||
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
 | 
			
		||||
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
 | 
			
		||||
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
 | 
			
		||||
      target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
target_link_libraries(caffe2 PUBLIC c10)
 | 
			
		||||
target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 | 
			
		||||
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 | 
			
		||||
@ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 | 
			
		||||
# Set standard properties on the target
 | 
			
		||||
torch_set_target_props(caffe2)
 | 
			
		||||
 | 
			
		||||
if (MSVC)
 | 
			
		||||
target_compile_options(caffe2 INTERFACE "-std=c++11")
 | 
			
		||||
else()
 | 
			
		||||
target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
 | 
			
		||||
if (NOT MSVC) 
 | 
			
		||||
  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>") 
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 | 
			
		||||
 | 
			
		||||
@ -93,7 +93,7 @@ using std::vector;
 | 
			
		||||
#define CAFFE2_NORETURN __attribute__((noreturn))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_MSC_VER)
 | 
			
		||||
#if (defined _MSC_VER && !defined NOMINMAX)
 | 
			
		||||
#define NOMINMAX
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,8 @@
 | 
			
		||||
from __future__ import absolute_import, division, print_function, unicode_literals
 | 
			
		||||
from caffe2.proto import caffe2_pb2
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import platform
 | 
			
		||||
# TODO: refactor & remove the following alias
 | 
			
		||||
caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 | 
			
		||||
caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
 | 
			
		||||
@ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 | 
			
		||||
caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 | 
			
		||||
caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
 | 
			
		||||
caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 | 
			
		||||
 | 
			
		||||
if platform.system() == 'Windows':
 | 
			
		||||
    IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
 | 
			
		||||
 | 
			
		||||
    if IS_CONDA:
 | 
			
		||||
        from ctypes import windll, c_wchar_p
 | 
			
		||||
        from ctypes.wintypes import DWORD, HMODULE
 | 
			
		||||
 | 
			
		||||
        AddDllDirectory = windll.kernel32.AddDllDirectory
 | 
			
		||||
        AddDllDirectory.restype = DWORD
 | 
			
		||||
        AddDllDirectory.argtypes = [c_wchar_p]
 | 
			
		||||
 | 
			
		||||
    def add_extra_dll_dir(extra_dll_dir):
 | 
			
		||||
        if os.path.isdir(extra_dll_dir):
 | 
			
		||||
            os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH']
 | 
			
		||||
 | 
			
		||||
            if IS_CONDA:
 | 
			
		||||
                AddDllDirectory(extra_dll_dir)
 | 
			
		||||
 | 
			
		||||
    # first get nvToolsExt PATH
 | 
			
		||||
    def get_nvToolsExt_path():
 | 
			
		||||
        NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
 | 
			
		||||
 | 
			
		||||
        if os.path.exists(NVTOOLEXT_HOME):
 | 
			
		||||
            return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
 | 
			
		||||
        else:
 | 
			
		||||
            return ''
 | 
			
		||||
 | 
			
		||||
    py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
 | 
			
		||||
    th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
 | 
			
		||||
    th_dll_path = os.path.join(th_root, 'lib')
 | 
			
		||||
 | 
			
		||||
    dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()]
 | 
			
		||||
 | 
			
		||||
    # then add the path to env
 | 
			
		||||
    for p in dll_paths:
 | 
			
		||||
        add_extra_dll_dir(p)
 | 
			
		||||
 | 
			
		||||
@ -628,37 +628,12 @@ endif()
 | 
			
		||||
 | 
			
		||||
# ---[ OpenMP
 | 
			
		||||
if(USE_OPENMP)
 | 
			
		||||
  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
 | 
			
		||||
  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
 | 
			
		||||
    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
 | 
			
		||||
    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
 | 
			
		||||
    if(DARWIN_VERSION GREATER 9)
 | 
			
		||||
      set(APPLE_OPENMP_SUCKS 1)
 | 
			
		||||
    endif(DARWIN_VERSION GREATER 9)
 | 
			
		||||
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
 | 
			
		||||
      OUTPUT_VARIABLE GCC_VERSION)
 | 
			
		||||
    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
 | 
			
		||||
      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
 | 
			
		||||
      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
 | 
			
		||||
      add_compile_options(-Wno-unknown-pragmas)
 | 
			
		||||
      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
 | 
			
		||||
    find_package(OpenMP)
 | 
			
		||||
    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
 | 
			
		||||
 | 
			
		||||
    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
 | 
			
		||||
    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
 | 
			
		||||
    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  find_package(OpenMP)
 | 
			
		||||
  if(OPENMP_FOUND)
 | 
			
		||||
    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
 | 
			
		||||
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 | 
			
		||||
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 | 
			
		||||
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
 | 
			
		||||
    caffe2_update_option(USE_OPENMP OFF)
 | 
			
		||||
@ -690,7 +665,12 @@ if(USE_CUDA)
 | 
			
		||||
      caffe2_update_option(USE_NVRTC OFF)
 | 
			
		||||
    endif()
 | 
			
		||||
    if(CAFFE2_USE_CUDNN)
 | 
			
		||||
      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
 | 
			
		||||
      IF(CUDNN_STATIC_LINKAGE)
 | 
			
		||||
	LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
 | 
			
		||||
	  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
 | 
			
		||||
      ELSE()
 | 
			
		||||
	list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
 | 
			
		||||
      ENDIF()
 | 
			
		||||
    else()
 | 
			
		||||
      caffe2_update_option(USE_CUDNN OFF)
 | 
			
		||||
    endif()
 | 
			
		||||
@ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
			
		||||
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
 | 
			
		||||
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
 | 
			
		||||
  ENDIF()
 | 
			
		||||
 | 
			
		||||
  # OpenMP support?
 | 
			
		||||
  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
 | 
			
		||||
  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
 | 
			
		||||
    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
 | 
			
		||||
    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
 | 
			
		||||
    IF (DARWIN_VERSION GREATER 9)
 | 
			
		||||
      SET(APPLE_OPENMP_SUCKS 1)
 | 
			
		||||
    ENDIF (DARWIN_VERSION GREATER 9)
 | 
			
		||||
    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
 | 
			
		||||
      OUTPUT_VARIABLE GCC_VERSION)
 | 
			
		||||
    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
 | 
			
		||||
      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
 | 
			
		||||
      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
 | 
			
		||||
      add_compile_options(-Wno-unknown-pragmas)
 | 
			
		||||
      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
 | 
			
		||||
    ENDIF()
 | 
			
		||||
  ENDIF()
 | 
			
		||||
 | 
			
		||||
  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
 | 
			
		||||
    FIND_PACKAGE(OpenMP)
 | 
			
		||||
    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
 | 
			
		||||
 | 
			
		||||
    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
 | 
			
		||||
    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
 | 
			
		||||
    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
 | 
			
		||||
  ENDIF()
 | 
			
		||||
 | 
			
		||||
  IF (OPENMP_FOUND)
 | 
			
		||||
    MESSAGE(STATUS "Compiling with OpenMP support")
 | 
			
		||||
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 | 
			
		||||
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 | 
			
		||||
  ENDIF()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 | 
			
		||||
 | 
			
		||||
  FIND_PACKAGE(MAGMA)
 | 
			
		||||
@ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
			
		||||
    SET(AT_CUDA_ENABLED 0)
 | 
			
		||||
  else()
 | 
			
		||||
    SET(AT_CUDA_ENABLED 1)
 | 
			
		||||
    find_package(CUDA 5.5 REQUIRED)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
 | 
			
		||||
@ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
			
		||||
  SET(AT_MKLDNN_ENABLED 0)
 | 
			
		||||
  SET(CAFFE2_USE_MKLDNN OFF)
 | 
			
		||||
  IF (USE_MKLDNN)
 | 
			
		||||
    FIND_PACKAGE(MKLDNN)
 | 
			
		||||
    INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
 | 
			
		||||
    IF(MKLDNN_FOUND)
 | 
			
		||||
      SET(AT_MKLDNN_ENABLED 1)
 | 
			
		||||
      INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR})
 | 
			
		||||
      INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR})
 | 
			
		||||
      IF(BUILD_CAFFE2_OPS)
 | 
			
		||||
        SET(CAFFE2_USE_MKLDNN ON)
 | 
			
		||||
        LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
 | 
			
		||||
 | 
			
		||||
@ -2,7 +2,6 @@
 | 
			
		||||
#
 | 
			
		||||
# The following variables are optionally searched for defaults
 | 
			
		||||
#  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found
 | 
			
		||||
#  USE_MKLDNN
 | 
			
		||||
#
 | 
			
		||||
# The following are set after configuration is done:
 | 
			
		||||
#  MKLDNN_FOUND          : set to true if mkl-dnn is found.
 | 
			
		||||
@ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND)
 | 
			
		||||
SET(MKLDNN_LIBRARIES)
 | 
			
		||||
SET(MKLDNN_INCLUDE_DIR)
 | 
			
		||||
 | 
			
		||||
IF (NOT USE_MKLDNN)
 | 
			
		||||
  RETURN()
 | 
			
		||||
ENDIF(NOT USE_MKLDNN)
 | 
			
		||||
 | 
			
		||||
IF(MSVC)
 | 
			
		||||
  MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
 | 
			
		||||
  RETURN()
 | 
			
		||||
@ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
 | 
			
		||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
 | 
			
		||||
 | 
			
		||||
IF(MKL_FOUND)
 | 
			
		||||
  # Append to mkldnn dependencies
 | 
			
		||||
  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
 | 
			
		||||
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
 | 
			
		||||
  # The OMP-related variables of MKL-DNN have to be overwritten here,
 | 
			
		||||
  # if MKL is used, and the OMP version is defined by MKL.
 | 
			
		||||
  # MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL.
 | 
			
		||||
  # INTEL_MKL_DIR gives the MKL root path.
 | 
			
		||||
  IF (INTEL_MKL_DIR)
 | 
			
		||||
    SET(MKLROOT ${INTEL_MKL_DIR})
 | 
			
		||||
    IF(WIN32)
 | 
			
		||||
      SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
 | 
			
		||||
    ELSE(WIN32)
 | 
			
		||||
      IF (MKL_LIBRARIES_gomp_LIBRARY)
 | 
			
		||||
        SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY})
 | 
			
		||||
      ELSE(MKL_LIBRARIES_gomp_LIBRARY)
 | 
			
		||||
        SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY})
 | 
			
		||||
      ENDIF(MKL_LIBRARIES_gomp_LIBRARY)
 | 
			
		||||
      SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
 | 
			
		||||
    ENDIF(WIN32)
 | 
			
		||||
  ELSE(INTEL_MKL_DIR)
 | 
			
		||||
    MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!")
 | 
			
		||||
  ENDIF(INTEL_MKL_DIR)
 | 
			
		||||
 | 
			
		||||
ELSE(MKL_FOUND)
 | 
			
		||||
  # If we cannot find MKL, we will use the Intel MKL Small library
 | 
			
		||||
  # comes with ${MKLDNN_ROOT}/external
 | 
			
		||||
@ -75,60 +51,65 @@ ELSE(MKL_FOUND)
 | 
			
		||||
  ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)
 | 
			
		||||
 | 
			
		||||
  FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
 | 
			
		||||
  IF(MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
    # if user has multiple version under external/ then guess last
 | 
			
		||||
    # one alphabetically is "latest" and warn
 | 
			
		||||
    LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
 | 
			
		||||
    IF(MKLINCLEN GREATER 1)
 | 
			
		||||
      LIST(SORT MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
      LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
      LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
 | 
			
		||||
      SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
 | 
			
		||||
    ENDIF(MKLINCLEN GREATER 1)
 | 
			
		||||
    GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
 | 
			
		||||
    LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
 | 
			
		||||
  IF(NOT MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
    MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
 | 
			
		||||
    RETURN()
 | 
			
		||||
  ENDIF(NOT MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
  # if user has multiple version under external/ then guess last
 | 
			
		||||
  # one alphabetically is "latest" and warn
 | 
			
		||||
  LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
 | 
			
		||||
  IF(MKLINCLEN GREATER 1)
 | 
			
		||||
    LIST(SORT MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
    LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
    LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
 | 
			
		||||
    SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
 | 
			
		||||
  ENDIF(MKLINCLEN GREATER 1)
 | 
			
		||||
  GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
 | 
			
		||||
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
 | 
			
		||||
 | 
			
		||||
    IF(APPLE)
 | 
			
		||||
      SET(__mklml_inner_libs mklml iomp5)
 | 
			
		||||
    ELSE(APPLE)
 | 
			
		||||
      SET(__mklml_inner_libs mklml_intel iomp5)
 | 
			
		||||
    ENDIF(APPLE)
 | 
			
		||||
 | 
			
		||||
    FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
 | 
			
		||||
      STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
 | 
			
		||||
      FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
 | 
			
		||||
            NAMES ${__mklml_inner_lib}
 | 
			
		||||
            PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
 | 
			
		||||
            DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
 | 
			
		||||
      MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
 | 
			
		||||
      LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
 | 
			
		||||
    ENDFOREACH(__mklml_inner_lib)
 | 
			
		||||
  ENDIF(MKLML_INNER_INCLUDE_DIR)
 | 
			
		||||
  IF(APPLE)
 | 
			
		||||
    SET(__mklml_inner_libs mklml iomp5)
 | 
			
		||||
  ELSE(APPLE)
 | 
			
		||||
    SET(__mklml_inner_libs mklml_intel iomp5)
 | 
			
		||||
  ENDIF(APPLE)
 | 
			
		||||
  FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
 | 
			
		||||
    STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
 | 
			
		||||
    FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
 | 
			
		||||
          NAMES ${__mklml_inner_lib}
 | 
			
		||||
          PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
 | 
			
		||||
          DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
 | 
			
		||||
    MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
 | 
			
		||||
    IF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
 | 
			
		||||
      MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
 | 
			
		||||
      RETURN()
 | 
			
		||||
    ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
 | 
			
		||||
    LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
 | 
			
		||||
  ENDFOREACH(__mklml_inner_lib)
 | 
			
		||||
ENDIF(MKL_FOUND)
 | 
			
		||||
 | 
			
		||||
LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES)
 | 
			
		||||
LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR)
 | 
			
		||||
INCLUDE(FindPackageHandleStandardArgs)
 | 
			
		||||
find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for})
 | 
			
		||||
IF(MKL_FOUND)
 | 
			
		||||
  SET(MKL_cmake_included TRUE)
 | 
			
		||||
  SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE)
 | 
			
		||||
ENDIF(MKL_FOUND)
 | 
			
		||||
SET(WITH_TEST FALSE CACHE BOOL "" FORCE)
 | 
			
		||||
SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE)
 | 
			
		||||
SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
 | 
			
		||||
ADD_SUBDIRECTORY(${MKLDNN_ROOT})
 | 
			
		||||
IF(NOT TARGET mkldnn)
 | 
			
		||||
  MESSAGE("Failed to include MKL-DNN target")
 | 
			
		||||
  RETURN()
 | 
			
		||||
ENDIF(NOT TARGET mkldnn)
 | 
			
		||||
IF(MKL_FOUND)
 | 
			
		||||
  TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL)
 | 
			
		||||
ENDIF(MKL_FOUND)
 | 
			
		||||
IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized)
 | 
			
		||||
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow)
 | 
			
		||||
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow)
 | 
			
		||||
ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
LIST(APPEND MKLDNN_LIBRARIES mkldnn)
 | 
			
		||||
 | 
			
		||||
IF(MKLDNN_FOUND)
 | 
			
		||||
  IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
    ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized)
 | 
			
		||||
  ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
			
		||||
  SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE)
 | 
			
		||||
  SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE)
 | 
			
		||||
  ADD_SUBDIRECTORY(${MKLDNN_ROOT})
 | 
			
		||||
  SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
 | 
			
		||||
  IF(WIN32)
 | 
			
		||||
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}")
 | 
			
		||||
  ELSE(WIN32)
 | 
			
		||||
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
 | 
			
		||||
  ENDIF(WIN32)
 | 
			
		||||
ELSE(MKLDNN_FOUND)
 | 
			
		||||
  MESSAGE(STATUS "MKLDNN source files not found!")
 | 
			
		||||
ENDIF(MKLDNN_FOUND)
 | 
			
		||||
 | 
			
		||||
UNSET(__mklml_inner_libs)
 | 
			
		||||
UNSET(__mkldnn_looked_for)
 | 
			
		||||
SET(MKLDNN_FOUND TRUE)
 | 
			
		||||
MESSAGE(STATUS "Found MKL-DNN: TRUE")
 | 
			
		||||
 | 
			
		||||
ENDIF(NOT MKLDNN_FOUND)
 | 
			
		||||
 | 
			
		||||
@ -9,6 +9,12 @@ endif()
 | 
			
		||||
# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
 | 
			
		||||
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
 | 
			
		||||
 | 
			
		||||
 # we dont want to statically link cudart, because we rely on it's dynamic linkage in
 | 
			
		||||
 # python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
 | 
			
		||||
 # Technically, we can link cudart here statically, and link libtorch_python.so
 | 
			
		||||
 # to a dynamic libcudart.so, but that's just wasteful
 | 
			
		||||
SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
 | 
			
		||||
 | 
			
		||||
# Find CUDA.
 | 
			
		||||
find_package(CUDA 7.0)
 | 
			
		||||
if(NOT CUDA_FOUND)
 | 
			
		||||
@ -89,6 +95,9 @@ endif()
 | 
			
		||||
 | 
			
		||||
if(DEFINED ENV{CUDNN_LIBRARY})
 | 
			
		||||
  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
 | 
			
		||||
  if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a")
 | 
			
		||||
    SET(CUDNN_STATIC_LINKAGE ON)
 | 
			
		||||
  endif()
 | 
			
		||||
else()
 | 
			
		||||
  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
 | 
			
		||||
    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
 | 
			
		||||
@ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN)
 | 
			
		||||
        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
 | 
			
		||||
  endif()
 | 
			
		||||
  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
 | 
			
		||||
  if(CUDNN_VERSION VERSION_LESS "7.0.0")
 | 
			
		||||
    message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
# ---[ CUDA libraries wrapper
 | 
			
		||||
@ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED)
 | 
			
		||||
if(CAFFE2_STATIC_LINK_CUDA)
 | 
			
		||||
    set_property(
 | 
			
		||||
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
			
		||||
        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt)
 | 
			
		||||
        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl)
 | 
			
		||||
else()
 | 
			
		||||
    set_property(
 | 
			
		||||
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,917 @@
 | 
			
		||||
PyTorch Contribution Guide
 | 
			
		||||
==========================
 | 
			
		||||
 | 
			
		||||
PyTorch is a GPU-accelerated Python tensor computation package for
 | 
			
		||||
building deep neural networks built on tape-based autograd systems.
 | 
			
		||||
 | 
			
		||||
The PyTorch Contribution Process
 | 
			
		||||
--------------------------------
 | 
			
		||||
 | 
			
		||||
The PyTorch organization is governed by `PyTorch
 | 
			
		||||
Governance </docs/community/governance.html>`__.
 | 
			
		||||
 | 
			
		||||
The PyTorch development process involves a healthy amount of open
 | 
			
		||||
discussions between the core development team and the community.
 | 
			
		||||
 | 
			
		||||
PyTorch operates similar to most open source projects on GitHub.
 | 
			
		||||
However, if you've never contributed to an open source project before,
 | 
			
		||||
here is the basic process.
 | 
			
		||||
 | 
			
		||||
-  **Figure out what you're going to work on.** The majority of open
 | 
			
		||||
   source contributions come from people scratching their own itches.
 | 
			
		||||
   However, if you don't know what you want to work on, or are just
 | 
			
		||||
   looking to get more acquainted with the project, here are some tips
 | 
			
		||||
   for how to find appropriate tasks:
 | 
			
		||||
 | 
			
		||||
   -  Look through the `issue
 | 
			
		||||
      tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if
 | 
			
		||||
      there are any issues you know how to fix. Issues that are
 | 
			
		||||
      confirmed by other contributors tend to be better to investigate.
 | 
			
		||||
      We also maintain some labels for issues which are likely to be
 | 
			
		||||
      good for new people, e.g., **bootcamp** and **1hr**, although
 | 
			
		||||
      these labels are less well maintained.
 | 
			
		||||
   -  Join us on Slack and let us know you're interested in getting to
 | 
			
		||||
      know PyTorch. We're very happy to help out researchers and
 | 
			
		||||
      partners get up to speed with the codebase.
 | 
			
		||||
 | 
			
		||||
-  **Figure out the scope of your change and reach out for design
 | 
			
		||||
   comments on a GitHub issue if it's large.** The majority of pull
 | 
			
		||||
   requests are small; in that case, no need to let us know about what
 | 
			
		||||
   you want to do, just get cracking. But if the change is going to be
 | 
			
		||||
   large, it's usually a good idea to get some design comments about it
 | 
			
		||||
   first.
 | 
			
		||||
 | 
			
		||||
   -  If you don't know how big a change is going to be, we can help you
 | 
			
		||||
      figure it out! Just post about it on issues or Slack.
 | 
			
		||||
   -  Some feature additions are very standardized; for example, lots of
 | 
			
		||||
      people add new operators or optimizers to PyTorch. Design
 | 
			
		||||
      discussion in these cases boils down mostly to, “Do we want this
 | 
			
		||||
      operator/optimizer?” Giving evidence for its utility, e.g., usage
 | 
			
		||||
      in peer reviewed papers, or existence in other frameworks, helps a
 | 
			
		||||
      bit when making this case.
 | 
			
		||||
   -  Core changes and refactors can be quite difficult to coordinate,
 | 
			
		||||
      as the pace of development on PyTorch master is quite fast.
 | 
			
		||||
      Definitely reach out about fundamental or cross-cutting changes;
 | 
			
		||||
      we can often give guidance about how to stage such changes into
 | 
			
		||||
      more easily reviewable pieces.
 | 
			
		||||
 | 
			
		||||
-  **Code it out!**
 | 
			
		||||
 | 
			
		||||
   -  See the technical guide for advice for working with PyTorch in a
 | 
			
		||||
      technical form.
 | 
			
		||||
 | 
			
		||||
-  **Open a pull request.**
 | 
			
		||||
 | 
			
		||||
   -  If you are not ready for the pull request to be reviewed, tag it
 | 
			
		||||
      with [WIP]. We will ignore it when doing review passes. If you are
 | 
			
		||||
      working on a complex change, it's good to start things off as WIP,
 | 
			
		||||
      because you will need to spend time looking at CI results to see
 | 
			
		||||
      if things worked out or not.
 | 
			
		||||
   -  Find an appropriate reviewer for your change. We have some folks
 | 
			
		||||
      who regularly go through the PR queue and try to review
 | 
			
		||||
      everything, but if you happen to know who the maintainer for a
 | 
			
		||||
      given subsystem affected by your patch is, feel free to include
 | 
			
		||||
      them directly on the pull request. You can learn more about this
 | 
			
		||||
      structure at PyTorch Subsystem Ownership.
 | 
			
		||||
 | 
			
		||||
-  **Iterate on the pull request until it's accepted!**
 | 
			
		||||
 | 
			
		||||
   -  We'll try our best to minimize the number of review roundtrips and
 | 
			
		||||
      block PRs only when there are major issues. For the most common
 | 
			
		||||
      issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__.
 | 
			
		||||
   -  Once a pull request is accepted and CI is passing, there is
 | 
			
		||||
      nothing else you need to do; we will merge the PR for you.
 | 
			
		||||
 | 
			
		||||
Getting Started
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Proposing new features
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
New feature ideas are best discussed on a specific issue. Please include
 | 
			
		||||
as much information as you can, any accompanying data, and your proposed
 | 
			
		||||
solution. The PyTorch team and community frequently reviews new issues
 | 
			
		||||
and comments where they think they can help. If you feel confident in
 | 
			
		||||
your solution, go ahead and implement it.
 | 
			
		||||
 | 
			
		||||
Reporting Issues
 | 
			
		||||
~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
If you've identified an issue, first search through the `list of
 | 
			
		||||
existing issues <https://github.com/pytorch/pytorch/issues>`__ on the
 | 
			
		||||
repo. If you are unable to find a similar issue, then create a new one.
 | 
			
		||||
Supply as much information you can to reproduce the problematic
 | 
			
		||||
behavior. Also, include any additional insights like the behavior you
 | 
			
		||||
expect.
 | 
			
		||||
 | 
			
		||||
Implementing Features or Fixing Bugs
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
If you want to fix a specific issue, it's best to comment on the
 | 
			
		||||
individual issue with your intent. However, we do not lock or assign
 | 
			
		||||
issues except in cases where we have worked with the developer before.
 | 
			
		||||
It's best to strike up a conversation on the issue and discuss your
 | 
			
		||||
proposed solution. The PyTorch team can provide guidance that saves you
 | 
			
		||||
time.
 | 
			
		||||
 | 
			
		||||
Issues that are labeled first-new-issue, low, or medium priority provide
 | 
			
		||||
the best entrance point are great places to start.
 | 
			
		||||
 | 
			
		||||
Adding Tutorials
 | 
			
		||||
~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__
 | 
			
		||||
come from the community itself and we welcome additional contributions.
 | 
			
		||||
To learn more about how to contribute a new tutorial you can learn more
 | 
			
		||||
here: `PyTorch.org Tutorial Contribution Guide on
 | 
			
		||||
Github <https://github.com/pytorch/tutorials/#contributing>`__
 | 
			
		||||
 | 
			
		||||
Improving Documentation & Tutorials
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
We aim to produce high quality documentation and tutorials. On rare
 | 
			
		||||
occasions that content includes typos or bugs. If you find something you
 | 
			
		||||
can fix, send us a pull request for consideration.
 | 
			
		||||
 | 
			
		||||
Take a look at the `Documentation <#on-documentation>`__ section to learn how our system
 | 
			
		||||
works.
 | 
			
		||||
 | 
			
		||||
Participating in online discussions
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
You can find active discussions happening on the PyTorch Discussion
 | 
			
		||||
`forum <https://discuss.pytorch.org/>`__.
 | 
			
		||||
 | 
			
		||||
Submitting pull requests to fix open issues
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
You can view a list of all open issues
 | 
			
		||||
`here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an
 | 
			
		||||
issue is a great way to get the attention of the team. From here you can
 | 
			
		||||
share your ideas and how you plan to resolve the issue.
 | 
			
		||||
 | 
			
		||||
For more challenging issues, the team will provide feedback and
 | 
			
		||||
direction for how to best solve the issue.
 | 
			
		||||
 | 
			
		||||
If you're not able to fix the issue itself, commenting and sharing
 | 
			
		||||
whether you can reproduce the issue can be useful for helping the team
 | 
			
		||||
identify problem areas.
 | 
			
		||||
 | 
			
		||||
Reviewing open pull requests
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
We appreciate your help reviewing and commenting on pull requests. Our
 | 
			
		||||
team strives to keep the number of open pull requests at a manageable
 | 
			
		||||
size, we respond quickly for more information if we need it, and we
 | 
			
		||||
merge PRs that we think are useful. However, due to the high level of
 | 
			
		||||
interest, additional eyes on pull requests is appreciated.
 | 
			
		||||
 | 
			
		||||
Improving code readability
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Improve code readability helps everyone. It is often better to submit a
 | 
			
		||||
small number of pull requests that touch few files versus a large pull
 | 
			
		||||
request that touches many files. Starting a discussion in the PyTorch
 | 
			
		||||
forum `here <https://discuss.pytorch.org/>`__ or on an issue related to
 | 
			
		||||
your improvement is the best way to get started.
 | 
			
		||||
 | 
			
		||||
Adding test cases to make the codebase more robust
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Additional test coverage is appreciated.
 | 
			
		||||
 | 
			
		||||
Promoting PyTorch
 | 
			
		||||
~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Your use of PyTorch in your projects, research papers, write ups, blogs,
 | 
			
		||||
or general discussions around the internet helps to raise awareness for
 | 
			
		||||
PyTorch and our growing community. Please reach out to
 | 
			
		||||
`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 | 
			
		||||
for marketing support.
 | 
			
		||||
 | 
			
		||||
Triaging issues
 | 
			
		||||
~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
If you feel that an issue could benefit from a particular tag or level
 | 
			
		||||
of complexity comment on the issue and share your opinion. If an you
 | 
			
		||||
feel an issue isn't categorized properly comment and let the team know.
 | 
			
		||||
 | 
			
		||||
About open source development
 | 
			
		||||
-----------------------------
 | 
			
		||||
 | 
			
		||||
If this is your first time contributing to an open source project, some
 | 
			
		||||
aspects of the development process may seem unusual to you.
 | 
			
		||||
 | 
			
		||||
-  **There is no way to “claim” issues.** People often want to “claim”
 | 
			
		||||
   an issue when they decide to work on it, to ensure that there isn't
 | 
			
		||||
   wasted work when someone else ends up working on it. This doesn't
 | 
			
		||||
   really work too well in open source, since someone may decide to work
 | 
			
		||||
   on something, and end up not having time to do it. Feel free to give
 | 
			
		||||
   information in an advisory fashion, but at the end of the day, we
 | 
			
		||||
   will take running code and rough consensus.
 | 
			
		||||
-  **There is a high bar for new functionality that is added.** Unlike
 | 
			
		||||
   in a corporate environment, where the person who wrote code
 | 
			
		||||
   implicitly “owns” it and can be expected to take care of it in the
 | 
			
		||||
   beginning of its lifetime, once a pull request is merged into an open
 | 
			
		||||
   source project, it immediately becomes the collective responsibility
 | 
			
		||||
   of all maintainers on the project. When we merge code, we are saying
 | 
			
		||||
   that we, the maintainers, are able to review subsequent changes and
 | 
			
		||||
   make a bugfix to the code. This naturally leads to a higher standard
 | 
			
		||||
   of contribution.
 | 
			
		||||
 | 
			
		||||
Common Mistakes To Avoid
 | 
			
		||||
------------------------
 | 
			
		||||
 | 
			
		||||
-  **Did you add tests?** (Or if the change is hard to test, did you
 | 
			
		||||
   describe how you tested your change?)
 | 
			
		||||
 | 
			
		||||
   -  We have a few motivations for why we ask for tests:
 | 
			
		||||
 | 
			
		||||
      1. to help us tell if we break it later
 | 
			
		||||
      2. to help us tell if the patch is correct in the first place
 | 
			
		||||
         (yes, we did review it, but as Knuth says, “beware of the
 | 
			
		||||
         following code, for I have not run it, merely proven it
 | 
			
		||||
         correct”)
 | 
			
		||||
 | 
			
		||||
   -  When is it OK not to add a test? Sometimes a change can't be
 | 
			
		||||
      conveniently tested, or the change is so obviously correct (and
 | 
			
		||||
      unlikely to be broken) that it's OK not to test it. On the
 | 
			
		||||
      contrary, if a change is seems likely (or is known to be likely)
 | 
			
		||||
      to be accidentally broken, it's important to put in the time to
 | 
			
		||||
      work out a testing strategy.
 | 
			
		||||
 | 
			
		||||
-  **Is your PR too long?**
 | 
			
		||||
 | 
			
		||||
   -  It's easier for us to review and merge small PRs. Difficulty of
 | 
			
		||||
      reviewing a PR scales nonlinearly with its size.
 | 
			
		||||
   -  When is it OK to submit a large PR? It helps a lot if there was a
 | 
			
		||||
      corresponding design discussion in an issue, with sign off from
 | 
			
		||||
      the people who are going to review your diff. We can also help
 | 
			
		||||
      give advice about how to split up a large change into individually
 | 
			
		||||
      shippable parts. Similarly, it helps if there is a complete
 | 
			
		||||
      description of the contents of the PR: it's easier to review code
 | 
			
		||||
      if we know what's inside!
 | 
			
		||||
 | 
			
		||||
-  **Comments for subtle things?** In cases where behavior of your code
 | 
			
		||||
   is nuanced, please include extra comments and documentation to allow
 | 
			
		||||
   us to better understand the intention of your code.
 | 
			
		||||
-  **Did you add a hack?** Sometimes a hack is the right answer. But
 | 
			
		||||
   usually we will have to discuss it.
 | 
			
		||||
-  **Do you want to touch a very core component?** In order to prevent
 | 
			
		||||
   major regressions, pull requests that touch core components receive
 | 
			
		||||
   extra scrutiny. Make sure you've discussed your changes with the team
 | 
			
		||||
   before undertaking major changes.
 | 
			
		||||
-  **Want to add a new feature?** If you want to add new features,
 | 
			
		||||
   comment your intention on the related issue. Our team tries to
 | 
			
		||||
   comment on and provide feedback to the community. It's better to have
 | 
			
		||||
   an open discussion with the team and the rest of the community prior
 | 
			
		||||
   to building new features. This helps us stay aware of what you're
 | 
			
		||||
   working on and increases the chance that it'll be merged.
 | 
			
		||||
-  **Did you touch unrelated code to the PR?** To aid in code review,
 | 
			
		||||
   please only include files in your pull request that are directly
 | 
			
		||||
   related to your changes.
 | 
			
		||||
 | 
			
		||||
Frequently asked questions
 | 
			
		||||
 | 
			
		||||
-  **How can I contribute as a reviewer?** There is lots of value if
 | 
			
		||||
   community developer reproduce issues, try out new functionality, or
 | 
			
		||||
   otherwise help us identify or troubleshoot issues. Commenting on
 | 
			
		||||
   tasks or pull requests with your enviroment details is helpful and
 | 
			
		||||
   appreciated.
 | 
			
		||||
-  **CI tests failed, what does it mean?** Maybe you need to merge with
 | 
			
		||||
   master or rebase with latest changes. Pushing your changes should
 | 
			
		||||
   re-trigger CI tests. If the tests persist, you'll want to trace
 | 
			
		||||
   through the error messages and resolve the related issues.
 | 
			
		||||
-  **What are the most high risk changes?** Anything that tourhces build
 | 
			
		||||
   configuration is an risky area. Please avoid changing these unless
 | 
			
		||||
   you've had a discussion with the team beforehand.
 | 
			
		||||
-  **Hey, a commit showed up on my branch, what's up with that?**
 | 
			
		||||
   Sometimes another community member will provide a patch or fix to
 | 
			
		||||
   your pull request or branch. This is often needed for getting CI tests
 | 
			
		||||
   to pass.
 | 
			
		||||
 | 
			
		||||
On Documentation
 | 
			
		||||
----------------
 | 
			
		||||
 | 
			
		||||
Python Docs
 | 
			
		||||
~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
PyTorch documentation is generated from python source using
 | 
			
		||||
`Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is
 | 
			
		||||
copied to the docs folder in the master branch of
 | 
			
		||||
`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
 | 
			
		||||
and is served via GitHub pages.
 | 
			
		||||
 | 
			
		||||
-  Site: http://pytorch.org/docs
 | 
			
		||||
-  GitHub: http://github.com/pytorch/pytorch/docs
 | 
			
		||||
-  Served from:
 | 
			
		||||
   `https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
 | 
			
		||||
 | 
			
		||||
C++ Docs
 | 
			
		||||
~~~~~~~~
 | 
			
		||||
 | 
			
		||||
For C++ code we use Doxygen to generate the content files. The C++ docs
 | 
			
		||||
are built on a special server and the resulting files are copied to the
 | 
			
		||||
https://github.com/pytorch/cppdocs repo, and are served from GitHub
 | 
			
		||||
pages.
 | 
			
		||||
 | 
			
		||||
-  Site: http://pytorch.org/cppdocs
 | 
			
		||||
-  GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp
 | 
			
		||||
-  Served from: https://github.com/pytorch/cppdocs
 | 
			
		||||
 | 
			
		||||
Tutorials
 | 
			
		||||
---------
 | 
			
		||||
 | 
			
		||||
PyTorch tutorials are documents used to help understand using PyTorch to
 | 
			
		||||
accomplish specific tasks or to understand more holistic concepts.
 | 
			
		||||
Tutorials are built using
 | 
			
		||||
`Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__
 | 
			
		||||
from executable python sources files, or from restructured-text (rst)
 | 
			
		||||
files.
 | 
			
		||||
 | 
			
		||||
-  Site: http://pytorch.org/tutorials
 | 
			
		||||
-  GitHub: http://github.com/pytorch/tutorials
 | 
			
		||||
 | 
			
		||||
Tutorials Build Overview
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
For tutorials, `pull
 | 
			
		||||
requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a
 | 
			
		||||
rebuild the entire site using CircleCI to test the effects of the
 | 
			
		||||
change. This build is sharded into 9 worker builds and takes around 40
 | 
			
		||||
minutes total. At the same time, we do a Netlify build using *make
 | 
			
		||||
html-noplot*, which builds the site without rendering the notebook
 | 
			
		||||
output into pages for quick review.
 | 
			
		||||
 | 
			
		||||
After a PR is accepted, the site is rebuilt and deployed from CircleCI.
 | 
			
		||||
 | 
			
		||||
Contributing a new Tutorial
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
`PyTorch.org Tutorial Contribution
 | 
			
		||||
Guide <https://github.com/pytorch/tutorials/#contributing>`__
 | 
			
		||||
 | 
			
		||||
Code Style
 | 
			
		||||
~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
**Python style**
 | 
			
		||||
 | 
			
		||||
**C++ style**
 | 
			
		||||
 | 
			
		||||
Submitting a Pull Request
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
PyTorch development happens publicly on our Github repo.
 | 
			
		||||
 | 
			
		||||
To have your feature or fix added to PyTorch, please submit a Pull
 | 
			
		||||
Request.
 | 
			
		||||
 | 
			
		||||
Running Tests
 | 
			
		||||
~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Show examples for running all tests, just one individual...
 | 
			
		||||
 | 
			
		||||
Technical Process
 | 
			
		||||
-----------------
 | 
			
		||||
 | 
			
		||||
Developing PyTorch
 | 
			
		||||
~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
To develop PyTorch on your machine, here are some tips:
 | 
			
		||||
 | 
			
		||||
1. Uninstall all existing PyTorch installs:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    conda uninstall pytorch
 | 
			
		||||
    pip uninstall torch
 | 
			
		||||
    pip uninstall torch # run this command twice
 | 
			
		||||
 | 
			
		||||
2. Clone a copy of PyTorch from source:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    git clone https://github.com/pytorch/pytorch
 | 
			
		||||
    cd pytorch
 | 
			
		||||
 | 
			
		||||
3. Install PyTorch in ``build develop`` mode:
 | 
			
		||||
 | 
			
		||||
A full set of instructions on installing PyTorch from source is here:
 | 
			
		||||
https://github.com/pytorch/pytorch#from-source
 | 
			
		||||
 | 
			
		||||
The change you have to make is to replace
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    python setup.py install
 | 
			
		||||
 | 
			
		||||
with
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    python setup.py build develop
 | 
			
		||||
 | 
			
		||||
This is especially useful if you are only changing Python files.
 | 
			
		||||
 | 
			
		||||
This mode will symlink the Python files from the current local source
 | 
			
		||||
tree into the Python install.
 | 
			
		||||
 | 
			
		||||
Hence, if you modify a Python file, you do not need to reinstall PyTorch
 | 
			
		||||
again and again.
 | 
			
		||||
 | 
			
		||||
For example:
 | 
			
		||||
 | 
			
		||||
-  Install local PyTorch in ``build develop`` mode
 | 
			
		||||
-  modify your Python file ``torch/__init__.py`` (for example)
 | 
			
		||||
-  test functionality
 | 
			
		||||
-  modify your Python file ``torch/__init__.py``
 | 
			
		||||
-  test functionality
 | 
			
		||||
-  modify your Python file ``torch/__init__.py``
 | 
			
		||||
-  test functionality
 | 
			
		||||
 | 
			
		||||
You do not need to repeatedly install after modifying Python files.
 | 
			
		||||
 | 
			
		||||
In case you want to reinstall, make sure that you uninstall PyTorch
 | 
			
		||||
first by running ``pip uninstall torch`` and ``python setup.py clean``.
 | 
			
		||||
Then you can install in ``build develop`` mode again.
 | 
			
		||||
 | 
			
		||||
Codebase structure
 | 
			
		||||
------------------
 | 
			
		||||
 | 
			
		||||
-  `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core
 | 
			
		||||
   library files that work everywhere, both server and mobile. We are
 | 
			
		||||
   slowly moving pieces from
 | 
			
		||||
   `ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
 | 
			
		||||
   here. This library is intended only to contain essential
 | 
			
		||||
   functionality, and appropriate to use in settings where binary size
 | 
			
		||||
   matters. (But you'll have a lot of missing functionality if you try
 | 
			
		||||
   to use it directly.)
 | 
			
		||||
-  `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++
 | 
			
		||||
   tensor library for PyTorch (no autograd support)
 | 
			
		||||
 | 
			
		||||
   -  `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__
 | 
			
		||||
 | 
			
		||||
      -  `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__
 | 
			
		||||
         `THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__
 | 
			
		||||
         `THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__
 | 
			
		||||
         `THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__
 | 
			
		||||
         - Legacy library code from the original Torch. Try not to add
 | 
			
		||||
         things here; we're slowly porting these to
 | 
			
		||||
         `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__.
 | 
			
		||||
 | 
			
		||||
         -  generic - Contains actual implementations of operators,
 | 
			
		||||
            parametrized over ``scalar_t``. Files here get compiled N
 | 
			
		||||
            times per supported scalar type in PyTorch.
 | 
			
		||||
 | 
			
		||||
      -  `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__
 | 
			
		||||
 | 
			
		||||
         -  `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
 | 
			
		||||
            - Core functionality of ATen. This is migrating to top-level
 | 
			
		||||
            c10 folder.
 | 
			
		||||
         -  `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__
 | 
			
		||||
            - Modern implementations of operators. If you want to write
 | 
			
		||||
            a new operator, here is where it should go. Most CPU
 | 
			
		||||
            operators go in the top level directory, except for
 | 
			
		||||
            operators which need to be compiled specially; see cpu
 | 
			
		||||
            below.
 | 
			
		||||
 | 
			
		||||
            -  `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__
 | 
			
		||||
               - Not actually CPU implementations of operators, but
 | 
			
		||||
               specifically implementations which are compiled with
 | 
			
		||||
               processor-specific instructions, like AVX. See the
 | 
			
		||||
               `README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__
 | 
			
		||||
               for more details.
 | 
			
		||||
            -  `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__
 | 
			
		||||
               - CUDA implementations of operators.
 | 
			
		||||
            -  `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__
 | 
			
		||||
               - CPU and CUDA implementations of COO sparse tensor
 | 
			
		||||
               operations
 | 
			
		||||
            -  `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__
 | 
			
		||||
               `mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__
 | 
			
		||||
               `miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__
 | 
			
		||||
               `cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__
 | 
			
		||||
 | 
			
		||||
               -  implementations of operators which simply bind to some
 | 
			
		||||
                  backend library.
 | 
			
		||||
 | 
			
		||||
-  `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ -
 | 
			
		||||
   The actual PyTorch library. Everything that is not in
 | 
			
		||||
   `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
 | 
			
		||||
   is a Python module, following the PyTorch Python frontend module
 | 
			
		||||
   structure.
 | 
			
		||||
 | 
			
		||||
   -  `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
 | 
			
		||||
      - C++ files composing the PyTorch library. Files in this directory
 | 
			
		||||
      tree are a mix of Python binding code, and C++ heavy lifting.
 | 
			
		||||
      Consult ``setup.py`` for the canonical list of Python binding
 | 
			
		||||
      files; conventionally, they are often prefixed with ``python_``.
 | 
			
		||||
 | 
			
		||||
      -  `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__
 | 
			
		||||
         - Compiler and frontend for TorchScript JIT frontend.
 | 
			
		||||
      -  `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__
 | 
			
		||||
         - Implementation of reverse-mode automatic differentiation.
 | 
			
		||||
      -  `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__
 | 
			
		||||
         - The PyTorch C++ frontend.
 | 
			
		||||
      -  `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__
 | 
			
		||||
         - Distributed training support for PyTorch.
 | 
			
		||||
 | 
			
		||||
-  `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ -
 | 
			
		||||
   Code generation scripts for the PyTorch library. See
 | 
			
		||||
   `README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__
 | 
			
		||||
   of this directory for more details.
 | 
			
		||||
-  `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ -
 | 
			
		||||
   Python unit tests for PyTorch Python frontend.
 | 
			
		||||
 | 
			
		||||
   -  `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__
 | 
			
		||||
      - Basic tests for PyTorch functionality.
 | 
			
		||||
   -  `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__
 | 
			
		||||
      - Tests for non-NN automatic differentiation support.
 | 
			
		||||
   -  `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__
 | 
			
		||||
      - Tests for NN operators and their automatic differentiation.
 | 
			
		||||
   -  `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__
 | 
			
		||||
      - Tests for the JIT compiler and TorchScript.
 | 
			
		||||
   -  ...
 | 
			
		||||
   -  `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__
 | 
			
		||||
      - C++ unit tests for PyTorch C++ frontend.
 | 
			
		||||
   -  `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__
 | 
			
		||||
      - Automatically generated "expect" files which are used to compare
 | 
			
		||||
      against expected output.
 | 
			
		||||
   -  `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__
 | 
			
		||||
      - Tests for ONNX export functionality, using both PyTorch and
 | 
			
		||||
      Caffe2.
 | 
			
		||||
 | 
			
		||||
-  `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ -
 | 
			
		||||
   The Caffe2 library.
 | 
			
		||||
 | 
			
		||||
   -  `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__
 | 
			
		||||
      - Core files of Caffe2, e.g., tensor, workspace, blobs, etc.
 | 
			
		||||
   -  `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__
 | 
			
		||||
      - Operators of Caffe2.
 | 
			
		||||
   -  `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__
 | 
			
		||||
      - Python bindings to Caffe2.
 | 
			
		||||
   -  ...
 | 
			
		||||
 | 
			
		||||
Unit Testing
 | 
			
		||||
------------
 | 
			
		||||
 | 
			
		||||
PyTorch's testing is located under ``test/``. Run the entire test suite
 | 
			
		||||
with
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    python test/run_test.py
 | 
			
		||||
 | 
			
		||||
or run individual test files, like ``python test/test_nn.py``, for
 | 
			
		||||
individual test suites.
 | 
			
		||||
 | 
			
		||||
Better local unit tests with pytest
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
We don't officially support ``pytest``, but it works well with our
 | 
			
		||||
``unittest`` tests and offers a number of useful features for local
 | 
			
		||||
developing. Install it via ``pip install pytest``.
 | 
			
		||||
 | 
			
		||||
If you want to just run tests that contain a specific substring, you can
 | 
			
		||||
use the ``-k`` flag:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    pytest test/test_nn.py -k Loss -v
 | 
			
		||||
 | 
			
		||||
The above is an example of testing a change to Loss functions: this
 | 
			
		||||
command runs tests such as ``TestNN.test_BCELoss``\ and
 | 
			
		||||
``TestNN.test_MSELoss`` and can be useful to save keystrokes.
 | 
			
		||||
 | 
			
		||||
Writing documentation
 | 
			
		||||
---------------------
 | 
			
		||||
 | 
			
		||||
PyTorch uses `Google
 | 
			
		||||
style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__
 | 
			
		||||
for formatting docstrings. Length of line inside docstrings block must
 | 
			
		||||
be limited to 80 characters to fit into Jupyter documentation popups.
 | 
			
		||||
 | 
			
		||||
For C++ documentation (https://pytorch.org/cppdocs), we use
 | 
			
		||||
`Doxygen <http://www.doxygen.nl/>`__ and then convert it to
 | 
			
		||||
`Sphinx <http://www.sphinx-doc.org/>`__ via
 | 
			
		||||
`Breathe <https://github.com/michaeljones/breathe>`__
 | 
			
		||||
and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen
 | 
			
		||||
reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__
 | 
			
		||||
for more information on the documentation syntax. To build the
 | 
			
		||||
documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``.
 | 
			
		||||
 | 
			
		||||
We run Doxygen in CI (Travis) to verify that you do not use invalid
 | 
			
		||||
Doxygen commands. To run this check locally, run ``./check-doxygen.sh``
 | 
			
		||||
from inside ``docs/cpp``.
 | 
			
		||||
 | 
			
		||||
Managing multiple build trees
 | 
			
		||||
-----------------------------
 | 
			
		||||
 | 
			
		||||
One downside to using ``python setup.py develop`` is that your
 | 
			
		||||
development version of PyTorch will be installed globally on your
 | 
			
		||||
account (e.g., if you run ``import torch`` anywhere else, the
 | 
			
		||||
development version will be used.
 | 
			
		||||
 | 
			
		||||
If you want to manage multiple builds of PyTorch, you can make use of
 | 
			
		||||
`conda environments <https://conda.io/docs/using/envs.html>`__ to
 | 
			
		||||
maintain separate Python package environments, each of which can be tied
 | 
			
		||||
to a specific build of PyTorch. To set one up:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed
 | 
			
		||||
    python setup.py build develop
 | 
			
		||||
 | 
			
		||||
C++ Development tips
 | 
			
		||||
--------------------
 | 
			
		||||
 | 
			
		||||
If you are working on the C++ code, there are a few important things
 | 
			
		||||
that you will want to keep in mind:
 | 
			
		||||
 | 
			
		||||
1. How to rebuild only the code you are working on.
 | 
			
		||||
2. How to make rebuilds in the absence of changes go faster.
 | 
			
		||||
 | 
			
		||||
Build only what you need.
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
``python setup.py build`` will build everything, but since our build
 | 
			
		||||
system is not very optimized for incremental rebuilds, this will
 | 
			
		||||
actually be very slow. Far better is to only request rebuilds of the
 | 
			
		||||
parts of the project you are working on:
 | 
			
		||||
 | 
			
		||||
-  Working on the Python bindings? Run ``python setup.py develop`` to
 | 
			
		||||
   rebuild (NB: no ``build`` here!)
 | 
			
		||||
-  Working on ``torch/csrc`` or ``aten``? Run
 | 
			
		||||
   ``python setup.py rebuild_libtorch`` to rebuild and avoid having to
 | 
			
		||||
   rebuild other dependent libraries we depend on.
 | 
			
		||||
-  Working on one of the other dependent libraries? The other valid
 | 
			
		||||
   targets are listed in ``dep_libs`` in ``setup.py``. prepend
 | 
			
		||||
   ``build_`` to get a target, and run as e.g.
 | 
			
		||||
   ``python setup.py build_gloo``.
 | 
			
		||||
-  Working on a test binary? Run
 | 
			
		||||
   ``(cd build && ninja bin/test_binary_name)`` to rebuild only that
 | 
			
		||||
   test binary (without rerunning cmake). (Replace ``ninja`` with
 | 
			
		||||
   ``make`` if you don't have ninja installed).
 | 
			
		||||
 | 
			
		||||
On the initial build, you can also speed things up with the environment
 | 
			
		||||
variables ``DEBUG`` and ``NO_CUDA``.
 | 
			
		||||
 | 
			
		||||
-  ``DEBUG=1`` will enable debug builds (-g -O0)
 | 
			
		||||
-  ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations
 | 
			
		||||
   (-g -O3)
 | 
			
		||||
-  ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing
 | 
			
		||||
   on something not CUDA related), to save compile time.
 | 
			
		||||
 | 
			
		||||
For example:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    NO_CUDA=1 DEBUG=1 python setup.py build develop
 | 
			
		||||
 | 
			
		||||
Make sure you continue to pass these flags on subsequent builds.
 | 
			
		||||
 | 
			
		||||
Code completion and IDE support
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
When using ``python setup.py develop``, PyTorch will generate a
 | 
			
		||||
``compile_commands.json`` file that can be used by many editors to
 | 
			
		||||
provide command completion and error highlighting for PyTorch's C++
 | 
			
		||||
code. You need to ``pip install ninja`` to generate accurate information
 | 
			
		||||
for the code in ``torch/csrc``. More information at:
 | 
			
		||||
 | 
			
		||||
-  https://sarcasm.github.io/notes/dev/compilation-database.html
 | 
			
		||||
 | 
			
		||||
Make no-op build fast.
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Use Ninja
 | 
			
		||||
~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Python ``setuptools`` is pretty dumb, and always rebuilds every C file
 | 
			
		||||
in a project. If you install the ninja build system with
 | 
			
		||||
``pip install ninja``, then PyTorch will use it to track dependencies
 | 
			
		||||
correctly. If PyTorch was already built, you will need to run
 | 
			
		||||
``python setup.py clean`` once after installing ninja for builds to
 | 
			
		||||
succeed.
 | 
			
		||||
 | 
			
		||||
Use CCache
 | 
			
		||||
~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Even when dependencies are tracked with file modification, there are
 | 
			
		||||
many situations where files get rebuilt when a previous compilation was
 | 
			
		||||
exactly the same.
 | 
			
		||||
 | 
			
		||||
Using ccache in a situation like this is a real time-saver. However, by
 | 
			
		||||
default, ccache does not properly support CUDA stuff, so here are the
 | 
			
		||||
instructions for installing a custom ccache fork that has CUDA support:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    # install and export ccacheif ! ls ~/ccache/bin/ccachethen
 | 
			
		||||
        sudo apt-get update
 | 
			
		||||
        sudo apt-get install -y automake autoconf
 | 
			
		||||
        sudo apt-get install -y asciidoc
 | 
			
		||||
        mkdir -p ~/ccache
 | 
			
		||||
        pushd /tmp
 | 
			
		||||
        rm -rf ccache
 | 
			
		||||
        git clone https://github.com/colesbury/ccache -b ccbin
 | 
			
		||||
        pushd ccache
 | 
			
		||||
        ./autogen.sh
 | 
			
		||||
        ./configure
 | 
			
		||||
        make install prefix=~/ccache
 | 
			
		||||
        popdpopd
 | 
			
		||||
 | 
			
		||||
        mkdir -p ~/ccache/lib
 | 
			
		||||
        mkdir -p ~/ccache/cuda
 | 
			
		||||
        ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
 | 
			
		||||
        ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
 | 
			
		||||
        ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
 | 
			
		||||
        ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
 | 
			
		||||
        ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
 | 
			
		||||
 | 
			
		||||
        ~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
 | 
			
		||||
 | 
			
		||||
CUDA Development tips
 | 
			
		||||
---------------------
 | 
			
		||||
 | 
			
		||||
If you are working on the CUDA code, here are some useful CUDA debugging
 | 
			
		||||
tips:
 | 
			
		||||
 | 
			
		||||
1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug
 | 
			
		||||
   symbols (``-g -G``). This will be particularly helpful in debugging
 | 
			
		||||
   device code. However, it will slow down the build process for about
 | 
			
		||||
   50% (compared to only ``DEBUG=1``), so use wisely.
 | 
			
		||||
2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging
 | 
			
		||||
   friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a
 | 
			
		||||
   CUDA tensor (rather than all zeros).
 | 
			
		||||
 | 
			
		||||
Hope this helps, and thanks for considering to contribute.
 | 
			
		||||
 | 
			
		||||
Windows development tips
 | 
			
		||||
------------------------
 | 
			
		||||
 | 
			
		||||
Occasionally, you will write a patch which works on Linux, but fails CI
 | 
			
		||||
on Windows. There are a few aspects in which MSVC (the Windows compiler
 | 
			
		||||
toolchain we use) is stricter than Linux, which are worth keeping in
 | 
			
		||||
mind when fixing these problems.
 | 
			
		||||
 | 
			
		||||
1. Symbols are NOT exported by default on Windows; instead, you have to
 | 
			
		||||
   explicitly mark a symbol as exported/imported in a header file with
 | 
			
		||||
   ``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have
 | 
			
		||||
   codified this pattern into a set of macros which follow the
 | 
			
		||||
   convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen.
 | 
			
		||||
   (Every separate shared library needs a unique macro name, because
 | 
			
		||||
   symbol visibility is on a per shared library basis. See
 | 
			
		||||
   c10/macros/Macros.h for more details.) The upshot is if you see an
 | 
			
		||||
   "unresolved external" error in your Windows build, this is probably
 | 
			
		||||
   because you forgot to mark a function with ``*_API``. However, there
 | 
			
		||||
   is one important counterexample to this principle: if you want a
 | 
			
		||||
   *templated* function to be instantiated at the call site, do NOT mark
 | 
			
		||||
   it with ``*_API`` (if you do mark it, you'll have to explicitly
 | 
			
		||||
   instantiate all of the specializations used by the call sites.)
 | 
			
		||||
2. If you link against a library, this does not make its dependencies
 | 
			
		||||
   transitively visible. You must explicitly specify a link dependency
 | 
			
		||||
   against every library whose symbols you use. (This is different from
 | 
			
		||||
   Linux where in most environments, transitive dependencies can be used
 | 
			
		||||
   to fulfill unresolved symbols.)
 | 
			
		||||
3. If you have a Windows box (we have a few on EC2 which you can request
 | 
			
		||||
   access to) and you want to run the build, the easiest way is to just
 | 
			
		||||
   run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run
 | 
			
		||||
   ``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing
 | 
			
		||||
   away your Conda environment.)
 | 
			
		||||
 | 
			
		||||
Even if you don't know anything about MSVC, you can use cmake to build
 | 
			
		||||
simple programs on Windows; this can be helpful if you want to learn
 | 
			
		||||
more about some peculiar linking behavior by reproducing it on a small
 | 
			
		||||
example. Here's a simple example cmake file that defines two dynamic
 | 
			
		||||
libraries, one linking with the other:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo)
 | 
			
		||||
 | 
			
		||||
You can build it with:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    mkdir buildcd build
 | 
			
		||||
    cmake ..
 | 
			
		||||
    cmake --build .
 | 
			
		||||
 | 
			
		||||
Known MSVC (and MSVC with NVCC) bugs
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
The PyTorch codebase sometimes likes to use exciting C++ features, and
 | 
			
		||||
these exciting features lead to exciting bugs in Windows compilers. To
 | 
			
		||||
add insult to injury, the error messages will often not tell you which
 | 
			
		||||
line of code actually induced the erroring template instantiation. We've
 | 
			
		||||
found the most effective way to debug these problems is to carefully
 | 
			
		||||
read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few
 | 
			
		||||
well known pitfalls and workarounds:
 | 
			
		||||
 | 
			
		||||
-  This is not actually a bug per se, but in general, code generated by
 | 
			
		||||
   MSVC is more sensitive to memory errors; you may have written some
 | 
			
		||||
   code that does a use-after-free or stack overflows; on Linux the code
 | 
			
		||||
   might work, but on Windows your program will crash. ASAN may not
 | 
			
		||||
   catch all of these problems: stay vigilant to the possibility that
 | 
			
		||||
   your crash is due to a real memory problem.
 | 
			
		||||
-  (NVCC) ``c10::optional`` does not work when used from device code.
 | 
			
		||||
   Don't use it from kernels. Upstream issue:
 | 
			
		||||
   https://github.com/akrzemi1/Optional/issues/58 and our local issue
 | 
			
		||||
   #10329.
 | 
			
		||||
-  ``constexpr`` generally works less well on MSVC.
 | 
			
		||||
 | 
			
		||||
   -  The idiom ``static_assert(f() == f())`` to test if ``f`` is
 | 
			
		||||
      constexpr does not work; you'll get "error C2131: expression did
 | 
			
		||||
      not evaluate to a constant". Don't use these asserts on Windows.
 | 
			
		||||
      (Example: ``c10/util/intrusive_ptr.h``)
 | 
			
		||||
 | 
			
		||||
-  (NVCC) Code you access inside a ``static_assert`` will eagerly be
 | 
			
		||||
   evaluated as if it were device code, and so you might get an error
 | 
			
		||||
   that the code is "not accessible".
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    class A {
 | 
			
		||||
      static A singleton_;
 | 
			
		||||
      static constexpr inline A* singleton() {
 | 
			
		||||
        return &singleton_;
 | 
			
		||||
      }
 | 
			
		||||
    };static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
 | 
			
		||||
 | 
			
		||||
-  The compiler will run out of heap space if you attempt to compile
 | 
			
		||||
   files that are too large. Splitting such files into separate files
 | 
			
		||||
   helps. (Example: ``THTensorMath``, ``THTensorMoreMath``,
 | 
			
		||||
   ``THTensorEvenMoreMath``.)
 | 
			
		||||
-  MSVC's preprocessor (but not the standard compiler) has a bug where
 | 
			
		||||
   it incorrectly tokenizes raw string literals, ending when it sees a
 | 
			
		||||
   ``"``. This causes preprocessor tokens inside the literal like
 | 
			
		||||
   an\ ``#endif`` to be incorrectly treated as preprocessor directives.
 | 
			
		||||
   See https://godbolt.org/z/eVTIJq as an example.
 | 
			
		||||
 | 
			
		||||
Running Clang-Tidy
 | 
			
		||||
~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
`Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a
 | 
			
		||||
C++ linter and static analysis tool based on the clang compiler. We run
 | 
			
		||||
clang-tidy in our CI to make sure that new C++ code is safe, sane and
 | 
			
		||||
efficient. See our
 | 
			
		||||
`.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__
 | 
			
		||||
file for the simple commands we use for this. To run clang-tidy locally,
 | 
			
		||||
follow these steps:
 | 
			
		||||
 | 
			
		||||
1. Install clang-tidy. First, check if you already have clang-tidy by
 | 
			
		||||
   simply writing ``clang-tidy`` in your terminal. If you don't yet have
 | 
			
		||||
   clang-tidy, you should be able to install it easily with your package
 | 
			
		||||
   manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu.
 | 
			
		||||
   See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on
 | 
			
		||||
   how to install the latest version. Note that newer versions of
 | 
			
		||||
   clang-tidy will have more checks than older versions. In our CI, we
 | 
			
		||||
   run clang-tidy-6.0.
 | 
			
		||||
2. Use our driver script to run clang-tidy over any changes relative to
 | 
			
		||||
   some git revision (you may want to replace ``HEAD~1`` with ``HEAD``
 | 
			
		||||
   to pick up uncommitted changes). Changes are picked up based on a
 | 
			
		||||
   ``git diff`` with the given revision:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
 | 
			
		||||
 | 
			
		||||
Above, it is assumed you are in the PyTorch root folder.
 | 
			
		||||
``path/to/build`` should be the path to where you built PyTorch from
 | 
			
		||||
source, e.g. ``build`` in the PyTorch root folder if you used
 | 
			
		||||
``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change
 | 
			
		||||
the clang-tidy this script uses. Make sure you have PyYaml installed,
 | 
			
		||||
which is in PyTorch's ``requirements.txt``.
 | 
			
		||||
 | 
			
		||||
Pre-commit Tidy/Linting Hook
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
We use clang-tidy and flake8 to perform additional formatting and
 | 
			
		||||
semantic checking of code. We provide a pre-commit git hook for
 | 
			
		||||
performing these checks, before a commit is created:
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
 | 
			
		||||
 | 
			
		||||
Caffe2 notes
 | 
			
		||||
------------
 | 
			
		||||
 | 
			
		||||
In 2018, we merged Caffe2 into the PyTorch source repository. While the
 | 
			
		||||
steady state aspiration is that Caffe2 and PyTorch share code freely, in
 | 
			
		||||
the meantime there will be some separation. If you submit a PR to only
 | 
			
		||||
PyTorch or only Caffe2 code, CI will only run for the project you
 | 
			
		||||
edited. The logic for this is implemented in
 | 
			
		||||
``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can
 | 
			
		||||
look at this to see what path prefixes constitute changes. This also
 | 
			
		||||
means if you ADD a new top-level path, or you start sharing code between
 | 
			
		||||
projects, you need to modify these files. There are a few "unusual"
 | 
			
		||||
directories which, for historical reasons, are Caffe2/PyTorch specific.
 | 
			
		||||
Here they are:
 | 
			
		||||
 | 
			
		||||
-  ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``,
 | 
			
		||||
   ``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code
 | 
			
		||||
   in them without extra coordination.
 | 
			
		||||
-  ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools``
 | 
			
		||||
   are PyTorch-specific. Don't put Caffe2 code in them without extra
 | 
			
		||||
   coordination.
 | 
			
		||||
							
								
								
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,154 @@
 | 
			
		||||
PyTorch Governance
 | 
			
		||||
==========================
 | 
			
		||||
 | 
			
		||||
Governance Philosophy and Guiding Tenets
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 | 
			
		||||
PyTorch adopts a governance structure with a small set of maintainers
 | 
			
		||||
driving the overall project direction with a strong bias towards
 | 
			
		||||
PyTorch's design philosophy where design and code contributions are
 | 
			
		||||
valued. Beyond the core maintainers, there is also a slightly broader
 | 
			
		||||
set of core developers that have the ability to directly merge pull
 | 
			
		||||
requests and own various parts of the core code base.
 | 
			
		||||
 | 
			
		||||
Beyond the maintainers and core devs, the community is encouraged to
 | 
			
		||||
contribute, file issues, make proposals, review pull requests and be
 | 
			
		||||
present in the community. Given contributions and willingness to
 | 
			
		||||
invest, anyone can be provided write access or ownership of parts of
 | 
			
		||||
the codebase.
 | 
			
		||||
 | 
			
		||||
Based on this governance structure, the project has the following core
 | 
			
		||||
operating tenets by which decisions are made and overall culture is
 | 
			
		||||
derived:
 | 
			
		||||
 | 
			
		||||
1. **Code contributions** matter much more than corporate sponsorship
 | 
			
		||||
   and independent developers are highly valued.
 | 
			
		||||
2. **Project influence** is gained through contributions (whether PRs,
 | 
			
		||||
   forum answers, code reviews or otherwise)
 | 
			
		||||
 | 
			
		||||
Key people and their functions
 | 
			
		||||
------------------------------
 | 
			
		||||
 | 
			
		||||
Project Maintainers
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Project maintainers provide leadership and direction for the PyTorch
 | 
			
		||||
project. Specifics include:
 | 
			
		||||
 | 
			
		||||
-  Articulate a cohesive long-term vision for the project
 | 
			
		||||
-  Possess a deep understanding of the PyTorch code base
 | 
			
		||||
-  Negotiate and resolve contentious issues in ways acceptable to all
 | 
			
		||||
   parties involved
 | 
			
		||||
 | 
			
		||||
PyTorch Maintainers:
 | 
			
		||||
 | 
			
		||||
-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
			
		||||
-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
			
		||||
-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
			
		||||
-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
			
		||||
-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 | 
			
		||||
-  (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
			
		||||
 | 
			
		||||
Core Developers
 | 
			
		||||
~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
The PyTorch project is developed by a team of core developers. You can
 | 
			
		||||
find the list of core developers at `PyTorch Governance \| Persons of
 | 
			
		||||
Interest </docs/community/persons_of_interest.html>`__.
 | 
			
		||||
 | 
			
		||||
While membership is determined by presence in the "PyTorch core" team in
 | 
			
		||||
the "PyTorch"
 | 
			
		||||
`organization <https://github.com/orgs/pytorch/teams/facebook>`__ on
 | 
			
		||||
GitHub, contribution takes many forms:
 | 
			
		||||
 | 
			
		||||
-  committing changes to the repository;
 | 
			
		||||
-  reviewing pull requests by others;
 | 
			
		||||
-  triaging bug reports on the issue tracker;
 | 
			
		||||
-  discussing topics on official PyTorch communication channels.
 | 
			
		||||
 | 
			
		||||
Moderators
 | 
			
		||||
~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
There is a group of people, some of which are not core developers,
 | 
			
		||||
responsible for ensuring that discussions on official communication
 | 
			
		||||
channels adhere to the Code of Conduct. They take action in view of
 | 
			
		||||
violations and help to support a healthy community. You can find the
 | 
			
		||||
list of moderators `here <https://discuss.pytorch.org/about>`__.
 | 
			
		||||
 | 
			
		||||
Decision Making
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Uncontroversial Changes
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Primary work happens through bug tracker issues and pull requests on
 | 
			
		||||
GitHub. Core developers should avoid pushing their changes directly to
 | 
			
		||||
the PyTorch repository, instead relying on pull requests. Approving a
 | 
			
		||||
pull request by a core developer allows it to be merged without further
 | 
			
		||||
process. Core Developers and Project Maintainers ultimately approve
 | 
			
		||||
these changes.
 | 
			
		||||
 | 
			
		||||
Notifying relevant experts about a bug tracker issue or a pull request
 | 
			
		||||
is important. Reviews from experts in the given interest area are
 | 
			
		||||
strongly preferred, especially on pull request approvals. Failure to do
 | 
			
		||||
so might end up with the change being reverted by the relevant expert.
 | 
			
		||||
 | 
			
		||||
Controversial decision process
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Substantial changes in a given interest area require a GitHub issue to
 | 
			
		||||
be opened for discussion. This includes:
 | 
			
		||||
 | 
			
		||||
-  Any semantic or syntactic change to the framework.
 | 
			
		||||
-  Backwards-incompatible changes to the Python or Cpp API.
 | 
			
		||||
-  Additions to the core framework, including substantial new
 | 
			
		||||
   functionality within an existing library.
 | 
			
		||||
-  Removing core features
 | 
			
		||||
 | 
			
		||||
Project Maintainers ultimately approve these changes.
 | 
			
		||||
 | 
			
		||||
FAQ
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
**Q: What if I would like to own (or partly own) a part of the project
 | 
			
		||||
such as a domain api (i.e. Torch Vision)?** This is absolutely possible.
 | 
			
		||||
The first step is to start contributing to the existing project area and
 | 
			
		||||
contributing to its health and success. In addition to this, you can
 | 
			
		||||
make a proposal through a GitHub issue for new functionality or changes
 | 
			
		||||
to improve the project area.
 | 
			
		||||
 | 
			
		||||
**Q: What if I am a company looking to use PyTorch internally for
 | 
			
		||||
development, can I be granted or purchase a board seat to drive the
 | 
			
		||||
project direction?** No, the PyTorch project is strictly driven by the
 | 
			
		||||
maintainer-driven project philosophy and does not have a board or
 | 
			
		||||
vehicle to take financial contributions relating to gaining influence
 | 
			
		||||
over technical direction.
 | 
			
		||||
 | 
			
		||||
**Q: Does the PyTorch project support grants or ways to support
 | 
			
		||||
independent developers using or contributing to the project?** No, not
 | 
			
		||||
at this point. We are however looking at ways to better support the
 | 
			
		||||
community of independent developers around PyTorch. If you have
 | 
			
		||||
suggestions or inputs, please reach out on the PyTorch forums to
 | 
			
		||||
discuss.
 | 
			
		||||
 | 
			
		||||
**Q: How do I contribute code to the project?** If the change is
 | 
			
		||||
relatively minor, a pull request on GitHub can be opened up immediately
 | 
			
		||||
for review and merge by the project committers. For larger changes,
 | 
			
		||||
please open an issue to make a proposal to discuss prior. Please also
 | 
			
		||||
see the **`PyTorch Contributor
 | 
			
		||||
Guide </docs/community/contribution_guide.html>`__** for contribution
 | 
			
		||||
guidelines.
 | 
			
		||||
 | 
			
		||||
**Q: Can I become a committer on the project?** Unfortunately, the
 | 
			
		||||
current commit process to PyTorch involves an interaction with Facebook
 | 
			
		||||
infrastructure that can only be triggered by Facebook employees. We are
 | 
			
		||||
however looking at ways to expand the committer base to individuals
 | 
			
		||||
outside of Facebook and will provide an update when the tooling exists
 | 
			
		||||
to allow this.
 | 
			
		||||
 | 
			
		||||
**Q: What if i would like to deliver a PyTorch tutorial at a conference
 | 
			
		||||
or otherwise? Do I need to be 'officially' a committer to do this?** No,
 | 
			
		||||
we encourage community members to showcase their work wherever and
 | 
			
		||||
whenever they can. Please reach out to
 | 
			
		||||
`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 | 
			
		||||
for marketing support.
 | 
			
		||||
							
								
								
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,130 @@
 | 
			
		||||
PyTorch Governance | Persons of Interest
 | 
			
		||||
=========================================
 | 
			
		||||
 | 
			
		||||
General Maintainers
 | 
			
		||||
-------------------
 | 
			
		||||
 | 
			
		||||
-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
			
		||||
-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
			
		||||
-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
			
		||||
-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
			
		||||
-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 | 
			
		||||
-  (sunsetting) Sam Gross
 | 
			
		||||
   (`colesbury <https://github.com/colesbury>`__)
 | 
			
		||||
 | 
			
		||||
Module-level maintainers
 | 
			
		||||
------------------------
 | 
			
		||||
 | 
			
		||||
JIT
 | 
			
		||||
~~~
 | 
			
		||||
 | 
			
		||||
-  Zach Devito (`zdevito <https://github.com/zdevito>`__)
 | 
			
		||||
-  Michael Suo (`suo <https://github.com/suo>`__)
 | 
			
		||||
 | 
			
		||||
Distributed
 | 
			
		||||
~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 | 
			
		||||
-  Shen Li (`mrshenli <https://github.com/mrshenli>`__)
 | 
			
		||||
-  (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__)
 | 
			
		||||
 | 
			
		||||
Autograd Engine
 | 
			
		||||
~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Alban Desmaison (`alband <https://github.com/alband>`__)
 | 
			
		||||
-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
			
		||||
 | 
			
		||||
Multiprocessing and DataLoaders
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Simon Wang (`SsnL <https://github.com/SsnL>`__)
 | 
			
		||||
-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
			
		||||
-  (proposed) Vitaly Fedyunin
 | 
			
		||||
   (`VitalyFedyunin <https://github.com/proposed>`__)
 | 
			
		||||
 | 
			
		||||
CUDA
 | 
			
		||||
~~~~
 | 
			
		||||
 | 
			
		||||
-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
			
		||||
-  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
 | 
			
		||||
 | 
			
		||||
C++
 | 
			
		||||
~~~
 | 
			
		||||
 | 
			
		||||
-  Will Feng (`yf225 <https://github.com/yf225>`__)
 | 
			
		||||
-  (sunsetting) Peter Goldsborough
 | 
			
		||||
   (`goldsborough <https://github.com/goldsborough>`__)
 | 
			
		||||
 | 
			
		||||
Build + CI
 | 
			
		||||
~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Will Feng (`yf225 <https://github.com/yf225>`__)
 | 
			
		||||
-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
			
		||||
-  Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__)
 | 
			
		||||
-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
			
		||||
-  (sunsetting) Orion Reblitz-Richardson
 | 
			
		||||
(`orionr <https://github.com/orionr>`__)
 | 
			
		||||
 | 
			
		||||
Distributions & RNG
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__)
 | 
			
		||||
-  Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__)
 | 
			
		||||
-  Alican Bozkurt (`alicanb <https://github.com/alicanb>`__)
 | 
			
		||||
-  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
 | 
			
		||||
 | 
			
		||||
C10
 | 
			
		||||
~~~
 | 
			
		||||
 | 
			
		||||
-  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
 | 
			
		||||
-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
			
		||||
 | 
			
		||||
ONNX <-> PyTorch
 | 
			
		||||
~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Lu Fang (`houseroad <https://github.com/houseroad>`__)
 | 
			
		||||
 | 
			
		||||
torch.nn
 | 
			
		||||
~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 | 
			
		||||
-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
			
		||||
-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
			
		||||
-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
			
		||||
-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
			
		||||
 | 
			
		||||
CPU Performance / SIMD
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 | 
			
		||||
-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
			
		||||
-  Richard Zou (`zou3519 <https://github.com/zou3519>`__)
 | 
			
		||||
 | 
			
		||||
AMD/ROCm/HIP
 | 
			
		||||
~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 | 
			
		||||
-  Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__)
 | 
			
		||||
 | 
			
		||||
Windows
 | 
			
		||||
~~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
 | 
			
		||||
 | 
			
		||||
MKLDNN
 | 
			
		||||
~~~~~~
 | 
			
		||||
 | 
			
		||||
-  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
 | 
			
		||||
 | 
			
		||||
XLA
 | 
			
		||||
~~~
 | 
			
		||||
 | 
			
		||||
-  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
 | 
			
		||||
-  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
			
		||||
-  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
 | 
			
		||||
-  Alex Suhan (`asuhan <https://github.com/asuhan>`__)
 | 
			
		||||
 | 
			
		||||
PPC
 | 
			
		||||
~~~
 | 
			
		||||
 | 
			
		||||
-  Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
 | 
			
		||||
@ -1,6 +1,101 @@
 | 
			
		||||
torch.hub
 | 
			
		||||
===================================
 | 
			
		||||
Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
 | 
			
		||||
 | 
			
		||||
Publishing models
 | 
			
		||||
-----------------
 | 
			
		||||
 | 
			
		||||
Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
 | 
			
		||||
to a github repository by adding a simple ``hubconf.py`` file;
 | 
			
		||||
 | 
			
		||||
``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with
 | 
			
		||||
the following signature.
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    def entrypoint_name(pretrained=False, *args, **kwargs):
 | 
			
		||||
        ...
 | 
			
		||||
 | 
			
		||||
How to implement an entrypoint?
 | 
			
		||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
			
		||||
Here is a code snipet from pytorch/vision repository, which specifies an entrypoint
 | 
			
		||||
for ``resnet18`` model. You can see a full script in
 | 
			
		||||
`pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    dependencies = ['torch', 'math']
 | 
			
		||||
 | 
			
		||||
    def resnet18(pretrained=False, *args, **kwargs):
 | 
			
		||||
        """
 | 
			
		||||
        Resnet18 model
 | 
			
		||||
        pretrained (bool): a recommended kwargs for all entrypoints
 | 
			
		||||
        args & kwargs are arguments for the function
 | 
			
		||||
        """
 | 
			
		||||
        ######## Call the model in the repo ###############
 | 
			
		||||
        from torchvision.models.resnet import resnet18 as _resnet18
 | 
			
		||||
        model = _resnet18(*args, **kwargs)
 | 
			
		||||
        ######## End of call ##############################
 | 
			
		||||
        # The following logic is REQUIRED
 | 
			
		||||
        if pretrained:
 | 
			
		||||
            # For weights saved in local repo
 | 
			
		||||
			# model.load_state_dict(<path_to_saved_file>)
 | 
			
		||||
 | 
			
		||||
			# For weights saved elsewhere
 | 
			
		||||
			checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
 | 
			
		||||
            model.load_state_dict(model_zoo.load_url(checkpoint, progress=False))
 | 
			
		||||
        return model
 | 
			
		||||
 | 
			
		||||
- ``dependencies`` variable is a **list** of package names required to to run the model.
 | 
			
		||||
- Pretrained weights can either be stored local in the github repo, or loadable by
 | 
			
		||||
  ``model_zoo.load()``.
 | 
			
		||||
- ``pretrained`` controls whether to load the pre-trained weights provided by repo owners.
 | 
			
		||||
- ``args`` and ``kwargs`` are passed along to the real callable function.
 | 
			
		||||
- Docstring of the function works as a help message, explaining what does the model do and what
 | 
			
		||||
  are the allowed arguments.
 | 
			
		||||
- Entrypoint function should **ALWAYS** return a model(nn.module).
 | 
			
		||||
 | 
			
		||||
Important Notice
 | 
			
		||||
^^^^^^^^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
- The published models should be at least in a branch/tag. It can't be a random commit.
 | 
			
		||||
 | 
			
		||||
Loading models from Hub
 | 
			
		||||
-----------------------
 | 
			
		||||
 | 
			
		||||
Users can load the pre-trained models using ``torch.hub.load()`` API.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. automodule:: torch.hub
 | 
			
		||||
.. autofunction:: load
 | 
			
		||||
 | 
			
		||||
Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo.
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    hub_model = hub.load(
 | 
			
		||||
        'pytorch/vision:master', # repo_owner/repo_name:branch
 | 
			
		||||
        'resnet18', # entrypoint
 | 
			
		||||
        1234, # args for callable [not applicable to resnet]
 | 
			
		||||
        pretrained=True) # kwargs for callable
 | 
			
		||||
 | 
			
		||||
Where are my downloaded model & weights saved?
 | 
			
		||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
The locations are used in the order of
 | 
			
		||||
 | 
			
		||||
- hub_dir: user specified path. It can be set in the following ways:
 | 
			
		||||
  - Setting the environment variable ``TORCH_HUB_DIR``
 | 
			
		||||
  - Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
 | 
			
		||||
- ``~/.torch/hub``
 | 
			
		||||
 | 
			
		||||
.. autofunction:: set_dir
 | 
			
		||||
 | 
			
		||||
Caching logic
 | 
			
		||||
^^^^^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``.
 | 
			
		||||
 | 
			
		||||
Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
 | 
			
		||||
the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
 | 
			
		||||
when updates are published to the same branch, users can keep up with the latest release.
 | 
			
		||||
 | 
			
		||||
@ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 | 
			
		||||
 | 
			
		||||
   notes/*
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
  :glob:
 | 
			
		||||
  :maxdepth: 1
 | 
			
		||||
  :caption: Community
 | 
			
		||||
 | 
			
		||||
  community/*
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :maxdepth: 1
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
Torch Script
 | 
			
		||||
TorchScript
 | 
			
		||||
============
 | 
			
		||||
 | 
			
		||||
.. contents:: :local:
 | 
			
		||||
@ -6,17 +6,17 @@ Torch Script
 | 
			
		||||
.. automodule:: torch.jit
 | 
			
		||||
.. currentmodule:: torch.jit
 | 
			
		||||
 | 
			
		||||
Torch Script is a way to create serializable and optimizable models from PyTorch code.
 | 
			
		||||
Any code written in Torch Script can be saved from your Python
 | 
			
		||||
TorchScript is a way to create serializable and optimizable models from PyTorch code.
 | 
			
		||||
Any code written in TorchScript can be saved from your Python
 | 
			
		||||
process and loaded in a process where there is no Python dependency.
 | 
			
		||||
 | 
			
		||||
We provide tools to incrementally transition a model from being a pure Python program
 | 
			
		||||
to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
 | 
			
		||||
to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program.
 | 
			
		||||
This makes it possible to train models in PyTorch using familiar tools and then export
 | 
			
		||||
the model to a production environment where it is not a good idea to run models as Python programs
 | 
			
		||||
for performance and multi-threading reasons.
 | 
			
		||||
 | 
			
		||||
Creating Torch Script Code
 | 
			
		||||
Creating TorchScript Code
 | 
			
		||||
--------------------------
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -117,26 +117,26 @@ Example:
 | 
			
		||||
            return self.resnet(input - self.means)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Torch Script Language Reference
 | 
			
		||||
TorchScript Language Reference
 | 
			
		||||
-------------------------------
 | 
			
		||||
 | 
			
		||||
Torch Script is a subset of Python that can either be written directly (using
 | 
			
		||||
TorchScript is a subset of Python that can either be written directly (using
 | 
			
		||||
the @script annotations) or generated automatically from Python code via
 | 
			
		||||
tracing. When using tracing, code is automatically converted into this subset of
 | 
			
		||||
Python by recording only the actual operators on tensors and simply executing and
 | 
			
		||||
discarding the other surrounding Python code.
 | 
			
		||||
 | 
			
		||||
When writing Torch Script directly using @script annotations, the programmer must
 | 
			
		||||
only use the subset of Python supported in Torch Script. This section documents
 | 
			
		||||
what is supported in Torch Script as if it were a language reference for a stand
 | 
			
		||||
When writing TorchScript directly using @script annotations, the programmer must
 | 
			
		||||
only use the subset of Python supported in TorchScript. This section documents
 | 
			
		||||
what is supported in TorchScript as if it were a language reference for a stand
 | 
			
		||||
alone language. Any features of Python not mentioned in this reference are not
 | 
			
		||||
part of Torch Script.
 | 
			
		||||
part of TorchScript.
 | 
			
		||||
 | 
			
		||||
As a subset of Python any valid Torch Script function is also a valid Python
 | 
			
		||||
As a subset of Python any valid TorchScript function is also a valid Python
 | 
			
		||||
function. This makes it possible to remove the @script annotations and debug the
 | 
			
		||||
function using standard Python tools like pdb. The reverse is not true: there
 | 
			
		||||
are many valid python programs that are not valid Torch Script programs.
 | 
			
		||||
Instead, Torch Script focuses specifically on the features of Python that are
 | 
			
		||||
are many valid python programs that are not valid TorchScript programs.
 | 
			
		||||
Instead, TorchScript focuses specifically on the features of Python that are
 | 
			
		||||
needed to represent neural network models in Torch.
 | 
			
		||||
 | 
			
		||||
.. envvar:: PYTORCH_JIT=1
 | 
			
		||||
@ -150,9 +150,9 @@ needed to represent neural network models in Torch.
 | 
			
		||||
Types
 | 
			
		||||
~~~~~
 | 
			
		||||
 | 
			
		||||
The largest difference between Torch Script and the full Python language is that
 | 
			
		||||
Torch Script only support a small set of types that are needed to express neural
 | 
			
		||||
net models. In particular Torch Script supports:
 | 
			
		||||
The largest difference between TorchScript and the full Python language is that
 | 
			
		||||
TorchScript only support a small set of types that are needed to express neural
 | 
			
		||||
net models. In particular TorchScript supports:
 | 
			
		||||
 | 
			
		||||
``Tensor``
 | 
			
		||||
    A PyTorch tensor of any dtype, dimension, or backend.
 | 
			
		||||
@ -169,8 +169,8 @@ net models. In particular Torch Script supports:
 | 
			
		||||
``List[T]``
 | 
			
		||||
    A list of which all members are type ``T``
 | 
			
		||||
 | 
			
		||||
Unlike Python, each variable in Torch Script function must have a single static type.
 | 
			
		||||
This makes it easier to optimize Torch Script functions.
 | 
			
		||||
Unlike Python, each variable in TorchScript function must have a single static type.
 | 
			
		||||
This makes it easier to optimize TorchScript functions.
 | 
			
		||||
 | 
			
		||||
Example::
 | 
			
		||||
 | 
			
		||||
@ -183,9 +183,9 @@ Example::
 | 
			
		||||
        return r # Type mismatch: r is set to type Tensor in the true branch
 | 
			
		||||
                 # and type int in the false branch
 | 
			
		||||
 | 
			
		||||
By default, all parameters to a Torch Script function are assumed to be Tensor
 | 
			
		||||
By default, all parameters to a TorchScript function are assumed to be Tensor
 | 
			
		||||
because this is the most common type used in modules. To specify that an
 | 
			
		||||
argument to a Torch Script function is another type, it is possible to use
 | 
			
		||||
argument to a TorchScript function is another type, it is possible to use
 | 
			
		||||
MyPy-style type annotations using the types listed above:
 | 
			
		||||
 | 
			
		||||
Example::
 | 
			
		||||
@ -264,7 +264,7 @@ Subscripts
 | 
			
		||||
  ``t[i:j, i]``
 | 
			
		||||
 | 
			
		||||
  .. note::
 | 
			
		||||
    Torch Script currently does not support mutating tensors in place, so any
 | 
			
		||||
    TorchScript currently does not support mutating tensors in place, so any
 | 
			
		||||
    tensor indexing can only appear on the right-hand size of an expression.
 | 
			
		||||
 | 
			
		||||
Function calls
 | 
			
		||||
@ -328,7 +328,7 @@ Accessing Module Parameters
 | 
			
		||||
Statements
 | 
			
		||||
~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Torch Script supports the following types of statements:
 | 
			
		||||
TorchScript supports the following types of statements:
 | 
			
		||||
 | 
			
		||||
Simple Assignments
 | 
			
		||||
 | 
			
		||||
@ -438,7 +438,7 @@ Return
 | 
			
		||||
Variable Resolution
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Torch Script supports a subset of Python's variable resolution (i.e. scoping)
 | 
			
		||||
TorchScript supports a subset of Python's variable resolution (i.e. scoping)
 | 
			
		||||
rules. Local variables behave the same as in Python, except for the restriction
 | 
			
		||||
that a variable must have the same type along all paths through a function.
 | 
			
		||||
If a variable has a different type on different sides of an if statement, it
 | 
			
		||||
@ -456,23 +456,23 @@ Example::
 | 
			
		||||
        print(y) # Error: undefined value y
 | 
			
		||||
 | 
			
		||||
Non-local variables are resolved to Python values at compile time when the
 | 
			
		||||
function is defined. These values are then converted into Torch Script values using
 | 
			
		||||
function is defined. These values are then converted into TorchScript values using
 | 
			
		||||
the rules described in `Use of Python Values`_.
 | 
			
		||||
 | 
			
		||||
Use of Python Values
 | 
			
		||||
~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
To make writing Torch Script more convenient, we allow script code to refer
 | 
			
		||||
To make writing TorchScript more convenient, we allow script code to refer
 | 
			
		||||
to Python values in the surrounding scope. For instance, any time there is a
 | 
			
		||||
reference to ``torch``, the Torch Script compiler is actually resolving it to the
 | 
			
		||||
reference to ``torch``, the TorchScript compiler is actually resolving it to the
 | 
			
		||||
``torch`` Python module when the function is declared.  These Python values are
 | 
			
		||||
not a first class part of Torch Script. Instead they are desugared at compile-time
 | 
			
		||||
into the primitive types that Torch Script supports. This section describes the
 | 
			
		||||
rules that are used when accessing Python values in Torch Script. They depend
 | 
			
		||||
not a first class part of TorchScript. Instead they are desugared at compile-time
 | 
			
		||||
into the primitive types that TorchScript supports. This section describes the
 | 
			
		||||
rules that are used when accessing Python values in TorchScript. They depend
 | 
			
		||||
on the dynamic type of the python valued referenced.
 | 
			
		||||
 | 
			
		||||
Functions
 | 
			
		||||
  Torch Script can call python functions. This functionality is very useful when
 | 
			
		||||
  TorchScript can call python functions. This functionality is very useful when
 | 
			
		||||
  incrementally converting a model into script. The model can be moved function-by-function
 | 
			
		||||
  to script, leaving calls to Python functions in place. This way you can incrementally
 | 
			
		||||
  check the correctness of the model as you go.
 | 
			
		||||
@ -495,12 +495,12 @@ Functions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Attribute Lookup On Python Modules
 | 
			
		||||
    Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
 | 
			
		||||
    are accessed this way. This allows Torch Script to call functions defined in
 | 
			
		||||
    TorchScript can lookup attributes on modules. Builtin functions like ``torch.add``
 | 
			
		||||
    are accessed this way. This allows TorchScript to call functions defined in
 | 
			
		||||
    other modules.
 | 
			
		||||
 | 
			
		||||
Python-defined Constants
 | 
			
		||||
    Torch Script also provides a way to use constants that are defined in Python.
 | 
			
		||||
    TorchScript also provides a way to use constants that are defined in Python.
 | 
			
		||||
    These can be used to hard-code hyper-parameters into the function, or to
 | 
			
		||||
    define universal constants. There are two ways of specifying that a Python
 | 
			
		||||
    value should be treated as a constant.
 | 
			
		||||
@ -597,36 +597,35 @@ Interpreting Graphs
 | 
			
		||||
 | 
			
		||||
    The example script above produces the graph::
 | 
			
		||||
 | 
			
		||||
        graph(%len : int) {
 | 
			
		||||
          %13 : float = prim::Constant[value=1]()
 | 
			
		||||
          %10 : int = prim::Constant[value=10]()
 | 
			
		||||
          %2 : int = prim::Constant[value=4]()
 | 
			
		||||
          %1 : int = prim::Constant[value=3]()
 | 
			
		||||
          %3 : int[] = prim::ListConstruct(%1, %2)
 | 
			
		||||
          %4 : int = prim::Constant[value=6]()
 | 
			
		||||
          %5 : int = prim::Constant[value=0]()
 | 
			
		||||
          %6 : int[] = prim::Constant[value=[0, -1]]()
 | 
			
		||||
          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
 | 
			
		||||
          %8 : int = prim::Constant[value=1]()
 | 
			
		||||
          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
 | 
			
		||||
            block0(%i : int, %12 : Dynamic) {
 | 
			
		||||
              %11 : int = aten::lt(%i, %10)
 | 
			
		||||
              %rv.4 : Dynamic = prim::If(%11)
 | 
			
		||||
                block0() {
 | 
			
		||||
                  %14 : int = prim::Constant[value=1]()
 | 
			
		||||
                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
 | 
			
		||||
                  -> (%rv.2)
 | 
			
		||||
                }
 | 
			
		||||
                block1() {
 | 
			
		||||
                  %16 : int = prim::Constant[value=1]()
 | 
			
		||||
                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
 | 
			
		||||
                  -> (%rv.3)
 | 
			
		||||
                }
 | 
			
		||||
              %19 : int = prim::Constant[value=1]()
 | 
			
		||||
              -> (%19, %rv.4)
 | 
			
		||||
            }
 | 
			
		||||
          return (%rv);
 | 
			
		||||
        }
 | 
			
		||||
	graph(%len : int) {
 | 
			
		||||
	  %15 : int = prim::Constant[value=1]()
 | 
			
		||||
	  %9 : bool = prim::Constant[value=1]()
 | 
			
		||||
	  %7 : Device = prim::Constant[value="cpu"]()
 | 
			
		||||
	  %6 : int = prim::Constant[value=0]()
 | 
			
		||||
	  %5 : int = prim::Constant[value=6]()
 | 
			
		||||
	  %1 : int = prim::Constant[value=3]()
 | 
			
		||||
	  %2 : int = prim::Constant[value=4]()
 | 
			
		||||
	  %11 : int = prim::Constant[value=10]()
 | 
			
		||||
	  %14 : float = prim::Constant[value=1]()
 | 
			
		||||
	  %4 : int[] = prim::ListConstruct(%1, %2)
 | 
			
		||||
	  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
 | 
			
		||||
	  %rv : Tensor = prim::Loop(%len, %9, %rv.1)
 | 
			
		||||
	    block0(%i : int, %13 : Tensor) {
 | 
			
		||||
	      %12 : bool = aten::lt(%i, %11)
 | 
			
		||||
	      %rv.4 : Tensor = prim::If(%12)
 | 
			
		||||
		block0() {
 | 
			
		||||
		  %rv.2 : Tensor = aten::sub(%13, %14, %15)
 | 
			
		||||
		  -> (%rv.2)
 | 
			
		||||
		}
 | 
			
		||||
		block1() {
 | 
			
		||||
		  %rv.3 : Tensor = aten::add(%13, %14, %15)
 | 
			
		||||
		  -> (%rv.3)
 | 
			
		||||
		}
 | 
			
		||||
	      -> (%9, %rv.4)
 | 
			
		||||
	    }
 | 
			
		||||
	  return (%rv);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
 | 
			
		||||
    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
 | 
			
		||||
@ -676,34 +675,39 @@ Automatic Trace Checking
 | 
			
		||||
        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
 | 
			
		||||
 | 
			
		||||
    Gives us the following diagnostic information::
 | 
			
		||||
 | 
			
		||||
        ERROR: Graphs differed across invocations!
 | 
			
		||||
        Graph diff:
 | 
			
		||||
            graph(%0 : Dynamic) {
 | 
			
		||||
                  %1 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %2 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %3 : Dynamic = aten::select(%0, %1, %2)
 | 
			
		||||
                  %4 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %5 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %6 : Dynamic = aten::select(%0, %4, %5)
 | 
			
		||||
                  %7 : Dynamic = aten::mul(%3, %6)
 | 
			
		||||
                  %8 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %9 : int = prim::Constant[value=1]()
 | 
			
		||||
                  %10 : Dynamic = aten::select(%0, %8, %9)
 | 
			
		||||
                  %11 : Dynamic = aten::mul(%7, %10)
 | 
			
		||||
                  %12 : int = prim::Constant[value=0]()
 | 
			
		||||
                  %13 : int = prim::Constant[value=2]()
 | 
			
		||||
                  %14 : Dynamic = aten::select(%0, %12, %13)
 | 
			
		||||
                  %15 : Dynamic = aten::mul(%11, %14)
 | 
			
		||||
              +   %16 : int = prim::Constant[value=0]()
 | 
			
		||||
              +   %17 : int = prim::Constant[value=3]()
 | 
			
		||||
              +   %18 : Dynamic = aten::select(%0, %16, %17)
 | 
			
		||||
              +   %19 : Dynamic = aten::mul(%15, %18)
 | 
			
		||||
              -   return (%15);
 | 
			
		||||
              ?             ^
 | 
			
		||||
              +   return (%19);
 | 
			
		||||
              ?             ^
 | 
			
		||||
            }
 | 
			
		||||
	ERROR: Graphs differed across invocations!
 | 
			
		||||
	Graph diff::
 | 
			
		||||
  
 | 
			
		||||
		  graph(%x : Tensor) {
 | 
			
		||||
		    %1 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %2 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %result.1 : Tensor = aten::select(%x, %1, %2)
 | 
			
		||||
		    %4 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %5 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %6 : Tensor = aten::select(%x, %4, %5)
 | 
			
		||||
		    %result.2 : Tensor = aten::mul(%result.1, %6)
 | 
			
		||||
		    %8 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %9 : int = prim::Constant[value=1]()
 | 
			
		||||
		    %10 : Tensor = aten::select(%x, %8, %9)
 | 
			
		||||
		-   %result : Tensor = aten::mul(%result.2, %10)
 | 
			
		||||
		+   %result.3 : Tensor = aten::mul(%result.2, %10)
 | 
			
		||||
		?          ++
 | 
			
		||||
		    %12 : int = prim::Constant[value=0]()
 | 
			
		||||
		    %13 : int = prim::Constant[value=2]()
 | 
			
		||||
		    %14 : Tensor = aten::select(%x, %12, %13)
 | 
			
		||||
		+   %result : Tensor = aten::mul(%result.3, %14)
 | 
			
		||||
		+   %16 : int = prim::Constant[value=0]()
 | 
			
		||||
		+   %17 : int = prim::Constant[value=3]()
 | 
			
		||||
		+   %18 : Tensor = aten::select(%x, %16, %17)
 | 
			
		||||
		-   %15 : Tensor = aten::mul(%result, %14)
 | 
			
		||||
		?     ^                                 ^
 | 
			
		||||
		+   %19 : Tensor = aten::mul(%result, %18)
 | 
			
		||||
		?     ^                                 ^
 | 
			
		||||
		-   return (%15);
 | 
			
		||||
		?             ^
 | 
			
		||||
		+   return (%19);
 | 
			
		||||
		?             ^
 | 
			
		||||
		  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    This message indicates to us that the computation differed between when
 | 
			
		||||
@ -733,23 +737,19 @@ Automatic Trace Checking
 | 
			
		||||
 | 
			
		||||
    Which produces::
 | 
			
		||||
 | 
			
		||||
        graph(%x : Dynamic) {
 | 
			
		||||
          %1 : int = prim::Constant[value=0]()
 | 
			
		||||
          %2 : int = prim::Constant[value=0]()
 | 
			
		||||
          %result.1 : Dynamic = aten::select(%x, %2, %1)
 | 
			
		||||
          %4 : int = aten::size(%x, %1)
 | 
			
		||||
          %5 : int = prim::Constant[value=1]()
 | 
			
		||||
          %result : Dynamic = prim::Loop(%4, %5, %result.1)
 | 
			
		||||
            block0(%i : int, %7 : Dynamic) {
 | 
			
		||||
              %9 : int = prim::Constant[value=0]()
 | 
			
		||||
              %10 : Dynamic = aten::select(%x, %9, %i)
 | 
			
		||||
              %result.2 : Dynamic = aten::mul(%7, %10)
 | 
			
		||||
              %12 : int = prim::Constant[value=1]()
 | 
			
		||||
              -> (%12, %result.2)
 | 
			
		||||
            }
 | 
			
		||||
          return (%result);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
	graph(%x : Tensor) {
 | 
			
		||||
	  %5 : bool = prim::Constant[value=1]()
 | 
			
		||||
	  %1 : int = prim::Constant[value=0]()
 | 
			
		||||
	  %result.1 : Tensor = aten::select(%x, %1, %1)
 | 
			
		||||
	  %4 : int = aten::size(%x, %1)
 | 
			
		||||
	  %result : Tensor = prim::Loop(%4, %5, %result.1)
 | 
			
		||||
	    block0(%i : int, %7 : Tensor) {
 | 
			
		||||
	      %10 : Tensor = aten::select(%x, %1, %i)
 | 
			
		||||
	      %result.2 : Tensor = aten::mul(%7, %10)
 | 
			
		||||
	      -> (%5, %result.2)
 | 
			
		||||
	    }
 | 
			
		||||
	  return (%result);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
Tracer Warnings
 | 
			
		||||
    The tracer produces warnings for several problematic patterns in traced
 | 
			
		||||
@ -789,14 +789,24 @@ Tracer Warnings
 | 
			
		||||
Builtin Functions
 | 
			
		||||
~~~~~~~~~~~~~~~~~
 | 
			
		||||
 | 
			
		||||
Torch Script supports a subset of the builtin tensor and neural network functions that
 | 
			
		||||
PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
 | 
			
		||||
namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
 | 
			
		||||
Torch Script supports a subset of the builtin tensor and neural network
 | 
			
		||||
functions that PyTorch provides. Most methods on Tensor as well as functions in
 | 
			
		||||
the ``torch`` namespace, all functions in ``torch.nn.functional`` and all
 | 
			
		||||
modules from ``torch.nn`` are supported in Torch Script, excluding those in the
 | 
			
		||||
table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`.
 | 
			
		||||
 | 
			
		||||
Unsupported ``torch.nn`` Modules  ::
 | 
			
		||||
 | 
			
		||||
    torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss
 | 
			
		||||
    torch.nn.modules.normalization.CrossMapLRN2d
 | 
			
		||||
    torch.nn.modules.fold.Fold
 | 
			
		||||
    torch.nn.modules.fold.Unfold
 | 
			
		||||
    torch.nn.modules.rnn.GRU
 | 
			
		||||
    torch.nn.modules.rnn.LSTM
 | 
			
		||||
    torch.nn.modules.rnn.RNN
 | 
			
		||||
    torch.nn.modules.rnn.GRUCell
 | 
			
		||||
    torch.nn.modules.rnn.LSTMCell
 | 
			
		||||
    torch.nn.modules.rnn.RNNCell
 | 
			
		||||
 | 
			
		||||
We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
 | 
			
		||||
``Conv`` module. This functionality is something that will be developed in the future.
 | 
			
		||||
For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
 | 
			
		||||
modules into ScriptModules on construction.
 | 
			
		||||
 | 
			
		||||
.. automodule:: torch.jit.supported_ops
 | 
			
		||||
 | 
			
		||||
@ -338,6 +338,7 @@ view of a storage and defines numeric operations on it.
 | 
			
		||||
   .. automethod:: reshape_as
 | 
			
		||||
   .. automethod:: resize_
 | 
			
		||||
   .. automethod:: resize_as_
 | 
			
		||||
   .. automethod:: roll
 | 
			
		||||
   .. automethod:: round
 | 
			
		||||
   .. automethod:: round_
 | 
			
		||||
   .. automethod:: rsqrt
 | 
			
		||||
 | 
			
		||||
@ -269,6 +269,7 @@ Other Operations
 | 
			
		||||
.. autofunction:: histc
 | 
			
		||||
.. autofunction:: meshgrid
 | 
			
		||||
.. autofunction:: renorm
 | 
			
		||||
.. autofunction:: roll
 | 
			
		||||
.. autofunction:: tensordot
 | 
			
		||||
.. autofunction:: trace
 | 
			
		||||
.. autofunction:: tril
 | 
			
		||||
 | 
			
		||||
@ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 | 
			
		||||
file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 | 
			
		||||
 | 
			
		||||
if (BUILD_CAFFE2_OPS)
 | 
			
		||||
  #cmake only check for separate OpenMP library on AppleClang 7+
 | 
			
		||||
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
 | 
			
		||||
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
 | 
			
		||||
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
 | 
			
		||||
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
 | 
			
		||||
      Set(OpenMP_link ${OpenMP_libomp_LIBRARY})
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # Note(ilijar): Since Detectron ops currently have no
 | 
			
		||||
  # CPU implementation, we only build GPU ops for now.
 | 
			
		||||
  if (USE_CUDA)
 | 
			
		||||
@ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS)
 | 
			
		||||
        ${Detectron_CPU_SRCS}
 | 
			
		||||
        ${Detectron_GPU_SRCS})
 | 
			
		||||
 | 
			
		||||
    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link})
 | 
			
		||||
    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
 | 
			
		||||
    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
 | 
			
		||||
  elseif(NOT IOS_PLATFORM)
 | 
			
		||||
    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
 | 
			
		||||
    target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
 | 
			
		||||
    target_link_libraries(caffe2_detectron_ops caffe2)
 | 
			
		||||
    install(TARGETS caffe2_detectron_ops DESTINATION lib)
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										166
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										166
									
								
								setup.py
									
									
									
									
									
								
							@ -124,6 +124,7 @@
 | 
			
		||||
#   LD_LIBRARY_PATH
 | 
			
		||||
#     we will search for libraries in these paths
 | 
			
		||||
 | 
			
		||||
from __future__ import print_function
 | 
			
		||||
from setuptools import setup, Extension, distutils, Command, find_packages
 | 
			
		||||
import setuptools.command.build_ext
 | 
			
		||||
import setuptools.command.install
 | 
			
		||||
@ -144,86 +145,32 @@ import json
 | 
			
		||||
import glob
 | 
			
		||||
import importlib
 | 
			
		||||
 | 
			
		||||
from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def hotpatch_var(var, prefix='USE_'):
 | 
			
		||||
    if check_env_flag('NO_' + var):
 | 
			
		||||
        os.environ[prefix + var] = '0'
 | 
			
		||||
    elif check_negative_env_flag('NO_' + var):
 | 
			
		||||
        os.environ[prefix + var] = '1'
 | 
			
		||||
    elif check_env_flag('WITH_' + var):
 | 
			
		||||
        os.environ[prefix + var] = '1'
 | 
			
		||||
    elif check_negative_env_flag('WITH_' + var):
 | 
			
		||||
        os.environ[prefix + var] = '0'
 | 
			
		||||
 | 
			
		||||
# Before we run the setup_helpers, let's look for NO_* and WITH_*
 | 
			
		||||
# variables and hotpatch environment with the USE_* equivalent
 | 
			
		||||
use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
 | 
			
		||||
                'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
 | 
			
		||||
list(map(hotpatch_var, use_env_vars))
 | 
			
		||||
 | 
			
		||||
# Also hotpatch a few with BUILD_* equivalent
 | 
			
		||||
build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
 | 
			
		||||
[hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 | 
			
		||||
 | 
			
		||||
from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
 | 
			
		||||
from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
 | 
			
		||||
                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
 | 
			
		||||
                                       USE_LMDB, USE_OPENCV, USE_FFMPEG)
 | 
			
		||||
from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 | 
			
		||||
from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
 | 
			
		||||
                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 | 
			
		||||
from tools.setup_helpers.fbgemm import USE_FBGEMM
 | 
			
		||||
from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY,
 | 
			
		||||
                                        MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
 | 
			
		||||
from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
 | 
			
		||||
    NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
 | 
			
		||||
from tools.setup_helpers.nnpack import USE_NNPACK
 | 
			
		||||
from tools.setup_helpers.qnnpack import USE_QNNPACK
 | 
			
		||||
from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
 | 
			
		||||
# If you want to modify flags or environmental variables that is set when
 | 
			
		||||
# building torch, you should do it in tools/setup_helpers/configure.py.
 | 
			
		||||
# Please don't add it here unless it's only used in PyTorch.
 | 
			
		||||
from tools.setup_helpers.configure import *
 | 
			
		||||
from tools.setup_helpers.generate_code import generate_code
 | 
			
		||||
from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
 | 
			
		||||
from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
 | 
			
		||||
    USE_GLOO_IBVERBS
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
# Parameters parsed from environment
 | 
			
		||||
################################################################################
 | 
			
		||||
 | 
			
		||||
DEBUG = check_env_flag('DEBUG')
 | 
			
		||||
REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO')
 | 
			
		||||
IS_WINDOWS = (platform.system() == 'Windows')
 | 
			
		||||
IS_DARWIN = (platform.system() == 'Darwin')
 | 
			
		||||
IS_LINUX = (platform.system() == 'Linux')
 | 
			
		||||
IS_PPC = (platform.machine() == 'ppc64le')
 | 
			
		||||
IS_ARM = (platform.machine() == 'aarch64')
 | 
			
		||||
VERBOSE_SCRIPT = True
 | 
			
		||||
# see if the user passed a quiet flag to setup.py arguments and respect
 | 
			
		||||
# that in our parts of the build
 | 
			
		||||
for arg in sys.argv:
 | 
			
		||||
    if arg == "--":
 | 
			
		||||
        break
 | 
			
		||||
    if arg == '-q' or arg == '--quiet':
 | 
			
		||||
        VERBOSE_SCRIPT = False
 | 
			
		||||
 | 
			
		||||
BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 | 
			
		||||
# ppc64le and aarch64 do not support MKLDNN
 | 
			
		||||
if IS_PPC or IS_ARM:
 | 
			
		||||
    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF')
 | 
			
		||||
if VERBOSE_SCRIPT:
 | 
			
		||||
    def report(*args):
 | 
			
		||||
        print(*args)
 | 
			
		||||
else:
 | 
			
		||||
    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON')
 | 
			
		||||
 | 
			
		||||
USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
 | 
			
		||||
RERUN_CMAKE = True
 | 
			
		||||
 | 
			
		||||
NUM_JOBS = multiprocessing.cpu_count()
 | 
			
		||||
max_jobs = os.getenv("MAX_JOBS")
 | 
			
		||||
if max_jobs is not None:
 | 
			
		||||
    NUM_JOBS = min(NUM_JOBS, int(max_jobs))
 | 
			
		||||
 | 
			
		||||
ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
 | 
			
		||||
if not ONNX_NAMESPACE:
 | 
			
		||||
    ONNX_NAMESPACE = "onnx_torch"
 | 
			
		||||
 | 
			
		||||
# Ninja
 | 
			
		||||
try:
 | 
			
		||||
    import ninja
 | 
			
		||||
    USE_NINJA = True
 | 
			
		||||
except ImportError:
 | 
			
		||||
    USE_NINJA = False
 | 
			
		||||
    def report(*args):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
# Constant known variables used throughout this file
 | 
			
		||||
cwd = os.path.dirname(os.path.abspath(__file__))
 | 
			
		||||
@ -323,8 +270,9 @@ def build_libs(libs):
 | 
			
		||||
        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
 | 
			
		||||
    else:
 | 
			
		||||
        build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
 | 
			
		||||
    my_env = os.environ.copy()
 | 
			
		||||
    my_env["PYTORCH_PYTHON"] = sys.executable
 | 
			
		||||
 | 
			
		||||
    my_env, extra_flags = get_pytorch_env_with_flags()
 | 
			
		||||
    build_libs_cmd.extend(extra_flags)
 | 
			
		||||
    my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
 | 
			
		||||
    my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
 | 
			
		||||
    my_env["PYTORCH_BUILD_VERSION"] = version
 | 
			
		||||
@ -334,64 +282,8 @@ def build_libs(libs):
 | 
			
		||||
        cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
 | 
			
		||||
    my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path
 | 
			
		||||
 | 
			
		||||
    my_env["NUM_JOBS"] = str(NUM_JOBS)
 | 
			
		||||
    my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
 | 
			
		||||
    if not IS_WINDOWS:
 | 
			
		||||
        if USE_NINJA:
 | 
			
		||||
            my_env["CMAKE_GENERATOR"] = '-GNinja'
 | 
			
		||||
            my_env["CMAKE_INSTALL"] = 'ninja install'
 | 
			
		||||
        else:
 | 
			
		||||
            my_env['CMAKE_GENERATOR'] = ''
 | 
			
		||||
            my_env['CMAKE_INSTALL'] = 'make install'
 | 
			
		||||
    if USE_SYSTEM_NCCL:
 | 
			
		||||
        my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
 | 
			
		||||
        my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
 | 
			
		||||
        my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
 | 
			
		||||
    if USE_CUDA:
 | 
			
		||||
        my_env["CUDA_BIN_PATH"] = CUDA_HOME
 | 
			
		||||
        build_libs_cmd += ['--use-cuda']
 | 
			
		||||
        if IS_WINDOWS:
 | 
			
		||||
            my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
 | 
			
		||||
    if USE_CUDA_STATIC_LINK:
 | 
			
		||||
        build_libs_cmd += ['--cuda-static-link']
 | 
			
		||||
    if USE_FBGEMM:
 | 
			
		||||
        build_libs_cmd += ['--use-fbgemm']
 | 
			
		||||
    if USE_ROCM:
 | 
			
		||||
        build_libs_cmd += ['--use-rocm']
 | 
			
		||||
    if USE_NNPACK:
 | 
			
		||||
        build_libs_cmd += ['--use-nnpack']
 | 
			
		||||
    if USE_NUMPY:
 | 
			
		||||
        my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR
 | 
			
		||||
    if USE_CUDNN:
 | 
			
		||||
        my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
 | 
			
		||||
        my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
 | 
			
		||||
        my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
 | 
			
		||||
    if USE_MIOPEN:
 | 
			
		||||
        my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
 | 
			
		||||
        my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
 | 
			
		||||
        my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
 | 
			
		||||
    if USE_MKLDNN:
 | 
			
		||||
        build_libs_cmd += ['--use-mkldnn']
 | 
			
		||||
    if USE_QNNPACK:
 | 
			
		||||
        build_libs_cmd += ['--use-qnnpack']
 | 
			
		||||
    if USE_GLOO_IBVERBS:
 | 
			
		||||
        build_libs_cmd += ['--use-gloo-ibverbs']
 | 
			
		||||
    if not RERUN_CMAKE:
 | 
			
		||||
        build_libs_cmd += ['--dont-rerun-cmake']
 | 
			
		||||
 | 
			
		||||
    my_env["BUILD_TORCH"] = "ON"
 | 
			
		||||
    my_env["BUILD_PYTHON"] = "ON"
 | 
			
		||||
    my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
 | 
			
		||||
    my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
 | 
			
		||||
    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
 | 
			
		||||
    my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
 | 
			
		||||
    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
 | 
			
		||||
    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
 | 
			
		||||
    my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
 | 
			
		||||
    my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
 | 
			
		||||
    my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
 | 
			
		||||
    my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
 | 
			
		||||
 | 
			
		||||
    if VERBOSE_SCRIPT:
 | 
			
		||||
        my_env['VERBOSE_SCRIPT'] = '1'
 | 
			
		||||
    try:
 | 
			
		||||
        os.mkdir('build')
 | 
			
		||||
    except OSError:
 | 
			
		||||
@ -660,6 +552,16 @@ class build_ext(build_ext_parent):
 | 
			
		||||
        return outputs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# this is a subclass of build just to get access to self.build_lib
 | 
			
		||||
# as there does not seem to be an utility function getting this
 | 
			
		||||
class create_pyi(distutils.command.build.build):
 | 
			
		||||
    def run(self):
 | 
			
		||||
        print("-- Building .pyi --")
 | 
			
		||||
        if sys.version_info[0] == 3:
 | 
			
		||||
            from tools.pyi.gen_pyi import gen_pyi
 | 
			
		||||
            gen_pyi(self.build_lib)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class build(distutils.command.build.build):
 | 
			
		||||
    sub_commands = [
 | 
			
		||||
        ('build_deps', lambda self: True),
 | 
			
		||||
@ -914,6 +816,7 @@ if USE_CUDA:
 | 
			
		||||
 | 
			
		||||
cmdclass = {
 | 
			
		||||
    'create_version_file': create_version_file,
 | 
			
		||||
    'create_pyi': create_pyi,
 | 
			
		||||
    'build': build,
 | 
			
		||||
    'build_py': build_py,
 | 
			
		||||
    'build_ext': build_ext,
 | 
			
		||||
@ -946,6 +849,7 @@ if __name__ == '__main__':
 | 
			
		||||
        entry_points=entry_points,
 | 
			
		||||
        package_data={
 | 
			
		||||
            'torch': [
 | 
			
		||||
                '__init__.pyi',
 | 
			
		||||
                'lib/*.so*',
 | 
			
		||||
                'lib/*.dylib*',
 | 
			
		||||
                'lib/*.dll',
 | 
			
		||||
 | 
			
		||||
@ -458,6 +458,10 @@ method_tests = [
 | 
			
		||||
     NO_ARGS, [skipIfNoLapack]),
 | 
			
		||||
    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
 | 
			
		||||
     NO_ARGS, [skipIfNoLapack]),
 | 
			
		||||
    ('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"),
 | 
			
		||||
    ('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"),
 | 
			
		||||
    ('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"),
 | 
			
		||||
    ('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"),
 | 
			
		||||
    ('addcmul', (S, S), ((S, S), (S, S))),
 | 
			
		||||
    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
 | 
			
		||||
    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
 | 
			
		||||
@ -560,8 +564,14 @@ method_tests = [
 | 
			
		||||
    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
 | 
			
		||||
    ('tril', (M, M), NO_ARGS),
 | 
			
		||||
    ('tril', (M, M), (2,), 'idx'),
 | 
			
		||||
    ('tril', (S, M, M), NO_ARGS, 'batched'),
 | 
			
		||||
    ('tril', (S, M, M), (2,), 'batched_idx'),
 | 
			
		||||
    ('tril', (3, 3, S, S), NO_ARGS, 'more_batched'),
 | 
			
		||||
    ('triu', (M, M), NO_ARGS),
 | 
			
		||||
    ('triu', (M, M), (2,), 'idx'),
 | 
			
		||||
    ('triu', (S, M, M), NO_ARGS, 'batched'),
 | 
			
		||||
    ('triu', (S, M, M), (2,), 'batched_idx'),
 | 
			
		||||
    ('triu', (3, 3, S, S), NO_ARGS, 'more_batched'),
 | 
			
		||||
    ('trace', (M, M), NO_ARGS),
 | 
			
		||||
    ('cross', (S, 3), ((S, 3),)),
 | 
			
		||||
    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
 | 
			
		||||
 | 
			
		||||
@ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs):
 | 
			
		||||
        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def brute_pdist(inp, p=2):
 | 
			
		||||
    """Computes the same as torch.pdist using primitives"""
 | 
			
		||||
    n = inp.shape[-2]
 | 
			
		||||
    k = n * (n - 1) // 2
 | 
			
		||||
    if k == 0:
 | 
			
		||||
        # torch complains about empty indices
 | 
			
		||||
        return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device)
 | 
			
		||||
    square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1)
 | 
			
		||||
    unroll = square.view(square.shape[:-2] + (n * n,))
 | 
			
		||||
    inds = torch.ones(k, dtype=torch.int)
 | 
			
		||||
    inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int)
 | 
			
		||||
    return unroll[..., inds.cumsum(0)]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def do_test_dtypes(self, dtypes, layout, device):
 | 
			
		||||
    for dtype in dtypes:
 | 
			
		||||
        if dtype != torch.float16:
 | 
			
		||||
 | 
			
		||||
@ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) {
 | 
			
		||||
  ASSERT_EQ(batch[1].target, "2");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct DummyTensorDataset
 | 
			
		||||
    : datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> {
 | 
			
		||||
  Example<torch::Tensor, int> get(size_t index) override {
 | 
			
		||||
    const auto channels = static_cast<int64_t>(index);
 | 
			
		||||
    torch::Tensor tensor =
 | 
			
		||||
        (channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4});
 | 
			
		||||
    return {tensor, static_cast<int>(channels)};
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  torch::optional<size_t> size() const override {
 | 
			
		||||
    return 100;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
TEST(DataTest, NormalizeTransform) {
 | 
			
		||||
  auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1));
 | 
			
		||||
 | 
			
		||||
  // Works for zero (one implicit) channels
 | 
			
		||||
  std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0);
 | 
			
		||||
  ASSERT_EQ(output.size(), 1);
 | 
			
		||||
  // (1 - 0.5) / 0.1 = 5
 | 
			
		||||
  ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
 | 
			
		||||
  // Works for one explicit channel
 | 
			
		||||
  output = dataset.get_batch(1);
 | 
			
		||||
  ASSERT_EQ(output.size(), 1);
 | 
			
		||||
  ASSERT_EQ(output[0].data.size(0), 1);
 | 
			
		||||
  ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
 | 
			
		||||
  // Works for two channels with different moments
 | 
			
		||||
  dataset = DummyTensorDataset().map(
 | 
			
		||||
      transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2}));
 | 
			
		||||
  output = dataset.get_batch(2);
 | 
			
		||||
  ASSERT_EQ(output.size(), 1);
 | 
			
		||||
  ASSERT_EQ(output[0].data.size(0), 2);
 | 
			
		||||
  ASSERT_TRUE(output[0]
 | 
			
		||||
                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
 | 
			
		||||
                  .allclose(torch::ones({1, 4, 4}) * 5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
  ASSERT_TRUE(output[0]
 | 
			
		||||
                  .data.slice(/*dim=*/0, /*start=*/1)
 | 
			
		||||
                  .allclose(torch::ones({1, 4, 4}) * -2.5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
 | 
			
		||||
  // Works for three channels with one moment value
 | 
			
		||||
  dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2));
 | 
			
		||||
  output = dataset.get_batch(3);
 | 
			
		||||
  ASSERT_EQ(output.size(), 1);
 | 
			
		||||
  ASSERT_EQ(output[0].data.size(0), 3);
 | 
			
		||||
  ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
 | 
			
		||||
  // Works for three channels with different moments
 | 
			
		||||
  dataset = DummyTensorDataset().map(
 | 
			
		||||
      transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2}));
 | 
			
		||||
  output = dataset.get_batch(3);
 | 
			
		||||
  ASSERT_EQ(output.size(), 1);
 | 
			
		||||
  ASSERT_EQ(output[0].data.size(0), 3);
 | 
			
		||||
  ASSERT_TRUE(output[0]
 | 
			
		||||
                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
 | 
			
		||||
                  .allclose(torch::ones({1, 4, 4}) * 5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
  ASSERT_TRUE(output[0]
 | 
			
		||||
                  .data.slice(/*dim=*/0, /*start=*/1, /*end=*/2)
 | 
			
		||||
                  .allclose(torch::ones({1, 4, 4}) * -2.5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
  ASSERT_TRUE(output[0]
 | 
			
		||||
                  .data.slice(/*dim=*/0, /*start=*/2)
 | 
			
		||||
                  .allclose(torch::ones({1, 4, 4}) * 12.5))
 | 
			
		||||
      << output[0].data;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
 | 
			
		||||
  UnCopyableDataset() = default;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) {
 | 
			
		||||
TEST_F(ModuleTest, ZeroGrad) {
 | 
			
		||||
  Linear module(3, 4);
 | 
			
		||||
  auto weight = torch::ones({8, 3}, torch::requires_grad());
 | 
			
		||||
  auto loss = module->forward(weight).sum();
 | 
			
		||||
  auto loss = module(weight).sum();
 | 
			
		||||
  loss.backward();
 | 
			
		||||
  for (auto& parameter : module->parameters()) {
 | 
			
		||||
    auto grad = parameter.grad();
 | 
			
		||||
@ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) {
 | 
			
		||||
    ASSERT_NO_THROW(module->modules());
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct ModuleWithNonTensorForwardImpl : torch::nn::Module {
 | 
			
		||||
  int64_t forward(torch::Tensor x) {
 | 
			
		||||
    return x.numel();
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
TORCH_MODULE(ModuleWithNonTensorForward);
 | 
			
		||||
 | 
			
		||||
TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) {
 | 
			
		||||
  ModuleWithNonTensorForward m;
 | 
			
		||||
  ASSERT_EQ(m(torch::ones(123)), 123);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {};
 | 
			
		||||
TEST_F(ModulesTest, Conv1d) {
 | 
			
		||||
  Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
 | 
			
		||||
  auto x = torch::randn({2, 3, 5}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) {
 | 
			
		||||
TEST_F(ModulesTest, Conv2dEven) {
 | 
			
		||||
  Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
 | 
			
		||||
  auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) {
 | 
			
		||||
TEST_F(ModulesTest, Conv2dUneven) {
 | 
			
		||||
  Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
 | 
			
		||||
  auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) {
 | 
			
		||||
TEST_F(ModulesTest, Conv3d) {
 | 
			
		||||
  Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
 | 
			
		||||
  auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) {
 | 
			
		||||
TEST_F(ModulesTest, Linear) {
 | 
			
		||||
  Linear model(5, 2);
 | 
			
		||||
  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) {
 | 
			
		||||
  auto l3 = model->add(Linear(5, 100), "l3");
 | 
			
		||||
 | 
			
		||||
  auto x = torch::randn({1000, 10}, torch::requires_grad());
 | 
			
		||||
  x = l1->forward(x).clamp_min(0);
 | 
			
		||||
  x = l2->forward(x).clamp_min(0);
 | 
			
		||||
  x = l3->forward(x).clamp_min(0);
 | 
			
		||||
  x = l1(x).clamp_min(0);
 | 
			
		||||
  x = l2(x).clamp_min(0);
 | 
			
		||||
  x = l3(x).clamp_min(0);
 | 
			
		||||
 | 
			
		||||
  x.backward();
 | 
			
		||||
  ASSERT_EQ(x.ndimension(), 2);
 | 
			
		||||
@ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 | 
			
		||||
  // Cannot get gradients to change indices (input) - only for embedding
 | 
			
		||||
  // params
 | 
			
		||||
  auto x = torch::full({10}, dict_size - 1, torch::kInt64);
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 | 
			
		||||
TEST_F(ModulesTest, EmbeddingList) {
 | 
			
		||||
  Embedding model(6, 4);
 | 
			
		||||
  auto x = torch::full({2, 3}, 5, torch::kInt64);
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) {
 | 
			
		||||
TEST_F(ModulesTest, Dropout) {
 | 
			
		||||
  Dropout dropout(0.5);
 | 
			
		||||
  torch::Tensor x = torch::ones(100, torch::requires_grad());
 | 
			
		||||
  torch::Tensor y = dropout->forward(x);
 | 
			
		||||
  torch::Tensor y = dropout(x);
 | 
			
		||||
 | 
			
		||||
  y.backward();
 | 
			
		||||
  ASSERT_EQ(y.ndimension(), 1);
 | 
			
		||||
@ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) {
 | 
			
		||||
  ASSERT_GT(y.sum().item<float>(), 70); // Probably
 | 
			
		||||
 | 
			
		||||
  dropout->eval();
 | 
			
		||||
  y = dropout->forward(x);
 | 
			
		||||
  y = dropout(x);
 | 
			
		||||
  ASSERT_EQ(y.sum().item<float>(), 100);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
 | 
			
		||||
    was_called = true;
 | 
			
		||||
    return input;
 | 
			
		||||
  });
 | 
			
		||||
  auto output = functional->forward(torch::ones(5, torch::requires_grad()));
 | 
			
		||||
  auto output = functional(torch::ones(5, torch::requires_grad()));
 | 
			
		||||
  ASSERT_TRUE(was_called);
 | 
			
		||||
  ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));
 | 
			
		||||
 | 
			
		||||
@ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) {
 | 
			
		||||
  ASSERT_FALSE(bn->bias.defined());
 | 
			
		||||
 | 
			
		||||
  ASSERT_THROWS_WITH(
 | 
			
		||||
      bn->forward(torch::ones({2, 5})),
 | 
			
		||||
      bn(torch::ones({2, 5})),
 | 
			
		||||
      "Calling BatchNorm::forward is only permitted "
 | 
			
		||||
      "when the 'stateful' option is true (was false). "
 | 
			
		||||
      "Use BatchNorm::pure_forward instead.");
 | 
			
		||||
@ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) {
 | 
			
		||||
  model->to(torch::kCUDA);
 | 
			
		||||
  auto x =
 | 
			
		||||
      torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
@ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) {
 | 
			
		||||
  model->to(torch::kCUDA);
 | 
			
		||||
  model->to(torch::kCPU);
 | 
			
		||||
  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
			
		||||
  auto y = model->forward(x);
 | 
			
		||||
  auto y = model(x);
 | 
			
		||||
  torch::Tensor s = y.sum();
 | 
			
		||||
 | 
			
		||||
  s.backward();
 | 
			
		||||
 | 
			
		||||
@ -215,7 +215,9 @@ TEST(SerializeTest, Optim) {
 | 
			
		||||
TEST(SerializeTest, XOR_CUDA) {
 | 
			
		||||
  torch::manual_seed(0);
 | 
			
		||||
  // We better be able to save and load a XOR model!
 | 
			
		||||
  auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) {
 | 
			
		||||
  auto getLoss = [](Sequential model,
 | 
			
		||||
                    uint32_t batch_size,
 | 
			
		||||
                    bool is_cuda = false) {
 | 
			
		||||
    auto inputs = torch::empty({batch_size, 2});
 | 
			
		||||
    auto labels = torch::empty({batch_size});
 | 
			
		||||
    if (is_cuda) {
 | 
			
		||||
@ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) {
 | 
			
		||||
  loss = getLoss(model3, 100, true);
 | 
			
		||||
  ASSERT_LT(loss.item<float>(), 0.1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST(
 | 
			
		||||
    SerializeTest,
 | 
			
		||||
    CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) {
 | 
			
		||||
  struct C : torch::nn::Module {
 | 
			
		||||
    C() {
 | 
			
		||||
      register_buffer("foo", torch::ones(5, torch::kInt32));
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  struct B : torch::nn::Module {};
 | 
			
		||||
  struct A : torch::nn::Module {
 | 
			
		||||
    A() {
 | 
			
		||||
      register_module("b", std::make_shared<B>());
 | 
			
		||||
      register_module("c", std::make_shared<C>());
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  struct M : torch::nn::Module {
 | 
			
		||||
    M() {
 | 
			
		||||
      register_module("a", std::make_shared<A>());
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  auto out = std::make_shared<M>();
 | 
			
		||||
  std::stringstream ss;
 | 
			
		||||
  torch::save(out, ss);
 | 
			
		||||
  auto in = std::make_shared<M>();
 | 
			
		||||
  torch::load(in, ss);
 | 
			
		||||
 | 
			
		||||
  const int output = in->named_buffers()["a.c.foo"].sum().item<int>();
 | 
			
		||||
  ASSERT_EQ(output, 5);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) {
 | 
			
		||||
  ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct A : torch::nn::Module {
 | 
			
		||||
  int forward() {
 | 
			
		||||
    return 5;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct B : torch::nn::Module {
 | 
			
		||||
  std::string forward(torch::Tensor tensor) {
 | 
			
		||||
    return "";
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct C : torch::nn::Module {
 | 
			
		||||
  float forward(torch::Tensor& tensor) {
 | 
			
		||||
    return 5.0;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct D : torch::nn::Module {
 | 
			
		||||
  char forward(torch::Tensor&& tensor) {
 | 
			
		||||
    return 'x';
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct E : torch::nn::Module {};
 | 
			
		||||
 | 
			
		||||
// Put in a function because macros don't handle the comma between arguments to
 | 
			
		||||
// is_same well ...
 | 
			
		||||
template <typename Module, typename ExpectedType, typename... Args>
 | 
			
		||||
void assert_has_expected_type() {
 | 
			
		||||
  using ReturnType =
 | 
			
		||||
      typename torch::detail::return_type_of_forward<Module, Args...>::type;
 | 
			
		||||
  constexpr bool is_expected_type =
 | 
			
		||||
      std::is_same<ReturnType, ExpectedType>::value;
 | 
			
		||||
  ASSERT_TRUE(is_expected_type) << Module().name();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST(TestStatic, ReturnTypeOfForward) {
 | 
			
		||||
  assert_has_expected_type<A, int>();
 | 
			
		||||
  assert_has_expected_type<B, std::string, torch::Tensor>();
 | 
			
		||||
  assert_has_expected_type<C, float, torch::Tensor&>();
 | 
			
		||||
  assert_has_expected_type<D, char, torch::Tensor&&>();
 | 
			
		||||
  assert_has_expected_type<E, void>();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST(TestStatic, Apply) {
 | 
			
		||||
  std::vector<int> v;
 | 
			
		||||
  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
 | 
			
		||||
 | 
			
		||||
@ -10,12 +10,13 @@ graph(%x.1_data : Tensor
 | 
			
		||||
  %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
 | 
			
		||||
    block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
 | 
			
		||||
      %16 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
      %alpha : float = prim::TensorToNum(%16)
 | 
			
		||||
      %alpha : float = prim::Float(%16)
 | 
			
		||||
      %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
 | 
			
		||||
      %mask : Tensor = aten::mul(%5_mask, %y_mask)
 | 
			
		||||
      %dims : Tensor = aten::__or__(%5_dims, %y_dims)
 | 
			
		||||
      %data : Tensor = aten::where(%mask, %data.1, %5_data)
 | 
			
		||||
      -> (%7, %data, %mask, %dims)
 | 
			
		||||
    }
 | 
			
		||||
  return (%x, %10, %11);
 | 
			
		||||
  %22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11)
 | 
			
		||||
  return (%22);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -7,33 +7,31 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %6 : int = prim::Constant[value=1]()
 | 
			
		||||
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
			
		||||
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %10 : bool = prim::TensorToBool(%7)
 | 
			
		||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha.1 : float = prim::TensorToNum(%11)
 | 
			
		||||
  %9 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha.1 : float = prim::Float(%9)
 | 
			
		||||
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
			
		||||
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %16 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::TensorToNum(%16)
 | 
			
		||||
  %14 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::Float(%14)
 | 
			
		||||
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
			
		||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %21 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %22 : int = prim::Constant[value=1]()
 | 
			
		||||
  %23 : Tensor = aten::type_as(%8, %7)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%7, %23)
 | 
			
		||||
  %25 : int = aten::dim(%data.2)
 | 
			
		||||
  %26 : bool = aten::eq(%25, %22)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
 | 
			
		||||
  %19 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %20 : int = prim::Constant[value=1]()
 | 
			
		||||
  %21 : Tensor = aten::type_as(%8, %7)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%7, %21)
 | 
			
		||||
  %23 : int = aten::dim(%data.2)
 | 
			
		||||
  %24 : bool = aten::eq(%23, %20)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%24)
 | 
			
		||||
    block0() {
 | 
			
		||||
      %29 : int = aten::dim(%data.1)
 | 
			
		||||
      %30 : int = aten::sub(%29, %22)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
 | 
			
		||||
        block0(%32 : int, %33 : Tensor) {
 | 
			
		||||
          %34 : int = aten::dim(%33)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%33, %34)
 | 
			
		||||
          -> (%21, %data.3)
 | 
			
		||||
      %27 : int = aten::dim(%data.1)
 | 
			
		||||
      %28 : int = aten::sub(%27, %20)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%28, %19, %data.2)
 | 
			
		||||
        block0(%30 : int, %31 : Tensor) {
 | 
			
		||||
          %32 : int = aten::dim(%31)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%31, %32)
 | 
			
		||||
          -> (%19, %data.3)
 | 
			
		||||
        }
 | 
			
		||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
			
		||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
			
		||||
@ -45,5 +43,6 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
			
		||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
			
		||||
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
			
		||||
  return (%res_data, %res_mask, %res_dims);
 | 
			
		||||
  %39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
			
		||||
  return (%39);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -7,34 +7,33 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %6 : int = prim::Constant[value=1]()
 | 
			
		||||
  %7 : float = prim::Constant[value=0.1]()
 | 
			
		||||
  %8 : Float() = prim::NumToTensor(%7)
 | 
			
		||||
  %other : float = prim::TensorToNum(%8)
 | 
			
		||||
  %other : float = prim::Float(%8)
 | 
			
		||||
  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
			
		||||
  %11 : bool = prim::TensorToBool(%10)
 | 
			
		||||
  %12 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha.1 : float = prim::TensorToNum(%12)
 | 
			
		||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha.1 : float = prim::Float(%11)
 | 
			
		||||
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
			
		||||
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %17 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::TensorToNum(%17)
 | 
			
		||||
  %16 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::Float(%16)
 | 
			
		||||
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
			
		||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %22 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %23 : int = prim::Constant[value=1]()
 | 
			
		||||
  %24 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%10, %24)
 | 
			
		||||
  %26 : int = aten::dim(%data.2)
 | 
			
		||||
  %27 : bool = aten::eq(%26, %23)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%27)
 | 
			
		||||
  %21 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %22 : int = prim::Constant[value=1]()
 | 
			
		||||
  %23 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%10, %23)
 | 
			
		||||
  %25 : int = aten::dim(%data.2)
 | 
			
		||||
  %26 : bool = aten::eq(%25, %22)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
 | 
			
		||||
    block0() {
 | 
			
		||||
      %30 : int = aten::dim(%data.1)
 | 
			
		||||
      %31 : int = aten::sub(%30, %23)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%31, %22, %data.2)
 | 
			
		||||
        block0(%33 : int, %34 : Tensor) {
 | 
			
		||||
          %35 : int = aten::dim(%34)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%34, %35)
 | 
			
		||||
          -> (%22, %data.3)
 | 
			
		||||
      %29 : int = aten::dim(%data.1)
 | 
			
		||||
      %30 : int = aten::sub(%29, %22)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
 | 
			
		||||
        block0(%32 : int, %33 : Tensor) {
 | 
			
		||||
          %34 : int = aten::dim(%33)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%33, %34)
 | 
			
		||||
          -> (%21, %data.3)
 | 
			
		||||
        }
 | 
			
		||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
			
		||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
			
		||||
@ -46,5 +45,6 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
			
		||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
			
		||||
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
			
		||||
  return (%res_data, %res_mask, %res_dims);
 | 
			
		||||
  %41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
			
		||||
  return (%41);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -7,28 +7,26 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %6 : int = prim::Constant[value=1]()
 | 
			
		||||
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
			
		||||
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %10 : bool = prim::TensorToBool(%7)
 | 
			
		||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::TensorToNum(%11)
 | 
			
		||||
  %9 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::Float(%9)
 | 
			
		||||
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
			
		||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %16 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %17 : int = prim::Constant[value=1]()
 | 
			
		||||
  %18 : Tensor = aten::type_as(%8, %7)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%7, %18)
 | 
			
		||||
  %20 : int = aten::dim(%data.2)
 | 
			
		||||
  %21 : bool = aten::eq(%20, %17)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
 | 
			
		||||
  %14 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %15 : int = prim::Constant[value=1]()
 | 
			
		||||
  %16 : Tensor = aten::type_as(%8, %7)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%7, %16)
 | 
			
		||||
  %18 : int = aten::dim(%data.2)
 | 
			
		||||
  %19 : bool = aten::eq(%18, %15)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%19)
 | 
			
		||||
    block0() {
 | 
			
		||||
      %24 : int = aten::dim(%data)
 | 
			
		||||
      %25 : int = aten::sub(%24, %17)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
 | 
			
		||||
        block0(%27 : int, %28 : Tensor) {
 | 
			
		||||
          %29 : int = aten::dim(%28)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%28, %29)
 | 
			
		||||
          -> (%16, %data.3)
 | 
			
		||||
      %22 : int = aten::dim(%data)
 | 
			
		||||
      %23 : int = aten::sub(%22, %15)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%23, %14, %data.2)
 | 
			
		||||
        block0(%25 : int, %26 : Tensor) {
 | 
			
		||||
          %27 : int = aten::dim(%26)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%26, %27)
 | 
			
		||||
          -> (%14, %data.3)
 | 
			
		||||
        }
 | 
			
		||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
			
		||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
			
		||||
@ -40,5 +38,6 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
			
		||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
			
		||||
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
			
		||||
  return (%res_data, %res_mask, %res_dims);
 | 
			
		||||
  %34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
			
		||||
  return (%34);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -7,29 +7,28 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %6 : int = prim::Constant[value=1]()
 | 
			
		||||
  %7 : float = prim::Constant[value=0.1]()
 | 
			
		||||
  %8 : Float() = prim::NumToTensor(%7)
 | 
			
		||||
  %other : float = prim::TensorToNum(%8)
 | 
			
		||||
  %other : float = prim::Float(%8)
 | 
			
		||||
  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
			
		||||
  %11 : bool = prim::TensorToBool(%10)
 | 
			
		||||
  %12 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::TensorToNum(%12)
 | 
			
		||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
  %alpha : float = prim::Float(%11)
 | 
			
		||||
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
			
		||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %17 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %18 : int = prim::Constant[value=1]()
 | 
			
		||||
  %19 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%10, %19)
 | 
			
		||||
  %21 : int = aten::dim(%data.2)
 | 
			
		||||
  %22 : bool = aten::eq(%21, %18)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%22)
 | 
			
		||||
  %16 : bool = prim::Constant[value=1]()
 | 
			
		||||
  %17 : int = prim::Constant[value=1]()
 | 
			
		||||
  %18 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
			
		||||
  %data.2 : Tensor = aten::mul(%10, %18)
 | 
			
		||||
  %20 : int = aten::dim(%data.2)
 | 
			
		||||
  %21 : bool = aten::eq(%20, %17)
 | 
			
		||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
 | 
			
		||||
    block0() {
 | 
			
		||||
      %25 : int = aten::dim(%data)
 | 
			
		||||
      %26 : int = aten::sub(%25, %18)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%26, %17, %data.2)
 | 
			
		||||
        block0(%28 : int, %29 : Tensor) {
 | 
			
		||||
          %30 : int = aten::dim(%29)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%29, %30)
 | 
			
		||||
          -> (%17, %data.3)
 | 
			
		||||
      %24 : int = aten::dim(%data)
 | 
			
		||||
      %25 : int = aten::sub(%24, %17)
 | 
			
		||||
      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
 | 
			
		||||
        block0(%27 : int, %28 : Tensor) {
 | 
			
		||||
          %29 : int = aten::dim(%28)
 | 
			
		||||
          %data.3 : Tensor = aten::unsqueeze(%28, %29)
 | 
			
		||||
          -> (%16, %data.3)
 | 
			
		||||
        }
 | 
			
		||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
			
		||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
			
		||||
@ -41,5 +40,6 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
			
		||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
			
		||||
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
			
		||||
  return (%res_data, %res_mask, %res_dims);
 | 
			
		||||
  %36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
			
		||||
  return (%36);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -9,38 +9,35 @@ graph(%a.1_data : Tensor
 | 
			
		||||
  %8 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
			
		||||
  %9 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
			
		||||
  %10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
			
		||||
  %11 : bool = prim::TensorToBool(%8)
 | 
			
		||||
  %12 : int = prim::Constant[value=0]()
 | 
			
		||||
  %13 : Tensor = aten::mul(%8, %9)
 | 
			
		||||
  %14 : Tensor = aten::sum(%13)
 | 
			
		||||
  %15 : Tensor = aten::gt(%14, %12)
 | 
			
		||||
  %16 : bool = prim::TensorToBool(%15)
 | 
			
		||||
  %17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
 | 
			
		||||
    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
 | 
			
		||||
      %30 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
      %alpha : float = prim::TensorToNum(%30)
 | 
			
		||||
  %11 : int = prim::Constant[value=0]()
 | 
			
		||||
  %12 : Tensor = aten::mul(%8, %9)
 | 
			
		||||
  %13 : Tensor = aten::sum(%12)
 | 
			
		||||
  %14 : Tensor = aten::gt(%13, %11)
 | 
			
		||||
  %15 : bool = prim::Bool(%14)
 | 
			
		||||
  %16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
 | 
			
		||||
    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
 | 
			
		||||
      %27 : Long() = prim::NumToTensor(%6)
 | 
			
		||||
      %alpha : float = prim::Float(%27)
 | 
			
		||||
      %data : Tensor = aten::sub(%6_data, %b_data, %alpha)
 | 
			
		||||
      %mask : Tensor = aten::mul(%6_mask, %b_mask)
 | 
			
		||||
      %dims : Tensor = aten::__or__(%6_dims, %b_dims)
 | 
			
		||||
      %35 : Tensor = aten::gt(%data, %b_data)
 | 
			
		||||
      %36 : Tensor = aten::mul(%mask, %b_mask)
 | 
			
		||||
      %37 : Tensor = aten::__or__(%dims, %b_dims)
 | 
			
		||||
      %38 : bool = prim::TensorToBool(%35)
 | 
			
		||||
      %39 : bool = prim::Constant[value=1]()
 | 
			
		||||
      %40 : int = prim::Constant[value=1]()
 | 
			
		||||
      %41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
 | 
			
		||||
      %data.2 : Tensor = aten::mul(%cond_data.2, %41)
 | 
			
		||||
      %43 : int = aten::dim(%data.2)
 | 
			
		||||
      %44 : bool = aten::eq(%43, %40)
 | 
			
		||||
      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%44)
 | 
			
		||||
      %32 : Tensor = aten::gt(%data, %b_data)
 | 
			
		||||
      %33 : Tensor = aten::mul(%mask, %b_mask)
 | 
			
		||||
      %34 : bool = prim::Constant[value=1]()
 | 
			
		||||
      %35 : int = prim::Constant[value=1]()
 | 
			
		||||
      %36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
 | 
			
		||||
      %data.2 : Tensor = aten::mul(%cond_data.2, %36)
 | 
			
		||||
      %38 : int = aten::dim(%data.2)
 | 
			
		||||
      %39 : bool = aten::eq(%38, %35)
 | 
			
		||||
      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%39)
 | 
			
		||||
        block0() {
 | 
			
		||||
          %47 : int = aten::dim(%data)
 | 
			
		||||
          %48 : int = aten::sub(%47, %40)
 | 
			
		||||
          %data.4 : Tensor = prim::Loop(%48, %39, %data.2)
 | 
			
		||||
            block0(%50 : int, %51 : Tensor) {
 | 
			
		||||
              %52 : int = aten::dim(%51)
 | 
			
		||||
              %data.3 : Tensor = aten::unsqueeze(%51, %52)
 | 
			
		||||
              -> (%39, %data.3)
 | 
			
		||||
          %42 : int = aten::dim(%data)
 | 
			
		||||
          %43 : int = aten::sub(%42, %35)
 | 
			
		||||
          %data.4 : Tensor = prim::Loop(%43, %34, %data.2)
 | 
			
		||||
            block0(%45 : int, %46 : Tensor) {
 | 
			
		||||
              %47 : int = aten::dim(%46)
 | 
			
		||||
              %data.3 : Tensor = aten::unsqueeze(%46, %47)
 | 
			
		||||
              -> (%34, %data.3)
 | 
			
		||||
            }
 | 
			
		||||
          %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
			
		||||
          %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
			
		||||
@ -52,12 +49,13 @@ graph(%a.1_data : Tensor
 | 
			
		||||
      %res_data : Tensor = aten::where(%cond_data, %data, %6_data)
 | 
			
		||||
      %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
 | 
			
		||||
      %res_dims : Tensor = aten::__or__(%dims, %6_dims)
 | 
			
		||||
      %59 : int = prim::Constant[value=0]()
 | 
			
		||||
      %60 : Tensor = aten::mul(%35, %36)
 | 
			
		||||
      %61 : Tensor = aten::sum(%60)
 | 
			
		||||
      %62 : Tensor = aten::gt(%61, %59)
 | 
			
		||||
      %63 : bool = prim::TensorToBool(%62)
 | 
			
		||||
      -> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims)
 | 
			
		||||
      %54 : int = prim::Constant[value=0]()
 | 
			
		||||
      %55 : Tensor = aten::mul(%32, %33)
 | 
			
		||||
      %56 : Tensor = aten::sum(%55)
 | 
			
		||||
      %57 : Tensor = aten::gt(%56, %54)
 | 
			
		||||
      %58 : bool = prim::Bool(%57)
 | 
			
		||||
      -> (%58, %32, %33, %res_data, %res_mask, %res_dims)
 | 
			
		||||
    }
 | 
			
		||||
  return (%a, %21, %22);
 | 
			
		||||
  %59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20)
 | 
			
		||||
  return (%59);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user