mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 16:04:58 +08:00 
			
		
		
		
	Compare commits
	
		
			101 Commits
		
	
	
		
			v2.2.2-rc1
			...
			v1.0.1
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| bb15580e88 | |||
| 743fdbdb19 | |||
| cdb9fd44dc | |||
| 83221655a8 | |||
| 48fcfdaccb | |||
| 18eef1d8d9 | |||
| 770462a5ff | |||
| 0f87ff6e38 | |||
| eb531da9a8 | |||
| 37c8a33b54 | |||
| 0e9bdcab80 | |||
| 1347a184ca | |||
| 1cb565fa34 | |||
| dab52a4a16 | |||
| 0a440da88b | |||
| cf11411d42 | |||
| fd8212cebd | |||
| ce37ec38f3 | |||
| 047231e1e1 | |||
| f748654e0e | |||
| 8fdcdc3c3f | |||
| 40fa56a5d1 | |||
| 2f9642010e | |||
| 3c10845036 | |||
| d85372f330 | |||
| 5fc2c8b115 | |||
| fc0c2252d2 | |||
| 304b932879 | |||
| e274158c72 | |||
| 8d1fc20c8b | |||
| af03dbb93b | |||
| b24edae29e | |||
| c99c8d8aa3 | |||
| eac4c5d901 | |||
| 231f1a4991 | |||
| b65b55a652 | |||
| c926cb4408 | |||
| a6f4538f31 | |||
| 7d3e2fa190 | |||
| 98bc784694 | |||
| 3c83026249 | |||
| 202909d601 | |||
| d4eea46dcd | |||
| cf0965736c | |||
| 274e755237 | |||
| c19b16cc99 | |||
| 228f73e7a9 | |||
| 1e61241227 | |||
| 9a9eae14d0 | |||
| fb92c3c7b5 | |||
| a9cf3f69ab | |||
| 6460628b25 | |||
| 74433436e8 | |||
| 57c685520f | |||
| ca1f9349dd | |||
| 6290587244 | |||
| 9c863c1952 | |||
| 84cf1660d2 | |||
| e8361887b1 | |||
| 9a7737146c | |||
| e27b4ba594 | |||
| 0384a0282b | |||
| f80dba92ae | |||
| 1b7113eaae | |||
| 7fec47f40d | |||
| d711595a07 | |||
| eef3be686e | |||
| ba4d1e8ca6 | |||
| ab1cd6241b | |||
| 1ff075b7df | |||
| b879d006f1 | |||
| 167f8e8314 | |||
| dfdf2376bb | |||
| 95fd0afed5 | |||
| 4e5b994ca7 | |||
| 5dbcbbf715 | |||
| 9067e9411d | |||
| 4c964dac7f | |||
| 7b40d9c7ff | |||
| e7767c1af3 | |||
| 982a8722cc | |||
| 3c1cbb8da8 | |||
| 5f51de77c9 | |||
| a4e2d27ddb | |||
| 4909529584 | |||
| 7b98af16ee | |||
| fe098a3605 | |||
| 3486cebd87 | |||
| a5a34fb5b1 | |||
| b2c4c55734 | |||
| b104068d24 | |||
| e0834ded98 | |||
| 30aed0237d | |||
| 033ae1598f | |||
| 8ca4fc3fd2 | |||
| 20296297ca | |||
| 72d27e3802 | |||
| 563d67087c | |||
| 7dc06810c2 | |||
| 07e4a5e069 | |||
| db5d3131d1 | 
@ -1,14 +1,14 @@
 | 
				
			|||||||
# IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 | 
					# IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 | 
				
			||||||
# in this file to the new version number, and **ALSO** update the version number below:
 | 
					# in this file to the new version number, and **ALSO** update the version number below:
 | 
				
			||||||
# PyTorchDockerVersion:262
 | 
					# PyTorchDockerVersion:282
 | 
				
			||||||
# Caffe2DockerVersion:230
 | 
					# Caffe2DockerVersion:238
 | 
				
			||||||
 | 
					
 | 
				
			||||||
docker_config_defaults: &docker_config_defaults
 | 
					docker_config_defaults: &docker_config_defaults
 | 
				
			||||||
  user: jenkins
 | 
					  user: jenkins
 | 
				
			||||||
  aws_auth:
 | 
					  aws_auth:
 | 
				
			||||||
    # This IAM user only allows read-write access to ECR
 | 
					    # This IAM user only allows read-write access to ECR
 | 
				
			||||||
    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
 | 
					    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
 | 
				
			||||||
    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
 | 
					    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# NOTE: We only perform the merge in build step and not in test step, because
 | 
					# NOTE: We only perform the merge in build step and not in test step, because
 | 
				
			||||||
# all source files will be shared from build to test
 | 
					# all source files will be shared from build to test
 | 
				
			||||||
@ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client
 | 
				
			|||||||
    sudo apt-get -qq update
 | 
					    sudo apt-get -qq update
 | 
				
			||||||
    sudo apt-get -qq install openssh-client git
 | 
					    sudo apt-get -qq install openssh-client git
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					install_doc_push_script: &install_doc_push_script
 | 
				
			||||||
 | 
					  name: Install the doc push script
 | 
				
			||||||
 | 
					  no_output_timeout: "2m"
 | 
				
			||||||
 | 
					  command: |
 | 
				
			||||||
 | 
					    cat >/home/circleci/project/doc_push_script.sh <<EOL
 | 
				
			||||||
 | 
					    # =================== The following code **should** be executed inside Docker container ===================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # This is where the local pytorch install in the docker image is located
 | 
				
			||||||
 | 
					    pt_checkout="/var/lib/jenkins/workspace"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Since we're cat-ing this file, we need to escape all $'s
 | 
				
			||||||
 | 
					    echo "doc_push_script.sh: Invoked with \$*"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
 | 
				
			||||||
 | 
					    pushd pytorch.github.io
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    set -ex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Argument 1: Where to copy the built documentation to
 | 
				
			||||||
 | 
					    # (pytorch.github.io/$install_path)
 | 
				
			||||||
 | 
					    install_path="\$1"
 | 
				
			||||||
 | 
					    if [ -z "\$install_path" ]; then
 | 
				
			||||||
 | 
					    echo "error: doc_push_script.sh: install_path (arg1) not specified"
 | 
				
			||||||
 | 
					      exit 1
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Argument 2: What version of the docs we are building.
 | 
				
			||||||
 | 
					    version="\$2"
 | 
				
			||||||
 | 
					    if [ -z "\$version" ]; then
 | 
				
			||||||
 | 
					    echo "error: doc_push_script.sh: version (arg2) not specified"
 | 
				
			||||||
 | 
					      exit 1
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    is_master_doc=false
 | 
				
			||||||
 | 
					    if [ "\$version" == "master" ]; then
 | 
				
			||||||
 | 
					      is_master_doc=true
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Argument 3: (optional) If present, we will NOT do any pushing. Used for testing.
 | 
				
			||||||
 | 
					    dry_run=false
 | 
				
			||||||
 | 
					    if [ "\$3" != "" ]; then
 | 
				
			||||||
 | 
					      dry_run=true
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    echo "install_path: \$install_path  version: \$version  dry_run: \$dry_run"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    export LC_ALL=C
 | 
				
			||||||
 | 
					    export PATH=/opt/conda/bin:$PATH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    rm -rf pytorch || true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Get all the documentation sources, put them in one place
 | 
				
			||||||
 | 
					    pushd "\$pt_checkout"
 | 
				
			||||||
 | 
					    git clone https://github.com/pytorch/vision
 | 
				
			||||||
 | 
					    pushd vision
 | 
				
			||||||
 | 
					    conda install -q pillow
 | 
				
			||||||
 | 
					    time python setup.py install
 | 
				
			||||||
 | 
					    popd
 | 
				
			||||||
 | 
					    pushd docs
 | 
				
			||||||
 | 
					    rm -rf source/torchvision
 | 
				
			||||||
 | 
					    cp -r ../vision/docs/source source/torchvision
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Build the docs
 | 
				
			||||||
 | 
					    pip -q install -r requirements.txt || true
 | 
				
			||||||
 | 
					    if [ "\$is_master_doc" = true ]; then
 | 
				
			||||||
 | 
					      make html
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      make html-stable
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Move them into the docs repo
 | 
				
			||||||
 | 
					    popd
 | 
				
			||||||
 | 
					    popd
 | 
				
			||||||
 | 
					    git rm -rf "\$install_path" || true
 | 
				
			||||||
 | 
					    mv "\$pt_checkout/docs/build/html" "\$install_path"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Add the version handler by search and replace.
 | 
				
			||||||
 | 
					    # XXX: Consider moving this to the docs Makefile or site build
 | 
				
			||||||
 | 
					    if [ "\$is_master_doc" = true ]; then
 | 
				
			||||||
 | 
					      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \▼</a>@g"
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \▼</a>@g"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    git add "\$install_path" || true
 | 
				
			||||||
 | 
					    git status
 | 
				
			||||||
 | 
					    git config user.email "soumith+bot@pytorch.org"
 | 
				
			||||||
 | 
					    git config user.name "pytorchbot"
 | 
				
			||||||
 | 
					    # If there aren't changes, don't make a commit; push is no-op
 | 
				
			||||||
 | 
					    git commit -m "auto-generating sphinx docs" || true
 | 
				
			||||||
 | 
					    git status
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if [ "\$dry_run" = false ]; then
 | 
				
			||||||
 | 
					      echo "Pushing to pytorch.github.io:site"
 | 
				
			||||||
 | 
					      git push origin site
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      echo "Skipping push due to dry_run"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    popd
 | 
				
			||||||
 | 
					    # =================== The above code **should** be executed inside Docker container ===================
 | 
				
			||||||
 | 
					    EOL
 | 
				
			||||||
 | 
					    chmod +x /home/circleci/project/doc_push_script.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
setup_ci_environment: &setup_ci_environment
 | 
					setup_ci_environment: &setup_ci_environment
 | 
				
			||||||
  name: Set Up CI Environment
 | 
					  name: Set Up CI Environment
 | 
				
			||||||
  no_output_timeout: "1h"
 | 
					  no_output_timeout: "1h"
 | 
				
			||||||
@ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment
 | 
				
			|||||||
      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
 | 
					      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      # This IAM user allows write access to S3 bucket for sccache
 | 
					      # This IAM user allows write access to S3 bucket for sccache
 | 
				
			||||||
      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
					      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
				
			||||||
      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
					      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
				
			||||||
    fi
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # This IAM user only allows read-write access to ECR
 | 
					    # This IAM user only allows read-write access to ECR
 | 
				
			||||||
    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
 | 
					    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
 | 
				
			||||||
    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
 | 
					    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
 | 
				
			||||||
    eval $(aws ecr get-login --region us-east-1 --no-include-email)
 | 
					    eval $(aws ecr get-login --region us-east-1 --no-include-email)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pytorch_linux_build_defaults: &pytorch_linux_build_defaults
 | 
					pytorch_linux_build_defaults: &pytorch_linux_build_defaults
 | 
				
			||||||
@ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
 | 
				
			|||||||
      <<: *setup_ci_environment
 | 
					      <<: *setup_ci_environment
 | 
				
			||||||
  - run:
 | 
					  - run:
 | 
				
			||||||
      name: Test
 | 
					      name: Test
 | 
				
			||||||
      no_output_timeout: "90m"
 | 
					      no_output_timeout: "1h"
 | 
				
			||||||
      command: |
 | 
					      command: |
 | 
				
			||||||
        set -e
 | 
					        set -e
 | 
				
			||||||
        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
					        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
				
			||||||
@ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
          export IN_CIRCLECI=1
 | 
					          export IN_CIRCLECI=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
					          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
				
			||||||
          brew install moreutils --without-parallel
 | 
					          # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
				
			||||||
 | 
					          brew unlink parallel
 | 
				
			||||||
 | 
					          brew install moreutils
 | 
				
			||||||
 | 
					          brew link parallel --overwrite
 | 
				
			||||||
          brew install cmake
 | 
					          brew install cmake
 | 
				
			||||||
          brew install expect
 | 
					          brew install expect
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
				
			|||||||
          export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
					          export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          # This IAM user allows write access to S3 bucket for sccache
 | 
					          # This IAM user allows write access to S3 bucket for sccache
 | 
				
			||||||
          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          export SCCACHE_BIN=${PWD}/sccache_bin
 | 
					          export SCCACHE_BIN=${PWD}/sccache_bin
 | 
				
			||||||
          mkdir -p ${SCCACHE_BIN}
 | 
					          mkdir -p ${SCCACHE_BIN}
 | 
				
			||||||
@ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 | 
				
			|||||||
            sccache --show-stats
 | 
					            sccache --show-stats
 | 
				
			||||||
          fi
 | 
					          fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##############################################################################
 | 
				
			||||||
 | 
					##############################################################################
 | 
				
			||||||
 | 
					# Job specifications
 | 
				
			||||||
 | 
					##############################################################################
 | 
				
			||||||
 | 
					##############################################################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
version: 2
 | 
					version: 2
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  pytorch_linux_trusty_py2_7_9_build:
 | 
					  pytorch_linux_trusty_py2_7_9_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py2_7_9_test:
 | 
					  pytorch_linux_trusty_py2_7_9_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py2_7_build:
 | 
					  pytorch_linux_trusty_py2_7_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py2_7_test:
 | 
					  pytorch_linux_trusty_py2_7_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_5_build:
 | 
					  pytorch_linux_trusty_py3_5_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_5_test:
 | 
					  pytorch_linux_trusty_py3_5_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc4_8_build:
 | 
					  pytorch_linux_trusty_py3_6_gcc4_8_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc4_8_test:
 | 
					  pytorch_linux_trusty_py3_6_gcc4_8_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc5_4_build:
 | 
					  pytorch_linux_trusty_py3_6_gcc5_4_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc5_4_test:
 | 
					  pytorch_linux_trusty_py3_6_gcc5_4_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc7_build:
 | 
					  pytorch_linux_trusty_py3_6_gcc7_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_py3_6_gcc7_test:
 | 
					  pytorch_linux_trusty_py3_6_gcc7_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_pynightly_build:
 | 
					  pytorch_linux_trusty_pynightly_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_trusty_pynightly_test:
 | 
					  pytorch_linux_trusty_pynightly_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
 | 
					      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_py3_clang5_asan_build:
 | 
					  pytorch_linux_xenial_py3_clang5_asan_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_py3_clang5_asan_test:
 | 
					  pytorch_linux_xenial_py3_clang5_asan_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_build:
 | 
					  pytorch_linux_xenial_cuda8_cudnn7_py3_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
 | 
					      BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_test:
 | 
					  pytorch_linux_xenial_cuda8_cudnn7_py3_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
 | 
					  pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
      MULTI_GPU: "1"
 | 
					      MULTI_GPU: "1"
 | 
				
			||||||
    resource_class: gpu.large
 | 
					    resource_class: gpu.large
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
 | 
					  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
    <<: *pytorch_linux_test_defaults
 | 
					    <<: *pytorch_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
 | 
					  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
@ -517,7 +631,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
 | 
					  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
 | 
				
			||||||
      PYTHON_VERSION: "2.7"
 | 
					      PYTHON_VERSION: "2.7"
 | 
				
			||||||
      CUDA_VERSION: "9"
 | 
					      CUDA_VERSION: "9"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
@ -525,7 +639,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
					  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
 | 
				
			||||||
      PYTHON_VERSION: "2.7"
 | 
					      PYTHON_VERSION: "2.7"
 | 
				
			||||||
      CUDA_VERSION: "9"
 | 
					      CUDA_VERSION: "9"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
@ -534,7 +648,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
 | 
					  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "9"
 | 
					      CUDA_VERSION: "9"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
@ -542,7 +656,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
 | 
					  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "9"
 | 
					      CUDA_VERSION: "9"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
@ -551,7 +665,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
 | 
					  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "9.2"
 | 
					      CUDA_VERSION: "9.2"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
@ -559,7 +673,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
 | 
					  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "9.2"
 | 
					      CUDA_VERSION: "9.2"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
@ -568,7 +682,7 @@ jobs:
 | 
				
			|||||||
  pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
 | 
					  pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
 | 
					      JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "10"
 | 
					      CUDA_VERSION: "10"
 | 
				
			||||||
    <<: *pytorch_linux_build_defaults
 | 
					    <<: *pytorch_linux_build_defaults
 | 
				
			||||||
@ -576,7 +690,7 @@ jobs:
 | 
				
			|||||||
  pytorch_short_perf_test_gpu:
 | 
					  pytorch_short_perf_test_gpu:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-short-perf-test-gpu
 | 
					      JOB_BASE_NAME: pytorch-short-perf-test-gpu
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
      PYTHON_VERSION: "3.6"
 | 
					      PYTHON_VERSION: "3.6"
 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
    resource_class: gpu.medium
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
@ -597,8 +711,8 @@ jobs:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
          docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
 | 
					          docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
 | 
				
			||||||
          # This IAM user allows write access to S3 bucket for perf test numbers
 | 
					          # This IAM user allows write access to S3 bucket for perf test numbers
 | 
				
			||||||
          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
					          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
				
			||||||
          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
 | 
					          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
 | 
				
			||||||
          docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env
 | 
					          docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
					          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
				
			||||||
@ -607,7 +721,7 @@ jobs:
 | 
				
			|||||||
  pytorch_doc_push:
 | 
					  pytorch_doc_push:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: pytorch-doc-push
 | 
					      JOB_BASE_NAME: pytorch-doc-push
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
 | 
				
			||||||
    resource_class: large
 | 
					    resource_class: large
 | 
				
			||||||
    machine:
 | 
					    machine:
 | 
				
			||||||
      image: default
 | 
					      image: default
 | 
				
			||||||
@ -615,72 +729,39 @@ jobs:
 | 
				
			|||||||
    - run:
 | 
					    - run:
 | 
				
			||||||
        <<: *setup_ci_environment
 | 
					        <<: *setup_ci_environment
 | 
				
			||||||
    - run:
 | 
					    - run:
 | 
				
			||||||
        name: Doc Push
 | 
					        <<: *install_doc_push_script
 | 
				
			||||||
 | 
					    - run:
 | 
				
			||||||
 | 
					        name: Doc Build and Push
 | 
				
			||||||
        no_output_timeout: "1h"
 | 
					        no_output_timeout: "1h"
 | 
				
			||||||
        command: |
 | 
					        command: |
 | 
				
			||||||
          set -e
 | 
					          set -e
 | 
				
			||||||
          if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
 | 
					 | 
				
			||||||
            echo "Skipping doc push..."
 | 
					 | 
				
			||||||
            exit 0
 | 
					 | 
				
			||||||
          fi
 | 
					 | 
				
			||||||
          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
					          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
 | 
				
			||||||
          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
 | 
					          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
 | 
				
			||||||
          docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
 | 
					          docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
 | 
				
			||||||
          export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 | 
					          export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          cat >/home/circleci/project/doc_push_script.sh <<EOL
 | 
					 | 
				
			||||||
          # =================== The following code will be executed inside Docker container ===================
 | 
					 | 
				
			||||||
          git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
 | 
					 | 
				
			||||||
          pushd pytorch.github.io
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          set -ex
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          export LC_ALL=C
 | 
					 | 
				
			||||||
          export PATH=/opt/conda/bin:$PATH
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          rm -rf pytorch || true
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          # Get all the documentation sources, put them in one place
 | 
					 | 
				
			||||||
          # TODO: These clones can race
 | 
					 | 
				
			||||||
          git clone https://github.com/pytorch/pytorch
 | 
					 | 
				
			||||||
          pushd pytorch
 | 
					 | 
				
			||||||
          git clone https://github.com/pytorch/vision
 | 
					 | 
				
			||||||
          pushd vision
 | 
					 | 
				
			||||||
          conda install -q pillow
 | 
					 | 
				
			||||||
          time python setup.py install
 | 
					 | 
				
			||||||
          popd
 | 
					 | 
				
			||||||
          pushd docs
 | 
					 | 
				
			||||||
          rm -rf source/torchvision
 | 
					 | 
				
			||||||
          cp -r ../vision/docs/source source/torchvision
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          # Build the docs
 | 
					 | 
				
			||||||
          pip -q install -r requirements.txt || true
 | 
					 | 
				
			||||||
          make html
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          # Move them into the docs repo
 | 
					 | 
				
			||||||
          popd
 | 
					 | 
				
			||||||
          popd
 | 
					 | 
				
			||||||
          git rm -rf docs/master || true
 | 
					 | 
				
			||||||
          mv pytorch/docs/build/html docs/master
 | 
					 | 
				
			||||||
          find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\▼<\\/a>/g'
 | 
					 | 
				
			||||||
          git add docs/master || true
 | 
					 | 
				
			||||||
          git status
 | 
					 | 
				
			||||||
          git config user.email "soumith+bot@pytorch.org"
 | 
					 | 
				
			||||||
          git config user.name "pytorchbot"
 | 
					 | 
				
			||||||
          # If there aren't changes, don't make a commit; push is no-op
 | 
					 | 
				
			||||||
          git commit -m "auto-generating sphinx docs" || true
 | 
					 | 
				
			||||||
          git status
 | 
					 | 
				
			||||||
          git push origin site
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
          popd
 | 
					 | 
				
			||||||
          # =================== The above code will be executed inside Docker container ===================
 | 
					 | 
				
			||||||
          EOL
 | 
					 | 
				
			||||||
          chmod +x /home/circleci/project/doc_push_script.sh
 | 
					 | 
				
			||||||
          docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh
 | 
					          docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
					          # master branch docs push
 | 
				
			||||||
 | 
					          if [[ "${CIRCLE_BRANCH}" == "master" ]]; then
 | 
				
			||||||
 | 
					            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          # stable release docs push. We keep an eternal PR open for merging
 | 
				
			||||||
 | 
					          # v1.0.1 -> master; everytime v1.0.1 is updated the following is run.
 | 
				
			||||||
 | 
					          elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then
 | 
				
			||||||
 | 
					            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          # For open PRs: Do a dry_run of the docs build, don't push build
 | 
				
			||||||
 | 
					          else
 | 
				
			||||||
 | 
					            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1'
 | 
				
			||||||
 | 
					          fi
 | 
				
			||||||
          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 | 
					          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          # Save the docs build so we can debug any problems
 | 
				
			||||||
 | 
					          export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug
 | 
				
			||||||
 | 
					          docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE}
 | 
				
			||||||
 | 
					          docker push ${DEBUG_COMMIT_DOCKER_IMAGE}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pytorch_macos_10_13_py3_build:
 | 
					  pytorch_macos_10_13_py3_build:
 | 
				
			||||||
    macos:
 | 
					    macos:
 | 
				
			||||||
      xcode: "9.0"
 | 
					      xcode: "9.0"
 | 
				
			||||||
@ -696,8 +777,11 @@ jobs:
 | 
				
			|||||||
            set -e
 | 
					            set -e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            export IN_CIRCLECI=1
 | 
					            export IN_CIRCLECI=1
 | 
				
			||||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
					            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
				
			||||||
            brew install moreutils --without-parallel
 | 
					            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
				
			||||||
 | 
					            brew unlink parallel
 | 
				
			||||||
 | 
					            brew install moreutils
 | 
				
			||||||
 | 
					            brew link parallel --overwrite
 | 
				
			||||||
            brew install expect
 | 
					            brew install expect
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Install sccache
 | 
					            # Install sccache
 | 
				
			||||||
@ -706,8 +790,8 @@ jobs:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
					            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
				
			||||||
            # This IAM user allows write access to S3 bucket for sccache
 | 
					            # This IAM user allows write access to S3 bucket for sccache
 | 
				
			||||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            git submodule sync && git submodule update -q --init
 | 
					            git submodule sync && git submodule update -q --init
 | 
				
			||||||
            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
					            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
				
			||||||
@ -740,8 +824,11 @@ jobs:
 | 
				
			|||||||
          command: |
 | 
					          command: |
 | 
				
			||||||
            set -e
 | 
					            set -e
 | 
				
			||||||
            export IN_CIRCLECI=1
 | 
					            export IN_CIRCLECI=1
 | 
				
			||||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
					            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
				
			||||||
            brew install moreutils --without-parallel
 | 
					            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
				
			||||||
 | 
					            brew unlink parallel
 | 
				
			||||||
 | 
					            brew install moreutils
 | 
				
			||||||
 | 
					            brew link parallel --overwrite
 | 
				
			||||||
            brew install expect
 | 
					            brew install expect
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
 | 
					            cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
 | 
				
			||||||
@ -765,8 +852,11 @@ jobs:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            export IN_CIRCLECI=1
 | 
					            export IN_CIRCLECI=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
 | 
					            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
 | 
				
			||||||
            brew install moreutils --without-parallel
 | 
					            # so we must unlink GNU `parallel` first, and relink it afterwards
 | 
				
			||||||
 | 
					            brew unlink parallel
 | 
				
			||||||
 | 
					            brew install moreutils
 | 
				
			||||||
 | 
					            brew link parallel --overwrite
 | 
				
			||||||
            brew install expect
 | 
					            brew install expect
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Install CUDA 9.2
 | 
					            # Install CUDA 9.2
 | 
				
			||||||
@ -790,30 +880,13 @@ jobs:
 | 
				
			|||||||
            sudo chmod +x /usr/local/bin/sccache
 | 
					            sudo chmod +x /usr/local/bin/sccache
 | 
				
			||||||
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
					            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 | 
				
			||||||
            # This IAM user allows write access to S3 bucket for sccache
 | 
					            # This IAM user allows write access to S3 bucket for sccache
 | 
				
			||||||
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
 | 
					            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            git submodule sync && git submodule update -q --init
 | 
					            git submodule sync && git submodule update -q --init
 | 
				
			||||||
            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
					            chmod a+x .jenkins/pytorch/macos-build.sh
 | 
				
			||||||
            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
 | 
					            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
 | 
					 | 
				
			||||||
    environment:
 | 
					 | 
				
			||||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
 | 
					 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
 | 
					 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					 | 
				
			||||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
 | 
					 | 
				
			||||||
    <<: *caffe2_linux_build_defaults
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
 | 
					 | 
				
			||||||
    environment:
 | 
					 | 
				
			||||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
 | 
					 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
 | 
					 | 
				
			||||||
      CUDA_VERSION: "8"
 | 
					 | 
				
			||||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
 | 
					 | 
				
			||||||
    resource_class: gpu.medium
 | 
					 | 
				
			||||||
    <<: *caffe2_linux_test_defaults
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
 | 
					  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
 | 
					      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
 | 
				
			||||||
@ -896,11 +969,20 @@ jobs:
 | 
				
			|||||||
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
 | 
					  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
 | 
					      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
 | 
				
			||||||
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
 | 
				
			||||||
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
 | 
					      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
 | 
				
			||||||
      BUILD_ONLY: "1"
 | 
					 | 
				
			||||||
    <<: *caffe2_linux_build_defaults
 | 
					    <<: *caffe2_linux_build_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
 | 
				
			||||||
 | 
					    environment:
 | 
				
			||||||
 | 
					      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test
 | 
				
			||||||
 | 
					      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
 | 
				
			||||||
 | 
					      CUDA_VERSION: "8"
 | 
				
			||||||
 | 
					      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
 | 
				
			||||||
 | 
					    resource_class: gpu.medium
 | 
				
			||||||
 | 
					    <<: *caffe2_linux_test_defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  caffe2_py2_gcc4_9_ubuntu14_04_build:
 | 
					  caffe2_py2_gcc4_9_ubuntu14_04_build:
 | 
				
			||||||
    environment:
 | 
					    environment:
 | 
				
			||||||
      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
 | 
					      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
 | 
				
			||||||
@ -1008,25 +1090,25 @@ workflows:
 | 
				
			|||||||
      - pytorch_linux_xenial_py3_clang5_asan_test:
 | 
					      - pytorch_linux_xenial_py3_clang5_asan_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_py3_clang5_asan_build
 | 
					            - pytorch_linux_xenial_py3_clang5_asan_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					      - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
 | 
					      - pytorch_linux_xenial_cuda8_cudnn7_py3_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
 | 
					      - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
 | 
					      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
 | 
					      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_short_perf_test_gpu:
 | 
					      - pytorch_short_perf_test_gpu:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_doc_push:
 | 
					      - pytorch_doc_push:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
 | 
					            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
 | 
					      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
 | 
				
			||||||
      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
					      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
@ -1047,10 +1129,6 @@ workflows:
 | 
				
			|||||||
            - pytorch_macos_10_13_py3_build
 | 
					            - pytorch_macos_10_13_py3_build
 | 
				
			||||||
      - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 | 
					      - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
 | 
					 | 
				
			||||||
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
 | 
					 | 
				
			||||||
          requires:
 | 
					 | 
				
			||||||
            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
 | 
					 | 
				
			||||||
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
 | 
					      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
 | 
				
			||||||
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
 | 
					      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
 | 
				
			||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
@ -1072,6 +1150,9 @@ workflows:
 | 
				
			|||||||
          requires:
 | 
					          requires:
 | 
				
			||||||
            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
 | 
					            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
 | 
				
			||||||
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
 | 
					      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
 | 
				
			||||||
 | 
					      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
 | 
				
			||||||
 | 
					          requires:
 | 
				
			||||||
 | 
					            - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
 | 
				
			||||||
      - caffe2_py2_clang3_8_ubuntu16_04_build
 | 
					      - caffe2_py2_clang3_8_ubuntu16_04_build
 | 
				
			||||||
      - caffe2_py2_clang3_9_ubuntu16_04_build
 | 
					      - caffe2_py2_clang3_9_ubuntu16_04_build
 | 
				
			||||||
      - caffe2_py2_clang7_ubuntu16_04_build
 | 
					      - caffe2_py2_clang7_ubuntu16_04_build
 | 
				
			||||||
 | 
				
			|||||||
@ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
 | 
					if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
 | 
				
			||||||
  CMAKE_ARGS+=("-DBLAS=MKL")
 | 
					  CMAKE_ARGS+=("-DBLAS=MKL")
 | 
				
			||||||
 | 
					  CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
 | 
					if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
 | 
				
			||||||
  CMAKE_ARGS+=("-DUSE_CUDA=ON")
 | 
					  CMAKE_ARGS+=("-DUSE_CUDA=ON")
 | 
				
			||||||
 | 
				
			|||||||
@ -14,18 +14,8 @@ clang --version
 | 
				
			|||||||
# symbolize=1: Gives us much better errors when things go wrong
 | 
					# symbolize=1: Gives us much better errors when things go wrong
 | 
				
			||||||
export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 | 
					export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# FIXME: Remove the hardcoded "-pthread" option.
 | 
					 | 
				
			||||||
# With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
 | 
					 | 
				
			||||||
# succeed because "pthread_create" is in libasan.so. However, libasan doesn't
 | 
					 | 
				
			||||||
# have the full pthread implementation. Other advanced pthread functions doesn't
 | 
					 | 
				
			||||||
# exist in libasan.so[2]. If we need some pthread advanced functions, we still
 | 
					 | 
				
			||||||
# need to link the pthread library.
 | 
					 | 
				
			||||||
# [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
 | 
					 | 
				
			||||||
# [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
# TODO: Make the ASAN flags a more unified env var
 | 
					# TODO: Make the ASAN flags a more unified env var
 | 
				
			||||||
CC="clang" CXX="clang++" LDSHARED="clang --shared" \
 | 
					CC="clang" CXX="clang++" LDSHARED="clang --shared" \
 | 
				
			||||||
  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
 | 
					  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
 | 
				
			||||||
  CXX_FLAGS="-pthread" \
 | 
					 | 
				
			||||||
  NO_CUDA=1 USE_MKLDNN=0 \
 | 
					  NO_CUDA=1 USE_MKLDNN=0 \
 | 
				
			||||||
  python setup.py install
 | 
					  python setup.py install
 | 
				
			||||||
 | 
				
			|||||||
@ -129,7 +129,7 @@ fi
 | 
				
			|||||||
git add -f build/bin
 | 
					git add -f build/bin
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Test documentation build
 | 
					# Test documentation build
 | 
				
			||||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
					if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
 | 
				
			||||||
  pushd docs
 | 
					  pushd docs
 | 
				
			||||||
  # TODO: Don't run this here
 | 
					  # TODO: Don't run this here
 | 
				
			||||||
  pip install -q -r requirements.txt || true
 | 
					  pip install -q -r requirements.txt || true
 | 
				
			||||||
@ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
				
			|||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Test standalone c10 build
 | 
					# Test standalone c10 build
 | 
				
			||||||
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 | 
					if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
 | 
				
			||||||
  mkdir -p c10/build
 | 
					  mkdir -p c10/build
 | 
				
			||||||
  pushd c10/build
 | 
					  pushd c10/build
 | 
				
			||||||
  cmake ..
 | 
					  cmake ..
 | 
				
			||||||
 | 
				
			|||||||
@ -122,7 +122,7 @@ fi
 | 
				
			|||||||
# Use conda cmake in some CI build. Conda cmake will be newer than our supported
 | 
					# Use conda cmake in some CI build. Conda cmake will be newer than our supported
 | 
				
			||||||
# min version 3.5, so we only do it in two builds that we know should use conda.
 | 
					# min version 3.5, so we only do it in two builds that we know should use conda.
 | 
				
			||||||
if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
 | 
					if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
 | 
				
			||||||
  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
 | 
					  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
 | 
				
			||||||
     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
 | 
					     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
 | 
				
			||||||
    if ! which conda; then
 | 
					    if ! which conda; then
 | 
				
			||||||
      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
 | 
					      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
 | 
				
			||||||
 | 
				
			|||||||
@ -5,9 +5,9 @@
 | 
				
			|||||||
# in this file will report a failure (so you don't forget to
 | 
					# in this file will report a failure (so you don't forget to
 | 
				
			||||||
# reenable the tests on merge ;)
 | 
					# reenable the tests on merge ;)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pytorch-linux-xenial-cuda8-cudnn6-py3-build
 | 
					pytorch-linux-xenial-cuda8-cudnn7-py3-build
 | 
				
			||||||
pytorch-linux-xenial-cuda8-cudnn6-py3-test
 | 
					pytorch-linux-xenial-cuda8-cudnn7-py3-test
 | 
				
			||||||
pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
 | 
					pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 | 
				
			||||||
pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
					pytorch-linux-xenial-cuda9-cudnn7-py2-build
 | 
				
			||||||
pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
					pytorch-linux-xenial-cuda9-cudnn7-py2-test
 | 
				
			||||||
pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
					pytorch-linux-xenial-cuda9-cudnn7-py3-build
 | 
				
			||||||
 | 
				
			|||||||
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
 | 
				
			|||||||
    sccache --show-stats
 | 
					    sccache --show-stats
 | 
				
			||||||
    sccache --zero-stats
 | 
					    sccache --zero-stats
 | 
				
			||||||
    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
 | 
					    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
 | 
				
			||||||
 | 
					    for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
 | 
				
			||||||
 | 
					      IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
 | 
				
			||||||
 | 
					        del /S /Q %%i
 | 
				
			||||||
 | 
					      )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
 | 
					    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
 | 
				
			||||||
  )
 | 
					  )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -34,10 +34,4 @@ matrix:
 | 
				
			|||||||
        script: cd docs/cpp/source && ./check-doxygen.sh
 | 
					        script: cd docs/cpp/source && ./check-doxygen.sh
 | 
				
			||||||
      - env: CLANG_TIDY
 | 
					      - env: CLANG_TIDY
 | 
				
			||||||
        python: "3.6"
 | 
					        python: "3.6"
 | 
				
			||||||
        addons:
 | 
					 | 
				
			||||||
          apt:
 | 
					 | 
				
			||||||
            sources:
 | 
					 | 
				
			||||||
              - ubuntu-toolchain-r-test
 | 
					 | 
				
			||||||
              - llvm-toolchain-trusty
 | 
					 | 
				
			||||||
            packages: clang-tidy
 | 
					 | 
				
			||||||
        script: tools/run-clang-tidy-in-ci.sh
 | 
					        script: tools/run-clang-tidy-in-ci.sh
 | 
				
			||||||
 | 
				
			|||||||
@ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 | 
				
			|||||||
	--generate-code arch=compute_50,code=sm_50
 | 
						--generate-code arch=compute_50,code=sm_50
 | 
				
			||||||
	--generate-code arch=compute_60,code=sm_60
 | 
						--generate-code arch=compute_60,code=sm_60
 | 
				
			||||||
	--generate-code arch=compute_70,code=sm_70)
 | 
						--generate-code arch=compute_70,code=sm_70)
 | 
				
			||||||
 | 
					    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
 | 
				
			||||||
 | 
					      SET(CUFFT_FAKELINK_OPTIONS
 | 
				
			||||||
 | 
						--generate-code arch=compute_35,code=sm_35
 | 
				
			||||||
 | 
						--generate-code arch=compute_50,code=sm_50
 | 
				
			||||||
 | 
						--generate-code arch=compute_60,code=sm_60
 | 
				
			||||||
 | 
						--generate-code arch=compute_70,code=sm_70)
 | 
				
			||||||
    else()
 | 
					    else()
 | 
				
			||||||
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
 | 
					      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
 | 
				
			||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
 | 
				
			|||||||
@ -2122,55 +2122,6 @@
 | 
				
			|||||||
    - arg: THTensor* self
 | 
					    - arg: THTensor* self
 | 
				
			||||||
    - arg: THTensor* tensor
 | 
					    - arg: THTensor* tensor
 | 
				
			||||||
]]
 | 
					]]
 | 
				
			||||||
[[
 | 
					 | 
				
			||||||
  name: _th_tril
 | 
					 | 
				
			||||||
  cname: tril
 | 
					 | 
				
			||||||
  variants:
 | 
					 | 
				
			||||||
    - function
 | 
					 | 
				
			||||||
  return: argument 0
 | 
					 | 
				
			||||||
  arguments:
 | 
					 | 
				
			||||||
    - arg: THTensor* result
 | 
					 | 
				
			||||||
      output: True
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - arg: long diagonal
 | 
					 | 
				
			||||||
      default: 0
 | 
					 | 
				
			||||||
]]
 | 
					 | 
				
			||||||
[[
 | 
					 | 
				
			||||||
  name: _th_tril_
 | 
					 | 
				
			||||||
  cname: tril
 | 
					 | 
				
			||||||
  variants: function
 | 
					 | 
				
			||||||
  return: self
 | 
					 | 
				
			||||||
  arguments:
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - arg: long diagonal
 | 
					 | 
				
			||||||
      default: 0
 | 
					 | 
				
			||||||
]]
 | 
					 | 
				
			||||||
[[
 | 
					 | 
				
			||||||
  name: _th_triu
 | 
					 | 
				
			||||||
  cname: triu
 | 
					 | 
				
			||||||
  variants:
 | 
					 | 
				
			||||||
    - function
 | 
					 | 
				
			||||||
  return: argument 0
 | 
					 | 
				
			||||||
  arguments:
 | 
					 | 
				
			||||||
    - arg: THTensor* result
 | 
					 | 
				
			||||||
      output: True
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - arg: long diagonal
 | 
					 | 
				
			||||||
      default: 0
 | 
					 | 
				
			||||||
]]
 | 
					 | 
				
			||||||
[[
 | 
					 | 
				
			||||||
  name: _th_triu_
 | 
					 | 
				
			||||||
  cname: triu
 | 
					 | 
				
			||||||
  variants:
 | 
					 | 
				
			||||||
    - function
 | 
					 | 
				
			||||||
  return: self
 | 
					 | 
				
			||||||
  arguments:
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - THTensor* self
 | 
					 | 
				
			||||||
    - arg: long diagonal
 | 
					 | 
				
			||||||
      default: 0
 | 
					 | 
				
			||||||
]]
 | 
					 | 
				
			||||||
[[
 | 
					[[
 | 
				
			||||||
  name: _th_cross
 | 
					  name: _th_cross
 | 
				
			||||||
  cname: cross
 | 
					  cname: cross
 | 
				
			||||||
 | 
				
			|||||||
@ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) {
 | 
				
			|||||||
    reduce_dims.push_back(i);
 | 
					    reduce_dims.push_back(i);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
 | 
					  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
 | 
				
			||||||
    if (shape[i - leading_dims] == 1 && sizes[i] > 1) {
 | 
					    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
 | 
				
			||||||
      reduce_dims.push_back(i);
 | 
					      reduce_dims.push_back(i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
				
			|||||||
@ -81,6 +81,39 @@ inline void parallel_for(
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					parallel_reduce
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					begin: index at which to start applying reduction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					end: index at which to stop applying reduction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					grain_size: number of elements per chunk. impacts number of elements in
 | 
				
			||||||
 | 
					intermediate results tensor and degree of parallelization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ident: identity for binary combination function sf. sf(ident, x) needs to return
 | 
				
			||||||
 | 
					x.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					f: function for reduction over a chunk. f needs to be of signature scalar_t
 | 
				
			||||||
 | 
					f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sf: function to combine two partial results. sf needs to be of signature
 | 
				
			||||||
 | 
					scalar_t sf(scalar_t x, scalar_t y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For example, you might have a tensor of 10000 entires and want to sum together
 | 
				
			||||||
 | 
					all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 | 
				
			||||||
 | 
					an intermediate result tensor with 4 elements. Then it will execute the function
 | 
				
			||||||
 | 
					"f" you provide and pass the beginning and end index of these chunks, so
 | 
				
			||||||
 | 
					0-24999, 2500-4999, etc. and the combination identity. It will then write out
 | 
				
			||||||
 | 
					the result from each of these chunks into the intermediate result tensor. After
 | 
				
			||||||
 | 
					that it'll reduce the partial results from each chunk into a single number using
 | 
				
			||||||
 | 
					the combination function sf and the identity ident. For a total summation this
 | 
				
			||||||
 | 
					would be "+" and 0 respectively. This is similar to tbb's approach [1], where
 | 
				
			||||||
 | 
					you need to provide a function to accumulate a subrange, a function to combine
 | 
				
			||||||
 | 
					two partial results and an identity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[1] https://software.intel.com/en-us/node/506154
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
template <class scalar_t, class F, class SF>
 | 
					template <class scalar_t, class F, class SF>
 | 
				
			||||||
inline scalar_t parallel_reduce(
 | 
					inline scalar_t parallel_reduce(
 | 
				
			||||||
    const int64_t begin,
 | 
					    const int64_t begin,
 | 
				
			||||||
 | 
				
			|||||||
@ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
 | 
					void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
 | 
				
			||||||
  AT_CHECK(
 | 
					  AT_CHECK(
 | 
				
			||||||
    t.type().backend() == backend,
 | 
					    !t.defined() || t.type().backend() == backend,
 | 
				
			||||||
    "Expected tensor to have ", toString(backend),
 | 
					    "Expected tensor to have ", toString(backend),
 | 
				
			||||||
    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
 | 
					    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
 | 
				
			||||||
    "(while checking arguments for ", c, ")");
 | 
					    "(while checking arguments for ", c, ")");
 | 
				
			||||||
 | 
				
			|||||||
@ -52,14 +52,11 @@ namespace c10 {
 | 
				
			|||||||
  _(prim, TupleSlice)              \
 | 
					  _(prim, TupleSlice)              \
 | 
				
			||||||
  _(prim, ListConstruct)           \
 | 
					  _(prim, ListConstruct)           \
 | 
				
			||||||
  _(prim, ListUnpack)              \
 | 
					  _(prim, ListUnpack)              \
 | 
				
			||||||
  _(prim, BoolToTensor)            \
 | 
					 | 
				
			||||||
  _(prim, NumToTensor)             \
 | 
					  _(prim, NumToTensor)             \
 | 
				
			||||||
  _(prim, TensorToNum)             \
 | 
					 | 
				
			||||||
  _(prim, ImplicitTensorToNum)     \
 | 
					  _(prim, ImplicitTensorToNum)     \
 | 
				
			||||||
  _(prim, TensorToBool)            \
 | 
					  _(prim, Bool)                    \
 | 
				
			||||||
  _(prim, IntToFloat)              \
 | 
					  _(prim, Int)                     \
 | 
				
			||||||
  _(prim, FloatToInt)              \
 | 
					  _(prim, Float)                   \
 | 
				
			||||||
  _(prim, StringToFloat)           \
 | 
					 | 
				
			||||||
  _(prim, device)                  \
 | 
					  _(prim, device)                  \
 | 
				
			||||||
  _(prim, dtype)                   \
 | 
					  _(prim, dtype)                   \
 | 
				
			||||||
  _(prim, shape)                   \
 | 
					  _(prim, shape)                   \
 | 
				
			||||||
@ -139,7 +136,8 @@ namespace c10 {
 | 
				
			|||||||
  _(attr, name)                    \
 | 
					  _(attr, name)                    \
 | 
				
			||||||
  _(attr, a)                       \
 | 
					  _(attr, a)                       \
 | 
				
			||||||
  _(attr, b)                       \
 | 
					  _(attr, b)                       \
 | 
				
			||||||
  _(attr, beg)
 | 
					  _(attr, beg)                     \
 | 
				
			||||||
 | 
					  _(attr, idx)
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define FORALL_NS_SYMBOLS(_) \
 | 
					#define FORALL_NS_SYMBOLS(_) \
 | 
				
			||||||
  _(namespaces, prim)              \
 | 
					  _(namespaces, prim)              \
 | 
				
			||||||
 | 
				
			|||||||
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
 | 
				
			|||||||
    ss << "Future[" << getElementType()->python_str() << "]";
 | 
					    ss << "Future[" << getElementType()->python_str() << "]";
 | 
				
			||||||
    return ss.str();
 | 
					    return ss.str();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
 | 
				
			||||||
 | 
					    return create(contained_types.at(0));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
  FutureType(TypePtr elem) : SingleElementType(elem) {}
 | 
					  FutureType(TypePtr elem) : SingleElementType(elem) {}
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
 | 
					inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
 | 
				
			||||||
  AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
 | 
					 | 
				
			||||||
  if (typ->isSubtypeOf(IntType::get())) {
 | 
					  if (typ->isSubtypeOf(IntType::get())) {
 | 
				
			||||||
    return CompleteTensorType::create(at::kLong, at::kCPU, {});
 | 
					    return CompleteTensorType::create(at::kLong, at::kCPU, {});
 | 
				
			||||||
  } else if (typ->isSubtypeOf(FloatType::get())) {
 | 
					  } else if (typ->isSubtypeOf(FloatType::get())) {
 | 
				
			||||||
@ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
 | 
				
			|||||||
template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 | 
					template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 | 
				
			||||||
template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }
 | 
					template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
 | 
					CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
using TypeEnv = std::unordered_map<std::string, TypePtr>;
 | 
					using TypeEnv = std::unordered_map<std::string, TypePtr>;
 | 
				
			||||||
struct MatchTypeReturn {
 | 
					struct MatchTypeReturn {
 | 
				
			||||||
 | 
				
			|||||||
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
 | 
				
			|||||||
  return value;
 | 
					  return value;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TypePtr inferTypeFrom(const IValue& value) {
 | 
					// why incomplete? You cannot completely recover a type from
 | 
				
			||||||
 | 
					// an IValue, List[List[int]] and List[List[Tensor]] will both
 | 
				
			||||||
 | 
					// become ivalue.isGenericList() and cannot be recovered.
 | 
				
			||||||
 | 
					// The only appropriate place to use this is where you know that
 | 
				
			||||||
 | 
					// you are only dealing with a subset of objects where you can recover
 | 
				
			||||||
 | 
					// the type, like in the tracer.
 | 
				
			||||||
 | 
					TypePtr incompleteInferTypeFrom(const IValue& value) {
 | 
				
			||||||
  if (value.isTensor()) {
 | 
					  if (value.isTensor()) {
 | 
				
			||||||
    return CompleteTensorType::create(value.toTensor());
 | 
					    return CompleteTensorType::create(value.toTensor());
 | 
				
			||||||
  } else if (value.isDouble()) {
 | 
					  } else if (value.isDouble()) {
 | 
				
			||||||
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
 | 
				
			|||||||
  } else if (value.isDoubleList()) {
 | 
					  } else if (value.isDoubleList()) {
 | 
				
			||||||
    return ListType::ofFloats();
 | 
					    return ListType::ofFloats();
 | 
				
			||||||
  } else if (value.isTuple()) {
 | 
					  } else if (value.isTuple()) {
 | 
				
			||||||
    return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
 | 
					    return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
 | 
				
			||||||
  } else if (value.isDevice()) {
 | 
					  } else if (value.isDevice()) {
 | 
				
			||||||
    return DeviceObjType::get();
 | 
					    return DeviceObjType::get();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
 | 
					  AT_ERROR("Type cannot be accurately recovered from this IValue.");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
 | 
					c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
 | 
				
			||||||
 | 
				
			|||||||
@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
 | 
				
			|||||||
    vec256::Vec256<scalar_t> acc_vec,
 | 
					    vec256::Vec256<scalar_t> acc_vec,
 | 
				
			||||||
    int64_t size) {
 | 
					    int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  scalar_t acc_arr[Vec::size];
 | 
					  scalar_t acc_arr[Vec::size()];
 | 
				
			||||||
  acc_vec.store(acc_arr);
 | 
					  acc_vec.store(acc_arr);
 | 
				
			||||||
  for (int64_t i = 1; i < size; i++) {
 | 
					  for (int64_t i = 1; i < size; i++) {
 | 
				
			||||||
    scalar_t acc_arr_next[Vec::size];
 | 
					    scalar_t acc_arr_next[Vec::size()];
 | 
				
			||||||
    acc_arr_next[0] = acc_arr[i];
 | 
					    acc_arr_next[0] = acc_arr[i];
 | 
				
			||||||
    Vec acc_vec_next = Vec::loadu(acc_arr_next);
 | 
					    Vec acc_vec_next = Vec::loadu(acc_arr_next);
 | 
				
			||||||
    acc_vec = vec_fun(acc_vec, acc_vec_next);
 | 
					    acc_vec = vec_fun(acc_vec, acc_vec_next);
 | 
				
			||||||
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
 | 
				
			|||||||
template <typename scalar_t, typename Op>
 | 
					template <typename scalar_t, typename Op>
 | 
				
			||||||
inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
 | 
					inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  if (size < Vec::size)
 | 
					  if (size < Vec::size())
 | 
				
			||||||
    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
 | 
					    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
 | 
				
			||||||
  int64_t d = Vec::size;
 | 
					  int64_t d = Vec::size();
 | 
				
			||||||
  Vec acc_vec = Vec::loadu(data);
 | 
					  Vec acc_vec = Vec::loadu(data);
 | 
				
			||||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
					  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
				
			||||||
    Vec data_vec = Vec::loadu(data + d);
 | 
					    Vec data_vec = Vec::loadu(data + d);
 | 
				
			||||||
    acc_vec = vec_fun(acc_vec, data_vec);
 | 
					    acc_vec = vec_fun(acc_vec, data_vec);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
 | 
				
			|||||||
    Vec data_vec = Vec::loadu(data + d, size - d);
 | 
					    Vec data_vec = Vec::loadu(data + d, size - d);
 | 
				
			||||||
    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
 | 
					    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
 | 
					  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
					template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
				
			||||||
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
 | 
				
			|||||||
    scalar_t* data,
 | 
					    scalar_t* data,
 | 
				
			||||||
    int64_t size) {
 | 
					    int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  if (size < Vec::size)
 | 
					  if (size < Vec::size())
 | 
				
			||||||
    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
 | 
					    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
 | 
				
			||||||
  int64_t d = Vec::size;
 | 
					  int64_t d = Vec::size();
 | 
				
			||||||
  Vec acc_vec = map_fun(Vec::loadu(data));
 | 
					  Vec acc_vec = map_fun(Vec::loadu(data));
 | 
				
			||||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
					  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
				
			||||||
    Vec data_vec = Vec::loadu(data + d);
 | 
					    Vec data_vec = Vec::loadu(data + d);
 | 
				
			||||||
    data_vec = map_fun(data_vec);
 | 
					    data_vec = map_fun(data_vec);
 | 
				
			||||||
    acc_vec = red_fun(acc_vec, data_vec);
 | 
					    acc_vec = red_fun(acc_vec, data_vec);
 | 
				
			||||||
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
 | 
				
			|||||||
    data_vec = map_fun(data_vec);
 | 
					    data_vec = map_fun(data_vec);
 | 
				
			||||||
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
					    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size);
 | 
					  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
					template <typename scalar_t, typename MapOp, typename ReduceOp>
 | 
				
			||||||
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
 | 
				
			|||||||
    const scalar_t* data2,
 | 
					    const scalar_t* data2,
 | 
				
			||||||
    int64_t size) {
 | 
					    int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  if (size < Vec::size) {
 | 
					  if (size < Vec::size()) {
 | 
				
			||||||
    Vec data_vec = Vec::loadu(data, size);
 | 
					    Vec data_vec = Vec::loadu(data, size);
 | 
				
			||||||
    Vec data2_vec = Vec::loadu(data2, size);
 | 
					    Vec data2_vec = Vec::loadu(data2, size);
 | 
				
			||||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
					    data_vec = map_fun(data_vec, data2_vec);
 | 
				
			||||||
    return vec_reduce_all(red_fun, data_vec, size);
 | 
					    return vec_reduce_all(red_fun, data_vec, size);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  int64_t d = Vec::size;
 | 
					  int64_t d = Vec::size();
 | 
				
			||||||
  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
 | 
					  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
 | 
				
			||||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
					  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
				
			||||||
    Vec data_vec = Vec::loadu(data + d);
 | 
					    Vec data_vec = Vec::loadu(data + d);
 | 
				
			||||||
    Vec data2_vec = Vec::loadu(data2 + d);
 | 
					    Vec data2_vec = Vec::loadu(data2 + d);
 | 
				
			||||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
					    data_vec = map_fun(data_vec, data2_vec);
 | 
				
			||||||
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
 | 
				
			|||||||
    data_vec = map_fun(data_vec, data2_vec);
 | 
					    data_vec = map_fun(data_vec, data2_vec);
 | 
				
			||||||
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
					    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return vec_reduce_all(red_fun, acc_vec, Vec::size);
 | 
					  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t, typename Op>
 | 
					template <typename scalar_t, typename Op>
 | 
				
			||||||
@ -103,7 +103,7 @@ inline void map(
 | 
				
			|||||||
    int64_t size) {
 | 
					    int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  int64_t d = 0;
 | 
					  int64_t d = 0;
 | 
				
			||||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
					  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
				
			||||||
    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
 | 
					    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
 | 
				
			||||||
    output_vec.store(output_data + d);
 | 
					    output_vec.store(output_data + d);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@ -122,7 +122,7 @@ inline void map2(
 | 
				
			|||||||
    int64_t size) {
 | 
					    int64_t size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  int64_t d = 0;
 | 
					  int64_t d = 0;
 | 
				
			||||||
  for (; d < size - (size % Vec::size); d += Vec::size) {
 | 
					  for (; d < size - (size % Vec::size()); d += Vec::size()) {
 | 
				
			||||||
    Vec data_vec = Vec::loadu(input_data + d);
 | 
					    Vec data_vec = Vec::loadu(input_data + d);
 | 
				
			||||||
    Vec data_vec2 = Vec::loadu(input_data2 + d);
 | 
					    Vec data_vec2 = Vec::loadu(input_data2 + d);
 | 
				
			||||||
    Vec output_vec = vec_fun(data_vec, data_vec2);
 | 
					    Vec output_vec = vec_fun(data_vec, data_vec2);
 | 
				
			||||||
 | 
				
			|||||||
@ -15,14 +15,24 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
namespace at {
 | 
					namespace at {
 | 
				
			||||||
namespace vec256 {
 | 
					namespace vec256 {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Note [Acceptable use of anonymous namespace in header]
 | 
				
			||||||
 | 
					// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					// Yes you saw right, this is an anonymous namespace in a header.  This header,
 | 
				
			||||||
 | 
					// and all of its subheaders, REQUIRE their code to be entirely inlined into
 | 
				
			||||||
 | 
					// the compilation unit that uses them.  It's important that these functions have
 | 
				
			||||||
 | 
					// internal linkage so that kernels for different architectures don't get
 | 
				
			||||||
 | 
					// combined during linking. It's sufficient to label functions "static", but
 | 
				
			||||||
 | 
					// class methods must be an unnamed namespace to have internal linkage (since
 | 
				
			||||||
 | 
					// static means something different in the context of classes).
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
 | 
					std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
 | 
				
			||||||
  T buf[Vec256<T>::size];
 | 
					  T buf[Vec256<T>::size()];
 | 
				
			||||||
  vec.store(buf);
 | 
					  vec.store(buf);
 | 
				
			||||||
  stream << "vec[";
 | 
					  stream << "vec[";
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    if (i != 0) {
 | 
					    if (i != 0) {
 | 
				
			||||||
      stream << ", ";
 | 
					      stream << ", ";
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
				
			|||||||
@ -20,6 +20,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
namespace at {
 | 
					namespace at {
 | 
				
			||||||
namespace vec256 {
 | 
					namespace vec256 {
 | 
				
			||||||
 | 
					// See Note [Acceptable use of anonymous namespace in header]
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<size_t n> struct int_of_size;
 | 
					template<size_t n> struct int_of_size;
 | 
				
			||||||
@ -45,15 +46,49 @@ struct Vec256 {
 | 
				
			|||||||
private:
 | 
					private:
 | 
				
			||||||
  T values[32 / sizeof(T)] = {0};
 | 
					  T values[32 / sizeof(T)] = {0};
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  static constexpr int size = 32 / sizeof(T);
 | 
					  // Note [constexpr static function to avoid odr-usage compiler bug]
 | 
				
			||||||
 | 
					  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					  // Why, you might ask, is size defined to be a static constexpr function,
 | 
				
			||||||
 | 
					  // rather than a more ordinary 'static constexpr int size;' variable?
 | 
				
			||||||
 | 
					  // The problem lies within ODR rules for static constexpr members versus
 | 
				
			||||||
 | 
					  // static constexpr functions.  First, recall that this class (along with all
 | 
				
			||||||
 | 
					  // of its derivations) live in an anonymous namespace: they are intended to be
 | 
				
			||||||
 | 
					  // *completely* inlined at their use-sites, because we need to compile it
 | 
				
			||||||
 | 
					  // multiple times for different instruction sets.
 | 
				
			||||||
 | 
					  //
 | 
				
			||||||
 | 
					  // Because of this constraint, we CANNOT provide a single definition for
 | 
				
			||||||
 | 
					  // any static members in this class; since we want to compile the class
 | 
				
			||||||
 | 
					  // multiple times, there wouldn't actually be any good place to put the
 | 
				
			||||||
 | 
					  // definition.  Now here is the problem: if we ODR-use a static constexpr
 | 
				
			||||||
 | 
					  // member, we are *obligated* to provide a definition.  Without the
 | 
				
			||||||
 | 
					  // definition, you get a compile error like:
 | 
				
			||||||
 | 
					  //
 | 
				
			||||||
 | 
					  //    relocation R_X86_64_PC32 against undefined symbol
 | 
				
			||||||
 | 
					  //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
 | 
				
			||||||
 | 
					  //    a shared object; recompile with -fPIC
 | 
				
			||||||
 | 
					  //
 | 
				
			||||||
 | 
					  // If this were C++17, we could replace a static constexpr variable with
 | 
				
			||||||
 | 
					  // an inline variable which doesn't require one definition. But we are not
 | 
				
			||||||
 | 
					  // C++17.  So the next best thing is to replace the member with a static
 | 
				
			||||||
 | 
					  // constexpr (and therefore inline) function, which does not require ODR
 | 
				
			||||||
 | 
					  // either.
 | 
				
			||||||
 | 
					  //
 | 
				
			||||||
 | 
					  // Also, technically according to the C++ standard, we don't have to define
 | 
				
			||||||
 | 
					  // a constexpr variable if we never odr-use it.  But it seems that some
 | 
				
			||||||
 | 
					  // versions GCC/Clang have buggy determinations on whether or not an
 | 
				
			||||||
 | 
					  // identifier is odr-used or not, and in any case it's hard to tel if
 | 
				
			||||||
 | 
					  // a variabe is odr-used or not.  So best to just cut the probem at the root.
 | 
				
			||||||
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 32 / sizeof(T);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(T val) {
 | 
					  Vec256(T val) {
 | 
				
			||||||
    for (int i = 0; i != size; i++) {
 | 
					    for (int i = 0; i != size(); i++) {
 | 
				
			||||||
      values[i] = val;
 | 
					      values[i] = val;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  template<typename... Args,
 | 
					  template<typename... Args,
 | 
				
			||||||
           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
 | 
					           typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
 | 
				
			||||||
  Vec256(Args... vals) {
 | 
					  Vec256(Args... vals) {
 | 
				
			||||||
    values = { vals... };
 | 
					    values = { vals... };
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@ -61,7 +96,7 @@ public:
 | 
				
			|||||||
  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
 | 
					  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			||||||
    int64_t mask = mask_;
 | 
					    int64_t mask = mask_;
 | 
				
			||||||
    Vec256 vec;
 | 
					    Vec256 vec;
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      if (mask & 0x01) {
 | 
					      if (mask & 0x01) {
 | 
				
			||||||
        vec[i] = b[i];
 | 
					        vec[i] = b[i];
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
@ -74,9 +109,9 @@ public:
 | 
				
			|||||||
  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
 | 
					  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
 | 
				
			||||||
                          const Vec256<T>& mask) {
 | 
					                          const Vec256<T>& mask) {
 | 
				
			||||||
    Vec256 vec;
 | 
					    Vec256 vec;
 | 
				
			||||||
    int_same_size_t<T> buffer[size];
 | 
					    int_same_size_t<T> buffer[size()];
 | 
				
			||||||
    mask.store(buffer);
 | 
					    mask.store(buffer);
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      if (buffer[i] & 0x01)
 | 
					      if (buffer[i] & 0x01)
 | 
				
			||||||
       {
 | 
					       {
 | 
				
			||||||
        vec[i] = b[i];
 | 
					        vec[i] = b[i];
 | 
				
			||||||
@ -88,14 +123,14 @@ public:
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
 | 
					  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
 | 
				
			||||||
    Vec256 vec;
 | 
					    Vec256 vec;
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      vec.values[i] = base + i * step;
 | 
					      vec.values[i] = base + i * step;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return vec;
 | 
					    return vec;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
 | 
					  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
 | 
				
			||||||
    Vec256 vec;
 | 
					    Vec256 vec;
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      if (i < count) {
 | 
					      if (i < count) {
 | 
				
			||||||
        vec[i] = b[i];
 | 
					        vec[i] = b[i];
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
@ -114,7 +149,7 @@ public:
 | 
				
			|||||||
    std::memcpy(vec.values, ptr, count * sizeof(T));
 | 
					    std::memcpy(vec.values, ptr, count * sizeof(T));
 | 
				
			||||||
    return vec;
 | 
					    return vec;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int count = size) const {
 | 
					  void store(void* ptr, int count = size()) const {
 | 
				
			||||||
    std::memcpy(ptr, values, count * sizeof(T));
 | 
					    std::memcpy(ptr, values, count * sizeof(T));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  const T& operator[](int idx) const {
 | 
					  const T& operator[](int idx) const {
 | 
				
			||||||
@ -125,14 +160,14 @@ public:
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<T> map(T (*f)(T)) const {
 | 
					  Vec256<T> map(T (*f)(T)) const {
 | 
				
			||||||
    Vec256<T> ret;
 | 
					    Vec256<T> ret;
 | 
				
			||||||
    for (int64_t i = 0; i != size; i++) {
 | 
					    for (int64_t i = 0; i != size(); i++) {
 | 
				
			||||||
      ret[i] = f(values[i]);
 | 
					      ret[i] = f(values[i]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return ret;
 | 
					    return ret;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<T> abs() const {
 | 
					  Vec256<T> abs() const {
 | 
				
			||||||
    Vec256<T> ret;
 | 
					    Vec256<T> ret;
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      ret[i] = values[i] < 0 ? -values[i] : values[i];
 | 
					      ret[i] = values[i] < 0 ? -values[i] : values[i];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return ret;
 | 
					    return ret;
 | 
				
			||||||
@ -214,7 +249,7 @@ public:
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<T> pow(const Vec256<T> &exp) const {
 | 
					  Vec256<T> pow(const Vec256<T> &exp) const {
 | 
				
			||||||
    Vec256<T> ret;
 | 
					    Vec256<T> ret;
 | 
				
			||||||
    for (int64_t i = 0; i < size; i++) {
 | 
					    for (int64_t i = 0; i < size(); i++) {
 | 
				
			||||||
      ret[i] = std::pow(values[i], exp[i]);
 | 
					      ret[i] = std::pow(values[i], exp[i]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return ret;
 | 
					    return ret;
 | 
				
			||||||
@ -222,7 +257,7 @@ public:
 | 
				
			|||||||
#define DEFINE_COMP(binary_pred)                                              \
 | 
					#define DEFINE_COMP(binary_pred)                                              \
 | 
				
			||||||
  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
 | 
					  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
 | 
				
			||||||
    Vec256<T> vec;                                                            \
 | 
					    Vec256<T> vec;                                                            \
 | 
				
			||||||
    for (int64_t i = 0; i != size; i++) {                                     \
 | 
					    for (int64_t i = 0; i != size(); i++) {                                     \
 | 
				
			||||||
      if (values[i] binary_pred other.values[i]) {                            \
 | 
					      if (values[i] binary_pred other.values[i]) {                            \
 | 
				
			||||||
        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
 | 
					        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
 | 
				
			||||||
      } else {                                                                \
 | 
					      } else {                                                                \
 | 
				
			||||||
@ -242,7 +277,7 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
 | 
					template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = a[i] + b[i];
 | 
					    c[i] = a[i] + b[i];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return c;
 | 
					  return c;
 | 
				
			||||||
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
 | 
					template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = a[i] - b[i];
 | 
					    c[i] = a[i] - b[i];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return c;
 | 
					  return c;
 | 
				
			||||||
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
 | 
					template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = a[i] * b[i];
 | 
					    c[i] = a[i] * b[i];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return c;
 | 
					  return c;
 | 
				
			||||||
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
 | 
					template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = a[i] / b[i];
 | 
					    c[i] = a[i] / b[i];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return c;
 | 
					  return c;
 | 
				
			||||||
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
 | 
				
			|||||||
// either input is a NaN.
 | 
					// either input is a NaN.
 | 
				
			||||||
template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
					template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
 | 
					    c[i] = (a[i] > b[i]) ? a[i] : b[i];
 | 
				
			||||||
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
					    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
				
			||||||
      // If either input is NaN, propagate a NaN.
 | 
					      // If either input is NaN, propagate a NaN.
 | 
				
			||||||
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
 | 
				
			|||||||
// either input is a NaN.
 | 
					// either input is a NaN.
 | 
				
			||||||
template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
					template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
 | 
				
			||||||
  Vec256<T> c = Vec256<T>();
 | 
					  Vec256<T> c = Vec256<T>();
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
 | 
					    c[i] = (a[i] < b[i]) ? a[i] : b[i];
 | 
				
			||||||
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
					    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
 | 
				
			||||||
      // If either input is NaN, propagate a NaN.
 | 
					      // If either input is NaN, propagate a NaN.
 | 
				
			||||||
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
 | 
				
			|||||||
template <class T>                                                          \
 | 
					template <class T>                                                          \
 | 
				
			||||||
Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
 | 
					Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
 | 
				
			||||||
  using iT = int_same_size_t<T>;                                            \
 | 
					  using iT = int_same_size_t<T>;                                            \
 | 
				
			||||||
  iT buffer[Vec256<T>::size];                                               \
 | 
					  iT buffer[Vec256<T>::size()];                                               \
 | 
				
			||||||
  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
 | 
					  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
 | 
				
			||||||
    auto a_val = a[i];                                                      \
 | 
					    auto a_val = a[i];                                                      \
 | 
				
			||||||
    auto b_val = b[i];                                                      \
 | 
					    auto b_val = b[i];                                                      \
 | 
				
			||||||
    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
 | 
					    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
 | 
				
			||||||
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
 | 
				
			|||||||
template <int64_t scale = 1, typename T = void>
 | 
					template <int64_t scale = 1, typename T = void>
 | 
				
			||||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
					c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
				
			||||||
inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
 | 
					inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
 | 
				
			||||||
  static constexpr int size = Vec256<T>::size;
 | 
					  static constexpr int size = Vec256<T>::size();
 | 
				
			||||||
  int_same_size_t<T> index_arr[size];
 | 
					  int_same_size_t<T> index_arr[size];
 | 
				
			||||||
  vindex.store(static_cast<void*>(index_arr));
 | 
					  vindex.store(static_cast<void*>(index_arr));
 | 
				
			||||||
  T buffer[size];
 | 
					  T buffer[size];
 | 
				
			||||||
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
 | 
				
			|||||||
c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
					c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 | 
				
			||||||
inline mask_gather(const Vec256<T>& src, T const* base_addr,
 | 
					inline mask_gather(const Vec256<T>& src, T const* base_addr,
 | 
				
			||||||
                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
 | 
					                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
 | 
				
			||||||
  static constexpr int size = Vec256<T>::size;
 | 
					  static constexpr int size = Vec256<T>::size();
 | 
				
			||||||
  T src_arr[size];
 | 
					  T src_arr[size];
 | 
				
			||||||
  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
 | 
					  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
 | 
				
			||||||
  int_same_size_t<T> index_arr[size];
 | 
					  int_same_size_t<T> index_arr[size];
 | 
				
			||||||
@ -392,7 +427,7 @@ namespace {
 | 
				
			|||||||
  template<typename dst_t, typename src_t>
 | 
					  template<typename dst_t, typename src_t>
 | 
				
			||||||
  struct CastImpl {
 | 
					  struct CastImpl {
 | 
				
			||||||
    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
 | 
					    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
 | 
				
			||||||
      src_t src_arr[Vec256<src_t>::size];
 | 
					      src_t src_arr[Vec256<src_t>::size()];
 | 
				
			||||||
      src.store(static_cast<void*>(src_arr));
 | 
					      src.store(static_cast<void*>(src_arr));
 | 
				
			||||||
      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
 | 
					      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
 | 
					inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
 | 
				
			||||||
  static constexpr int size = Vec256<T>::size;
 | 
					  static constexpr int size = Vec256<T>::size();
 | 
				
			||||||
  T src_arr[size];
 | 
					  T src_arr[size];
 | 
				
			||||||
  src.store(static_cast<void*>(src_arr));
 | 
					  src.store(static_cast<void*>(src_arr));
 | 
				
			||||||
  int_same_size_t<T> buffer[size];
 | 
					  int_same_size_t<T> buffer[size];
 | 
				
			||||||
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
 | 
				
			|||||||
//       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 | 
					//       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 | 
				
			||||||
//                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 | 
					//                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
					inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
				
			||||||
deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
					deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			||||||
  static constexpr int size = Vec256<T>::size;
 | 
					  static constexpr int size = Vec256<T>::size();
 | 
				
			||||||
  static constexpr int half_size = size / 2;
 | 
					  static constexpr int half_size = size / 2;
 | 
				
			||||||
  T a_arr[size];
 | 
					  T a_arr[size];
 | 
				
			||||||
  T b_arr[size];
 | 
					  T b_arr[size];
 | 
				
			||||||
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			|||||||
//       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 | 
					//       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 | 
				
			||||||
//                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 | 
					//                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
					inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 | 
				
			||||||
interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
					interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			||||||
  static constexpr int size = Vec256<T>::size;
 | 
					  static constexpr int size = Vec256<T>::size();
 | 
				
			||||||
  static constexpr int half_size = size / 2;
 | 
					  static constexpr int half_size = size / 2;
 | 
				
			||||||
  T a_arr[size];
 | 
					  T a_arr[size];
 | 
				
			||||||
  T b_arr[size];
 | 
					  T b_arr[size];
 | 
				
			||||||
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <typename src_T, typename dst_T>
 | 
					template <typename src_T, typename dst_T>
 | 
				
			||||||
void convert(const src_T *src, dst_T *dst, int64_t n) {
 | 
					void convert(const src_T *src, dst_T *dst, int64_t n) {
 | 
				
			||||||
#pragma unroll
 | 
					#ifndef _MSC_VER  
 | 
				
			||||||
 | 
					# pragma unroll  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  for (int64_t i = 0; i < n; i++) {
 | 
					  for (int64_t i = 0; i < n; i++) {
 | 
				
			||||||
    *dst = static_cast<dst_T>(
 | 
					    *dst = static_cast<dst_T>(
 | 
				
			||||||
        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
 | 
					        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
 | 
				
			||||||
 | 
				
			|||||||
@ -8,6 +8,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
namespace at {
 | 
					namespace at {
 | 
				
			||||||
namespace vec256 {
 | 
					namespace vec256 {
 | 
				
			||||||
 | 
					// See Note [Acceptable use of anonymous namespace in header]
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(__AVX__) && !defined(_MSC_VER)
 | 
					#if defined(__AVX__) && !defined(_MSC_VER)
 | 
				
			||||||
@ -16,7 +17,9 @@ template <> class Vec256<double> {
 | 
				
			|||||||
private:
 | 
					private:
 | 
				
			||||||
  __m256d values;
 | 
					  __m256d values;
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  static constexpr int size = 4;
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 4;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(__m256d v) : values(v) {}
 | 
					  Vec256(__m256d v) : values(v) {}
 | 
				
			||||||
  Vec256(double val) {
 | 
					  Vec256(double val) {
 | 
				
			||||||
@ -40,7 +43,7 @@ public:
 | 
				
			|||||||
    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
 | 
					    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
 | 
					  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
 | 
				
			||||||
                            int64_t count = size) {
 | 
					                            int64_t count = size()) {
 | 
				
			||||||
    switch (count) {
 | 
					    switch (count) {
 | 
				
			||||||
      case 0:
 | 
					      case 0:
 | 
				
			||||||
        return a;
 | 
					        return a;
 | 
				
			||||||
@ -53,22 +56,22 @@ public:
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    return b;
 | 
					    return b;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
 | 
					  static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
 | 
				
			||||||
    if (count == size)
 | 
					    if (count == size())
 | 
				
			||||||
      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
 | 
					      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    __at_align32__ double tmp_values[size];
 | 
					    __at_align32__ double tmp_values[size()];
 | 
				
			||||||
    std::memcpy(
 | 
					    std::memcpy(
 | 
				
			||||||
        tmp_values,
 | 
					        tmp_values,
 | 
				
			||||||
        reinterpret_cast<const double*>(ptr),
 | 
					        reinterpret_cast<const double*>(ptr),
 | 
				
			||||||
        count * sizeof(double));
 | 
					        count * sizeof(double));
 | 
				
			||||||
    return _mm256_load_pd(tmp_values);
 | 
					    return _mm256_load_pd(tmp_values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int count = size) const {
 | 
					  void store(void* ptr, int count = size()) const {
 | 
				
			||||||
    if (count == size) {
 | 
					    if (count == size()) {
 | 
				
			||||||
      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
 | 
					      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
 | 
				
			||||||
    } else if (count > 0) {
 | 
					    } else if (count > 0) {
 | 
				
			||||||
      double tmp_values[size];
 | 
					      double tmp_values[size()];
 | 
				
			||||||
      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
 | 
					      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
 | 
				
			||||||
      std::memcpy(ptr, tmp_values, count * sizeof(double));
 | 
					      std::memcpy(ptr, tmp_values, count * sizeof(double));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -252,7 +255,7 @@ template <>
 | 
				
			|||||||
void convert(const double* src, double* dst, int64_t n) {
 | 
					void convert(const double* src, double* dst, int64_t n) {
 | 
				
			||||||
  int64_t i;
 | 
					  int64_t i;
 | 
				
			||||||
#pragma unroll
 | 
					#pragma unroll
 | 
				
			||||||
  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
 | 
					  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
 | 
				
			||||||
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
 | 
					    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#pragma unroll
 | 
					#pragma unroll
 | 
				
			||||||
 | 
				
			|||||||
@ -8,6 +8,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
namespace at {
 | 
					namespace at {
 | 
				
			||||||
namespace vec256 {
 | 
					namespace vec256 {
 | 
				
			||||||
 | 
					// See Note [Acceptable use of anonymous namespace in header]
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(__AVX__) && !defined(_MSC_VER)
 | 
					#if defined(__AVX__) && !defined(_MSC_VER)
 | 
				
			||||||
@ -16,7 +17,9 @@ template <> class Vec256<float> {
 | 
				
			|||||||
private:
 | 
					private:
 | 
				
			||||||
  __m256 values;
 | 
					  __m256 values;
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  static constexpr int size = 8;
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 8;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(__m256 v) : values(v) {}
 | 
					  Vec256(__m256 v) : values(v) {}
 | 
				
			||||||
  Vec256(float val) {
 | 
					  Vec256(float val) {
 | 
				
			||||||
@ -43,7 +46,7 @@ public:
 | 
				
			|||||||
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
					      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
 | 
					  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
 | 
				
			||||||
                           int64_t count = size) {
 | 
					                           int64_t count = size()) {
 | 
				
			||||||
    switch (count) {
 | 
					    switch (count) {
 | 
				
			||||||
      case 0:
 | 
					      case 0:
 | 
				
			||||||
        return a;
 | 
					        return a;
 | 
				
			||||||
@ -64,19 +67,19 @@ public:
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    return b;
 | 
					    return b;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
 | 
					  static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
 | 
				
			||||||
    if (count == size)
 | 
					    if (count == size())
 | 
				
			||||||
      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
 | 
					      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
 | 
				
			||||||
    __at_align32__ float tmp_values[size];
 | 
					    __at_align32__ float tmp_values[size()];
 | 
				
			||||||
    std::memcpy(
 | 
					    std::memcpy(
 | 
				
			||||||
        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
 | 
					        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
 | 
				
			||||||
    return _mm256_loadu_ps(tmp_values);
 | 
					    return _mm256_loadu_ps(tmp_values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int64_t count = size) const {
 | 
					  void store(void* ptr, int64_t count = size()) const {
 | 
				
			||||||
    if (count == size) {
 | 
					    if (count == size()) {
 | 
				
			||||||
      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
 | 
					      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
 | 
				
			||||||
    } else if (count > 0) {
 | 
					    } else if (count > 0) {
 | 
				
			||||||
      float tmp_values[size];
 | 
					      float tmp_values[size()];
 | 
				
			||||||
      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
 | 
					      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
 | 
				
			||||||
      std::memcpy(ptr, tmp_values, count * sizeof(float));
 | 
					      std::memcpy(ptr, tmp_values, count * sizeof(float));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -260,7 +263,7 @@ template <>
 | 
				
			|||||||
void convert(const float* src, float* dst, int64_t n) {
 | 
					void convert(const float* src, float* dst, int64_t n) {
 | 
				
			||||||
  int64_t i;
 | 
					  int64_t i;
 | 
				
			||||||
#pragma unroll
 | 
					#pragma unroll
 | 
				
			||||||
  for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
 | 
					  for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
 | 
				
			||||||
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
 | 
					    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#pragma unroll
 | 
					#pragma unroll
 | 
				
			||||||
 | 
				
			|||||||
@ -12,6 +12,11 @@ namespace {
 | 
				
			|||||||
struct Vec256i {
 | 
					struct Vec256i {
 | 
				
			||||||
protected:
 | 
					protected:
 | 
				
			||||||
  __m256i values;
 | 
					  __m256i values;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static inline __m256i invert(const __m256i& v) {
 | 
				
			||||||
 | 
					    const auto ones = _mm256_set1_epi64x(-1);
 | 
				
			||||||
 | 
					    return _mm256_xor_si256(ones, v);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  Vec256i() {}
 | 
					  Vec256i() {}
 | 
				
			||||||
  Vec256i(__m256i v) : values(v) {}
 | 
					  Vec256i(__m256i v) : values(v) {}
 | 
				
			||||||
@ -22,7 +27,9 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <>
 | 
					template <>
 | 
				
			||||||
struct Vec256<int64_t> : public Vec256i {
 | 
					struct Vec256<int64_t> : public Vec256i {
 | 
				
			||||||
  static constexpr int size = 4;
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 4;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  using Vec256i::Vec256i;
 | 
					  using Vec256i::Vec256i;
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
 | 
					  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
 | 
				
			||||||
@ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  template <int64_t mask>
 | 
					  template <int64_t mask>
 | 
				
			||||||
  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
 | 
					  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
 | 
				
			||||||
    __at_align32__ int64_t tmp_values[size];
 | 
					    __at_align32__ int64_t tmp_values[size()];
 | 
				
			||||||
    a.store(tmp_values);
 | 
					    a.store(tmp_values);
 | 
				
			||||||
    if (mask & 0x01)
 | 
					    if (mask & 0x01)
 | 
				
			||||||
      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
 | 
					      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
 | 
				
			||||||
@ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i {
 | 
				
			|||||||
    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
 | 
					    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int64_t>
 | 
					  static Vec256<int64_t>
 | 
				
			||||||
  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
 | 
					  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
 | 
				
			||||||
    switch (count) {
 | 
					    switch (count) {
 | 
				
			||||||
      case 0:
 | 
					      case 0:
 | 
				
			||||||
        return a;
 | 
					        return a;
 | 
				
			||||||
@ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
					    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
 | 
					  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
 | 
				
			||||||
    __at_align32__ int64_t tmp_values[size];
 | 
					    __at_align32__ int64_t tmp_values[size()];
 | 
				
			||||||
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
 | 
					    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
 | 
				
			||||||
    return loadu(tmp_values);
 | 
					    return loadu(tmp_values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int count = size) const {
 | 
					  void store(void* ptr, int count = size()) const {
 | 
				
			||||||
    if (count == size) {
 | 
					    if (count == size()) {
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
				
			||||||
    } else if (count > 0) {
 | 
					    } else if (count > 0) {
 | 
				
			||||||
      __at_align32__ int64_t tmp_values[size];
 | 
					      __at_align32__ int64_t tmp_values[size()];
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
				
			||||||
      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
 | 
					      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_cmpeq_epi64(values, other.values);
 | 
					    return _mm256_cmpeq_epi64(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
 | 
					  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpeq_epi64(values, other.values));
 | 
				
			||||||
    auto eq = _mm256_cmpeq_epi64(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
 | 
					  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi64(other.values, values);
 | 
					    return _mm256_cmpgt_epi64(other.values, values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
 | 
					  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi64(values, other.values));
 | 
				
			||||||
    auto gt = _mm256_cmpgt_epi64(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
 | 
					  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi64(values, other.values);
 | 
					    return _mm256_cmpgt_epi64(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
 | 
					  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi64(other.values, values));
 | 
				
			||||||
    auto lt = _mm256_cmpgt_epi64(other.values, values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <>
 | 
					template <>
 | 
				
			||||||
struct Vec256<int32_t> : public Vec256i {
 | 
					struct Vec256<int32_t> : public Vec256i {
 | 
				
			||||||
  static constexpr int size = 8;
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 8;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  using Vec256i::Vec256i;
 | 
					  using Vec256i::Vec256i;
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
 | 
					  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
 | 
				
			||||||
@ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i {
 | 
				
			|||||||
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
					      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int32_t>
 | 
					  static Vec256<int32_t>
 | 
				
			||||||
  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
 | 
					  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
 | 
				
			||||||
    switch (count) {
 | 
					    switch (count) {
 | 
				
			||||||
      case 0:
 | 
					      case 0:
 | 
				
			||||||
        return a;
 | 
					        return a;
 | 
				
			||||||
@ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
					    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
 | 
					  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
 | 
				
			||||||
    __at_align32__ int32_t tmp_values[size];
 | 
					    __at_align32__ int32_t tmp_values[size()];
 | 
				
			||||||
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
 | 
					    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
 | 
				
			||||||
    return loadu(tmp_values);
 | 
					    return loadu(tmp_values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int count = size) const {
 | 
					  void store(void* ptr, int count = size()) const {
 | 
				
			||||||
    if (count == size) {
 | 
					    if (count == size()) {
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
				
			||||||
    } else if (count > 0) {
 | 
					    } else if (count > 0) {
 | 
				
			||||||
      __at_align32__ int32_t tmp_values[size];
 | 
					      __at_align32__ int32_t tmp_values[size()];
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
				
			||||||
      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
 | 
					      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_cmpeq_epi32(values, other.values);
 | 
					    return _mm256_cmpeq_epi32(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
 | 
					  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpeq_epi32(values, other.values));
 | 
				
			||||||
    auto eq = _mm256_cmpeq_epi32(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
 | 
					  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi32(other.values, values);
 | 
					    return _mm256_cmpgt_epi32(other.values, values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
 | 
					  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi32(values, other.values));
 | 
				
			||||||
    auto gt = _mm256_cmpgt_epi32(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
 | 
					  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi32(values, other.values);
 | 
					    return _mm256_cmpgt_epi32(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
 | 
					  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi32(other.values, values));
 | 
				
			||||||
    auto lt = _mm256_cmpgt_epi32(other.values, values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -212,13 +209,17 @@ template <>
 | 
				
			|||||||
void convert(const int32_t *src, float *dst, int64_t n) {
 | 
					void convert(const int32_t *src, float *dst, int64_t n) {
 | 
				
			||||||
  int64_t i;
 | 
					  int64_t i;
 | 
				
			||||||
  // int32_t and float have same size
 | 
					  // int32_t and float have same size
 | 
				
			||||||
#pragma unroll
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
 | 
					# pragma unroll
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
 | 
				
			||||||
    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
 | 
					    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
 | 
				
			||||||
    auto output_vec = _mm256_cvtepi32_ps(input_vec);
 | 
					    auto output_vec = _mm256_cvtepi32_ps(input_vec);
 | 
				
			||||||
    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
 | 
					    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#pragma unroll
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
 | 
					# pragma unroll
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  for (; i < n; i++) {
 | 
					  for (; i < n; i++) {
 | 
				
			||||||
    dst[i] = static_cast<float>(src[i]);
 | 
					    dst[i] = static_cast<float>(src[i]);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@ -228,13 +229,17 @@ template <>
 | 
				
			|||||||
void convert(const int32_t *src, double *dst, int64_t n) {
 | 
					void convert(const int32_t *src, double *dst, int64_t n) {
 | 
				
			||||||
  int64_t i;
 | 
					  int64_t i;
 | 
				
			||||||
  // int32_t has half the size of double
 | 
					  // int32_t has half the size of double
 | 
				
			||||||
#pragma unroll
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
 | 
					# pragma unroll
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
 | 
				
			||||||
    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
 | 
					    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
 | 
				
			||||||
    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
 | 
					    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
 | 
				
			||||||
    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
 | 
					    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#pragma unroll
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
 | 
					# pragma unroll
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  for (; i < n; i++) {
 | 
					  for (; i < n; i++) {
 | 
				
			||||||
    dst[i] = static_cast<double>(src[i]);
 | 
					    dst[i] = static_cast<double>(src[i]);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <>
 | 
					template <>
 | 
				
			||||||
struct Vec256<int16_t> : public Vec256i {
 | 
					struct Vec256<int16_t> : public Vec256i {
 | 
				
			||||||
  static constexpr int size = 16;
 | 
					  static constexpr int size() {
 | 
				
			||||||
 | 
					    return 16;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  using Vec256i::Vec256i;
 | 
					  using Vec256i::Vec256i;
 | 
				
			||||||
  Vec256() {}
 | 
					  Vec256() {}
 | 
				
			||||||
  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
 | 
					  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
 | 
				
			||||||
@ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  template <int64_t mask>
 | 
					  template <int64_t mask>
 | 
				
			||||||
  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
 | 
					  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
 | 
				
			||||||
    __at_align32__ int16_t tmp_values[size];
 | 
					    __at_align32__ int16_t tmp_values[size()];
 | 
				
			||||||
    a.store(tmp_values);
 | 
					    a.store(tmp_values);
 | 
				
			||||||
    if (mask & 0x01)
 | 
					    if (mask & 0x01)
 | 
				
			||||||
      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
 | 
					      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
 | 
				
			||||||
@ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i {
 | 
				
			|||||||
      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
 | 
					      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int16_t>
 | 
					  static Vec256<int16_t>
 | 
				
			||||||
  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
 | 
					  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
 | 
				
			||||||
    switch (count) {
 | 
					    switch (count) {
 | 
				
			||||||
      case 0:
 | 
					      case 0:
 | 
				
			||||||
        return a;
 | 
					        return a;
 | 
				
			||||||
@ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
					    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
 | 
					  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
 | 
				
			||||||
    __at_align32__ int16_t tmp_values[size];
 | 
					    __at_align32__ int16_t tmp_values[size()];
 | 
				
			||||||
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
 | 
					    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
 | 
				
			||||||
    return loadu(tmp_values);
 | 
					    return loadu(tmp_values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void store(void* ptr, int count = size) const {
 | 
					  void store(void* ptr, int count = size()) const {
 | 
				
			||||||
    if (count == size) {
 | 
					    if (count == size()) {
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
 | 
				
			||||||
    } else if (count > 0) {
 | 
					    } else if (count > 0) {
 | 
				
			||||||
      __at_align32__ int16_t tmp_values[size];
 | 
					      __at_align32__ int16_t tmp_values[size()];
 | 
				
			||||||
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
					      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
 | 
				
			||||||
      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
 | 
					      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i {
 | 
				
			|||||||
    return _mm256_cmpeq_epi16(values, other.values);
 | 
					    return _mm256_cmpeq_epi16(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
 | 
					  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpeq_epi16(values, other.values));
 | 
				
			||||||
    auto eq = _mm256_cmpeq_epi16(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, eq);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
 | 
					  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi16(other.values, values);
 | 
					    return _mm256_cmpgt_epi16(other.values, values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
 | 
					  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi16(values, other.values));
 | 
				
			||||||
    auto gt = _mm256_cmpgt_epi16(values, other.values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, gt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
 | 
					  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
 | 
				
			||||||
    return _mm256_cmpgt_epi16(values, other.values);
 | 
					    return _mm256_cmpgt_epi16(values, other.values);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
 | 
					  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
 | 
				
			||||||
    auto zero = _mm256_set1_epi64x(0);
 | 
					    return invert(_mm256_cmpgt_epi16(other.values, values));
 | 
				
			||||||
    auto lt = _mm256_cmpgt_epi16(other.values, values);
 | 
					 | 
				
			||||||
    return _mm256_xor_si256(zero, lt);  // invert
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
 | 
					Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
 | 
				
			||||||
  T values_a[Vec256<T>::size];
 | 
					  T values_a[Vec256<T>::size()];
 | 
				
			||||||
  T values_b[Vec256<T>::size];
 | 
					  T values_b[Vec256<T>::size()];
 | 
				
			||||||
  a.store(values_a);
 | 
					  a.store(values_a);
 | 
				
			||||||
  b.store(values_b);
 | 
					  b.store(values_b);
 | 
				
			||||||
  for (int i = 0; i != Vec256<T>::size; i++) {
 | 
					  for (int i = 0; i != Vec256<T>::size(); i++) {
 | 
				
			||||||
    values_a[i] /= values_b[i];
 | 
					    values_a[i] /= values_b[i];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return Vec256<T>::loadu(values_a);
 | 
					  return Vec256<T>::loadu(values_a);
 | 
				
			||||||
 | 
				
			|||||||
@ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 | 
				
			|||||||
      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
 | 
					      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
 | 
				
			||||||
  // NOTE: extra parenthesis around numbers disable clang warnings about
 | 
					  // NOTE: extra parenthesis around numbers disable clang warnings about
 | 
				
			||||||
  // dead code
 | 
					  // dead code
 | 
				
			||||||
  return (
 | 
					  return true;
 | 
				
			||||||
      (CUDNN_VERSION >= (6021)) ||
 | 
					 | 
				
			||||||
      (CUDNN_VERSION >= (6000) && prop->major >= 5));
 | 
					 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
  return false;
 | 
					  return false;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
				
			|||||||
@ -9,45 +9,6 @@
 | 
				
			|||||||
#include "ATen/cuda/ATenCUDAGeneral.h"
 | 
					#include "ATen/cuda/ATenCUDAGeneral.h"
 | 
				
			||||||
#include <cuda.h>
 | 
					#include <cuda.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <curand_kernel.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
Note [cuDNN dropout descriptor initialization]
 | 
					 | 
				
			||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In most cases, setting descriptors in cuDNN is cheap (e.g.,
 | 
					 | 
				
			||||||
cudnnSetTensorNdDescriptor).  However, this is not the case for
 | 
					 | 
				
			||||||
cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
 | 
					 | 
				
			||||||
expensive precomputation to initialize the random number generator states.  In
 | 
					 | 
				
			||||||
cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
 | 
					 | 
				
			||||||
which means that law-abiding clients were expected to generate a dropout
 | 
					 | 
				
			||||||
descriptor once and cache it.  However, our ATen interface is (1) stateless (so
 | 
					 | 
				
			||||||
we can't cache the descriptors) and (2) does not accept arbitrary user types in
 | 
					 | 
				
			||||||
its interface (so we can't pass the descriptor in).  This puts us in a pickle.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
 | 
					 | 
				
			||||||
forgoes the expensive initialization process, and can initialize the
 | 
					 | 
				
			||||||
descriptor with a pre-initialized state CUDA tensor.  This is great, because
 | 
					 | 
				
			||||||
it means we can simply pass in the state tensor and then initialize the
 | 
					 | 
				
			||||||
descriptor internally.  Unfortunately, this function is not available in
 | 
					 | 
				
			||||||
cuDNN 6.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
To work around this, we break the cuDNN abstraction barrier, and have
 | 
					 | 
				
			||||||
the struct layout of the underlaying dropout descriptor.  With this struct,
 | 
					 | 
				
			||||||
we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
 | 
					 | 
				
			||||||
struct cudnnDropoutStruct {
 | 
					 | 
				
			||||||
  float dropout;
 | 
					 | 
				
			||||||
  int nstates;
 | 
					 | 
				
			||||||
  void * states;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace at { namespace native {
 | 
					namespace at { namespace native {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// TODO: Add constructors for all of the descriptors
 | 
					// TODO: Add constructors for all of the descriptors
 | 
				
			||||||
@ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor
 | 
				
			|||||||
    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
 | 
					    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
 | 
				
			||||||
    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
 | 
					    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
 | 
				
			||||||
                                          CUDNN_CROSS_CORRELATION, mathType));
 | 
					                                          CUDNN_CROSS_CORRELATION, mathType));
 | 
				
			||||||
#if CUDNN_VERSION >= 7000
 | 
					 | 
				
			||||||
    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
 | 
					    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
 | 
				
			||||||
    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
 | 
					    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
 | 
				
			||||||
    if(dataType == CUDNN_DATA_HALF)
 | 
					    if(dataType == CUDNN_DATA_HALF)
 | 
				
			||||||
      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 | 
					      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// See Note [cuDNN dropout descriptor initialization]
 | 
					 | 
				
			||||||
inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
 | 
					 | 
				
			||||||
    cudnnDropoutDescriptor_t dropoutDesc,
 | 
					 | 
				
			||||||
    cudnnHandle_t handle,
 | 
					 | 
				
			||||||
    float dropout,
 | 
					 | 
				
			||||||
    void *states,
 | 
					 | 
				
			||||||
    size_t stateSizeInBytes,
 | 
					 | 
				
			||||||
    unsigned long long seed) {
 | 
					 | 
				
			||||||
  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
 | 
					 | 
				
			||||||
  // This is not entirely accurate but is good enough to catch some API
 | 
					 | 
				
			||||||
  // uses which would not be compatible in cuDNN 7.  Feel free to fix
 | 
					 | 
				
			||||||
  // this if you notice something is wrong.
 | 
					 | 
				
			||||||
  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
 | 
					 | 
				
			||||||
  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
 | 
					 | 
				
			||||||
  size_t expectedStateSizeInBytes;
 | 
					 | 
				
			||||||
  // State size will differ depending on size of GPU
 | 
					 | 
				
			||||||
  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
 | 
					 | 
				
			||||||
  if (ret != CUDNN_STATUS_SUCCESS) return ret;
 | 
					 | 
				
			||||||
  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
 | 
					 | 
				
			||||||
  dropoutDesc->dropout = dropout;
 | 
					 | 
				
			||||||
  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
 | 
					 | 
				
			||||||
  dropoutDesc->states = states;
 | 
					 | 
				
			||||||
  return CUDNN_STATUS_SUCCESS;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif // CUDNN_VERSION
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct AT_CUDA_API DropoutDescriptor
 | 
					struct AT_CUDA_API DropoutDescriptor
 | 
				
			||||||
  : public Descriptor<cudnnDropoutStruct,
 | 
					  : public Descriptor<cudnnDropoutStruct,
 | 
				
			||||||
                      &cudnnCreateDropoutDescriptor,
 | 
					                      &cudnnCreateDropoutDescriptor,
 | 
				
			||||||
@ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor
 | 
				
			|||||||
          mode,
 | 
					          mode,
 | 
				
			||||||
          algo,
 | 
					          algo,
 | 
				
			||||||
          datatype));
 | 
					          datatype));
 | 
				
			||||||
#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
 | 
					#if CUDA_VERSION >= 9000
 | 
				
			||||||
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 | 
					    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 | 
				
			||||||
    if (prop->major >= 7) {
 | 
					    if (prop->major >= 7) {
 | 
				
			||||||
      if (datatype == CUDNN_DATA_HALF) {
 | 
					      if (datatype == CUDNN_DATA_HALF) {
 | 
				
			||||||
@ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION >= 7000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct AT_CUDA_API CTCLossDescriptor
 | 
					struct AT_CUDA_API CTCLossDescriptor
 | 
				
			||||||
  : public Descriptor<cudnnCTCLossStruct,
 | 
					  : public Descriptor<cudnnCTCLossStruct,
 | 
				
			||||||
                      &cudnnCreateCTCLossDescriptor,
 | 
					                      &cudnnCreateCTCLossDescriptor,
 | 
				
			||||||
@ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
union Constant
 | 
					union Constant
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  float f;
 | 
					  float f;
 | 
				
			||||||
 | 
				
			|||||||
@ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
 | 
				
			|||||||
      input_stride1 = strides[1];
 | 
					      input_stride1 = strides[1];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    AT_CHECK(channel_size == weight_num,
 | 
					    AT_CHECK(channel_size == weight_num,
 | 
				
			||||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
					      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
				
			||||||
      weight_num, channel_size);
 | 
					      " and channel size = ", channel_size, ".");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
 | 
					    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
 | 
				
			||||||
      prelu_cpu_kernel_multi_weights<scalar_t>(
 | 
					      prelu_cpu_kernel_multi_weights<scalar_t>(
 | 
				
			||||||
@ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
 | 
				
			|||||||
      input_stride1 = strides[1];
 | 
					      input_stride1 = strides[1];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    AT_CHECK(channel_size == weight_num,
 | 
					    AT_CHECK(channel_size == weight_num,
 | 
				
			||||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
					      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
				
			||||||
      weight_num, channel_size);
 | 
					      " and channel size = ", channel_size, ".");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
 | 
					    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
 | 
				
			||||||
      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
 | 
					      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
 | 
				
			||||||
 | 
				
			|||||||
@ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
// Supports arbitrary batch dimensions for self and A
 | 
					// Supports arbitrary batch dimensions for self and A
 | 
				
			||||||
std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
 | 
					std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
 | 
				
			||||||
  if (self.dim() <= 2 && A.dim() <= 2) {
 | 
					  AT_CHECK(self.dim() >= 2,
 | 
				
			||||||
 | 
					           "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
 | 
				
			||||||
 | 
					  AT_CHECK(A.dim() >= 2,
 | 
				
			||||||
 | 
					           "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
 | 
				
			||||||
 | 
					  if (self.dim() == 2 && A.dim() == 2) {
 | 
				
			||||||
    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
 | 
					    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
 | 
				
			||||||
    // TH and ATen. We should remove the TH gesv bindings, especially
 | 
					    // TH and ATen. We should remove the TH gesv bindings, especially
 | 
				
			||||||
    // since the lapackGesv function is already in ATen.
 | 
					    // since the lapackGesv function is already in ATen.
 | 
				
			||||||
 | 
					    linearSolveCheckInputs(self, A);  // Checks square shape of A, and compatibility of self and A
 | 
				
			||||||
    return at::_th_gesv_single(self, A);
 | 
					    return at::_th_gesv_single(self, A);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  squareCheckInputs(self);
 | 
					  squareCheckInputs(self);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors,
 | 
					 | 
				
			||||||
  // this can be simplified. Currently, we are zero-ing out values in the
 | 
					 | 
				
			||||||
  // batch of matrices by using a mask and the `where` function.
 | 
					 | 
				
			||||||
  // The simplification with batched `triu` and `tril` would be this:
 | 
					 | 
				
			||||||
  // if (upper) {
 | 
					 | 
				
			||||||
  //   return raw_cholesky_output.triu();
 | 
					 | 
				
			||||||
  // } else {
 | 
					 | 
				
			||||||
  //   return raw_cholesky_output.tril();
 | 
					 | 
				
			||||||
  // }
 | 
					 | 
				
			||||||
  auto raw_cholesky_output = at::_cholesky_helper(self, upper);
 | 
					  auto raw_cholesky_output = at::_cholesky_helper(self, upper);
 | 
				
			||||||
  int64_t n = self.size(-1);
 | 
					  if (upper) {
 | 
				
			||||||
  auto indices = at::ones({n, n}, self.options().dtype(at::kByte));
 | 
					    return raw_cholesky_output.triu_();
 | 
				
			||||||
  indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self);
 | 
					  } else {
 | 
				
			||||||
  return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output);
 | 
					    return raw_cholesky_output.tril_();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
 | 
					Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
 | 
				
			||||||
@ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
 | 
				
			|||||||
  return result;
 | 
					  return result;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <typename scalar_t, bool inplace, bool upper>
 | 
				
			||||||
 | 
					static void apply_triu_tril_single(
 | 
				
			||||||
 | 
					    scalar_t* result, scalar_t* self,
 | 
				
			||||||
 | 
					    int64_t k, int64_t n, int64_t m,
 | 
				
			||||||
 | 
					    int64_t res_row_stride, int64_t res_col_stride,
 | 
				
			||||||
 | 
					    int64_t self_row_stride, int64_t self_col_stride) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constexpr int64_t zero = 0;
 | 
				
			||||||
 | 
					  int64_t i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (upper) {
 | 
				
			||||||
 | 
					    #pragma omp parallel for private(i)
 | 
				
			||||||
 | 
					    for (i = 0; i < n; i++) {
 | 
				
			||||||
 | 
					      for (int64_t j = 0; j < std::min(m, i + k); j++) {
 | 
				
			||||||
 | 
					        result[i * res_row_stride + j * res_col_stride] = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      if (!inplace) {  // copy the rest of the self if not inplace
 | 
				
			||||||
 | 
					        for (int64_t j = std::max(zero, i + k); j < m; j++) {
 | 
				
			||||||
 | 
					          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    #pragma omp parallel for private(i)
 | 
				
			||||||
 | 
					    for (i = 0; i < n; i++) {
 | 
				
			||||||
 | 
					      for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
 | 
				
			||||||
 | 
					        result[i * res_row_stride + j * res_col_stride] = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      if (!inplace) {  // copy the rest of the self if not inplace
 | 
				
			||||||
 | 
					        for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
 | 
				
			||||||
 | 
					          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <typename scalar_t, bool inplace, bool upper>
 | 
				
			||||||
 | 
					void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  auto n = self.size(-2);
 | 
				
			||||||
 | 
					  auto m = self.size(-1);
 | 
				
			||||||
 | 
					  auto self_data = self.data<scalar_t>();
 | 
				
			||||||
 | 
					  auto self_stride = self.dim() > 2 ? self.stride(-3) : 1;
 | 
				
			||||||
 | 
					  auto batchsize = batchCount(self);
 | 
				
			||||||
 | 
					  auto self_row_stride = self.stride(-2);
 | 
				
			||||||
 | 
					  auto self_column_stride = self.stride(-1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  auto result_data = result.data<scalar_t>();
 | 
				
			||||||
 | 
					  int64_t result_stride, result_row_stride, result_column_stride;
 | 
				
			||||||
 | 
					  if (result_data != self_data) {
 | 
				
			||||||
 | 
					    result_stride = result.dim() > 2 ? result.stride(-3) : 1;
 | 
				
			||||||
 | 
					    result_row_stride = result.stride(-2);
 | 
				
			||||||
 | 
					    result_column_stride = result.stride(-1);
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    result_stride = self_stride;
 | 
				
			||||||
 | 
					    result_row_stride = self_row_stride;
 | 
				
			||||||
 | 
					    result_column_stride = self_column_stride;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int64_t b;
 | 
				
			||||||
 | 
					  #pragma omp parallel for private(b)
 | 
				
			||||||
 | 
					  for (b = 0; b < batchsize; b++) {
 | 
				
			||||||
 | 
					    scalar_t* self_batch = &self_data[b * self_stride];
 | 
				
			||||||
 | 
					    scalar_t* result_batch = &result_data[b * result_stride];
 | 
				
			||||||
 | 
					    apply_triu_tril_single<scalar_t, inplace, upper>(
 | 
				
			||||||
 | 
					        result_batch, self_batch, k, n, m,
 | 
				
			||||||
 | 
					        result_row_stride, result_column_stride, self_row_stride, self_column_stride);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor tril(const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  Tensor result = at::empty({0}, self.options());
 | 
				
			||||||
 | 
					  at::tril_out(result, self, k);
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& tril_cpu_(Tensor &self, int64_t k) {
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return self;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
				
			||||||
 | 
					  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
 | 
				
			||||||
 | 
					    apply_triu_tril<scalar_t, true, false>(self, self, k);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					  return self;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  if (result.sizes() != self.sizes()) {
 | 
				
			||||||
 | 
					    result.resize_as_(self);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return result;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
				
			||||||
 | 
					  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
 | 
				
			||||||
 | 
					    apply_triu_tril<scalar_t, false, false>(result, self_c, k);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor triu(const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  Tensor result = at::empty({0}, self.options());
 | 
				
			||||||
 | 
					  at::triu_out(result, self, k);
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& triu_cpu_(Tensor &self, int64_t k) {
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return self;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
				
			||||||
 | 
					  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
 | 
				
			||||||
 | 
					    apply_triu_tril<scalar_t, true, true>(self, self, k);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					  return self;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  if (result.sizes() != self.sizes()) {
 | 
				
			||||||
 | 
					    result.resize_as_(self);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return result;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
				
			||||||
 | 
					  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
 | 
				
			||||||
 | 
					    apply_triu_tril<scalar_t, false, true>(result, self_c, k);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}  // namespace at::native
 | 
					}}  // namespace at::native
 | 
				
			||||||
 | 
				
			|||||||
@ -378,8 +378,8 @@ at::Tensor _convolution(
 | 
				
			|||||||
    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
 | 
					    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
 | 
				
			||||||
             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
 | 
					             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
 | 
				
			||||||
             ") should be the same");
 | 
					             ") should be the same");
 | 
				
			||||||
 | 
					    output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias,
 | 
				
			||||||
    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 | 
					                                    params.padding, params.stride, params.dilation, params.groups);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    if (params.groups == 1) {
 | 
					    if (params.groups == 1) {
 | 
				
			||||||
 | 
				
			|||||||
@ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor & eq_(Tensor& self, const Tensor & other) {
 | 
					Tensor & eq_(Tensor& self, const Tensor & other) {
 | 
				
			||||||
  return at::_th_ge_(self, other);
 | 
					  return at::_th_eq_(self, other);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor & ne_(Tensor& self, Scalar other) {
 | 
					Tensor & ne_(Tensor& self, Scalar other) {
 | 
				
			||||||
@ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) {
 | 
				
			|||||||
  return at::_th_atan2_(self, other);
 | 
					  return at::_th_atan2_(self, other);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor & tril_(Tensor& self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_tril_(self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor & triu_(Tensor& self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_triu_(self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor & digamma_(Tensor& self) {
 | 
					Tensor & digamma_(Tensor& self) {
 | 
				
			||||||
  return at::_th_digamma_(self);
 | 
					  return at::_th_digamma_(self);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
 | 
				
			|||||||
  return at::_th_cross(self, other, dim);
 | 
					  return at::_th_cross(self, other, dim);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_triu_out(result, self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor triu(const Tensor & self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_triu(self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_tril_out(result, self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor tril(const Tensor & self, int64_t diagonal) {
 | 
					 | 
				
			||||||
  return at::_th_tril(self, diagonal);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Tensor trace(const Tensor & self) {
 | 
					Tensor trace(const Tensor & self) {
 | 
				
			||||||
  return at::_th_trace(self);
 | 
					  return at::_th_trace(self);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) {
 | 
				
			|||||||
  return batched_matrices.size(-1) * batched_matrices.size(-2);
 | 
					  return batched_matrices.size(-1) * batched_matrices.size(-2);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Checks a necessary property for the triu and tril implementations, hence the name.
 | 
				
			||||||
 | 
					 * Here batch contiguity is checked for tensors with greater than 4 dimensions.
 | 
				
			||||||
 | 
					 * Contiguous tensors and tensors with less than 3 dimensions pass this check
 | 
				
			||||||
 | 
					 */ 
 | 
				
			||||||
 | 
					static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) {
 | 
				
			||||||
 | 
					  // Complete contiguity is the most desired property, which is why
 | 
				
			||||||
 | 
					  // we return true if the tensor is contiguous
 | 
				
			||||||
 | 
					  if (tensor.is_contiguous()) return true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int64_t dims = tensor.dim();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tensors with dimension less than 4 are handled by default
 | 
				
			||||||
 | 
					  if (dims <= 3) return true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
 | 
				
			||||||
 | 
					  for (int64_t i = dims - 3; i >= 0; i--) {
 | 
				
			||||||
 | 
					    if (expected_stride != tensor.stride(i)) return false;
 | 
				
			||||||
 | 
					    expected_stride *= tensor.size(i);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Returns the epsilon value for floating types except half
 | 
					// Returns the epsilon value for floating types except half
 | 
				
			||||||
static inline double _get_epsilon(const ScalarType& sc_type) {
 | 
					static inline double _get_epsilon(const ScalarType& sc_type) {
 | 
				
			||||||
  switch (sc_type) {
 | 
					  switch (sc_type) {
 | 
				
			||||||
 | 
				
			|||||||
@ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
 | 
				
			|||||||
std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
 | 
					std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
 | 
				
			||||||
                                                  const Tensor& running_mean, const Tensor& running_var,
 | 
					                                                  const Tensor& running_mean, const Tensor& running_var,
 | 
				
			||||||
                                                  bool train, double momentum, double eps) {
 | 
					                                                  bool train, double momentum, double eps) {
 | 
				
			||||||
 | 
					  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
 | 
					  return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
 | 
				
			||||||
      return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
 | 
					      return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
 | 
				
			|||||||
@ -21,7 +21,6 @@ namespace native {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
DEFINE_DISPATCH(sum_stub);
 | 
					DEFINE_DISPATCH(sum_stub);
 | 
				
			||||||
DEFINE_DISPATCH(prod_stub);
 | 
					DEFINE_DISPATCH(prod_stub);
 | 
				
			||||||
DEFINE_DISPATCH(norm_kernel);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
 | 
					static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
 | 
				
			||||||
  ScalarType scalarType = self.type().scalarType();
 | 
					  ScalarType scalarType = self.type().scalarType();
 | 
				
			||||||
@ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_
 | 
				
			|||||||
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
 | 
					  int64_t dim = maybe_wrap_dim(dim_, self.dim());
 | 
				
			||||||
  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
 | 
					  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
 | 
				
			||||||
    return result;
 | 
					    return result;
 | 
				
			||||||
  if (self.is_contiguous() && result.is_contiguous()) {
 | 
					  return at::_th_norm_out(result, self, p, dim, keepdim);
 | 
				
			||||||
    _dimreduce_setup(result, self, dim);
 | 
					 | 
				
			||||||
    norm_kernel(kCPU, result, self, p, dim);
 | 
					 | 
				
			||||||
    if (!keepdim) {
 | 
					 | 
				
			||||||
      result.squeeze_(dim);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    return result;
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    return at::_th_norm_out(result, self, p, dim, keepdim);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
 | 
					Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
 | 
				
			||||||
@ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
 | 
				
			|||||||
    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
 | 
					    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
 | 
				
			||||||
             "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
 | 
					             "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
 | 
				
			||||||
    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
 | 
					    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
 | 
				
			||||||
    if (self.is_cuda()) {
 | 
					    return at::_th_norm(self, p);
 | 
				
			||||||
      return at::_th_norm(self, p);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      if (self.is_contiguous()) {
 | 
					 | 
				
			||||||
        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
 | 
					 | 
				
			||||||
        norm_kernel(kCPU, result, self, p, c10::nullopt);
 | 
					 | 
				
			||||||
        return result;
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
        return at::_th_norm(self, p);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -34,11 +34,11 @@ Tensor _bincount_cpu_template(
 | 
				
			|||||||
  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
 | 
					  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
 | 
				
			||||||
  nbins = std::max(nbins, minlength); // at least minlength # of bins
 | 
					  nbins = std::max(nbins, minlength); // at least minlength # of bins
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const input_t* self_p = self.contiguous().data<input_t>();
 | 
					  const input_t* self_p = self.data<input_t>();
 | 
				
			||||||
  if (has_weights) {
 | 
					  if (has_weights) {
 | 
				
			||||||
    output = native::zeros({nbins}, weights.options());
 | 
					    output = native::zeros({nbins}, weights.options());
 | 
				
			||||||
    weights_t* output_p = output.data<weights_t>();
 | 
					    weights_t* output_p = output.data<weights_t>();
 | 
				
			||||||
    const weights_t* weights_p = weights.contiguous().data<weights_t>();
 | 
					    const weights_t* weights_p = weights.data<weights_t>();
 | 
				
			||||||
    for (int64_t i = 0; i < self.size(0); i++) {
 | 
					    for (int64_t i = 0; i < self.size(0); i++) {
 | 
				
			||||||
      output_p[self_p[i]] += weights_p[i];
 | 
					      output_p[self_p[i]] += weights_p[i];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
 | 
				
			|||||||
  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
 | 
					  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
 | 
				
			||||||
    const auto scalar = weights.type().scalarType();
 | 
					    const auto scalar = weights.type().scalarType();
 | 
				
			||||||
    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
 | 
					    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
 | 
				
			||||||
      return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
 | 
					      return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
 | 
				
			||||||
    return _bincount_cpu_template<scalar_t, double>(
 | 
					    return _bincount_cpu_template<scalar_t, double>(
 | 
				
			||||||
        self, weights.toType(CPU(kDouble)), minlength);
 | 
					        self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength);
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
 | 
					void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
 | 
				
			||||||
 | 
					  if (range.size() == 0) {
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  auto strides = get_strides();
 | 
					  auto strides = get_strides();
 | 
				
			||||||
  while (strides.size() < 2 * ntensors()) {
 | 
					  while (strides.size() < 2 * ntensors()) {
 | 
				
			||||||
    strides.push_back(0);
 | 
					    strides.push_back(0);
 | 
				
			||||||
@ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range)
 | 
				
			|||||||
  int64_t ndim = values.size();
 | 
					  int64_t ndim = values.size();
 | 
				
			||||||
  for (int dim = 0; dim < ndim; dim++) {
 | 
					  for (int dim = 0; dim < ndim; dim++) {
 | 
				
			||||||
    int64_t size = shape[dim];
 | 
					    int64_t size = shape[dim];
 | 
				
			||||||
    values[dim] = linear_offset % size;
 | 
					    if (size > 0) {
 | 
				
			||||||
    linear_offset /= size;
 | 
					      values[dim] = linear_offset % size;
 | 
				
			||||||
 | 
					      linear_offset /= size;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  AT_ASSERT(linear_offset == 0);
 | 
					  AT_ASSERT(linear_offset == 0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -101,14 +101,14 @@ struct PDist {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    scalar_t * const res_start = result.data<scalar_t>();
 | 
					    scalar_t * const res_start = result.data<scalar_t>();
 | 
				
			||||||
    int64_t combs = result.numel(); // n * (n - 1) / 2
 | 
					    int64_t combs = result.numel(); // n * (n - 1) / 2
 | 
				
			||||||
    const Vec pvec(p);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // We conceptually iterate over tuples of (i, j, k) where i is the first
 | 
					    // We conceptually iterate over tuples of (i, j, k) where i is the first
 | 
				
			||||||
    // vector from the input, j is the second, and k is the result index. This
 | 
					    // vector from the input, j is the second, and k is the result index. This
 | 
				
			||||||
    // parallelizes over the range of k and infers what i and j are from the
 | 
					    // parallelizes over the range of k and infers what i and j are from the
 | 
				
			||||||
    // value of k.
 | 
					    // value of k.
 | 
				
			||||||
    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
 | 
					    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) {
 | 
				
			||||||
      float n2 = n - .5;
 | 
					      const Vec pvec(p);
 | 
				
			||||||
 | 
					      double n2 = n - .5;
 | 
				
			||||||
      // The -1 accounts for floating point truncation issues
 | 
					      // The -1 accounts for floating point truncation issues
 | 
				
			||||||
      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
 | 
					      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
 | 
				
			||||||
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
					      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
				
			||||||
@ -149,7 +149,7 @@ struct PDist {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template <typename F>
 | 
					  template <typename F>
 | 
				
			||||||
  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
 | 
					  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
 | 
				
			||||||
    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
 | 
					    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      const Vec self_vec_i = Vec::loadu(self_i, count);
 | 
					      const Vec self_vec_i = Vec::loadu(self_i, count);
 | 
				
			||||||
@ -177,7 +177,6 @@ struct PDist {
 | 
				
			|||||||
    const int64_t n = self.size(0);
 | 
					    const int64_t n = self.size(0);
 | 
				
			||||||
    const int64_t m = self.size(1);
 | 
					    const int64_t m = self.size(1);
 | 
				
			||||||
    const int64_t gs = grad.stride(0);
 | 
					    const int64_t gs = grad.stride(0);
 | 
				
			||||||
    const Vec pvec(p);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const scalar_t * const grad_start = grad.data<scalar_t>();
 | 
					    const scalar_t * const grad_start = grad.data<scalar_t>();
 | 
				
			||||||
    const scalar_t * const dist_start = dist.data<scalar_t>();
 | 
					    const scalar_t * const dist_start = dist.data<scalar_t>();
 | 
				
			||||||
@ -187,17 +186,19 @@ struct PDist {
 | 
				
			|||||||
    // The only way to parallelize and avoid locking requires parallelizing
 | 
					    // The only way to parallelize and avoid locking requires parallelizing
 | 
				
			||||||
    // over the columns of the input, i.e. we compute the gradient for the
 | 
					    // over the columns of the input, i.e. we compute the gradient for the
 | 
				
			||||||
    // first section of each vector independentaly of the second section, etc.
 | 
					    // first section of each vector independentaly of the second section, etc.
 | 
				
			||||||
    at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
 | 
					    at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) {
 | 
				
			||||||
      const scalar_t * self_l = self_start + l * Vec::size;
 | 
					      const Vec pvec(p);
 | 
				
			||||||
      scalar_t * res_l = res_start + l * Vec::size;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
 | 
					      const scalar_t * self_l = self_start + l * Vec::size();
 | 
				
			||||||
 | 
					      scalar_t * res_l = res_start + l * Vec::size();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) {
 | 
				
			||||||
        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
 | 
					        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
    const int64_t remainder = m % Vec::size;
 | 
					    const int64_t remainder = m % Vec::size();
 | 
				
			||||||
    if (remainder) {
 | 
					    if (remainder) {
 | 
				
			||||||
      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
 | 
					      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -308,7 +308,9 @@ static inline void
 | 
				
			|||||||
mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
 | 
					mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
 | 
				
			||||||
                 const int_same_size_t<scalar_t> *offsets,
 | 
					                 const int_same_size_t<scalar_t> *offsets,
 | 
				
			||||||
                 const int_same_size_t<scalar_t> *mask, int64_t len) {
 | 
					                 const int_same_size_t<scalar_t> *mask, int64_t len) {
 | 
				
			||||||
  #pragma unroll
 | 
					  #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					  # pragma unroll  
 | 
				
			||||||
 | 
					  #endif
 | 
				
			||||||
  for (int64_t i = 0; i < len; i++) {
 | 
					  for (int64_t i = 0; i < len; i++) {
 | 
				
			||||||
    if (mask[i] & 0x01) {
 | 
					    if (mask[i] & 0x01) {
 | 
				
			||||||
      base_addr[offsets[i]] += src[i];
 | 
					      base_addr[offsets[i]] += src[i];
 | 
				
			||||||
@ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
				
			|||||||
    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
 | 
					    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
 | 
				
			||||||
    auto i_se_offset = i_sw_offset + iVec(inp_sW);
 | 
					    auto i_se_offset = i_sw_offset + iVec(inp_sW);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #pragma unroll
 | 
					    #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					    # pragma unroll  
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    for (int64_t c = 0; c < C; ++c) {
 | 
					    for (int64_t c = 0; c < C; ++c) {
 | 
				
			||||||
      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
					      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
				
			|||||||
    // So we store the necessary vectors to temporary arrays and use the helper
 | 
					    // So we store the necessary vectors to temporary arrays and use the helper
 | 
				
			||||||
    // mask_scatter_add defined above.
 | 
					    // mask_scatter_add defined above.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    integer_t i_gInp_nw_offset_arr[iVec::size];
 | 
					    integer_t i_gInp_nw_offset_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_gInp_ne_offset_arr[iVec::size];
 | 
					    integer_t i_gInp_ne_offset_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_gInp_sw_offset_arr[iVec::size];
 | 
					    integer_t i_gInp_sw_offset_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_gInp_se_offset_arr[iVec::size];
 | 
					    integer_t i_gInp_se_offset_arr[iVec::size()];
 | 
				
			||||||
    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
 | 
					    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
 | 
				
			||||||
    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
 | 
					    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
 | 
				
			||||||
    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
 | 
					    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
 | 
				
			||||||
    i_gInp_se_offset.store(i_gInp_se_offset_arr);
 | 
					    i_gInp_se_offset.store(i_gInp_se_offset_arr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    integer_t i_nw_mask_arr[iVec::size];
 | 
					    integer_t i_nw_mask_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_ne_mask_arr[iVec::size];
 | 
					    integer_t i_ne_mask_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_sw_mask_arr[iVec::size];
 | 
					    integer_t i_sw_mask_arr[iVec::size()];
 | 
				
			||||||
    integer_t i_se_mask_arr[iVec::size];
 | 
					    integer_t i_se_mask_arr[iVec::size()];
 | 
				
			||||||
    nw_mask.store(i_nw_mask_arr);
 | 
					    nw_mask.store(i_nw_mask_arr);
 | 
				
			||||||
    ne_mask.store(i_ne_mask_arr);
 | 
					    ne_mask.store(i_ne_mask_arr);
 | 
				
			||||||
    sw_mask.store(i_sw_mask_arr);
 | 
					    sw_mask.store(i_sw_mask_arr);
 | 
				
			||||||
    se_mask.store(i_se_mask_arr);
 | 
					    se_mask.store(i_se_mask_arr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    scalar_t gInp_corner_arr[Vec::size];
 | 
					    scalar_t gInp_corner_arr[Vec::size()];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    auto gx = Vec(0), gy = Vec(0);
 | 
					    auto gx = Vec(0), gy = Vec(0);
 | 
				
			||||||
    #pragma unroll
 | 
					    #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					    # pragma unroll  
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    for (int64_t c = 0; c < C; ++c) {
 | 
					    for (int64_t c = 0; c < C; ++c) {
 | 
				
			||||||
      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
					      auto inp_slice_C_ptr = inp_slice[c].data();
 | 
				
			||||||
      auto gInp_slice_C_ptr = gInp_slice[c].data();
 | 
					      auto gInp_slice_C_ptr = gInp_slice[c].data();
 | 
				
			||||||
@ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
 | 
				
			|||||||
    gx = gx * gx_mult;
 | 
					    gx = gx * gx_mult;
 | 
				
			||||||
    gy = gy * gy_mult;
 | 
					    gy = gy * gy_mult;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    constexpr int64_t step = Vec::size;
 | 
					    constexpr int64_t step = Vec::size();
 | 
				
			||||||
    auto interleaved_gGrid = interleave2(gx, gy);
 | 
					    auto interleaved_gGrid = interleave2(gx, gy);
 | 
				
			||||||
    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
 | 
					    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
 | 
				
			||||||
    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
 | 
					    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
 | 
				
			||||||
@ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
 | 
				
			|||||||
    auto out_ptr = out_slice.data() + offset;
 | 
					    auto out_ptr = out_slice.data() + offset;
 | 
				
			||||||
    auto out_sC = out_slice.stride(0);
 | 
					    auto out_sC = out_slice.stride(0);
 | 
				
			||||||
    auto inp_slice_ptr = inp_slice.data();
 | 
					    auto inp_slice_ptr = inp_slice.data();
 | 
				
			||||||
    #pragma unroll
 | 
					    #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					    # pragma unroll  
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
 | 
					    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
 | 
				
			||||||
      // mask_gather zeros out the mask, so we need to make a copy
 | 
					      // mask_gather zeros out the mask, so we need to make a copy
 | 
				
			||||||
      auto mask_copy = mask;
 | 
					      auto mask_copy = mask;
 | 
				
			||||||
@ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
 | 
					    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    integer_t mask_arr[iVec::size];
 | 
					    integer_t mask_arr[iVec::size()];
 | 
				
			||||||
    i_mask.store(mask_arr);
 | 
					    i_mask.store(mask_arr);
 | 
				
			||||||
    integer_t gInp_offset_arr[iVec::size];
 | 
					    integer_t gInp_offset_arr[iVec::size()];
 | 
				
			||||||
    i_gInp_offset.store(gInp_offset_arr);
 | 
					    i_gInp_offset.store(gInp_offset_arr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #pragma unroll
 | 
					    #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					    # pragma unroll  
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    for (int64_t c = 0; c < C; ++c) {
 | 
					    for (int64_t c = 0; c < C; ++c) {
 | 
				
			||||||
      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
 | 
					      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
 | 
				
			||||||
                       gInp_offset_arr, mask_arr, len);
 | 
					                       gInp_offset_arr, mask_arr, len);
 | 
				
			||||||
@ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  using Vec = Vec256<scalar_t>;
 | 
					  using Vec = Vec256<scalar_t>;
 | 
				
			||||||
  using iVec = Vec256<int_same_size_t<scalar_t>>;
 | 
					  using iVec = Vec256<int_same_size_t<scalar_t>>;
 | 
				
			||||||
  constexpr int64_t step = Vec::size;
 | 
					  constexpr int64_t step = Vec::size();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Loop over each output pixel in grid.
 | 
					  // Loop over each output pixel in grid.
 | 
				
			||||||
  // We consider the following three cases (after slicing out the batch
 | 
					  // We consider the following three cases (after slicing out the batch
 | 
				
			||||||
@ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator(
 | 
				
			|||||||
    auto spatial_offset = 0;
 | 
					    auto spatial_offset = 0;
 | 
				
			||||||
    auto i_offsets_delta = iVec(grid_sW * step);
 | 
					    auto i_offsets_delta = iVec(grid_sW * step);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #pragma unroll
 | 
					    #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					    # pragma unroll  
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    for (int64_t h = 0; h < out_H; h++) {
 | 
					    for (int64_t h = 0; h < out_H; h++) {
 | 
				
			||||||
      auto grid_ptr_x = grid_ptr + h * grid_sH;
 | 
					      auto grid_ptr_x = grid_ptr + h * grid_sH;
 | 
				
			||||||
      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
 | 
					      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
 | 
				
			||||||
      auto i_offsets = iVec::arange(0, grid_sW);
 | 
					      auto i_offsets = iVec::arange(0, grid_sW);
 | 
				
			||||||
      #pragma unroll
 | 
					      #ifndef _MSC_VER  
 | 
				
			||||||
 | 
					      # pragma unroll  
 | 
				
			||||||
 | 
					      #endif
 | 
				
			||||||
      for (int64_t w = 0; w < out_W; w += step) {
 | 
					      for (int64_t w = 0; w < out_W; w += step) {
 | 
				
			||||||
        auto len = std::min(step, out_W - w);
 | 
					        auto len = std::min(step, out_W - w);
 | 
				
			||||||
        if (len < step) {
 | 
					        if (len < step) {
 | 
				
			||||||
 | 
				
			|||||||
@ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t>
 | 
				
			|||||||
static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
					static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
				
			||||||
  VEC_LOOP_HEADER(func_t, data)
 | 
					  VEC_LOOP_HEADER(func_t, data)
 | 
				
			||||||
  int64_t i = 0;
 | 
					  int64_t i = 0;
 | 
				
			||||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
					  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
				
			||||||
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
					    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
					    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
    auto out1 = vop(a1, b1);
 | 
					    auto out1 = vop(a1, b1);
 | 
				
			||||||
    auto out2 = vop(a2, b2);
 | 
					    auto out2 = vop(a2, b2);
 | 
				
			||||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
					    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
 | 
					  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
 | 
				
			||||||
  binary_loop(data, strides, i, n, op);
 | 
					  binary_loop(data, strides, i, n, op);
 | 
				
			||||||
@ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op,
 | 
				
			|||||||
  VEC_LOOP_HEADER(func_t, data)
 | 
					  VEC_LOOP_HEADER(func_t, data)
 | 
				
			||||||
  int64_t i = 0;
 | 
					  int64_t i = 0;
 | 
				
			||||||
  auto a = Vec(*(scalar_t*)in1_ptr);
 | 
					  auto a = Vec(*(scalar_t*)in1_ptr);
 | 
				
			||||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
					  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
				
			||||||
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
					    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
    auto out1 = vop(a, b1);
 | 
					    auto out1 = vop(a, b1);
 | 
				
			||||||
    auto out2 = vop(a, b2);
 | 
					    auto out2 = vop(a, b2);
 | 
				
			||||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
					    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
 | 
					  int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
 | 
				
			||||||
  binary_loop(data, strides, i, n, op);
 | 
					  binary_loop(data, strides, i, n, op);
 | 
				
			||||||
@ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op,
 | 
				
			|||||||
  VEC_LOOP_HEADER(func_t, data)
 | 
					  VEC_LOOP_HEADER(func_t, data)
 | 
				
			||||||
  int64_t i = 0;
 | 
					  int64_t i = 0;
 | 
				
			||||||
  auto b = Vec(*(scalar_t*)in2_ptr);
 | 
					  auto b = Vec(*(scalar_t*)in2_ptr);
 | 
				
			||||||
  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
 | 
					  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
 | 
				
			||||||
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
					    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
    auto out1 = vop(a1, b);
 | 
					    auto out1 = vop(a1, b);
 | 
				
			||||||
    auto out2 = vop(a2, b);
 | 
					    auto out2 = vop(a2, b);
 | 
				
			||||||
    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
					    out1.store(out_ptr + i * sizeof(scalar_t));
 | 
				
			||||||
    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
 | 
					    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
 | 
					  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
 | 
				
			||||||
  binary_loop(data, strides, i, n, op);
 | 
					  binary_loop(data, strides, i, n, op);
 | 
				
			||||||
@ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o
 | 
				
			|||||||
  char* in_ptr = data[1];
 | 
					  char* in_ptr = data[1];
 | 
				
			||||||
  Vec acc[4];
 | 
					  Vec acc[4];
 | 
				
			||||||
  for  (int j = 0; j < 4; j++) {
 | 
					  for  (int j = 0; j < 4; j++) {
 | 
				
			||||||
    acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t));
 | 
					    acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  for (int64_t i = 1; i < n; i++) {
 | 
					  for (int64_t i = 1; i < n; i++) {
 | 
				
			||||||
    const char* ptr = in_ptr + stride * i;
 | 
					    const char* ptr = in_ptr + stride * i;
 | 
				
			||||||
    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t))));
 | 
					    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
 | 
				
			||||||
    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t))));
 | 
					    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
 | 
				
			||||||
    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t))));
 | 
					    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
 | 
				
			||||||
    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t))));
 | 
					    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if (reduce) {
 | 
					  if (reduce) {
 | 
				
			||||||
    scalar_t buffer[Vec::size];
 | 
					    scalar_t buffer[Vec::size()];
 | 
				
			||||||
    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
 | 
					    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
 | 
				
			||||||
    acc[0].store(buffer);
 | 
					    acc[0].store(buffer);
 | 
				
			||||||
    for (int j = 1; j < Vec::size; j++) {
 | 
					    for (int j = 1; j < Vec::size(); j++) {
 | 
				
			||||||
      buffer[0] = op(buffer[0], buffer[j]);
 | 
					      buffer[0] = op(buffer[0], buffer[j]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    auto dst = (scalar_t*)out_ptr;
 | 
					    auto dst = (scalar_t*)out_ptr;
 | 
				
			||||||
    *dst = op(*dst, buffer[0]);
 | 
					    *dst = op(*dst, buffer[0]);
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    for (int j = 0; j < 4; j++) {
 | 
					    for (int j = 0; j < 4; j++) {
 | 
				
			||||||
      auto dst = out_ptr + j * Vec::size * sizeof(scalar_t);
 | 
					      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
 | 
				
			||||||
      acc[j] = vop(acc[j], Vec::loadu(dst));
 | 
					      acc[j] = vop(acc[j], Vec::loadu(dst));
 | 
				
			||||||
      acc[j].store(dst);
 | 
					      acc[j].store(dst);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int
 | 
				
			|||||||
template <typename func_t, typename vec_func_t>
 | 
					template <typename func_t, typename vec_func_t>
 | 
				
			||||||
static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
					static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
 | 
				
			||||||
  VEC_HEADER(func_t)
 | 
					  VEC_HEADER(func_t)
 | 
				
			||||||
  int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t);
 | 
					  int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
 | 
				
			||||||
  int64_t count = n / (4 * Vec::size);
 | 
					  int64_t count = n / (4 * Vec::size());
 | 
				
			||||||
  if (count > 0) {
 | 
					  if (count > 0) {
 | 
				
			||||||
    reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
 | 
					    reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  char* ptrs[3] = { data[0], data[0], data[1] };
 | 
					  char* ptrs[3] = { data[0], data[0], data[1] };
 | 
				
			||||||
  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
 | 
					  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
 | 
				
			||||||
  binary_loop(ptrs, strides, count * 4 * Vec::size, n, op);
 | 
					  binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// computes the reduction out = op(out, in)
 | 
					// computes the reduction out = op(out, in)
 | 
				
			||||||
@ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t>
 | 
				
			|||||||
static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
 | 
					static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
 | 
				
			||||||
  VEC_HEADER(func_t)
 | 
					  VEC_HEADER(func_t)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // reduce down each column of 4 * Vec::size elements (128 bytes)
 | 
					  // reduce down each column of 4 * Vec::size() elements (128 bytes)
 | 
				
			||||||
  int64_t outer_stride[2] = { 128, 128 };
 | 
					  int64_t outer_stride[2] = { 128, 128 };
 | 
				
			||||||
  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] {
 | 
					  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
 | 
				
			||||||
    reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
 | 
					    reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // reduce down the remaining columns
 | 
					  // reduce down the remaining columns
 | 
				
			||||||
  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
 | 
					  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
 | 
				
			||||||
  int64_t remaining = size1 % (4 * Vec::size);
 | 
					  int64_t remaining = size1 % (4 * Vec::size());
 | 
				
			||||||
  UNARY_OUTER_LOOP(data, step, remaining, [&] {
 | 
					  UNARY_OUTER_LOOP(data, step, remaining, [&] {
 | 
				
			||||||
    char* ptrs[3] = { data[0], data[0], data[1] };
 | 
					    char* ptrs[3] = { data[0], data[0], data[1] };
 | 
				
			||||||
    int64_t strides[] = { 0, 0, inner_stride };
 | 
					    int64_t strides[] = { 0, 0, inner_stride };
 | 
				
			||||||
 | 
				
			|||||||
@ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) {
 | 
				
			|||||||
      /*identity=*/1);
 | 
					      /*identity=*/1);
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					 | 
				
			||||||
static inline int64_t round_down(int64_t a, int64_t m) {
 | 
					 | 
				
			||||||
  return a - (a % m);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<typename scalar_t>
 | 
					 | 
				
			||||||
struct NormReduction {
 | 
					 | 
				
			||||||
  // reduction width in number of scalar elements
 | 
					 | 
				
			||||||
  static constexpr int WIDTH = 128 / sizeof(scalar_t);
 | 
					 | 
				
			||||||
  using Vec = Vec256<scalar_t>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static void apply(
 | 
					 | 
				
			||||||
      Tensor& res,
 | 
					 | 
				
			||||||
      const Tensor& self,
 | 
					 | 
				
			||||||
      Scalar p,
 | 
					 | 
				
			||||||
      c10::optional<int64_t> dim) {
 | 
					 | 
				
			||||||
    auto out_ = res.data<scalar_t>();
 | 
					 | 
				
			||||||
    auto data_ = self.data<scalar_t>();
 | 
					 | 
				
			||||||
    auto numel = self.numel();
 | 
					 | 
				
			||||||
    float pval = 0.0;
 | 
					 | 
				
			||||||
    if (p.isIntegral()){
 | 
					 | 
				
			||||||
      pval = p.to<int64_t>();
 | 
					 | 
				
			||||||
    } else if (p.isFloatingPoint()) {
 | 
					 | 
				
			||||||
      pval = p.to<float>();
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    if (!dim.has_value()) {
 | 
					 | 
				
			||||||
      *out_ = reduce_all(data_, numel,  pval);
 | 
					 | 
				
			||||||
      return;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    int64_t n = self.size(*dim);
 | 
					 | 
				
			||||||
    int64_t stride = self.stride(*dim);
 | 
					 | 
				
			||||||
    // A contiguous tensor does not need to hold a meaningful stride
 | 
					 | 
				
			||||||
    // if the corresponding size is 1
 | 
					 | 
				
			||||||
    if (n == 1) {
 | 
					 | 
				
			||||||
      stride = 1;
 | 
					 | 
				
			||||||
      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
 | 
					 | 
				
			||||||
        stride *= self.size(i);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    int64_t batch = numel / n;
 | 
					 | 
				
			||||||
    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
 | 
					 | 
				
			||||||
      for (int64_t bi = begin; bi < end; bi++) {
 | 
					 | 
				
			||||||
        int64_t b = bi / stride;
 | 
					 | 
				
			||||||
        int64_t i = bi % stride;
 | 
					 | 
				
			||||||
        const scalar_t* data = &data_[b * n * stride + i];
 | 
					 | 
				
			||||||
        out_[bi] = norm_reduce(data, n, stride, pval);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    });
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
 | 
					 | 
				
			||||||
    scalar_t sum = parallel_reduce(
 | 
					 | 
				
			||||||
      0,
 | 
					 | 
				
			||||||
      size,
 | 
					 | 
				
			||||||
      internal::GRAIN_SIZE,
 | 
					 | 
				
			||||||
      (scalar_t)0,
 | 
					 | 
				
			||||||
      [=](int64_t begin, int64_t end, scalar_t init) {
 | 
					 | 
				
			||||||
        const scalar_t* data = &data_[begin];
 | 
					 | 
				
			||||||
        int64_t n = end - begin;
 | 
					 | 
				
			||||||
        scalar_t result = norm_reduce(data, n, 1, pval);
 | 
					 | 
				
			||||||
        return result;
 | 
					 | 
				
			||||||
      },
 | 
					 | 
				
			||||||
      std::plus<scalar_t>());
 | 
					 | 
				
			||||||
    return sum;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
 | 
					 | 
				
			||||||
    scalar_t result = 0.0;
 | 
					 | 
				
			||||||
    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
 | 
					 | 
				
			||||||
      int64_t n_rounded = round_down(n, WIDTH);
 | 
					 | 
				
			||||||
      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
 | 
					 | 
				
			||||||
      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
 | 
					 | 
				
			||||||
      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      result = norm_reduce_sequential(data, n, stride, pval);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    return result;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
 | 
					 | 
				
			||||||
    scalar_t result = 0.0;
 | 
					 | 
				
			||||||
    if (pval == 0) {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result += (data[k * stride] != 0.0);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else if (pval == 1) {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result += std::abs(data[k * stride]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else if (pval == 2) {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result += data[k * stride] * data[k * stride];
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      result = std::sqrt(result);
 | 
					 | 
				
			||||||
    } else if (pval == 3) {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      result = std::pow(result, 1.0/3);
 | 
					 | 
				
			||||||
    } else if (pval == INFINITY) {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else if (pval == -INFINITY) {
 | 
					 | 
				
			||||||
      result = INFINITY;
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      for (int64_t k = 0; k < n; k++) {
 | 
					 | 
				
			||||||
        result += std::pow(std::abs(data[k * stride]), pval);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      result = std::pow(result, 1.0/pval);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    return result;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
 | 
					 | 
				
			||||||
  // n is already rounded by 128
 | 
					 | 
				
			||||||
  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
 | 
					 | 
				
			||||||
    scalar_t result = 0.0;
 | 
					 | 
				
			||||||
    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
 | 
					 | 
				
			||||||
    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
 | 
					 | 
				
			||||||
    int64_t rows = n / WIDTH;
 | 
					 | 
				
			||||||
    if (pval == 1){
 | 
					 | 
				
			||||||
      for (int row = 0; row < rows; row ++) {
 | 
					 | 
				
			||||||
        for (int j = 0; j != 4; j++) {
 | 
					 | 
				
			||||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
					 | 
				
			||||||
          acc[j] = acc[j] + val.abs();
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    else if (pval == 2) {
 | 
					 | 
				
			||||||
      for (int row = 0; row < rows; row ++) {
 | 
					 | 
				
			||||||
        for (int j = 0; j != 4; j++) {
 | 
					 | 
				
			||||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
					 | 
				
			||||||
          acc[j] = acc[j] + val * val;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    else if (pval == 3) {
 | 
					 | 
				
			||||||
      for (int row = 0; row < rows; row ++) {
 | 
					 | 
				
			||||||
        for (int j = 0; j != 4; j++) {
 | 
					 | 
				
			||||||
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
 | 
					 | 
				
			||||||
          acc[j] = acc[j] + (val * val * val).abs();
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    scalar_t buf[WIDTH] = {0};
 | 
					 | 
				
			||||||
    for (int j = 0; j != 4; j++) {
 | 
					 | 
				
			||||||
      acc[j].store(&buf[j * Vec::size]);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    for (int i = 0; i < WIDTH; i++) {
 | 
					 | 
				
			||||||
      result += buf[i];
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    result = std::pow(result, 1.0/pval);
 | 
					 | 
				
			||||||
    return result;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static void norm_kernel_impl(
 | 
					 | 
				
			||||||
    Tensor& result,
 | 
					 | 
				
			||||||
    const Tensor& self,
 | 
					 | 
				
			||||||
    Scalar p,
 | 
					 | 
				
			||||||
    c10::optional<int64_t> dim) {
 | 
					 | 
				
			||||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
 | 
					 | 
				
			||||||
    NormReduction<scalar_t>::apply(result, self, p, dim);
 | 
					 | 
				
			||||||
  });
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}  // anonymous namespace
 | 
					}  // anonymous namespace
 | 
				
			||||||
 | 
					
 | 
				
			||||||
REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
 | 
					REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
 | 
				
			||||||
REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
 | 
					REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
 | 
				
			||||||
REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}  // namespace at::native
 | 
					}}  // namespace at::native
 | 
				
			||||||
 | 
				
			|||||||
@ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim(
 | 
				
			|||||||
    int64_t outer_size,
 | 
					    int64_t outer_size,
 | 
				
			||||||
    int64_t dim_size) {
 | 
					    int64_t dim_size) {
 | 
				
			||||||
  using Vec = vec256::Vec256<scalar_t>;
 | 
					  using Vec = vec256::Vec256<scalar_t>;
 | 
				
			||||||
  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
 | 
					  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
 | 
				
			||||||
  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
 | 
					  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
 | 
				
			||||||
  if (grain_size < CHUNK_SIZE)
 | 
					  if (grain_size < CHUNK_SIZE)
 | 
				
			||||||
    grain_size = CHUNK_SIZE;
 | 
					    grain_size = CHUNK_SIZE;
 | 
				
			||||||
 | 
				
			|||||||
@ -37,9 +37,9 @@ template <>
 | 
				
			|||||||
int64_t _sigmoid(float* x, float* y, int64_t size) {
 | 
					int64_t _sigmoid(float* x, float* y, int64_t size) {
 | 
				
			||||||
  using Vec = Vec256<float>;
 | 
					  using Vec = Vec256<float>;
 | 
				
			||||||
  int64_t i = 0;
 | 
					  int64_t i = 0;
 | 
				
			||||||
  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
 | 
					  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
 | 
				
			||||||
    Vec ret = Vec::loadu(y + i);
 | 
					    Vec ret = Vec::loadu(y + i);
 | 
				
			||||||
    Vec ret2 = Vec::loadu(y + i + Vec::size);
 | 
					    Vec ret2 = Vec::loadu(y + i + Vec::size());
 | 
				
			||||||
    ret = ret.neg();
 | 
					    ret = ret.neg();
 | 
				
			||||||
    ret2 = ret2.neg();
 | 
					    ret2 = ret2.neg();
 | 
				
			||||||
#if defined(__AVX2__) && !defined(_MSC_VER)
 | 
					#if defined(__AVX2__) && !defined(_MSC_VER)
 | 
				
			||||||
@ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) {
 | 
				
			|||||||
    ret = ret.reciprocal();
 | 
					    ret = ret.reciprocal();
 | 
				
			||||||
    ret2 = ret2.reciprocal();
 | 
					    ret2 = ret2.reciprocal();
 | 
				
			||||||
    ret.store(x + i);
 | 
					    ret.store(x + i);
 | 
				
			||||||
    ret2.store(x + i + Vec::size);
 | 
					    ret2.store(x + i + Vec::size());
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return i;
 | 
					  return i;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -63,9 +63,9 @@ template <>
 | 
				
			|||||||
int64_t _sigmoid(double* x, double* y, int64_t size) {
 | 
					int64_t _sigmoid(double* x, double* y, int64_t size) {
 | 
				
			||||||
  using Vec = Vec256<double>;
 | 
					  using Vec = Vec256<double>;
 | 
				
			||||||
  int64_t i = 0;
 | 
					  int64_t i = 0;
 | 
				
			||||||
  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
 | 
					  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
 | 
				
			||||||
    Vec ret = Vec::loadu(y + i);
 | 
					    Vec ret = Vec::loadu(y + i);
 | 
				
			||||||
    Vec ret2 = Vec::loadu(y + i + Vec::size);
 | 
					    Vec ret2 = Vec::loadu(y + i + Vec::size());
 | 
				
			||||||
    ret = ret.neg();
 | 
					    ret = ret.neg();
 | 
				
			||||||
    ret2 = ret2.neg();
 | 
					    ret2 = ret2.neg();
 | 
				
			||||||
    ret = ret.exp();
 | 
					    ret = ret.exp();
 | 
				
			||||||
@ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) {
 | 
				
			|||||||
    ret = ret.reciprocal();
 | 
					    ret = ret.reciprocal();
 | 
				
			||||||
    ret2 = ret2.reciprocal();
 | 
					    ret2 = ret2.reciprocal();
 | 
				
			||||||
    ret.store(x + i);
 | 
					    ret.store(x + i);
 | 
				
			||||||
    ret2.store(x + i + Vec::size);
 | 
					    ret2.store(x + i + Vec::size());
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return i;
 | 
					  return i;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
 | 
				
			|||||||
          if (stridex == 1 && stridey == 1) {
 | 
					          if (stridex == 1 && stridey == 1) {
 | 
				
			||||||
            i = _sigmoid(x, y, size);
 | 
					            i = _sigmoid(x, y, size);
 | 
				
			||||||
          }
 | 
					          }
 | 
				
			||||||
          for (; i < size; i += Vec::size) {
 | 
					          for (; i < size; i += Vec::size()) {
 | 
				
			||||||
            scalar_t buffer[Vec::size];
 | 
					            scalar_t buffer[Vec::size()];
 | 
				
			||||||
            int64_t width = Vec::size;
 | 
					            int64_t width = Vec::size();
 | 
				
			||||||
            width = std::min(width, size - i);
 | 
					            width = std::min(width, size - i);
 | 
				
			||||||
            for (int64_t j = 0; j < width; j++) {
 | 
					            for (int64_t j = 0; j < width; j++) {
 | 
				
			||||||
              buffer[j] = y[stridey * (i + j)];
 | 
					              buffer[j] = y[stridey * (i + j)];
 | 
				
			||||||
 | 
				
			|||||||
@ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
 | 
				
			|||||||
      input_stride1 = strides[1];
 | 
					      input_stride1 = strides[1];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    AT_CHECK(channel_size == weight_num,
 | 
					    AT_CHECK(channel_size == weight_num,
 | 
				
			||||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
					      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
				
			||||||
      weight_num, channel_size);
 | 
					      " and channel size = ", channel_size, ".");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // config to run cuda kernel
 | 
					    // config to run cuda kernel
 | 
				
			||||||
    int64_t input_numel = input.numel();
 | 
					    int64_t input_numel = input.numel();
 | 
				
			||||||
@ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
 | 
				
			|||||||
      input_stride1 = strides[1];
 | 
					      input_stride1 = strides[1];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    AT_CHECK(channel_size == weight_num,
 | 
					    AT_CHECK(channel_size == weight_num,
 | 
				
			||||||
      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
 | 
					      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
 | 
				
			||||||
      weight_num, channel_size);
 | 
					      " and channel size = ", channel_size, ".");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // config to run cuda kernel
 | 
					    // config to run cuda kernel
 | 
				
			||||||
    int64_t input_numel = input.numel();
 | 
					    int64_t input_numel = input.numel();
 | 
				
			||||||
 | 
				
			|||||||
@ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <typename scalar_t, bool upper>
 | 
				
			||||||
 | 
					__global__
 | 
				
			||||||
 | 
					void triu_tril_kernel(
 | 
				
			||||||
 | 
					    scalar_t* result, scalar_t* self, int64_t k, int64_t N,
 | 
				
			||||||
 | 
					    int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride,
 | 
				
			||||||
 | 
					    int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) {
 | 
				
			||||||
 | 
					  int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					  if (linear_idx >= N) {
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int64_t self_batch_idx = blockIdx.y;
 | 
				
			||||||
 | 
					  int64_t row = linear_idx / self_ncol;
 | 
				
			||||||
 | 
					  int64_t col = linear_idx % self_ncol;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  bool mask = upper ? (col - row >= k) : (col - row <= k);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Now compute the offset for the self and result tensor
 | 
				
			||||||
 | 
					  int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride;
 | 
				
			||||||
 | 
					  int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride;
 | 
				
			||||||
 | 
					  result[res_offset] = mask ? self[self_offset] : scalar_t(0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <bool upper>
 | 
				
			||||||
 | 
					Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
 | 
				
			||||||
 | 
					  int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2),
 | 
				
			||||||
 | 
					          res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1,
 | 
				
			||||||
 | 
					          res_row_stride = result.stride(-2), res_col_stride = result.stride(-1),
 | 
				
			||||||
 | 
					          self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1,
 | 
				
			||||||
 | 
					          self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
 | 
				
			||||||
 | 
					  dim3 dim_block = cuda::getApplyBlock();
 | 
				
			||||||
 | 
					  dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
 | 
				
			||||||
 | 
					  AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{
 | 
				
			||||||
 | 
					    triu_tril_kernel<scalar_t, upper>
 | 
				
			||||||
 | 
					      <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
 | 
				
			||||||
 | 
					        result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
 | 
				
			||||||
 | 
					        res_batch_stride, res_row_stride, res_col_stride,
 | 
				
			||||||
 | 
					        self_batch_stride, self_row_stride, self_col_stride, self.size(-1));
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					  AT_CUDA_CHECK(cudaGetLastError());
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& tril_cuda_(Tensor &self, int64_t k) {
 | 
				
			||||||
 | 
					  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
				
			||||||
 | 
					  return tril_cuda_out(self, self, k);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  if (result.sizes() != self.sizes()) {
 | 
				
			||||||
 | 
					    result.resize_as_(self);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return result;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
				
			||||||
 | 
					  return triu_tril_cuda_template<false>(result, self_c, k, "tril");
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& triu_cuda_(Tensor &self, int64_t k) {
 | 
				
			||||||
 | 
					  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
 | 
				
			||||||
 | 
					  return triu_cuda_out(self, self, k);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
 | 
				
			||||||
 | 
					  if (result.sizes() != self.sizes()) {
 | 
				
			||||||
 | 
					    result.resize_as_(self);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (self.numel() == 0) {
 | 
				
			||||||
 | 
					    return result;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
 | 
				
			||||||
 | 
					  return triu_tril_cuda_template<true>(result, self_c, k, "triu");
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}  // namespace at::native
 | 
					}}  // namespace at::native
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef ALLOCATE_ARRAY
 | 
					#undef ALLOCATE_ARRAY
 | 
				
			||||||
 | 
				
			|||||||
@ -1,4 +1,5 @@
 | 
				
			|||||||
#include "ATen/ATen.h"
 | 
					#include <ATen/ATen.h>
 | 
				
			||||||
 | 
					#include <ATen/cuda/Exceptions.h>
 | 
				
			||||||
#include <THC/THCTensorMathReduce.cuh>
 | 
					#include <THC/THCTensorMathReduce.cuh>
 | 
				
			||||||
#include <math.h>
 | 
					#include <math.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -78,13 +79,13 @@ struct dists {
 | 
				
			|||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t, typename F>
 | 
					template <typename scalar_t, typename F>
 | 
				
			||||||
__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
 | 
					__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
 | 
				
			||||||
 | 
					                                              const double n2, const double n2_squared_minus_1) {
 | 
				
			||||||
  const int k = blockIdx.x;
 | 
					  const int k = blockIdx.x;
 | 
				
			||||||
  const int stride = blockDim.x;
 | 
					  const int stride = blockDim.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  float n2 = n - .5;
 | 
					 | 
				
			||||||
  // The -1 accounts for floating point truncation issues
 | 
					  // The -1 accounts for floating point truncation issues
 | 
				
			||||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
 | 
					  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
 | 
				
			||||||
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
					  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const scalar_t * const start = self + i * m;
 | 
					  const scalar_t * const start = self + i * m;
 | 
				
			||||||
@ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t, typename F>
 | 
					template <typename scalar_t, typename F>
 | 
				
			||||||
__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
 | 
					__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
 | 
				
			||||||
 | 
					                                                       const double n2, const double n2_squared_minus_1) {
 | 
				
			||||||
  const int k = blockIdx.y * blockDim.y + threadIdx.y;
 | 
					  const int k = blockIdx.y * blockDim.y + threadIdx.y;
 | 
				
			||||||
  const int init = blockIdx.x * blockDim.x + threadIdx.x;
 | 
					  const int init = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
  const int stride = blockDim.x * gridDim.x;
 | 
					  const int stride = blockDim.x * gridDim.x;
 | 
				
			||||||
@ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const
 | 
				
			|||||||
    return;
 | 
					    return;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  float n2 = n - .5;
 | 
					 | 
				
			||||||
  // The -1 accounts for floating point truncation issues
 | 
					  // The -1 accounts for floating point truncation issues
 | 
				
			||||||
  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
 | 
					  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
 | 
				
			||||||
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
					  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 | 
				
			||||||
  int64_t ib = j - i - 1;
 | 
					  int64_t ib = j - i - 1;
 | 
				
			||||||
  int64_t jb = n - 2 - i;
 | 
					  int64_t jb = n - 2 - i;
 | 
				
			||||||
@ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
 | 
				
			|||||||
  const dim3 block(forward_threads);
 | 
					  const dim3 block(forward_threads);
 | 
				
			||||||
  int64_t n = self.size(0);
 | 
					  int64_t n = self.size(0);
 | 
				
			||||||
  int64_t m = self.size(1);
 | 
					  int64_t m = self.size(1);
 | 
				
			||||||
 | 
					  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
 | 
				
			||||||
 | 
					  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
 | 
				
			||||||
 | 
					  const double n2 = n - .5;
 | 
				
			||||||
 | 
					  const double n2_squared_minus_1 = n2 * n2 - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
 | 
					  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
 | 
				
			||||||
    if (p == 0.0) {
 | 
					    if (p == 0.0) {
 | 
				
			||||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
					      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (p == 1.0) {
 | 
					    } else if (p == 1.0) {
 | 
				
			||||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
					      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (p == 2.0) {
 | 
					    } else if (p == 2.0) {
 | 
				
			||||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
					      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (std::isinf(p)) {
 | 
					    } else if (std::isinf(p)) {
 | 
				
			||||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
					      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
 | 
					      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					  AT_CUDA_CHECK(cudaGetLastError());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
 | 
					void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
 | 
				
			||||||
@ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
 | 
				
			|||||||
  const int64_t n = result.size(0);
 | 
					  const int64_t n = result.size(0);
 | 
				
			||||||
  int64_t m = self.size(1);
 | 
					  int64_t m = self.size(1);
 | 
				
			||||||
  const int block_x = 64;
 | 
					  const int block_x = 64;
 | 
				
			||||||
  const int block_y = 4;
 | 
					  // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
 | 
				
			||||||
 | 
					  // From binary search, block_y of 16 gives us max pdist dim0 of 1449,
 | 
				
			||||||
 | 
					  //                     block_y of  4 gives us max pdist dim0 of  725.
 | 
				
			||||||
 | 
					  const int block_y = 16;
 | 
				
			||||||
  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
 | 
					  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
 | 
				
			||||||
  const int grid_y = (dist.numel() + block_y - 1) / block_y;
 | 
					  const int grid_y = (dist.numel() + block_y - 1) / block_y;
 | 
				
			||||||
  const dim3 grid(grid_x, grid_y);
 | 
					  const dim3 grid(grid_x, grid_y);
 | 
				
			||||||
  const dim3 block(block_x, block_y);
 | 
					  const dim3 block(block_x, block_y);
 | 
				
			||||||
 | 
					  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
 | 
				
			||||||
 | 
					  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
 | 
				
			||||||
 | 
					  const double n2 = n - .5;
 | 
				
			||||||
 | 
					  const double n2_squared_minus_1 = n2 * n2 - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
 | 
					  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
 | 
				
			||||||
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
 | 
					  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
 | 
				
			||||||
    if (p == 1.0) {
 | 
					    if (p == 1.0) {
 | 
				
			||||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
					      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (p < 2.0) {
 | 
					    } else if (p < 2.0) {
 | 
				
			||||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
					      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (p == 2.0) {
 | 
					    } else if (p == 2.0) {
 | 
				
			||||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
					      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else if (std::isinf(p)) {
 | 
					    } else if (std::isinf(p)) {
 | 
				
			||||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
					      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
 | 
					      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					  AT_CUDA_CHECK(cudaGetLastError());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  at::sum_out(result, buffer, 0);
 | 
					  at::sum_out(result, buffer, 0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    default:
 | 
					    default:
 | 
				
			||||||
      AT_ERROR(
 | 
					      AT_ERROR(
 | 
				
			||||||
          "Unknown mode for embedding_bag_backward_cuda %d", mode);
 | 
					          "Unknown mode for embedding_bag_backward_cuda ", mode);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
 | 
				
			|||||||
          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
 | 
					          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
 | 
					        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
 | 
				
			||||||
      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
 | 
					      } else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) {
 | 
				
			||||||
          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
 | 
					          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
 | 
				
			|||||||
      if (targets.type().scalarType() == kLong) {
 | 
					      if (targets.type().scalarType() == kLong) {
 | 
				
			||||||
	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
						return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
						return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_
 | 
				
			|||||||
                                                            const Tensor& running_mean_, const Tensor& running_var_,
 | 
					                                                            const Tensor& running_mean_, const Tensor& running_var_,
 | 
				
			||||||
                                                            bool train, double momentum, double epsilon) {
 | 
					                                                            bool train, double momentum, double epsilon) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TensorArg input_arg{ input_, "input", 1 },
 | 
				
			||||||
 | 
					            weight_arg{ weight_, "weight", 2 },
 | 
				
			||||||
 | 
					            bias_arg{ bias_, "bias", 3 },
 | 
				
			||||||
 | 
					            run_mean_arg{ running_mean_, "running_mean", 4 },
 | 
				
			||||||
 | 
					            run_var_arg{ running_var_, "running_var", 5 };
 | 
				
			||||||
 | 
					  CheckedFrom c = "batch_norm_cuda";
 | 
				
			||||||
 | 
					  checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  using accscalar_t = at::acc_type<scalar_t, true>;
 | 
					  using accscalar_t = at::acc_type<scalar_t, true>;
 | 
				
			||||||
  int64_t n_input = input_.size(1);
 | 
					  int64_t n_input = input_.size(1);
 | 
				
			||||||
  Tensor save_mean_;
 | 
					  Tensor save_mean_;
 | 
				
			||||||
 | 
				
			|||||||
@ -7,28 +7,13 @@
 | 
				
			|||||||
#include <tuple>
 | 
					#include <tuple>
 | 
				
			||||||
#include <thrust/unique.h>
 | 
					#include <thrust/unique.h>
 | 
				
			||||||
#include <thrust/sort.h>
 | 
					#include <thrust/sort.h>
 | 
				
			||||||
 | 
					#include <thrust/scan.h>
 | 
				
			||||||
 | 
					#include <thrust/scatter.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace at {
 | 
					namespace at {
 | 
				
			||||||
namespace native{
 | 
					namespace native{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
template <typename scalar_t>
 | 
					 | 
				
			||||||
__global__ void inverse_indices_kernel(
 | 
					 | 
				
			||||||
    const scalar_t* input_data,
 | 
					 | 
				
			||||||
    const scalar_t* output_data,
 | 
					 | 
				
			||||||
    int64_t* inverse_indices_data,
 | 
					 | 
				
			||||||
    int64_t num_inp,
 | 
					 | 
				
			||||||
    int64_t num_out) {
 | 
					 | 
				
			||||||
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 | 
					 | 
				
			||||||
    int64_t stride = blockDim.x * gridDim.x;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (int64_t i = idx; i < num_inp * num_out; i += stride) {
 | 
					 | 
				
			||||||
      if (input_data[i / num_out] == output_data[i % num_out]){
 | 
					 | 
				
			||||||
        inverse_indices_data[i / num_out] = i % num_out;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t>
 | 
					template <typename scalar_t>
 | 
				
			||||||
  std::tuple<Tensor, Tensor> _unique_cuda_template(
 | 
					  std::tuple<Tensor, Tensor> _unique_cuda_template(
 | 
				
			||||||
@ -47,25 +32,29 @@ template <typename scalar_t>
 | 
				
			|||||||
    Tensor output = input.clone();
 | 
					    Tensor output = input.clone();
 | 
				
			||||||
    output = output.view(-1);
 | 
					    output = output.view(-1);
 | 
				
			||||||
    scalar_t* output_data = output.data<scalar_t>();
 | 
					    scalar_t* output_data = output.data<scalar_t>();
 | 
				
			||||||
    thrust::sort(policy, output_data, output_data + num_inp);
 | 
					    Tensor inverse_indices;
 | 
				
			||||||
    scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
 | 
					    if (!return_inverse) {
 | 
				
			||||||
    int64_t num_out = output_end - output_data;
 | 
					        inverse_indices = at::empty({0},  self.type().toScalarType(kLong));
 | 
				
			||||||
    output.resize_(num_out);
 | 
					        thrust::sort(policy, output_data, output_data + num_inp);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
 | 
					        Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong));
 | 
				
			||||||
 | 
					        int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>();
 | 
				
			||||||
    if (return_inverse) {
 | 
					        thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr);
 | 
				
			||||||
      inverse_indices.resize_(input.sizes());
 | 
					        Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong));
 | 
				
			||||||
      int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
 | 
					        inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong));
 | 
				
			||||||
      int block = 512;
 | 
					        int64_t* inv_loc_ptr = inv_loc.data<int64_t>();
 | 
				
			||||||
      int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
 | 
					        int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>();
 | 
				
			||||||
      inverse_indices_kernel<<<grid, block, 0, stream>>>(
 | 
					        thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }});
 | 
				
			||||||
        input_data, output_data, inverse_indices_data, num_inp, num_out);
 | 
					        inv_loc[0] = 0;
 | 
				
			||||||
 | 
					        thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
 | 
				
			||||||
 | 
					        thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
 | 
				
			||||||
 | 
					        inverse_indices.resize_(input.sizes());
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data;
 | 
				
			||||||
 | 
					    output.resize_(num_out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    THCudaCheck(cudaGetLastError());
 | 
					    THCudaCheck(cudaGetLastError());
 | 
				
			||||||
    return std::tuple<Tensor, Tensor>(output, inverse_indices);
 | 
					    return std::tuple<Tensor, Tensor>(output, inverse_indices);
 | 
				
			||||||
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename scalar_t>
 | 
					template <typename scalar_t>
 | 
				
			||||||
 | 
				
			|||||||
@ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
 | 
				
			|||||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
 | 
					        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
 | 
				
			||||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
 | 
					        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
 | 
				
			||||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
 | 
					        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
 | 
				
			||||||
#if CUDNN_VERSION >= 6000
 | 
					 | 
				
			||||||
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
 | 
					        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
 | 
					    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
 | 
				
			||||||
    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
 | 
					    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
 | 
				
			||||||
@ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
 | 
				
			|||||||
  THCCachingAllocator_emptyCache();
 | 
					  THCCachingAllocator_emptyCache();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//hot fix for #16610
 | 
				
			||||||
 | 
					//specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData, 
 | 
				
			||||||
 | 
					//adding "strided" argument, so in the end this looks simpler.
 | 
				
			||||||
 | 
					template<>
 | 
				
			||||||
 | 
					void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) {
 | 
				
			||||||
 | 
					  using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>;
 | 
				
			||||||
 | 
					  auto& cache = search::cache();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (cache.find(args.params, algo)) {
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (args.params.deterministic && !benchmark) {
 | 
				
			||||||
 | 
					    *algo = search::DEFAULT_ALGO;
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  int stride_dim = args.input.dim() - 2;
 | 
				
			||||||
 | 
					  bool strided = false;
 | 
				
			||||||
 | 
					  for (int i = 0; i< stride_dim; i++) {
 | 
				
			||||||
 | 
					      if (args.params.stride[i] != 1) {
 | 
				
			||||||
 | 
					         strided = true;
 | 
				
			||||||
 | 
					         break;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (!benchmark) {
 | 
				
			||||||
 | 
					    search::getAlgorithm(args, algo);
 | 
				
			||||||
 | 
					    if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
 | 
				
			||||||
 | 
					       *algo = search::DEFAULT_ALGO;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (cache.find(args.params, algo)) {
 | 
				
			||||||
 | 
					    // re-check cache since another thread may have benchmarked the algorithm
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  auto perfResults = search::findAlgorithm(args);
 | 
				
			||||||
 | 
					  // for deterministic algo, look at all the perf results and return the best
 | 
				
			||||||
 | 
					  // deterministic algo
 | 
				
			||||||
 | 
					  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
 | 
				
			||||||
 | 
					      !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
 | 
				
			||||||
 | 
					      *algo = perfResults.algo;
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					      *algo = search::DEFAULT_ALGO;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
 | 
				
			||||||
 | 
					     *algo = search::DEFAULT_ALGO;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  cache.insert(args.params, *algo);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Free the cached blocks in our caching allocator. They are
 | 
				
			||||||
 | 
					  // needed here because the above benchmarking uses a huge amount of memory,
 | 
				
			||||||
 | 
					  // e.g. a few GBs.
 | 
				
			||||||
 | 
					  THCCachingAllocator_emptyCache();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<typename algo_t>
 | 
					template<typename algo_t>
 | 
				
			||||||
Workspace chooseAlgorithm(
 | 
					Workspace chooseAlgorithm(
 | 
				
			||||||
    const ConvolutionArgs& args,
 | 
					    const ConvolutionArgs& args,
 | 
				
			||||||
@ -848,19 +907,9 @@ Tensor cudnn_convolution_forward(
 | 
				
			|||||||
  // See #4500
 | 
					  // See #4500
 | 
				
			||||||
  Tensor weight_contig = weight->contiguous();
 | 
					  Tensor weight_contig = weight->contiguous();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
  for (int i = 0; i < groups; i++) {
 | 
					 | 
				
			||||||
    raw_cudnn_convolution_forward_out(
 | 
					 | 
				
			||||||
        narrowGroup(*output, output_channels_dim,        i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(*input,  input_channels_dim,         i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        padding, stride, dilation, 1, benchmark, deterministic);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
  raw_cudnn_convolution_forward_out(
 | 
					  raw_cudnn_convolution_forward_out(
 | 
				
			||||||
      *output, *input, weight_contig,
 | 
					      *output, *input, weight_contig,
 | 
				
			||||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
					      padding, stride, dilation, groups, benchmark, deterministic);
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return *output;
 | 
					  return *output;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input(
 | 
				
			|||||||
  // See #4500
 | 
					  // See #4500
 | 
				
			||||||
  Tensor weight_contig = weight->contiguous();
 | 
					  Tensor weight_contig = weight->contiguous();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
  for (int i = 0; i < groups; i++) {
 | 
					 | 
				
			||||||
    raw_cudnn_convolution_backward_input_out(
 | 
					 | 
				
			||||||
        narrowGroup(*grad_input, input_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(*grad_output, output_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        padding, stride, dilation, 1, benchmark, deterministic);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
  raw_cudnn_convolution_backward_input_out(
 | 
					  raw_cudnn_convolution_backward_input_out(
 | 
				
			||||||
      *grad_input, *grad_output, weight_contig,
 | 
					      *grad_input, *grad_output, weight_contig,
 | 
				
			||||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
					      padding, stride, dilation, groups, benchmark, deterministic);
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return *grad_input;
 | 
					  return *grad_input;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight(
 | 
				
			|||||||
  TensorArg grad_weight{ grad_weight_t, "result", 0 };
 | 
					  TensorArg grad_weight{ grad_weight_t, "result", 0 };
 | 
				
			||||||
  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
 | 
					  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
  for (int i = 0; i < groups; i++) {
 | 
					 | 
				
			||||||
    raw_cudnn_convolution_backward_weight_out(
 | 
					 | 
				
			||||||
        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(*grad_output, output_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        narrowGroup(*input, input_channels_dim, i, groups),
 | 
					 | 
				
			||||||
        padding, stride, dilation, groups, benchmark, deterministic);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
  raw_cudnn_convolution_backward_weight_out(
 | 
					  raw_cudnn_convolution_backward_weight_out(
 | 
				
			||||||
      *grad_weight, *grad_output, *input,
 | 
					      *grad_weight, *grad_output, *input,
 | 
				
			||||||
      padding, stride, dilation, groups, benchmark, deterministic);
 | 
					      padding, stride, dilation, groups, benchmark, deterministic);
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return grad_weight_t;
 | 
					  return grad_weight_t;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -7,7 +7,7 @@
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
 | 
					#if !AT_CUDNN_ENABLED()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace at { namespace native {
 | 
					namespace at { namespace native {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -375,7 +375,7 @@ namespace {
 | 
				
			|||||||
      case CUDNN_RNN_TANH:
 | 
					      case CUDNN_RNN_TANH:
 | 
				
			||||||
        return 2;
 | 
					        return 2;
 | 
				
			||||||
      default:
 | 
					      default:
 | 
				
			||||||
        AT_ERROR("unknown cuDNN RNN mode %d", mode);
 | 
					        AT_ERROR("unknown cuDNN RNN mode ", mode);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -2555,9 +2555,15 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
- func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
 | 
					- func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
 | 
				
			||||||
  variants: method
 | 
					  variants: method
 | 
				
			||||||
 | 
					  dispatch:
 | 
				
			||||||
 | 
					    CPU: tril_cpu_
 | 
				
			||||||
 | 
					    CUDA: tril_cuda_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
 | 
					- func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
 | 
				
			||||||
  variants: method
 | 
					  variants: method
 | 
				
			||||||
 | 
					  dispatch:
 | 
				
			||||||
 | 
					    CPU: triu_cpu_
 | 
				
			||||||
 | 
					    CUDA: triu_cuda_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: digamma_(Tensor self) -> Tensor
 | 
					- func: digamma_(Tensor self) -> Tensor
 | 
				
			||||||
  variants: method
 | 
					  variants: method
 | 
				
			||||||
@ -2658,11 +2664,17 @@
 | 
				
			|||||||
  variants: method, function
 | 
					  variants: method, function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
					- func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
				
			||||||
 | 
					  dispatch:
 | 
				
			||||||
 | 
					    CPU: triu_cpu_out
 | 
				
			||||||
 | 
					    CUDA: triu_cuda_out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: triu(Tensor self, int64_t diagonal=0) -> Tensor
 | 
					- func: triu(Tensor self, int64_t diagonal=0) -> Tensor
 | 
				
			||||||
  variants: method, function
 | 
					  variants: method, function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
					- func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
 | 
				
			||||||
 | 
					  dispatch:
 | 
				
			||||||
 | 
					    CPU: tril_cpu_out
 | 
				
			||||||
 | 
					    CUDA: tril_cuda_out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- func: tril(Tensor self, int64_t diagonal=0) -> Tensor
 | 
					- func: tril(Tensor self, int64_t diagonal=0) -> Tensor
 | 
				
			||||||
  variants: method, function
 | 
					  variants: method, function
 | 
				
			||||||
 | 
				
			|||||||
@ -11,18 +11,4 @@ using namespace at::native;
 | 
				
			|||||||
TEST(CUDNNTest, CUDNNTestCUDA) {
 | 
					TEST(CUDNNTest, CUDNNTestCUDA) {
 | 
				
			||||||
  if (!at::cuda::is_available()) return;
 | 
					  if (!at::cuda::is_available()) return;
 | 
				
			||||||
  manual_seed(123);
 | 
					  manual_seed(123);
 | 
				
			||||||
 | 
					 | 
				
			||||||
#if CUDNN_VERSION < 7000
 | 
					 | 
				
			||||||
  auto handle = getCudnnHandle();
 | 
					 | 
				
			||||||
  DropoutDescriptor desc1, desc2;
 | 
					 | 
				
			||||||
  desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte));
 | 
					 | 
				
			||||||
  desc2.set(handle, 0.5, desc1.state);
 | 
					 | 
				
			||||||
  bool isEQ;
 | 
					 | 
				
			||||||
  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
 | 
					 | 
				
			||||||
  ASSERT_TRUE(isEQ);
 | 
					 | 
				
			||||||
  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
 | 
					 | 
				
			||||||
  ASSERT_TRUE(isEQ);
 | 
					 | 
				
			||||||
  isEQ = (desc1.desc()->states == desc2.desc()->states);
 | 
					 | 
				
			||||||
  ASSERT_TRUE(isEQ);
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -3,6 +3,8 @@ find_package(ATen REQUIRED)
 | 
				
			|||||||
include_directories(${ATEN_INCLUDE_DIR})
 | 
					include_directories(${ATEN_INCLUDE_DIR})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# C++11
 | 
					# C++11
 | 
				
			||||||
set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
 | 
					if (not MSVC) 
 | 
				
			||||||
 | 
					    set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") 
 | 
				
			||||||
 | 
					endif()
 | 
				
			||||||
add_executable(main main.cpp)
 | 
					add_executable(main main.cpp)
 | 
				
			||||||
target_link_libraries(main ${ATEN_LIBRARIES})
 | 
					target_link_libraries(main ${ATEN_LIBRARIES})
 | 
				
			||||||
 | 
				
			|||||||
@ -247,10 +247,13 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef _OPENMP
 | 
					#ifdef _OPENMP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef _WIN32
 | 
					#ifdef _WIN32  
 | 
				
			||||||
#define PRAGMA(P) _Pragma(#P)
 | 
					// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.  
 | 
				
			||||||
 | 
					#define PRAGMA_LOOP(P)    // Noop  
 | 
				
			||||||
 | 
					#define PRAGMA(P)         __pragma(P)
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define PRAGMA(P) __pragma(P)
 | 
					#define PRAGMA_LOOP(P)    _Pragma(#P)  
 | 
				
			||||||
 | 
					#define PRAGMA(P)         _Pragma(#P)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <omp.h>
 | 
					#include <omp.h>
 | 
				
			||||||
@ -369,7 +372,7 @@
 | 
				
			|||||||
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
 | 
					    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
 | 
				
			||||||
    ptrdiff_t iter = 0;                                                                        \
 | 
					    ptrdiff_t iter = 0;                                                                        \
 | 
				
			||||||
    if(tp != (TYPE2*)rp) {                                                                             \
 | 
					    if(tp != (TYPE2*)rp) {                                                                             \
 | 
				
			||||||
      PRAGMA(ivdep) \
 | 
					      PRAGMA_LOOP(ivdep) \
 | 
				
			||||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
 | 
					      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
 | 
				
			||||||
      for (iter = 0; iter < SIZE; iter++) {                             \
 | 
					      for (iter = 0; iter < SIZE; iter++) {                             \
 | 
				
			||||||
        TYPE2 *TENSOR2##_data = tp+iter;                                \
 | 
					        TYPE2 *TENSOR2##_data = tp+iter;                                \
 | 
				
			||||||
@ -377,7 +380,7 @@
 | 
				
			|||||||
        CODE                                                            \
 | 
					        CODE                                                            \
 | 
				
			||||||
      }\
 | 
					      }\
 | 
				
			||||||
    } else {\
 | 
					    } else {\
 | 
				
			||||||
      PRAGMA(simd) \
 | 
					      PRAGMA_LOOP(simd) \
 | 
				
			||||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
 | 
					      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
 | 
				
			||||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
					      for (iter = 0; iter < SIZE; iter++) {\
 | 
				
			||||||
        TYPE2* TENSOR2##_data = tp+iter;\
 | 
					        TYPE2* TENSOR2##_data = tp+iter;\
 | 
				
			||||||
@ -449,7 +452,7 @@
 | 
				
			|||||||
    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
 | 
					    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
 | 
				
			||||||
    ptrdiff_t iter = 0;\
 | 
					    ptrdiff_t iter = 0;\
 | 
				
			||||||
    if(tp != (TYPE2*)rp) {                                                                             \
 | 
					    if(tp != (TYPE2*)rp) {                                                                             \
 | 
				
			||||||
      PRAGMA(ivdep) \
 | 
					      PRAGMA_LOOP(ivdep) \
 | 
				
			||||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
					      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
				
			||||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
					      for (iter = 0; iter < SIZE; iter++) {\
 | 
				
			||||||
        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
					        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
				
			||||||
@ -458,7 +461,7 @@
 | 
				
			|||||||
        CODE                                \
 | 
					        CODE                                \
 | 
				
			||||||
      } \
 | 
					      } \
 | 
				
			||||||
    } else {\
 | 
					    } else {\
 | 
				
			||||||
      PRAGMA(simd) \
 | 
					      PRAGMA_LOOP(simd) \
 | 
				
			||||||
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
					      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
 | 
				
			||||||
      for (iter = 0; iter < SIZE; iter++) {\
 | 
					      for (iter = 0; iter < SIZE; iter++) {\
 | 
				
			||||||
        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
					        TYPE1 *TENSOR1##_data = rp+iter;\
 | 
				
			||||||
 | 
				
			|||||||
@ -13,10 +13,13 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef _OPENMP
 | 
					#ifdef _OPENMP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef _WIN32
 | 
					#ifdef _WIN32  
 | 
				
			||||||
#define PRAGMA(P) _Pragma(#P)
 | 
					// MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.  
 | 
				
			||||||
 | 
					#define PRAGMA_LOOP(P)    // Noop  
 | 
				
			||||||
 | 
					#define PRAGMA(P)         __pragma(P)
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define PRAGMA(P) __pragma(P)
 | 
					#define PRAGMA_LOOP(P)    _Pragma(#P)  
 | 
				
			||||||
 | 
					#define PRAGMA(P)         _Pragma(#P)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
 | 
					#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
 | 
				
			||||||
 | 
				
			|||||||
@ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 | 
				
			|||||||
  int free_b = 0;
 | 
					  int free_b = 0;
 | 
				
			||||||
  if (a == NULL) a = ra_;
 | 
					  if (a == NULL) a = ra_;
 | 
				
			||||||
  if (b == NULL) b = rb_;
 | 
					  if (b == NULL) b = rb_;
 | 
				
			||||||
  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
 | 
					 | 
				
			||||||
      a->dim());
 | 
					 | 
				
			||||||
  THArgCheck(!a->is_empty(), 2, "A should not be empty");
 | 
					 | 
				
			||||||
  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
 | 
					 | 
				
			||||||
      "dimensions, but has %d", b->dim());
 | 
					 | 
				
			||||||
  THArgCheck(!b->is_empty(), 2, "B should not be empty");
 | 
					 | 
				
			||||||
  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
 | 
					 | 
				
			||||||
      a->size(0), a->size(1));
 | 
					 | 
				
			||||||
  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
 | 
					 | 
				
			||||||
      "rows, B has %ld", a->size(0), b->size(0));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (b->dim() == 1) {
 | 
					 | 
				
			||||||
    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
 | 
					 | 
				
			||||||
            b->stride(0), 1, 0);
 | 
					 | 
				
			||||||
    free_b = 1;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int n, nrhs, lda, ldb, info;
 | 
					  int n, nrhs, lda, ldb, info;
 | 
				
			||||||
  THIntTensor *ipiv;
 | 
					  THIntTensor *ipiv;
 | 
				
			||||||
@ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 | 
				
			|||||||
  THTensor_(freeCopyTo)(ra__, ra_);
 | 
					  THTensor_(freeCopyTo)(ra__, ra_);
 | 
				
			||||||
  THTensor_(freeCopyTo)(rb__, rb_);
 | 
					  THTensor_(freeCopyTo)(rb__, rb_);
 | 
				
			||||||
  THIntTensor_free(ipiv);
 | 
					  THIntTensor_free(ipiv);
 | 
				
			||||||
  if (free_b) c10::raw::intrusive_ptr::decref(b);
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
 | 
					void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
 | 
				
			||||||
 | 
				
			|||||||
@ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 | 
					TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 | 
				
			||||||
TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
 | 
					TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
 | 
				
			||||||
TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
 | 
					 | 
				
			||||||
TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 | 
					TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 | 
				
			||||||
TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 | 
					TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 | 
				
			||||||
TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
 | 
					TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
 | 
				
			||||||
 | 
				
			|||||||
@ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
 | 
				
			|||||||
  REAL_SWAP(ARR(III), ARR(JJJ)); \
 | 
					  REAL_SWAP(ARR(III), ARR(JJJ)); \
 | 
				
			||||||
  LONG_SWAP(IDX(III), IDX(JJJ))
 | 
					  LONG_SWAP(IDX(III), IDX(JJJ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Emulate NumPy behavior of putting NaNs
 | 
				
			||||||
 | 
					 * at the end of an ascending list. */
 | 
				
			||||||
 | 
					#define GT_OR_NAN(x, y) \
 | 
				
			||||||
 | 
					  ((x != x && y == y) || (x > y))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
 | 
					static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
 | 
					  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
 | 
				
			||||||
@ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
				
			|||||||
      /* Use median of three for pivot choice */
 | 
					      /* Use median of three for pivot choice */
 | 
				
			||||||
    P=(L+R)>>1;
 | 
					    P=(L+R)>>1;
 | 
				
			||||||
    BOTH_SWAP(P, L+1);
 | 
					    BOTH_SWAP(P, L+1);
 | 
				
			||||||
    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
 | 
					    if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
 | 
				
			||||||
    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
 | 
					    if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
 | 
				
			||||||
    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
 | 
					    if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
					    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    do {
 | 
					    do {
 | 
				
			||||||
      do { i = i+1; } while(ARR(i) < piv);
 | 
					      do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
 | 
				
			||||||
      do { j = j-1; } while(ARR(j) > piv);
 | 
					      do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
 | 
				
			||||||
      if (j < i)
 | 
					      if (j < i)
 | 
				
			||||||
          break;
 | 
					          break;
 | 
				
			||||||
      BOTH_SWAP(i, j);
 | 
					      BOTH_SWAP(i, j);
 | 
				
			||||||
@ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
				
			|||||||
  } /* while not done */
 | 
					  } /* while not done */
 | 
				
			||||||
  /* Now insertion sort on the concatenation of subfiles */
 | 
					  /* Now insertion sort on the concatenation of subfiles */
 | 
				
			||||||
  for(i=elements-2; i>=0; i--) {
 | 
					  for(i=elements-2; i>=0; i--) {
 | 
				
			||||||
    if (ARR(i) > ARR(i+1)) {
 | 
					    if (GT_OR_NAN(ARR(i),ARR(i+1))) {
 | 
				
			||||||
      piv = ARR(i);
 | 
					      piv = ARR(i);
 | 
				
			||||||
      pid = IDX(i);
 | 
					      pid = IDX(i);
 | 
				
			||||||
      j = i+1;
 | 
					      j = i+1;
 | 
				
			||||||
@ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
 | 
				
			|||||||
        ARR(j-1) = ARR(j);
 | 
					        ARR(j-1) = ARR(j);
 | 
				
			||||||
        IDX(j-1) = IDX(j);
 | 
					        IDX(j-1) = IDX(j);
 | 
				
			||||||
        j = j+1;
 | 
					        j = j+1;
 | 
				
			||||||
      } while(j < elements && ARR(j) < piv);
 | 
					      } while(j < elements && GT_OR_NAN(piv, ARR(j)));
 | 
				
			||||||
      ARR(j-1) = piv;
 | 
					      ARR(j-1) = piv;
 | 
				
			||||||
      IDX(j-1) = pid;
 | 
					      IDX(j-1) = pid;
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
@ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
				
			|||||||
      /* Use median of three for pivot choice */
 | 
					      /* Use median of three for pivot choice */
 | 
				
			||||||
    P=(L+R)>>1;
 | 
					    P=(L+R)>>1;
 | 
				
			||||||
    BOTH_SWAP(P, L+1);
 | 
					    BOTH_SWAP(P, L+1);
 | 
				
			||||||
    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
 | 
					    if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
 | 
				
			||||||
    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
 | 
					    if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
 | 
				
			||||||
    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
 | 
					    if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
					    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    do {
 | 
					    do {
 | 
				
			||||||
      do { i = i+1; } while(ARR(i) > piv);
 | 
					      do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
 | 
				
			||||||
      do { j = j-1; } while(ARR(j) < piv);
 | 
					      do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
 | 
				
			||||||
      if (j < i)
 | 
					      if (j < i)
 | 
				
			||||||
          break;
 | 
					          break;
 | 
				
			||||||
      BOTH_SWAP(i, j);
 | 
					      BOTH_SWAP(i, j);
 | 
				
			||||||
@ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
				
			|||||||
  } /* while not done */
 | 
					  } /* while not done */
 | 
				
			||||||
  /* Now insertion sort on the concatenation of subfiles */
 | 
					  /* Now insertion sort on the concatenation of subfiles */
 | 
				
			||||||
  for(i=elements-2; i>=0; i--) {
 | 
					  for(i=elements-2; i>=0; i--) {
 | 
				
			||||||
    if (ARR(i) < ARR(i+1)) {
 | 
					    if (GT_OR_NAN(ARR(i+1), ARR(i))) {
 | 
				
			||||||
      piv = ARR(i);
 | 
					      piv = ARR(i);
 | 
				
			||||||
      pid = IDX(i);
 | 
					      pid = IDX(i);
 | 
				
			||||||
      j = i+1;
 | 
					      j = i+1;
 | 
				
			||||||
@ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
 | 
				
			|||||||
        ARR(j-1) = ARR(j);
 | 
					        ARR(j-1) = ARR(j);
 | 
				
			||||||
        IDX(j-1) = IDX(j);
 | 
					        IDX(j-1) = IDX(j);
 | 
				
			||||||
        j = j+1;
 | 
					        j = j+1;
 | 
				
			||||||
      } while(j < elements && ARR(j) > piv);
 | 
					      } while(j < elements && GT_OR_NAN(ARR(j), piv));
 | 
				
			||||||
      ARR(j-1) = piv;
 | 
					      ARR(j-1) = piv;
 | 
				
			||||||
      IDX(j-1) = pid;
 | 
					      IDX(j-1) = pid;
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
@ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i
 | 
				
			|||||||
  THLongTensor_free(tmpIndices);
 | 
					  THLongTensor_free(tmpIndices);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  int64_t t_size_0, t_size_1;
 | 
					 | 
				
			||||||
  int64_t t_stride_0, t_stride_1;
 | 
					 | 
				
			||||||
  int64_t r__stride_0, r__stride_1;
 | 
					 | 
				
			||||||
  scalar_t *t_data, *r__data;
 | 
					 | 
				
			||||||
  int64_t r, c;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  THTensor_(resizeAs)(r_, t);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  t_size_0 = THTensor_(size)(t, 0);
 | 
					 | 
				
			||||||
  t_size_1 = THTensor_(size)(t, 1);
 | 
					 | 
				
			||||||
  t_stride_0 = THTensor_(stride)(t, 0);
 | 
					 | 
				
			||||||
  t_stride_1 = THTensor_(stride)(t, 1);
 | 
					 | 
				
			||||||
  r__stride_0 = THTensor_(stride)(r_, 0);
 | 
					 | 
				
			||||||
  r__stride_1 = THTensor_(stride)(r_, 1);
 | 
					 | 
				
			||||||
  r__data = r_->data<scalar_t>();
 | 
					 | 
				
			||||||
  t_data = t->data<scalar_t>();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  for(r = 0; r < t_size_0; r++)
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    int64_t sz = THMin(r+k+1, t_size_1);
 | 
					 | 
				
			||||||
    for(c = THMax(0, r+k+1); c < t_size_1; c++)
 | 
					 | 
				
			||||||
      r__data[r*r__stride_0+c*r__stride_1] = 0;
 | 
					 | 
				
			||||||
    for(c = 0; c < sz; c++)
 | 
					 | 
				
			||||||
      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 | 
					void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int64_t t_size_0, t_size_1;
 | 
					  int64_t t_size_0, t_size_1;
 | 
				
			||||||
 | 
				
			|||||||
@ -6,17 +6,17 @@
 | 
				
			|||||||
#include "THCNumerics.cuh"
 | 
					#include "THCNumerics.cuh"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Collection of kernel sort routines
 | 
					// Collection of kernel sort routines
 | 
				
			||||||
template <typename T>
 | 
					template <typename T, bool handleNaN = false>
 | 
				
			||||||
struct LTComp {
 | 
					struct LTComp {
 | 
				
			||||||
  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
					  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
				
			||||||
    return THCNumerics<T>::lt(a, b);
 | 
					    return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T, bool handleNaN = false>
 | 
				
			||||||
struct GTComp {
 | 
					struct GTComp {
 | 
				
			||||||
  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
					  __device__ inline bool operator()(const T& a, const T& b) const {
 | 
				
			||||||
    return THCNumerics<T>::gt(a, b);
 | 
					    return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T>
 | 
				
			||||||
__device__ int binarySearchForMultinomial(T* dist,
 | 
					__device__ int binarySearchForMultinomial(T* cumdist,
 | 
				
			||||||
 | 
					                                          T* dist,
 | 
				
			||||||
                                          int size,
 | 
					                                          int size,
 | 
				
			||||||
                                          T val) {
 | 
					                                          T val) {
 | 
				
			||||||
  int start = 0;
 | 
					  int start = 0;
 | 
				
			||||||
  int end = size;
 | 
					  int end = size;
 | 
				
			||||||
  // dist[size - 1] = 0 => all zero prob dist
 | 
					  // cumdist[size - 1] = 0 => all zero prob dist
 | 
				
			||||||
  assert(THCNumerics<T>::gt(dist[size - 1], 0));
 | 
					  assert(THCNumerics<T>::gt(cumdist[size - 1], 0));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  while (end - start > 0) {
 | 
					  while (end - start > 0) {
 | 
				
			||||||
    int mid = start + (end - start) / 2;
 | 
					    int mid = start + (end - start) / 2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    T midVal = dist[mid];
 | 
					    T midVal = cumdist[mid];
 | 
				
			||||||
    if (THCNumerics<T>::lt(midVal, val)) {
 | 
					    if (THCNumerics<T>::lt(midVal, val)) {
 | 
				
			||||||
      start = mid + 1;
 | 
					      start = mid + 1;
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
@ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist,
 | 
				
			|||||||
    start = size - 1;
 | 
					    start = size - 1;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  T curVal = dist[start];
 | 
					  T curVal = cumdist[start];
 | 
				
			||||||
  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
 | 
					  while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return start;
 | 
					  return start;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
 | 
				
			|||||||
                                 int64_t* dest,
 | 
					                                 int64_t* dest,
 | 
				
			||||||
                                 int64_t distributions,
 | 
					                                 int64_t distributions,
 | 
				
			||||||
                                 int categories,
 | 
					                                 int categories,
 | 
				
			||||||
                                 T* normDistPrefixSum) {
 | 
					                                 T* normDistPrefixSum,
 | 
				
			||||||
 | 
					                                 T* normDist) {
 | 
				
			||||||
  // At the moment, each warp computes one sample value in the binary
 | 
					  // At the moment, each warp computes one sample value in the binary
 | 
				
			||||||
  // search due to divergence. It seems possible to compute multiple
 | 
					  // search due to divergence. It seems possible to compute multiple
 | 
				
			||||||
  // values and limit divergence though later on. However, no matter
 | 
					  // values and limit divergence though later on. However, no matter
 | 
				
			||||||
@ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
 | 
				
			|||||||
        // Find the bucket that a uniform sample lies in
 | 
					        // Find the bucket that a uniform sample lies in
 | 
				
			||||||
        int choice = binarySearchForMultinomial<T>(
 | 
					        int choice = binarySearchForMultinomial<T>(
 | 
				
			||||||
          normDistPrefixSum + curDist * categories,
 | 
					          normDistPrefixSum + curDist * categories,
 | 
				
			||||||
 | 
					          normDist + curDist * categories,
 | 
				
			||||||
          categories,
 | 
					          categories,
 | 
				
			||||||
          r);
 | 
					          r);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
 | 
				
			|||||||
      // Find the bucket that a uniform sample lies in
 | 
					      // Find the bucket that a uniform sample lies in
 | 
				
			||||||
      int choice = binarySearchForMultinomial<T>(
 | 
					      int choice = binarySearchForMultinomial<T>(
 | 
				
			||||||
        normDistPrefixSum + curDist * categories,
 | 
					        normDistPrefixSum + curDist * categories,
 | 
				
			||||||
 | 
					        origDist + curDist * categories,
 | 
				
			||||||
        categories,
 | 
					        categories,
 | 
				
			||||||
        r);
 | 
					        r);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -15,17 +15,17 @@
 | 
				
			|||||||
#include <thrust/system/cuda/execution_policy.h>
 | 
					#include <thrust/system/cuda/execution_policy.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T, bool handleNaN = false>
 | 
				
			||||||
struct ThrustGTOp {
 | 
					struct ThrustGTOp {
 | 
				
			||||||
  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
					  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
				
			||||||
    return THCNumerics<T>::gt(lhs, rhs);
 | 
					    return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename T>
 | 
					template <typename T, bool handleNaN = false>
 | 
				
			||||||
struct ThrustLTOp {
 | 
					struct ThrustLTOp {
 | 
				
			||||||
  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
					  __device__ bool operator()(const T& lhs, const T& rhs) const {
 | 
				
			||||||
    return THCNumerics<T>::lt(lhs, rhs);
 | 
					    return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 | 
				
			|||||||
void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 | 
					void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#ifdef USE_MAGMA
 | 
					#ifdef USE_MAGMA
 | 
				
			||||||
  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
 | 
					 | 
				
			||||||
  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
 | 
					 | 
				
			||||||
  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
 | 
					 | 
				
			||||||
  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  int64_t n = a_->size(0);
 | 
					  int64_t n = a_->size(0);
 | 
				
			||||||
  int64_t nrhs = b_->size(1);
 | 
					  int64_t nrhs = b_->size(1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
 | 
				
			|||||||
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
					      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    THCTensor_(resizeAs)(state, self_, src_);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
 | 
					    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
 | 
				
			||||||
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
					      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
 | 
				
			||||||
 | 
				
			|||||||
@ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state,
 | 
				
			|||||||
          n_sample,
 | 
					          n_sample,
 | 
				
			||||||
          THCudaLongTensor_data(state, self),
 | 
					          THCudaLongTensor_data(state, self),
 | 
				
			||||||
          numDist, numCategories,
 | 
					          numDist, numCategories,
 | 
				
			||||||
          THCTensor_(data)(state, prefixSum));
 | 
					          THCTensor_(data)(state, prefixSum),
 | 
				
			||||||
 | 
						  THCTensor_(data)(state, normDist));
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      // Sample without replacement
 | 
					      // Sample without replacement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
				
			|||||||
    dim3 block(blockSize);                                              \
 | 
					    dim3 block(blockSize);                                              \
 | 
				
			||||||
                                                                        \
 | 
					                                                                        \
 | 
				
			||||||
    if (dir) {                                                          \
 | 
					    if (dir) {                                                          \
 | 
				
			||||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \
 | 
					      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \
 | 
				
			||||||
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
					        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
				
			||||||
          keyInfo,                                                      \
 | 
					          keyInfo,                                                      \
 | 
				
			||||||
          keySlices,                                                    \
 | 
					          keySlices,                                                    \
 | 
				
			||||||
@ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
				
			|||||||
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
					          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
				
			||||||
          valueInfo,                                                    \
 | 
					          valueInfo,                                                    \
 | 
				
			||||||
          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
					          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
				
			||||||
          GTComp<scalar_t>());                                              \
 | 
					          GTComp<scalar_t, true>());                                    \
 | 
				
			||||||
    } else {                                                            \
 | 
					    } else {                                                            \
 | 
				
			||||||
      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \
 | 
					      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \
 | 
				
			||||||
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
					        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
 | 
				
			||||||
          keyInfo,                                                      \
 | 
					          keyInfo,                                                      \
 | 
				
			||||||
          keySlices,                                                    \
 | 
					          keySlices,                                                    \
 | 
				
			||||||
@ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
 | 
				
			|||||||
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
					          (TYPE) keyInfo.strides[collapseKeyDim],                       \
 | 
				
			||||||
          valueInfo,                                                    \
 | 
					          valueInfo,                                                    \
 | 
				
			||||||
          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
					          (TYPE) valueInfo.strides[collapseValueDim],                   \
 | 
				
			||||||
          LTComp<scalar_t>());                                              \
 | 
					          LTComp<scalar_t, true>());                                              \
 | 
				
			||||||
    }                                                                   \
 | 
					    }                                                                   \
 | 
				
			||||||
  } while (0)
 | 
					  } while (0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state,
 | 
				
			|||||||
#if CUDA_VERSION >= 7000
 | 
					#if CUDA_VERSION >= 7000
 | 
				
			||||||
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
					      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>());
 | 
					      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>());
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    thrust::stable_sort_by_key(
 | 
					    thrust::stable_sort_by_key(
 | 
				
			||||||
#if CUDA_VERSION >= 7000
 | 
					#if CUDA_VERSION >= 7000
 | 
				
			||||||
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
					      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>());
 | 
					      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>());
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Then, re-sort according to slice that each index is
 | 
					  // Then, re-sort according to slice that each index is
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								c10/Half.h
									
									
									
									
									
								
							@ -383,6 +383,14 @@ struct Converter<
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// In some versions of MSVC, there will be a compiler error when building.
 | 
				
			||||||
 | 
					// C4146: unary minus operator applied to unsigned type, result still unsigned
 | 
				
			||||||
 | 
					// It can be addressed by disabling the following warning. 
 | 
				
			||||||
 | 
					#ifdef _MSC_VER
 | 
				
			||||||
 | 
					#pragma warning( push )
 | 
				
			||||||
 | 
					#pragma warning( disable : 4146 )
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// skip isnan and isinf check for integral types
 | 
					// skip isnan and isinf check for integral types
 | 
				
			||||||
template <typename To, typename From>
 | 
					template <typename To, typename From>
 | 
				
			||||||
typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 | 
					typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 | 
				
			||||||
@ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef _MSC_VER
 | 
				
			||||||
 | 
					#pragma warning( pop )
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename To, typename From>
 | 
					template <typename To, typename From>
 | 
				
			||||||
typename std::enable_if<std::is_floating_point<From>::value, bool>::type
 | 
					typename std::enable_if<std::is_floating_point<From>::value, bool>::type
 | 
				
			||||||
overflows(From f) {
 | 
					overflows(From f) {
 | 
				
			||||||
 | 
				
			|||||||
@ -11,9 +11,11 @@ using c10::intrusive_ptr_target;
 | 
				
			|||||||
using c10::make_intrusive;
 | 
					using c10::make_intrusive;
 | 
				
			||||||
using c10::weak_intrusive_ptr;
 | 
					using c10::weak_intrusive_ptr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
					#pragma GCC diagnostic ignored "-Wpragmas"
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
					#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wself-move"
 | 
					#pragma GCC diagnostic ignored "-Wself-move"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
class SomeClass0Parameters : public intrusive_ptr_target {};
 | 
					class SomeClass0Parameters : public intrusive_ptr_target {};
 | 
				
			||||||
 | 
				
			|||||||
@ -25,7 +25,7 @@ Error::Error(
 | 
				
			|||||||
// Caffe2-style error message
 | 
					// Caffe2-style error message
 | 
				
			||||||
Error::Error(
 | 
					Error::Error(
 | 
				
			||||||
    const char* file,
 | 
					    const char* file,
 | 
				
			||||||
    const int line,
 | 
					    const uint32_t line,
 | 
				
			||||||
    const char* condition,
 | 
					    const char* condition,
 | 
				
			||||||
    const std::string& msg,
 | 
					    const std::string& msg,
 | 
				
			||||||
    const std::string& backtrace,
 | 
					    const std::string& backtrace,
 | 
				
			||||||
 | 
				
			|||||||
@ -49,7 +49,7 @@ class C10_API Error : public std::exception {
 | 
				
			|||||||
  Error(SourceLocation source_location, const std::string& msg);
 | 
					  Error(SourceLocation source_location, const std::string& msg);
 | 
				
			||||||
  Error(
 | 
					  Error(
 | 
				
			||||||
      const char* file,
 | 
					      const char* file,
 | 
				
			||||||
      const int line,
 | 
					      const uint32_t line,
 | 
				
			||||||
      const char* condition,
 | 
					      const char* condition,
 | 
				
			||||||
      const std::string& msg,
 | 
					      const std::string& msg,
 | 
				
			||||||
      const std::string& backtrace,
 | 
					      const std::string& backtrace,
 | 
				
			||||||
@ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e);
 | 
				
			|||||||
// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
 | 
					// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
 | 
				
			||||||
// not met.
 | 
					// not met.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
 | 
				
			||||||
 | 
					// which is different from the definition of `SourceLocation` that requires
 | 
				
			||||||
 | 
					// unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
 | 
				
			||||||
 | 
					// error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
 | 
				
			||||||
 | 
					// Here the static cast is used to pass the build.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AT_ERROR(...) \
 | 
					#define AT_ERROR(...) \
 | 
				
			||||||
  throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
 | 
					  throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AT_WARN(...) \
 | 
					#define AT_WARN(...) \
 | 
				
			||||||
  ::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
 | 
					  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AT_ASSERT(cond)                       \
 | 
					#define AT_ASSERT(cond)                       \
 | 
				
			||||||
  if (!(cond)) {                              \
 | 
					  if (!(cond)) {                              \
 | 
				
			||||||
 | 
				
			|||||||
@ -17,9 +17,10 @@
 | 
				
			|||||||
#include <utility>
 | 
					#include <utility>
 | 
				
			||||||
#include <type_traits>
 | 
					#include <type_traits>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
#pragma GCC diagnostic push
 | 
					#pragma GCC diagnostic push
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wshadow"
 | 
					#pragma GCC diagnostic ignored "-Wshadow"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#ifdef _MSC_VER
 | 
					#ifdef _MSC_VER
 | 
				
			||||||
#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
 | 
					#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
@ -1457,4 +1458,6 @@ namespace ska
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
} // end namespace ska
 | 
					} // end namespace ska
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef _MSC_VER
 | 
				
			||||||
#pragma GCC diagnostic pop
 | 
					#pragma GCC diagnostic pop
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
				
			|||||||
@ -72,18 +72,27 @@ class C10_API intrusive_ptr_target {
 | 
				
			|||||||
// We also have to disable -Wunknown-warning-option and -Wpragmas, because
 | 
					// We also have to disable -Wunknown-warning-option and -Wpragmas, because
 | 
				
			||||||
// some other compilers don't know about -Wterminate or -Wexceptions and
 | 
					// some other compilers don't know about -Wterminate or -Wexceptions and
 | 
				
			||||||
// will show a warning about unknown warning options otherwise.
 | 
					// will show a warning about unknown warning options otherwise.
 | 
				
			||||||
#pragma GCC diagnostic push
 | 
					#ifdef _MSC_VER
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
					#  pragma warning(push)  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
					#  pragma warning(disable: 4297) // function assumed not to throw an exception but does  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wterminate"
 | 
					#else  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wexceptions"
 | 
					#  pragma GCC diagnostic push  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic ignored "-Wpragmas"  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic ignored "-Wterminate"  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic ignored "-Wexceptions"  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    AT_ASSERTM(
 | 
					    AT_ASSERTM(
 | 
				
			||||||
        refcount_.load() == 0,
 | 
					        refcount_.load() == 0,
 | 
				
			||||||
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
 | 
					        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
 | 
				
			||||||
    AT_ASSERTM(
 | 
					    AT_ASSERTM(
 | 
				
			||||||
        weakcount_.load() == 0,
 | 
					        weakcount_.load() == 0,
 | 
				
			||||||
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 | 
					        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 | 
				
			||||||
#pragma GCC diagnostic pop
 | 
					#ifdef _MSC_VER
 | 
				
			||||||
 | 
					#  pragma warning(pop)  
 | 
				
			||||||
 | 
					#else  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic pop  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
 | 
					  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
 | 
				
			||||||
 | 
				
			|||||||
@ -430,12 +430,16 @@ class C10_API TypeMeta {
 | 
				
			|||||||
    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
 | 
					    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
 | 
				
			||||||
    // disabled for compilers that don't know '-Wundefined-var-template' and
 | 
					    // disabled for compilers that don't know '-Wundefined-var-template' and
 | 
				
			||||||
    // would error at our attempt to disable it.
 | 
					    // would error at our attempt to disable it.
 | 
				
			||||||
#pragma GCC diagnostic push
 | 
					#ifndef _MSC_VER  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wpragmas"
 | 
					#  pragma GCC diagnostic push  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
 | 
					#  pragma GCC diagnostic ignored "-Wpragmas"  
 | 
				
			||||||
#pragma GCC diagnostic ignored "-Wundefined-var-template"
 | 
					#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic ignored "-Wundefined-var-template"  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    return TypeMeta(_typeMetaDataInstance<T>());
 | 
					    return TypeMeta(_typeMetaDataInstance<T>());
 | 
				
			||||||
#pragma GCC diagnostic pop
 | 
					#ifndef _MSC_VER  
 | 
				
			||||||
 | 
					#  pragma GCC diagnostic pop  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 private:
 | 
					 private:
 | 
				
			||||||
 | 
				
			|||||||
@ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY)
 | 
				
			|||||||
  else()
 | 
					  else()
 | 
				
			||||||
    target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 | 
					    target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					 | 
				
			||||||
  #cmake only check for separate OpenMP library on AppleClang 7+
 | 
					 | 
				
			||||||
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
 | 
					 | 
				
			||||||
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
 | 
					 | 
				
			||||||
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
 | 
					 | 
				
			||||||
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
 | 
					 | 
				
			||||||
      target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
 | 
					 | 
				
			||||||
    endif()
 | 
					 | 
				
			||||||
  endif()
 | 
					 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
target_link_libraries(caffe2 PUBLIC c10)
 | 
					target_link_libraries(caffe2 PUBLIC c10)
 | 
				
			||||||
target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 | 
					target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 | 
				
			||||||
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 | 
					target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 | 
				
			||||||
@ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 | 
				
			|||||||
# Set standard properties on the target
 | 
					# Set standard properties on the target
 | 
				
			||||||
torch_set_target_props(caffe2)
 | 
					torch_set_target_props(caffe2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if (MSVC)
 | 
					if (NOT MSVC) 
 | 
				
			||||||
target_compile_options(caffe2 INTERFACE "-std=c++11")
 | 
					  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>") 
 | 
				
			||||||
else()
 | 
					 | 
				
			||||||
target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
 | 
					 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 | 
					target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 | 
				
			||||||
 | 
				
			|||||||
@ -93,7 +93,7 @@ using std::vector;
 | 
				
			|||||||
#define CAFFE2_NORETURN __attribute__((noreturn))
 | 
					#define CAFFE2_NORETURN __attribute__((noreturn))
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(_MSC_VER)
 | 
					#if (defined _MSC_VER && !defined NOMINMAX)
 | 
				
			||||||
#define NOMINMAX
 | 
					#define NOMINMAX
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,8 @@
 | 
				
			|||||||
from __future__ import absolute_import, division, print_function, unicode_literals
 | 
					from __future__ import absolute_import, division, print_function, unicode_literals
 | 
				
			||||||
from caffe2.proto import caffe2_pb2
 | 
					from caffe2.proto import caffe2_pb2
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import platform
 | 
				
			||||||
# TODO: refactor & remove the following alias
 | 
					# TODO: refactor & remove the following alias
 | 
				
			||||||
caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 | 
					caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 | 
				
			||||||
caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
 | 
					caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
 | 
				
			||||||
@ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 | 
				
			|||||||
caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 | 
					caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 | 
				
			||||||
caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
 | 
					caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
 | 
				
			||||||
caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 | 
					caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if platform.system() == 'Windows':
 | 
				
			||||||
 | 
					    IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if IS_CONDA:
 | 
				
			||||||
 | 
					        from ctypes import windll, c_wchar_p
 | 
				
			||||||
 | 
					        from ctypes.wintypes import DWORD, HMODULE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        AddDllDirectory = windll.kernel32.AddDllDirectory
 | 
				
			||||||
 | 
					        AddDllDirectory.restype = DWORD
 | 
				
			||||||
 | 
					        AddDllDirectory.argtypes = [c_wchar_p]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_extra_dll_dir(extra_dll_dir):
 | 
				
			||||||
 | 
					        if os.path.isdir(extra_dll_dir):
 | 
				
			||||||
 | 
					            os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if IS_CONDA:
 | 
				
			||||||
 | 
					                AddDllDirectory(extra_dll_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # first get nvToolsExt PATH
 | 
				
			||||||
 | 
					    def get_nvToolsExt_path():
 | 
				
			||||||
 | 
					        NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if os.path.exists(NVTOOLEXT_HOME):
 | 
				
			||||||
 | 
					            return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
 | 
				
			||||||
 | 
					    th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
 | 
				
			||||||
 | 
					    th_dll_path = os.path.join(th_root, 'lib')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # then add the path to env
 | 
				
			||||||
 | 
					    for p in dll_paths:
 | 
				
			||||||
 | 
					        add_extra_dll_dir(p)
 | 
				
			||||||
 | 
				
			|||||||
@ -628,37 +628,12 @@ endif()
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# ---[ OpenMP
 | 
					# ---[ OpenMP
 | 
				
			||||||
if(USE_OPENMP)
 | 
					if(USE_OPENMP)
 | 
				
			||||||
  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
 | 
					  find_package(OpenMP)
 | 
				
			||||||
  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
					 | 
				
			||||||
    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
 | 
					 | 
				
			||||||
    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
 | 
					 | 
				
			||||||
    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
 | 
					 | 
				
			||||||
    if(DARWIN_VERSION GREATER 9)
 | 
					 | 
				
			||||||
      set(APPLE_OPENMP_SUCKS 1)
 | 
					 | 
				
			||||||
    endif(DARWIN_VERSION GREATER 9)
 | 
					 | 
				
			||||||
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
 | 
					 | 
				
			||||||
      OUTPUT_VARIABLE GCC_VERSION)
 | 
					 | 
				
			||||||
    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
 | 
					 | 
				
			||||||
      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
 | 
					 | 
				
			||||||
      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
 | 
					 | 
				
			||||||
      add_compile_options(-Wno-unknown-pragmas)
 | 
					 | 
				
			||||||
      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
 | 
					 | 
				
			||||||
    endif()
 | 
					 | 
				
			||||||
  endif()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
 | 
					 | 
				
			||||||
    find_package(OpenMP)
 | 
					 | 
				
			||||||
    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
 | 
					 | 
				
			||||||
    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
 | 
					 | 
				
			||||||
    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
 | 
					 | 
				
			||||||
  endif()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if(OPENMP_FOUND)
 | 
					  if(OPENMP_FOUND)
 | 
				
			||||||
    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
 | 
					    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
 | 
				
			||||||
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 | 
					    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 | 
				
			||||||
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 | 
					    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 | 
				
			||||||
 | 
					    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 | 
				
			||||||
  else()
 | 
					  else()
 | 
				
			||||||
    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
 | 
					    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
 | 
				
			||||||
    caffe2_update_option(USE_OPENMP OFF)
 | 
					    caffe2_update_option(USE_OPENMP OFF)
 | 
				
			||||||
@ -690,7 +665,12 @@ if(USE_CUDA)
 | 
				
			|||||||
      caffe2_update_option(USE_NVRTC OFF)
 | 
					      caffe2_update_option(USE_NVRTC OFF)
 | 
				
			||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
    if(CAFFE2_USE_CUDNN)
 | 
					    if(CAFFE2_USE_CUDNN)
 | 
				
			||||||
      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
 | 
					      IF(CUDNN_STATIC_LINKAGE)
 | 
				
			||||||
 | 
						LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
 | 
				
			||||||
 | 
						  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
 | 
				
			||||||
 | 
					      ELSE()
 | 
				
			||||||
 | 
						list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
 | 
				
			||||||
 | 
					      ENDIF()
 | 
				
			||||||
    else()
 | 
					    else()
 | 
				
			||||||
      caffe2_update_option(USE_CUDNN OFF)
 | 
					      caffe2_update_option(USE_CUDNN OFF)
 | 
				
			||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
@ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
				
			|||||||
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
 | 
					    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
 | 
				
			||||||
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
 | 
					    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
 | 
				
			||||||
  ENDIF()
 | 
					  ENDIF()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # OpenMP support?
 | 
				
			||||||
 | 
					  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
 | 
				
			||||||
 | 
					  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
				
			||||||
 | 
					    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
 | 
				
			||||||
 | 
					    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
 | 
				
			||||||
 | 
					    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
 | 
				
			||||||
 | 
					    IF (DARWIN_VERSION GREATER 9)
 | 
				
			||||||
 | 
					      SET(APPLE_OPENMP_SUCKS 1)
 | 
				
			||||||
 | 
					    ENDIF (DARWIN_VERSION GREATER 9)
 | 
				
			||||||
 | 
					    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
 | 
				
			||||||
 | 
					      OUTPUT_VARIABLE GCC_VERSION)
 | 
				
			||||||
 | 
					    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
 | 
				
			||||||
 | 
					      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
 | 
				
			||||||
 | 
					      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
 | 
				
			||||||
 | 
					      add_compile_options(-Wno-unknown-pragmas)
 | 
				
			||||||
 | 
					      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
 | 
				
			||||||
 | 
					    ENDIF()
 | 
				
			||||||
 | 
					  ENDIF()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
 | 
				
			||||||
 | 
					    FIND_PACKAGE(OpenMP)
 | 
				
			||||||
 | 
					    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
 | 
				
			||||||
 | 
					    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
 | 
				
			||||||
 | 
					    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
 | 
				
			||||||
 | 
					  ENDIF()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  IF (OPENMP_FOUND)
 | 
				
			||||||
 | 
					    MESSAGE(STATUS "Compiling with OpenMP support")
 | 
				
			||||||
 | 
					    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 | 
				
			||||||
 | 
					    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 | 
				
			||||||
 | 
					  ENDIF()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 | 
					  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  FIND_PACKAGE(MAGMA)
 | 
					  FIND_PACKAGE(MAGMA)
 | 
				
			||||||
@ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
				
			|||||||
    SET(AT_CUDA_ENABLED 0)
 | 
					    SET(AT_CUDA_ENABLED 0)
 | 
				
			||||||
  else()
 | 
					  else()
 | 
				
			||||||
    SET(AT_CUDA_ENABLED 1)
 | 
					    SET(AT_CUDA_ENABLED 1)
 | 
				
			||||||
    find_package(CUDA 5.5 REQUIRED)
 | 
					 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
 | 
					  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
 | 
				
			||||||
@ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE)
 | 
				
			|||||||
  SET(AT_MKLDNN_ENABLED 0)
 | 
					  SET(AT_MKLDNN_ENABLED 0)
 | 
				
			||||||
  SET(CAFFE2_USE_MKLDNN OFF)
 | 
					  SET(CAFFE2_USE_MKLDNN OFF)
 | 
				
			||||||
  IF (USE_MKLDNN)
 | 
					  IF (USE_MKLDNN)
 | 
				
			||||||
    FIND_PACKAGE(MKLDNN)
 | 
					 | 
				
			||||||
    INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
 | 
					    INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
 | 
				
			||||||
    IF(MKLDNN_FOUND)
 | 
					    IF(MKLDNN_FOUND)
 | 
				
			||||||
      SET(AT_MKLDNN_ENABLED 1)
 | 
					      SET(AT_MKLDNN_ENABLED 1)
 | 
				
			||||||
      INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR})
 | 
					      INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR})
 | 
				
			||||||
      IF(BUILD_CAFFE2_OPS)
 | 
					      IF(BUILD_CAFFE2_OPS)
 | 
				
			||||||
        SET(CAFFE2_USE_MKLDNN ON)
 | 
					        SET(CAFFE2_USE_MKLDNN ON)
 | 
				
			||||||
        LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
 | 
					        LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
 | 
				
			||||||
 | 
				
			|||||||
@ -2,7 +2,6 @@
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# The following variables are optionally searched for defaults
 | 
					# The following variables are optionally searched for defaults
 | 
				
			||||||
#  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found
 | 
					#  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found
 | 
				
			||||||
#  USE_MKLDNN
 | 
					 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# The following are set after configuration is done:
 | 
					# The following are set after configuration is done:
 | 
				
			||||||
#  MKLDNN_FOUND          : set to true if mkl-dnn is found.
 | 
					#  MKLDNN_FOUND          : set to true if mkl-dnn is found.
 | 
				
			||||||
@ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND)
 | 
				
			|||||||
SET(MKLDNN_LIBRARIES)
 | 
					SET(MKLDNN_LIBRARIES)
 | 
				
			||||||
SET(MKLDNN_INCLUDE_DIR)
 | 
					SET(MKLDNN_INCLUDE_DIR)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IF (NOT USE_MKLDNN)
 | 
					 | 
				
			||||||
  RETURN()
 | 
					 | 
				
			||||||
ENDIF(NOT USE_MKLDNN)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
IF(MSVC)
 | 
					IF(MSVC)
 | 
				
			||||||
  MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
 | 
					  MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
 | 
				
			||||||
  RETURN()
 | 
					  RETURN()
 | 
				
			||||||
@ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
 | 
				
			|||||||
LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
 | 
					LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IF(MKL_FOUND)
 | 
					IF(MKL_FOUND)
 | 
				
			||||||
 | 
					  # Append to mkldnn dependencies
 | 
				
			||||||
  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
 | 
					  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
 | 
				
			||||||
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
 | 
					  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
 | 
				
			||||||
  # The OMP-related variables of MKL-DNN have to be overwritten here,
 | 
					 | 
				
			||||||
  # if MKL is used, and the OMP version is defined by MKL.
 | 
					 | 
				
			||||||
  # MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL.
 | 
					 | 
				
			||||||
  # INTEL_MKL_DIR gives the MKL root path.
 | 
					 | 
				
			||||||
  IF (INTEL_MKL_DIR)
 | 
					 | 
				
			||||||
    SET(MKLROOT ${INTEL_MKL_DIR})
 | 
					 | 
				
			||||||
    IF(WIN32)
 | 
					 | 
				
			||||||
      SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
 | 
					 | 
				
			||||||
    ELSE(WIN32)
 | 
					 | 
				
			||||||
      IF (MKL_LIBRARIES_gomp_LIBRARY)
 | 
					 | 
				
			||||||
        SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY})
 | 
					 | 
				
			||||||
      ELSE(MKL_LIBRARIES_gomp_LIBRARY)
 | 
					 | 
				
			||||||
        SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY})
 | 
					 | 
				
			||||||
      ENDIF(MKL_LIBRARIES_gomp_LIBRARY)
 | 
					 | 
				
			||||||
      SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
 | 
					 | 
				
			||||||
    ENDIF(WIN32)
 | 
					 | 
				
			||||||
  ELSE(INTEL_MKL_DIR)
 | 
					 | 
				
			||||||
    MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!")
 | 
					 | 
				
			||||||
  ENDIF(INTEL_MKL_DIR)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
ELSE(MKL_FOUND)
 | 
					ELSE(MKL_FOUND)
 | 
				
			||||||
  # If we cannot find MKL, we will use the Intel MKL Small library
 | 
					  # If we cannot find MKL, we will use the Intel MKL Small library
 | 
				
			||||||
  # comes with ${MKLDNN_ROOT}/external
 | 
					  # comes with ${MKLDNN_ROOT}/external
 | 
				
			||||||
@ -75,60 +51,65 @@ ELSE(MKL_FOUND)
 | 
				
			|||||||
  ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)
 | 
					  ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
 | 
					  FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
 | 
				
			||||||
  IF(MKLML_INNER_INCLUDE_DIR)
 | 
					  IF(NOT MKLML_INNER_INCLUDE_DIR)
 | 
				
			||||||
    # if user has multiple version under external/ then guess last
 | 
					    MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
 | 
				
			||||||
    # one alphabetically is "latest" and warn
 | 
					    RETURN()
 | 
				
			||||||
    LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
 | 
					  ENDIF(NOT MKLML_INNER_INCLUDE_DIR)
 | 
				
			||||||
    IF(MKLINCLEN GREATER 1)
 | 
					  # if user has multiple version under external/ then guess last
 | 
				
			||||||
      LIST(SORT MKLML_INNER_INCLUDE_DIR)
 | 
					  # one alphabetically is "latest" and warn
 | 
				
			||||||
      LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
 | 
					  LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
 | 
				
			||||||
      LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
 | 
					  IF(MKLINCLEN GREATER 1)
 | 
				
			||||||
      SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
 | 
					    LIST(SORT MKLML_INNER_INCLUDE_DIR)
 | 
				
			||||||
    ENDIF(MKLINCLEN GREATER 1)
 | 
					    LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
 | 
				
			||||||
    GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
 | 
					    LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
 | 
				
			||||||
    LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
 | 
					    SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
 | 
				
			||||||
 | 
					  ENDIF(MKLINCLEN GREATER 1)
 | 
				
			||||||
 | 
					  GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
 | 
				
			||||||
 | 
					  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    IF(APPLE)
 | 
					  IF(APPLE)
 | 
				
			||||||
      SET(__mklml_inner_libs mklml iomp5)
 | 
					    SET(__mklml_inner_libs mklml iomp5)
 | 
				
			||||||
    ELSE(APPLE)
 | 
					  ELSE(APPLE)
 | 
				
			||||||
      SET(__mklml_inner_libs mklml_intel iomp5)
 | 
					    SET(__mklml_inner_libs mklml_intel iomp5)
 | 
				
			||||||
    ENDIF(APPLE)
 | 
					  ENDIF(APPLE)
 | 
				
			||||||
 | 
					  FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
 | 
				
			||||||
    FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
 | 
					    STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
 | 
				
			||||||
      STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
 | 
					    FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
 | 
				
			||||||
      FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
 | 
					          NAMES ${__mklml_inner_lib}
 | 
				
			||||||
            NAMES ${__mklml_inner_lib}
 | 
					          PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
 | 
				
			||||||
            PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
 | 
					          DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
 | 
				
			||||||
            DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
 | 
					    MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
 | 
				
			||||||
      MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
 | 
					    IF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
 | 
				
			||||||
      LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
 | 
					      MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
 | 
				
			||||||
    ENDFOREACH(__mklml_inner_lib)
 | 
					      RETURN()
 | 
				
			||||||
  ENDIF(MKLML_INNER_INCLUDE_DIR)
 | 
					    ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
 | 
				
			||||||
 | 
					    LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
 | 
				
			||||||
 | 
					  ENDFOREACH(__mklml_inner_lib)
 | 
				
			||||||
ENDIF(MKL_FOUND)
 | 
					ENDIF(MKL_FOUND)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES)
 | 
					IF(MKL_FOUND)
 | 
				
			||||||
LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR)
 | 
					  SET(MKL_cmake_included TRUE)
 | 
				
			||||||
INCLUDE(FindPackageHandleStandardArgs)
 | 
					  SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE)
 | 
				
			||||||
find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for})
 | 
					ENDIF(MKL_FOUND)
 | 
				
			||||||
 | 
					SET(WITH_TEST FALSE CACHE BOOL "" FORCE)
 | 
				
			||||||
 | 
					SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE)
 | 
				
			||||||
 | 
					SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
 | 
				
			||||||
 | 
					ADD_SUBDIRECTORY(${MKLDNN_ROOT})
 | 
				
			||||||
 | 
					IF(NOT TARGET mkldnn)
 | 
				
			||||||
 | 
					  MESSAGE("Failed to include MKL-DNN target")
 | 
				
			||||||
 | 
					  RETURN()
 | 
				
			||||||
 | 
					ENDIF(NOT TARGET mkldnn)
 | 
				
			||||||
 | 
					IF(MKL_FOUND)
 | 
				
			||||||
 | 
					  TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL)
 | 
				
			||||||
 | 
					ENDIF(MKL_FOUND)
 | 
				
			||||||
 | 
					IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
				
			||||||
 | 
					  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized)
 | 
				
			||||||
 | 
					  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow)
 | 
				
			||||||
 | 
					  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow)
 | 
				
			||||||
 | 
					ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
				
			||||||
 | 
					LIST(APPEND MKLDNN_LIBRARIES mkldnn)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IF(MKLDNN_FOUND)
 | 
					SET(MKLDNN_FOUND TRUE)
 | 
				
			||||||
  IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
					MESSAGE(STATUS "Found MKL-DNN: TRUE")
 | 
				
			||||||
    ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized)
 | 
					 | 
				
			||||||
  ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 | 
					 | 
				
			||||||
  SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE)
 | 
					 | 
				
			||||||
  SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE)
 | 
					 | 
				
			||||||
  ADD_SUBDIRECTORY(${MKLDNN_ROOT})
 | 
					 | 
				
			||||||
  SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
 | 
					 | 
				
			||||||
  IF(WIN32)
 | 
					 | 
				
			||||||
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}")
 | 
					 | 
				
			||||||
  ELSE(WIN32)
 | 
					 | 
				
			||||||
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
 | 
					 | 
				
			||||||
  ENDIF(WIN32)
 | 
					 | 
				
			||||||
ELSE(MKLDNN_FOUND)
 | 
					 | 
				
			||||||
  MESSAGE(STATUS "MKLDNN source files not found!")
 | 
					 | 
				
			||||||
ENDIF(MKLDNN_FOUND)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
UNSET(__mklml_inner_libs)
 | 
					 | 
				
			||||||
UNSET(__mkldnn_looked_for)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
ENDIF(NOT MKLDNN_FOUND)
 | 
					ENDIF(NOT MKLDNN_FOUND)
 | 
				
			||||||
 | 
				
			|||||||
@ -9,6 +9,12 @@ endif()
 | 
				
			|||||||
# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
 | 
					# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
 | 
				
			||||||
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
 | 
					list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 # we dont want to statically link cudart, because we rely on it's dynamic linkage in
 | 
				
			||||||
 | 
					 # python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
 | 
				
			||||||
 | 
					 # Technically, we can link cudart here statically, and link libtorch_python.so
 | 
				
			||||||
 | 
					 # to a dynamic libcudart.so, but that's just wasteful
 | 
				
			||||||
 | 
					SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Find CUDA.
 | 
					# Find CUDA.
 | 
				
			||||||
find_package(CUDA 7.0)
 | 
					find_package(CUDA 7.0)
 | 
				
			||||||
if(NOT CUDA_FOUND)
 | 
					if(NOT CUDA_FOUND)
 | 
				
			||||||
@ -89,6 +95,9 @@ endif()
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
if(DEFINED ENV{CUDNN_LIBRARY})
 | 
					if(DEFINED ENV{CUDNN_LIBRARY})
 | 
				
			||||||
  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
 | 
					  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
 | 
				
			||||||
 | 
					  if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a")
 | 
				
			||||||
 | 
					    SET(CUDNN_STATIC_LINKAGE ON)
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
else()
 | 
					else()
 | 
				
			||||||
  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
 | 
					  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
 | 
				
			||||||
    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
 | 
					    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
 | 
				
			||||||
@ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN)
 | 
				
			|||||||
        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
 | 
					        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
 | 
					  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
 | 
				
			||||||
 | 
					  if(CUDNN_VERSION VERSION_LESS "7.0.0")
 | 
				
			||||||
 | 
					    message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ---[ CUDA libraries wrapper
 | 
					# ---[ CUDA libraries wrapper
 | 
				
			||||||
@ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED)
 | 
				
			|||||||
if(CAFFE2_STATIC_LINK_CUDA)
 | 
					if(CAFFE2_STATIC_LINK_CUDA)
 | 
				
			||||||
    set_property(
 | 
					    set_property(
 | 
				
			||||||
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
					        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
				
			||||||
        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt)
 | 
					        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl)
 | 
				
			||||||
else()
 | 
					else()
 | 
				
			||||||
    set_property(
 | 
					    set_property(
 | 
				
			||||||
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
					        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										917
									
								
								docs/source/community/contribution_guide.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,917 @@
 | 
				
			|||||||
 | 
					PyTorch Contribution Guide
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch is a GPU-accelerated Python tensor computation package for
 | 
				
			||||||
 | 
					building deep neural networks built on tape-based autograd systems.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The PyTorch Contribution Process
 | 
				
			||||||
 | 
					--------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The PyTorch organization is governed by `PyTorch
 | 
				
			||||||
 | 
					Governance </docs/community/governance.html>`__.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The PyTorch development process involves a healthy amount of open
 | 
				
			||||||
 | 
					discussions between the core development team and the community.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch operates similar to most open source projects on GitHub.
 | 
				
			||||||
 | 
					However, if you've never contributed to an open source project before,
 | 
				
			||||||
 | 
					here is the basic process.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Figure out what you're going to work on.** The majority of open
 | 
				
			||||||
 | 
					   source contributions come from people scratching their own itches.
 | 
				
			||||||
 | 
					   However, if you don't know what you want to work on, or are just
 | 
				
			||||||
 | 
					   looking to get more acquainted with the project, here are some tips
 | 
				
			||||||
 | 
					   for how to find appropriate tasks:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  Look through the `issue
 | 
				
			||||||
 | 
					      tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if
 | 
				
			||||||
 | 
					      there are any issues you know how to fix. Issues that are
 | 
				
			||||||
 | 
					      confirmed by other contributors tend to be better to investigate.
 | 
				
			||||||
 | 
					      We also maintain some labels for issues which are likely to be
 | 
				
			||||||
 | 
					      good for new people, e.g., **bootcamp** and **1hr**, although
 | 
				
			||||||
 | 
					      these labels are less well maintained.
 | 
				
			||||||
 | 
					   -  Join us on Slack and let us know you're interested in getting to
 | 
				
			||||||
 | 
					      know PyTorch. We're very happy to help out researchers and
 | 
				
			||||||
 | 
					      partners get up to speed with the codebase.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Figure out the scope of your change and reach out for design
 | 
				
			||||||
 | 
					   comments on a GitHub issue if it's large.** The majority of pull
 | 
				
			||||||
 | 
					   requests are small; in that case, no need to let us know about what
 | 
				
			||||||
 | 
					   you want to do, just get cracking. But if the change is going to be
 | 
				
			||||||
 | 
					   large, it's usually a good idea to get some design comments about it
 | 
				
			||||||
 | 
					   first.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  If you don't know how big a change is going to be, we can help you
 | 
				
			||||||
 | 
					      figure it out! Just post about it on issues or Slack.
 | 
				
			||||||
 | 
					   -  Some feature additions are very standardized; for example, lots of
 | 
				
			||||||
 | 
					      people add new operators or optimizers to PyTorch. Design
 | 
				
			||||||
 | 
					      discussion in these cases boils down mostly to, “Do we want this
 | 
				
			||||||
 | 
					      operator/optimizer?” Giving evidence for its utility, e.g., usage
 | 
				
			||||||
 | 
					      in peer reviewed papers, or existence in other frameworks, helps a
 | 
				
			||||||
 | 
					      bit when making this case.
 | 
				
			||||||
 | 
					   -  Core changes and refactors can be quite difficult to coordinate,
 | 
				
			||||||
 | 
					      as the pace of development on PyTorch master is quite fast.
 | 
				
			||||||
 | 
					      Definitely reach out about fundamental or cross-cutting changes;
 | 
				
			||||||
 | 
					      we can often give guidance about how to stage such changes into
 | 
				
			||||||
 | 
					      more easily reviewable pieces.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Code it out!**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  See the technical guide for advice for working with PyTorch in a
 | 
				
			||||||
 | 
					      technical form.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Open a pull request.**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  If you are not ready for the pull request to be reviewed, tag it
 | 
				
			||||||
 | 
					      with [WIP]. We will ignore it when doing review passes. If you are
 | 
				
			||||||
 | 
					      working on a complex change, it's good to start things off as WIP,
 | 
				
			||||||
 | 
					      because you will need to spend time looking at CI results to see
 | 
				
			||||||
 | 
					      if things worked out or not.
 | 
				
			||||||
 | 
					   -  Find an appropriate reviewer for your change. We have some folks
 | 
				
			||||||
 | 
					      who regularly go through the PR queue and try to review
 | 
				
			||||||
 | 
					      everything, but if you happen to know who the maintainer for a
 | 
				
			||||||
 | 
					      given subsystem affected by your patch is, feel free to include
 | 
				
			||||||
 | 
					      them directly on the pull request. You can learn more about this
 | 
				
			||||||
 | 
					      structure at PyTorch Subsystem Ownership.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Iterate on the pull request until it's accepted!**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  We'll try our best to minimize the number of review roundtrips and
 | 
				
			||||||
 | 
					      block PRs only when there are major issues. For the most common
 | 
				
			||||||
 | 
					      issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__.
 | 
				
			||||||
 | 
					   -  Once a pull request is accepted and CI is passing, there is
 | 
				
			||||||
 | 
					      nothing else you need to do; we will merge the PR for you.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Getting Started
 | 
				
			||||||
 | 
					---------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Proposing new features
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					New feature ideas are best discussed on a specific issue. Please include
 | 
				
			||||||
 | 
					as much information as you can, any accompanying data, and your proposed
 | 
				
			||||||
 | 
					solution. The PyTorch team and community frequently reviews new issues
 | 
				
			||||||
 | 
					and comments where they think they can help. If you feel confident in
 | 
				
			||||||
 | 
					your solution, go ahead and implement it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Reporting Issues
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you've identified an issue, first search through the `list of
 | 
				
			||||||
 | 
					existing issues <https://github.com/pytorch/pytorch/issues>`__ on the
 | 
				
			||||||
 | 
					repo. If you are unable to find a similar issue, then create a new one.
 | 
				
			||||||
 | 
					Supply as much information you can to reproduce the problematic
 | 
				
			||||||
 | 
					behavior. Also, include any additional insights like the behavior you
 | 
				
			||||||
 | 
					expect.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Implementing Features or Fixing Bugs
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you want to fix a specific issue, it's best to comment on the
 | 
				
			||||||
 | 
					individual issue with your intent. However, we do not lock or assign
 | 
				
			||||||
 | 
					issues except in cases where we have worked with the developer before.
 | 
				
			||||||
 | 
					It's best to strike up a conversation on the issue and discuss your
 | 
				
			||||||
 | 
					proposed solution. The PyTorch team can provide guidance that saves you
 | 
				
			||||||
 | 
					time.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Issues that are labeled first-new-issue, low, or medium priority provide
 | 
				
			||||||
 | 
					the best entrance point are great places to start.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Adding Tutorials
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__
 | 
				
			||||||
 | 
					come from the community itself and we welcome additional contributions.
 | 
				
			||||||
 | 
					To learn more about how to contribute a new tutorial you can learn more
 | 
				
			||||||
 | 
					here: `PyTorch.org Tutorial Contribution Guide on
 | 
				
			||||||
 | 
					Github <https://github.com/pytorch/tutorials/#contributing>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Improving Documentation & Tutorials
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We aim to produce high quality documentation and tutorials. On rare
 | 
				
			||||||
 | 
					occasions that content includes typos or bugs. If you find something you
 | 
				
			||||||
 | 
					can fix, send us a pull request for consideration.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Take a look at the `Documentation <#on-documentation>`__ section to learn how our system
 | 
				
			||||||
 | 
					works.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Participating in online discussions
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can find active discussions happening on the PyTorch Discussion
 | 
				
			||||||
 | 
					`forum <https://discuss.pytorch.org/>`__.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Submitting pull requests to fix open issues
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can view a list of all open issues
 | 
				
			||||||
 | 
					`here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an
 | 
				
			||||||
 | 
					issue is a great way to get the attention of the team. From here you can
 | 
				
			||||||
 | 
					share your ideas and how you plan to resolve the issue.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For more challenging issues, the team will provide feedback and
 | 
				
			||||||
 | 
					direction for how to best solve the issue.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you're not able to fix the issue itself, commenting and sharing
 | 
				
			||||||
 | 
					whether you can reproduce the issue can be useful for helping the team
 | 
				
			||||||
 | 
					identify problem areas.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Reviewing open pull requests
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We appreciate your help reviewing and commenting on pull requests. Our
 | 
				
			||||||
 | 
					team strives to keep the number of open pull requests at a manageable
 | 
				
			||||||
 | 
					size, we respond quickly for more information if we need it, and we
 | 
				
			||||||
 | 
					merge PRs that we think are useful. However, due to the high level of
 | 
				
			||||||
 | 
					interest, additional eyes on pull requests is appreciated.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Improving code readability
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Improve code readability helps everyone. It is often better to submit a
 | 
				
			||||||
 | 
					small number of pull requests that touch few files versus a large pull
 | 
				
			||||||
 | 
					request that touches many files. Starting a discussion in the PyTorch
 | 
				
			||||||
 | 
					forum `here <https://discuss.pytorch.org/>`__ or on an issue related to
 | 
				
			||||||
 | 
					your improvement is the best way to get started.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Adding test cases to make the codebase more robust
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Additional test coverage is appreciated.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Promoting PyTorch
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Your use of PyTorch in your projects, research papers, write ups, blogs,
 | 
				
			||||||
 | 
					or general discussions around the internet helps to raise awareness for
 | 
				
			||||||
 | 
					PyTorch and our growing community. Please reach out to
 | 
				
			||||||
 | 
					`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 | 
				
			||||||
 | 
					for marketing support.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Triaging issues
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you feel that an issue could benefit from a particular tag or level
 | 
				
			||||||
 | 
					of complexity comment on the issue and share your opinion. If an you
 | 
				
			||||||
 | 
					feel an issue isn't categorized properly comment and let the team know.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					About open source development
 | 
				
			||||||
 | 
					-----------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If this is your first time contributing to an open source project, some
 | 
				
			||||||
 | 
					aspects of the development process may seem unusual to you.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **There is no way to “claim” issues.** People often want to “claim”
 | 
				
			||||||
 | 
					   an issue when they decide to work on it, to ensure that there isn't
 | 
				
			||||||
 | 
					   wasted work when someone else ends up working on it. This doesn't
 | 
				
			||||||
 | 
					   really work too well in open source, since someone may decide to work
 | 
				
			||||||
 | 
					   on something, and end up not having time to do it. Feel free to give
 | 
				
			||||||
 | 
					   information in an advisory fashion, but at the end of the day, we
 | 
				
			||||||
 | 
					   will take running code and rough consensus.
 | 
				
			||||||
 | 
					-  **There is a high bar for new functionality that is added.** Unlike
 | 
				
			||||||
 | 
					   in a corporate environment, where the person who wrote code
 | 
				
			||||||
 | 
					   implicitly “owns” it and can be expected to take care of it in the
 | 
				
			||||||
 | 
					   beginning of its lifetime, once a pull request is merged into an open
 | 
				
			||||||
 | 
					   source project, it immediately becomes the collective responsibility
 | 
				
			||||||
 | 
					   of all maintainers on the project. When we merge code, we are saying
 | 
				
			||||||
 | 
					   that we, the maintainers, are able to review subsequent changes and
 | 
				
			||||||
 | 
					   make a bugfix to the code. This naturally leads to a higher standard
 | 
				
			||||||
 | 
					   of contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Common Mistakes To Avoid
 | 
				
			||||||
 | 
					------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Did you add tests?** (Or if the change is hard to test, did you
 | 
				
			||||||
 | 
					   describe how you tested your change?)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  We have a few motivations for why we ask for tests:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      1. to help us tell if we break it later
 | 
				
			||||||
 | 
					      2. to help us tell if the patch is correct in the first place
 | 
				
			||||||
 | 
					         (yes, we did review it, but as Knuth says, “beware of the
 | 
				
			||||||
 | 
					         following code, for I have not run it, merely proven it
 | 
				
			||||||
 | 
					         correct”)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  When is it OK not to add a test? Sometimes a change can't be
 | 
				
			||||||
 | 
					      conveniently tested, or the change is so obviously correct (and
 | 
				
			||||||
 | 
					      unlikely to be broken) that it's OK not to test it. On the
 | 
				
			||||||
 | 
					      contrary, if a change is seems likely (or is known to be likely)
 | 
				
			||||||
 | 
					      to be accidentally broken, it's important to put in the time to
 | 
				
			||||||
 | 
					      work out a testing strategy.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Is your PR too long?**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  It's easier for us to review and merge small PRs. Difficulty of
 | 
				
			||||||
 | 
					      reviewing a PR scales nonlinearly with its size.
 | 
				
			||||||
 | 
					   -  When is it OK to submit a large PR? It helps a lot if there was a
 | 
				
			||||||
 | 
					      corresponding design discussion in an issue, with sign off from
 | 
				
			||||||
 | 
					      the people who are going to review your diff. We can also help
 | 
				
			||||||
 | 
					      give advice about how to split up a large change into individually
 | 
				
			||||||
 | 
					      shippable parts. Similarly, it helps if there is a complete
 | 
				
			||||||
 | 
					      description of the contents of the PR: it's easier to review code
 | 
				
			||||||
 | 
					      if we know what's inside!
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **Comments for subtle things?** In cases where behavior of your code
 | 
				
			||||||
 | 
					   is nuanced, please include extra comments and documentation to allow
 | 
				
			||||||
 | 
					   us to better understand the intention of your code.
 | 
				
			||||||
 | 
					-  **Did you add a hack?** Sometimes a hack is the right answer. But
 | 
				
			||||||
 | 
					   usually we will have to discuss it.
 | 
				
			||||||
 | 
					-  **Do you want to touch a very core component?** In order to prevent
 | 
				
			||||||
 | 
					   major regressions, pull requests that touch core components receive
 | 
				
			||||||
 | 
					   extra scrutiny. Make sure you've discussed your changes with the team
 | 
				
			||||||
 | 
					   before undertaking major changes.
 | 
				
			||||||
 | 
					-  **Want to add a new feature?** If you want to add new features,
 | 
				
			||||||
 | 
					   comment your intention on the related issue. Our team tries to
 | 
				
			||||||
 | 
					   comment on and provide feedback to the community. It's better to have
 | 
				
			||||||
 | 
					   an open discussion with the team and the rest of the community prior
 | 
				
			||||||
 | 
					   to building new features. This helps us stay aware of what you're
 | 
				
			||||||
 | 
					   working on and increases the chance that it'll be merged.
 | 
				
			||||||
 | 
					-  **Did you touch unrelated code to the PR?** To aid in code review,
 | 
				
			||||||
 | 
					   please only include files in your pull request that are directly
 | 
				
			||||||
 | 
					   related to your changes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Frequently asked questions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  **How can I contribute as a reviewer?** There is lots of value if
 | 
				
			||||||
 | 
					   community developer reproduce issues, try out new functionality, or
 | 
				
			||||||
 | 
					   otherwise help us identify or troubleshoot issues. Commenting on
 | 
				
			||||||
 | 
					   tasks or pull requests with your enviroment details is helpful and
 | 
				
			||||||
 | 
					   appreciated.
 | 
				
			||||||
 | 
					-  **CI tests failed, what does it mean?** Maybe you need to merge with
 | 
				
			||||||
 | 
					   master or rebase with latest changes. Pushing your changes should
 | 
				
			||||||
 | 
					   re-trigger CI tests. If the tests persist, you'll want to trace
 | 
				
			||||||
 | 
					   through the error messages and resolve the related issues.
 | 
				
			||||||
 | 
					-  **What are the most high risk changes?** Anything that tourhces build
 | 
				
			||||||
 | 
					   configuration is an risky area. Please avoid changing these unless
 | 
				
			||||||
 | 
					   you've had a discussion with the team beforehand.
 | 
				
			||||||
 | 
					-  **Hey, a commit showed up on my branch, what's up with that?**
 | 
				
			||||||
 | 
					   Sometimes another community member will provide a patch or fix to
 | 
				
			||||||
 | 
					   your pull request or branch. This is often needed for getting CI tests
 | 
				
			||||||
 | 
					   to pass.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					On Documentation
 | 
				
			||||||
 | 
					----------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Python Docs
 | 
				
			||||||
 | 
					~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch documentation is generated from python source using
 | 
				
			||||||
 | 
					`Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is
 | 
				
			||||||
 | 
					copied to the docs folder in the master branch of
 | 
				
			||||||
 | 
					`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
 | 
				
			||||||
 | 
					and is served via GitHub pages.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Site: http://pytorch.org/docs
 | 
				
			||||||
 | 
					-  GitHub: http://github.com/pytorch/pytorch/docs
 | 
				
			||||||
 | 
					-  Served from:
 | 
				
			||||||
 | 
					   `https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					C++ Docs
 | 
				
			||||||
 | 
					~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For C++ code we use Doxygen to generate the content files. The C++ docs
 | 
				
			||||||
 | 
					are built on a special server and the resulting files are copied to the
 | 
				
			||||||
 | 
					https://github.com/pytorch/cppdocs repo, and are served from GitHub
 | 
				
			||||||
 | 
					pages.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Site: http://pytorch.org/cppdocs
 | 
				
			||||||
 | 
					-  GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp
 | 
				
			||||||
 | 
					-  Served from: https://github.com/pytorch/cppdocs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tutorials
 | 
				
			||||||
 | 
					---------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch tutorials are documents used to help understand using PyTorch to
 | 
				
			||||||
 | 
					accomplish specific tasks or to understand more holistic concepts.
 | 
				
			||||||
 | 
					Tutorials are built using
 | 
				
			||||||
 | 
					`Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__
 | 
				
			||||||
 | 
					from executable python sources files, or from restructured-text (rst)
 | 
				
			||||||
 | 
					files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Site: http://pytorch.org/tutorials
 | 
				
			||||||
 | 
					-  GitHub: http://github.com/pytorch/tutorials
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tutorials Build Overview
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For tutorials, `pull
 | 
				
			||||||
 | 
					requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a
 | 
				
			||||||
 | 
					rebuild the entire site using CircleCI to test the effects of the
 | 
				
			||||||
 | 
					change. This build is sharded into 9 worker builds and takes around 40
 | 
				
			||||||
 | 
					minutes total. At the same time, we do a Netlify build using *make
 | 
				
			||||||
 | 
					html-noplot*, which builds the site without rendering the notebook
 | 
				
			||||||
 | 
					output into pages for quick review.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					After a PR is accepted, the site is rebuilt and deployed from CircleCI.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Contributing a new Tutorial
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`PyTorch.org Tutorial Contribution
 | 
				
			||||||
 | 
					Guide <https://github.com/pytorch/tutorials/#contributing>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Code Style
 | 
				
			||||||
 | 
					~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Python style**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**C++ style**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Submitting a Pull Request
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch development happens publicly on our Github repo.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To have your feature or fix added to PyTorch, please submit a Pull
 | 
				
			||||||
 | 
					Request.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Running Tests
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Show examples for running all tests, just one individual...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Technical Process
 | 
				
			||||||
 | 
					-----------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Developing PyTorch
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To develop PyTorch on your machine, here are some tips:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. Uninstall all existing PyTorch installs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    conda uninstall pytorch
 | 
				
			||||||
 | 
					    pip uninstall torch
 | 
				
			||||||
 | 
					    pip uninstall torch # run this command twice
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. Clone a copy of PyTorch from source:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    git clone https://github.com/pytorch/pytorch
 | 
				
			||||||
 | 
					    cd pytorch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. Install PyTorch in ``build develop`` mode:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A full set of instructions on installing PyTorch from source is here:
 | 
				
			||||||
 | 
					https://github.com/pytorch/pytorch#from-source
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The change you have to make is to replace
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    python setup.py install
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    python setup.py build develop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This is especially useful if you are only changing Python files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This mode will symlink the Python files from the current local source
 | 
				
			||||||
 | 
					tree into the Python install.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Hence, if you modify a Python file, you do not need to reinstall PyTorch
 | 
				
			||||||
 | 
					again and again.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Install local PyTorch in ``build develop`` mode
 | 
				
			||||||
 | 
					-  modify your Python file ``torch/__init__.py`` (for example)
 | 
				
			||||||
 | 
					-  test functionality
 | 
				
			||||||
 | 
					-  modify your Python file ``torch/__init__.py``
 | 
				
			||||||
 | 
					-  test functionality
 | 
				
			||||||
 | 
					-  modify your Python file ``torch/__init__.py``
 | 
				
			||||||
 | 
					-  test functionality
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You do not need to repeatedly install after modifying Python files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In case you want to reinstall, make sure that you uninstall PyTorch
 | 
				
			||||||
 | 
					first by running ``pip uninstall torch`` and ``python setup.py clean``.
 | 
				
			||||||
 | 
					Then you can install in ``build develop`` mode again.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Codebase structure
 | 
				
			||||||
 | 
					------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core
 | 
				
			||||||
 | 
					   library files that work everywhere, both server and mobile. We are
 | 
				
			||||||
 | 
					   slowly moving pieces from
 | 
				
			||||||
 | 
					   `ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
 | 
				
			||||||
 | 
					   here. This library is intended only to contain essential
 | 
				
			||||||
 | 
					   functionality, and appropriate to use in settings where binary size
 | 
				
			||||||
 | 
					   matters. (But you'll have a lot of missing functionality if you try
 | 
				
			||||||
 | 
					   to use it directly.)
 | 
				
			||||||
 | 
					-  `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++
 | 
				
			||||||
 | 
					   tensor library for PyTorch (no autograd support)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -  `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__
 | 
				
			||||||
 | 
					         `THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__
 | 
				
			||||||
 | 
					         `THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__
 | 
				
			||||||
 | 
					         `THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__
 | 
				
			||||||
 | 
					         - Legacy library code from the original Torch. Try not to add
 | 
				
			||||||
 | 
					         things here; we're slowly porting these to
 | 
				
			||||||
 | 
					         `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					         -  generic - Contains actual implementations of operators,
 | 
				
			||||||
 | 
					            parametrized over ``scalar_t``. Files here get compiled N
 | 
				
			||||||
 | 
					            times per supported scalar type in PyTorch.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -  `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					         -  `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
 | 
				
			||||||
 | 
					            - Core functionality of ATen. This is migrating to top-level
 | 
				
			||||||
 | 
					            c10 folder.
 | 
				
			||||||
 | 
					         -  `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__
 | 
				
			||||||
 | 
					            - Modern implementations of operators. If you want to write
 | 
				
			||||||
 | 
					            a new operator, here is where it should go. Most CPU
 | 
				
			||||||
 | 
					            operators go in the top level directory, except for
 | 
				
			||||||
 | 
					            operators which need to be compiled specially; see cpu
 | 
				
			||||||
 | 
					            below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            -  `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__
 | 
				
			||||||
 | 
					               - Not actually CPU implementations of operators, but
 | 
				
			||||||
 | 
					               specifically implementations which are compiled with
 | 
				
			||||||
 | 
					               processor-specific instructions, like AVX. See the
 | 
				
			||||||
 | 
					               `README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__
 | 
				
			||||||
 | 
					               for more details.
 | 
				
			||||||
 | 
					            -  `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__
 | 
				
			||||||
 | 
					               - CUDA implementations of operators.
 | 
				
			||||||
 | 
					            -  `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__
 | 
				
			||||||
 | 
					               - CPU and CUDA implementations of COO sparse tensor
 | 
				
			||||||
 | 
					               operations
 | 
				
			||||||
 | 
					            -  `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__
 | 
				
			||||||
 | 
					               `mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__
 | 
				
			||||||
 | 
					               `miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__
 | 
				
			||||||
 | 
					               `cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					               -  implementations of operators which simply bind to some
 | 
				
			||||||
 | 
					                  backend library.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ -
 | 
				
			||||||
 | 
					   The actual PyTorch library. Everything that is not in
 | 
				
			||||||
 | 
					   `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
 | 
				
			||||||
 | 
					   is a Python module, following the PyTorch Python frontend module
 | 
				
			||||||
 | 
					   structure.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
 | 
				
			||||||
 | 
					      - C++ files composing the PyTorch library. Files in this directory
 | 
				
			||||||
 | 
					      tree are a mix of Python binding code, and C++ heavy lifting.
 | 
				
			||||||
 | 
					      Consult ``setup.py`` for the canonical list of Python binding
 | 
				
			||||||
 | 
					      files; conventionally, they are often prefixed with ``python_``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -  `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__
 | 
				
			||||||
 | 
					         - Compiler and frontend for TorchScript JIT frontend.
 | 
				
			||||||
 | 
					      -  `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__
 | 
				
			||||||
 | 
					         - Implementation of reverse-mode automatic differentiation.
 | 
				
			||||||
 | 
					      -  `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__
 | 
				
			||||||
 | 
					         - The PyTorch C++ frontend.
 | 
				
			||||||
 | 
					      -  `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__
 | 
				
			||||||
 | 
					         - Distributed training support for PyTorch.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ -
 | 
				
			||||||
 | 
					   Code generation scripts for the PyTorch library. See
 | 
				
			||||||
 | 
					   `README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__
 | 
				
			||||||
 | 
					   of this directory for more details.
 | 
				
			||||||
 | 
					-  `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ -
 | 
				
			||||||
 | 
					   Python unit tests for PyTorch Python frontend.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__
 | 
				
			||||||
 | 
					      - Basic tests for PyTorch functionality.
 | 
				
			||||||
 | 
					   -  `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__
 | 
				
			||||||
 | 
					      - Tests for non-NN automatic differentiation support.
 | 
				
			||||||
 | 
					   -  `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__
 | 
				
			||||||
 | 
					      - Tests for NN operators and their automatic differentiation.
 | 
				
			||||||
 | 
					   -  `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__
 | 
				
			||||||
 | 
					      - Tests for the JIT compiler and TorchScript.
 | 
				
			||||||
 | 
					   -  ...
 | 
				
			||||||
 | 
					   -  `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__
 | 
				
			||||||
 | 
					      - C++ unit tests for PyTorch C++ frontend.
 | 
				
			||||||
 | 
					   -  `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__
 | 
				
			||||||
 | 
					      - Automatically generated "expect" files which are used to compare
 | 
				
			||||||
 | 
					      against expected output.
 | 
				
			||||||
 | 
					   -  `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__
 | 
				
			||||||
 | 
					      - Tests for ONNX export functionality, using both PyTorch and
 | 
				
			||||||
 | 
					      Caffe2.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ -
 | 
				
			||||||
 | 
					   The Caffe2 library.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__
 | 
				
			||||||
 | 
					      - Core files of Caffe2, e.g., tensor, workspace, blobs, etc.
 | 
				
			||||||
 | 
					   -  `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__
 | 
				
			||||||
 | 
					      - Operators of Caffe2.
 | 
				
			||||||
 | 
					   -  `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__
 | 
				
			||||||
 | 
					      - Python bindings to Caffe2.
 | 
				
			||||||
 | 
					   -  ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Unit Testing
 | 
				
			||||||
 | 
					------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch's testing is located under ``test/``. Run the entire test suite
 | 
				
			||||||
 | 
					with
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    python test/run_test.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					or run individual test files, like ``python test/test_nn.py``, for
 | 
				
			||||||
 | 
					individual test suites.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Better local unit tests with pytest
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We don't officially support ``pytest``, but it works well with our
 | 
				
			||||||
 | 
					``unittest`` tests and offers a number of useful features for local
 | 
				
			||||||
 | 
					developing. Install it via ``pip install pytest``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you want to just run tests that contain a specific substring, you can
 | 
				
			||||||
 | 
					use the ``-k`` flag:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pytest test/test_nn.py -k Loss -v
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The above is an example of testing a change to Loss functions: this
 | 
				
			||||||
 | 
					command runs tests such as ``TestNN.test_BCELoss``\ and
 | 
				
			||||||
 | 
					``TestNN.test_MSELoss`` and can be useful to save keystrokes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Writing documentation
 | 
				
			||||||
 | 
					---------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch uses `Google
 | 
				
			||||||
 | 
					style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__
 | 
				
			||||||
 | 
					for formatting docstrings. Length of line inside docstrings block must
 | 
				
			||||||
 | 
					be limited to 80 characters to fit into Jupyter documentation popups.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For C++ documentation (https://pytorch.org/cppdocs), we use
 | 
				
			||||||
 | 
					`Doxygen <http://www.doxygen.nl/>`__ and then convert it to
 | 
				
			||||||
 | 
					`Sphinx <http://www.sphinx-doc.org/>`__ via
 | 
				
			||||||
 | 
					`Breathe <https://github.com/michaeljones/breathe>`__
 | 
				
			||||||
 | 
					and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen
 | 
				
			||||||
 | 
					reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__
 | 
				
			||||||
 | 
					for more information on the documentation syntax. To build the
 | 
				
			||||||
 | 
					documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We run Doxygen in CI (Travis) to verify that you do not use invalid
 | 
				
			||||||
 | 
					Doxygen commands. To run this check locally, run ``./check-doxygen.sh``
 | 
				
			||||||
 | 
					from inside ``docs/cpp``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Managing multiple build trees
 | 
				
			||||||
 | 
					-----------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					One downside to using ``python setup.py develop`` is that your
 | 
				
			||||||
 | 
					development version of PyTorch will be installed globally on your
 | 
				
			||||||
 | 
					account (e.g., if you run ``import torch`` anywhere else, the
 | 
				
			||||||
 | 
					development version will be used.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you want to manage multiple builds of PyTorch, you can make use of
 | 
				
			||||||
 | 
					`conda environments <https://conda.io/docs/using/envs.html>`__ to
 | 
				
			||||||
 | 
					maintain separate Python package environments, each of which can be tied
 | 
				
			||||||
 | 
					to a specific build of PyTorch. To set one up:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed
 | 
				
			||||||
 | 
					    python setup.py build develop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					C++ Development tips
 | 
				
			||||||
 | 
					--------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you are working on the C++ code, there are a few important things
 | 
				
			||||||
 | 
					that you will want to keep in mind:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. How to rebuild only the code you are working on.
 | 
				
			||||||
 | 
					2. How to make rebuilds in the absence of changes go faster.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Build only what you need.
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					``python setup.py build`` will build everything, but since our build
 | 
				
			||||||
 | 
					system is not very optimized for incremental rebuilds, this will
 | 
				
			||||||
 | 
					actually be very slow. Far better is to only request rebuilds of the
 | 
				
			||||||
 | 
					parts of the project you are working on:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Working on the Python bindings? Run ``python setup.py develop`` to
 | 
				
			||||||
 | 
					   rebuild (NB: no ``build`` here!)
 | 
				
			||||||
 | 
					-  Working on ``torch/csrc`` or ``aten``? Run
 | 
				
			||||||
 | 
					   ``python setup.py rebuild_libtorch`` to rebuild and avoid having to
 | 
				
			||||||
 | 
					   rebuild other dependent libraries we depend on.
 | 
				
			||||||
 | 
					-  Working on one of the other dependent libraries? The other valid
 | 
				
			||||||
 | 
					   targets are listed in ``dep_libs`` in ``setup.py``. prepend
 | 
				
			||||||
 | 
					   ``build_`` to get a target, and run as e.g.
 | 
				
			||||||
 | 
					   ``python setup.py build_gloo``.
 | 
				
			||||||
 | 
					-  Working on a test binary? Run
 | 
				
			||||||
 | 
					   ``(cd build && ninja bin/test_binary_name)`` to rebuild only that
 | 
				
			||||||
 | 
					   test binary (without rerunning cmake). (Replace ``ninja`` with
 | 
				
			||||||
 | 
					   ``make`` if you don't have ninja installed).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					On the initial build, you can also speed things up with the environment
 | 
				
			||||||
 | 
					variables ``DEBUG`` and ``NO_CUDA``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  ``DEBUG=1`` will enable debug builds (-g -O0)
 | 
				
			||||||
 | 
					-  ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations
 | 
				
			||||||
 | 
					   (-g -O3)
 | 
				
			||||||
 | 
					-  ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing
 | 
				
			||||||
 | 
					   on something not CUDA related), to save compile time.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    NO_CUDA=1 DEBUG=1 python setup.py build develop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Make sure you continue to pass these flags on subsequent builds.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Code completion and IDE support
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When using ``python setup.py develop``, PyTorch will generate a
 | 
				
			||||||
 | 
					``compile_commands.json`` file that can be used by many editors to
 | 
				
			||||||
 | 
					provide command completion and error highlighting for PyTorch's C++
 | 
				
			||||||
 | 
					code. You need to ``pip install ninja`` to generate accurate information
 | 
				
			||||||
 | 
					for the code in ``torch/csrc``. More information at:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  https://sarcasm.github.io/notes/dev/compilation-database.html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Make no-op build fast.
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Use Ninja
 | 
				
			||||||
 | 
					~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Python ``setuptools`` is pretty dumb, and always rebuilds every C file
 | 
				
			||||||
 | 
					in a project. If you install the ninja build system with
 | 
				
			||||||
 | 
					``pip install ninja``, then PyTorch will use it to track dependencies
 | 
				
			||||||
 | 
					correctly. If PyTorch was already built, you will need to run
 | 
				
			||||||
 | 
					``python setup.py clean`` once after installing ninja for builds to
 | 
				
			||||||
 | 
					succeed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Use CCache
 | 
				
			||||||
 | 
					~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Even when dependencies are tracked with file modification, there are
 | 
				
			||||||
 | 
					many situations where files get rebuilt when a previous compilation was
 | 
				
			||||||
 | 
					exactly the same.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Using ccache in a situation like this is a real time-saver. However, by
 | 
				
			||||||
 | 
					default, ccache does not properly support CUDA stuff, so here are the
 | 
				
			||||||
 | 
					instructions for installing a custom ccache fork that has CUDA support:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # install and export ccacheif ! ls ~/ccache/bin/ccachethen
 | 
				
			||||||
 | 
					        sudo apt-get update
 | 
				
			||||||
 | 
					        sudo apt-get install -y automake autoconf
 | 
				
			||||||
 | 
					        sudo apt-get install -y asciidoc
 | 
				
			||||||
 | 
					        mkdir -p ~/ccache
 | 
				
			||||||
 | 
					        pushd /tmp
 | 
				
			||||||
 | 
					        rm -rf ccache
 | 
				
			||||||
 | 
					        git clone https://github.com/colesbury/ccache -b ccbin
 | 
				
			||||||
 | 
					        pushd ccache
 | 
				
			||||||
 | 
					        ./autogen.sh
 | 
				
			||||||
 | 
					        ./configure
 | 
				
			||||||
 | 
					        make install prefix=~/ccache
 | 
				
			||||||
 | 
					        popdpopd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        mkdir -p ~/ccache/lib
 | 
				
			||||||
 | 
					        mkdir -p ~/ccache/cuda
 | 
				
			||||||
 | 
					        ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
 | 
				
			||||||
 | 
					        ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
 | 
				
			||||||
 | 
					        ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
 | 
				
			||||||
 | 
					        ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
 | 
				
			||||||
 | 
					        ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CUDA Development tips
 | 
				
			||||||
 | 
					---------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you are working on the CUDA code, here are some useful CUDA debugging
 | 
				
			||||||
 | 
					tips:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug
 | 
				
			||||||
 | 
					   symbols (``-g -G``). This will be particularly helpful in debugging
 | 
				
			||||||
 | 
					   device code. However, it will slow down the build process for about
 | 
				
			||||||
 | 
					   50% (compared to only ``DEBUG=1``), so use wisely.
 | 
				
			||||||
 | 
					2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging
 | 
				
			||||||
 | 
					   friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a
 | 
				
			||||||
 | 
					   CUDA tensor (rather than all zeros).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Hope this helps, and thanks for considering to contribute.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Windows development tips
 | 
				
			||||||
 | 
					------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Occasionally, you will write a patch which works on Linux, but fails CI
 | 
				
			||||||
 | 
					on Windows. There are a few aspects in which MSVC (the Windows compiler
 | 
				
			||||||
 | 
					toolchain we use) is stricter than Linux, which are worth keeping in
 | 
				
			||||||
 | 
					mind when fixing these problems.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. Symbols are NOT exported by default on Windows; instead, you have to
 | 
				
			||||||
 | 
					   explicitly mark a symbol as exported/imported in a header file with
 | 
				
			||||||
 | 
					   ``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have
 | 
				
			||||||
 | 
					   codified this pattern into a set of macros which follow the
 | 
				
			||||||
 | 
					   convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen.
 | 
				
			||||||
 | 
					   (Every separate shared library needs a unique macro name, because
 | 
				
			||||||
 | 
					   symbol visibility is on a per shared library basis. See
 | 
				
			||||||
 | 
					   c10/macros/Macros.h for more details.) The upshot is if you see an
 | 
				
			||||||
 | 
					   "unresolved external" error in your Windows build, this is probably
 | 
				
			||||||
 | 
					   because you forgot to mark a function with ``*_API``. However, there
 | 
				
			||||||
 | 
					   is one important counterexample to this principle: if you want a
 | 
				
			||||||
 | 
					   *templated* function to be instantiated at the call site, do NOT mark
 | 
				
			||||||
 | 
					   it with ``*_API`` (if you do mark it, you'll have to explicitly
 | 
				
			||||||
 | 
					   instantiate all of the specializations used by the call sites.)
 | 
				
			||||||
 | 
					2. If you link against a library, this does not make its dependencies
 | 
				
			||||||
 | 
					   transitively visible. You must explicitly specify a link dependency
 | 
				
			||||||
 | 
					   against every library whose symbols you use. (This is different from
 | 
				
			||||||
 | 
					   Linux where in most environments, transitive dependencies can be used
 | 
				
			||||||
 | 
					   to fulfill unresolved symbols.)
 | 
				
			||||||
 | 
					3. If you have a Windows box (we have a few on EC2 which you can request
 | 
				
			||||||
 | 
					   access to) and you want to run the build, the easiest way is to just
 | 
				
			||||||
 | 
					   run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run
 | 
				
			||||||
 | 
					   ``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing
 | 
				
			||||||
 | 
					   away your Conda environment.)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Even if you don't know anything about MSVC, you can use cmake to build
 | 
				
			||||||
 | 
					simple programs on Windows; this can be helpful if you want to learn
 | 
				
			||||||
 | 
					more about some peculiar linking behavior by reproducing it on a small
 | 
				
			||||||
 | 
					example. Here's a simple example cmake file that defines two dynamic
 | 
				
			||||||
 | 
					libraries, one linking with the other:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can build it with:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mkdir buildcd build
 | 
				
			||||||
 | 
					    cmake ..
 | 
				
			||||||
 | 
					    cmake --build .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Known MSVC (and MSVC with NVCC) bugs
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The PyTorch codebase sometimes likes to use exciting C++ features, and
 | 
				
			||||||
 | 
					these exciting features lead to exciting bugs in Windows compilers. To
 | 
				
			||||||
 | 
					add insult to injury, the error messages will often not tell you which
 | 
				
			||||||
 | 
					line of code actually induced the erroring template instantiation. We've
 | 
				
			||||||
 | 
					found the most effective way to debug these problems is to carefully
 | 
				
			||||||
 | 
					read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few
 | 
				
			||||||
 | 
					well known pitfalls and workarounds:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  This is not actually a bug per se, but in general, code generated by
 | 
				
			||||||
 | 
					   MSVC is more sensitive to memory errors; you may have written some
 | 
				
			||||||
 | 
					   code that does a use-after-free or stack overflows; on Linux the code
 | 
				
			||||||
 | 
					   might work, but on Windows your program will crash. ASAN may not
 | 
				
			||||||
 | 
					   catch all of these problems: stay vigilant to the possibility that
 | 
				
			||||||
 | 
					   your crash is due to a real memory problem.
 | 
				
			||||||
 | 
					-  (NVCC) ``c10::optional`` does not work when used from device code.
 | 
				
			||||||
 | 
					   Don't use it from kernels. Upstream issue:
 | 
				
			||||||
 | 
					   https://github.com/akrzemi1/Optional/issues/58 and our local issue
 | 
				
			||||||
 | 
					   #10329.
 | 
				
			||||||
 | 
					-  ``constexpr`` generally works less well on MSVC.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   -  The idiom ``static_assert(f() == f())`` to test if ``f`` is
 | 
				
			||||||
 | 
					      constexpr does not work; you'll get "error C2131: expression did
 | 
				
			||||||
 | 
					      not evaluate to a constant". Don't use these asserts on Windows.
 | 
				
			||||||
 | 
					      (Example: ``c10/util/intrusive_ptr.h``)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  (NVCC) Code you access inside a ``static_assert`` will eagerly be
 | 
				
			||||||
 | 
					   evaluated as if it were device code, and so you might get an error
 | 
				
			||||||
 | 
					   that the code is "not accessible".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class A {
 | 
				
			||||||
 | 
					      static A singleton_;
 | 
				
			||||||
 | 
					      static constexpr inline A* singleton() {
 | 
				
			||||||
 | 
					        return &singleton_;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    };static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  The compiler will run out of heap space if you attempt to compile
 | 
				
			||||||
 | 
					   files that are too large. Splitting such files into separate files
 | 
				
			||||||
 | 
					   helps. (Example: ``THTensorMath``, ``THTensorMoreMath``,
 | 
				
			||||||
 | 
					   ``THTensorEvenMoreMath``.)
 | 
				
			||||||
 | 
					-  MSVC's preprocessor (but not the standard compiler) has a bug where
 | 
				
			||||||
 | 
					   it incorrectly tokenizes raw string literals, ending when it sees a
 | 
				
			||||||
 | 
					   ``"``. This causes preprocessor tokens inside the literal like
 | 
				
			||||||
 | 
					   an\ ``#endif`` to be incorrectly treated as preprocessor directives.
 | 
				
			||||||
 | 
					   See https://godbolt.org/z/eVTIJq as an example.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Running Clang-Tidy
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a
 | 
				
			||||||
 | 
					C++ linter and static analysis tool based on the clang compiler. We run
 | 
				
			||||||
 | 
					clang-tidy in our CI to make sure that new C++ code is safe, sane and
 | 
				
			||||||
 | 
					efficient. See our
 | 
				
			||||||
 | 
					`.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__
 | 
				
			||||||
 | 
					file for the simple commands we use for this. To run clang-tidy locally,
 | 
				
			||||||
 | 
					follow these steps:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. Install clang-tidy. First, check if you already have clang-tidy by
 | 
				
			||||||
 | 
					   simply writing ``clang-tidy`` in your terminal. If you don't yet have
 | 
				
			||||||
 | 
					   clang-tidy, you should be able to install it easily with your package
 | 
				
			||||||
 | 
					   manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu.
 | 
				
			||||||
 | 
					   See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on
 | 
				
			||||||
 | 
					   how to install the latest version. Note that newer versions of
 | 
				
			||||||
 | 
					   clang-tidy will have more checks than older versions. In our CI, we
 | 
				
			||||||
 | 
					   run clang-tidy-6.0.
 | 
				
			||||||
 | 
					2. Use our driver script to run clang-tidy over any changes relative to
 | 
				
			||||||
 | 
					   some git revision (you may want to replace ``HEAD~1`` with ``HEAD``
 | 
				
			||||||
 | 
					   to pick up uncommitted changes). Changes are picked up based on a
 | 
				
			||||||
 | 
					   ``git diff`` with the given revision:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Above, it is assumed you are in the PyTorch root folder.
 | 
				
			||||||
 | 
					``path/to/build`` should be the path to where you built PyTorch from
 | 
				
			||||||
 | 
					source, e.g. ``build`` in the PyTorch root folder if you used
 | 
				
			||||||
 | 
					``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change
 | 
				
			||||||
 | 
					the clang-tidy this script uses. Make sure you have PyYaml installed,
 | 
				
			||||||
 | 
					which is in PyTorch's ``requirements.txt``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Pre-commit Tidy/Linting Hook
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We use clang-tidy and flake8 to perform additional formatting and
 | 
				
			||||||
 | 
					semantic checking of code. We provide a pre-commit git hook for
 | 
				
			||||||
 | 
					performing these checks, before a commit is created:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Caffe2 notes
 | 
				
			||||||
 | 
					------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In 2018, we merged Caffe2 into the PyTorch source repository. While the
 | 
				
			||||||
 | 
					steady state aspiration is that Caffe2 and PyTorch share code freely, in
 | 
				
			||||||
 | 
					the meantime there will be some separation. If you submit a PR to only
 | 
				
			||||||
 | 
					PyTorch or only Caffe2 code, CI will only run for the project you
 | 
				
			||||||
 | 
					edited. The logic for this is implemented in
 | 
				
			||||||
 | 
					``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can
 | 
				
			||||||
 | 
					look at this to see what path prefixes constitute changes. This also
 | 
				
			||||||
 | 
					means if you ADD a new top-level path, or you start sharing code between
 | 
				
			||||||
 | 
					projects, you need to modify these files. There are a few "unusual"
 | 
				
			||||||
 | 
					directories which, for historical reasons, are Caffe2/PyTorch specific.
 | 
				
			||||||
 | 
					Here they are:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``,
 | 
				
			||||||
 | 
					   ``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code
 | 
				
			||||||
 | 
					   in them without extra coordination.
 | 
				
			||||||
 | 
					-  ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools``
 | 
				
			||||||
 | 
					   are PyTorch-specific. Don't put Caffe2 code in them without extra
 | 
				
			||||||
 | 
					   coordination.
 | 
				
			||||||
							
								
								
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								docs/source/community/governance.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,154 @@
 | 
				
			|||||||
 | 
					PyTorch Governance
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Governance Philosophy and Guiding Tenets
 | 
				
			||||||
 | 
					-----------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch adopts a governance structure with a small set of maintainers
 | 
				
			||||||
 | 
					driving the overall project direction with a strong bias towards
 | 
				
			||||||
 | 
					PyTorch's design philosophy where design and code contributions are
 | 
				
			||||||
 | 
					valued. Beyond the core maintainers, there is also a slightly broader
 | 
				
			||||||
 | 
					set of core developers that have the ability to directly merge pull
 | 
				
			||||||
 | 
					requests and own various parts of the core code base.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Beyond the maintainers and core devs, the community is encouraged to
 | 
				
			||||||
 | 
					contribute, file issues, make proposals, review pull requests and be
 | 
				
			||||||
 | 
					present in the community. Given contributions and willingness to
 | 
				
			||||||
 | 
					invest, anyone can be provided write access or ownership of parts of
 | 
				
			||||||
 | 
					the codebase.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Based on this governance structure, the project has the following core
 | 
				
			||||||
 | 
					operating tenets by which decisions are made and overall culture is
 | 
				
			||||||
 | 
					derived:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. **Code contributions** matter much more than corporate sponsorship
 | 
				
			||||||
 | 
					   and independent developers are highly valued.
 | 
				
			||||||
 | 
					2. **Project influence** is gained through contributions (whether PRs,
 | 
				
			||||||
 | 
					   forum answers, code reviews or otherwise)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Key people and their functions
 | 
				
			||||||
 | 
					------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Project Maintainers
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Project maintainers provide leadership and direction for the PyTorch
 | 
				
			||||||
 | 
					project. Specifics include:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Articulate a cohesive long-term vision for the project
 | 
				
			||||||
 | 
					-  Possess a deep understanding of the PyTorch code base
 | 
				
			||||||
 | 
					-  Negotiate and resolve contentious issues in ways acceptable to all
 | 
				
			||||||
 | 
					   parties involved
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PyTorch Maintainers:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
				
			||||||
 | 
					-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
				
			||||||
 | 
					-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
				
			||||||
 | 
					-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
				
			||||||
 | 
					-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 | 
				
			||||||
 | 
					-  (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Core Developers
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The PyTorch project is developed by a team of core developers. You can
 | 
				
			||||||
 | 
					find the list of core developers at `PyTorch Governance \| Persons of
 | 
				
			||||||
 | 
					Interest </docs/community/persons_of_interest.html>`__.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					While membership is determined by presence in the "PyTorch core" team in
 | 
				
			||||||
 | 
					the "PyTorch"
 | 
				
			||||||
 | 
					`organization <https://github.com/orgs/pytorch/teams/facebook>`__ on
 | 
				
			||||||
 | 
					GitHub, contribution takes many forms:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  committing changes to the repository;
 | 
				
			||||||
 | 
					-  reviewing pull requests by others;
 | 
				
			||||||
 | 
					-  triaging bug reports on the issue tracker;
 | 
				
			||||||
 | 
					-  discussing topics on official PyTorch communication channels.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Moderators
 | 
				
			||||||
 | 
					~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					There is a group of people, some of which are not core developers,
 | 
				
			||||||
 | 
					responsible for ensuring that discussions on official communication
 | 
				
			||||||
 | 
					channels adhere to the Code of Conduct. They take action in view of
 | 
				
			||||||
 | 
					violations and help to support a healthy community. You can find the
 | 
				
			||||||
 | 
					list of moderators `here <https://discuss.pytorch.org/about>`__.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Decision Making
 | 
				
			||||||
 | 
					---------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Uncontroversial Changes
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Primary work happens through bug tracker issues and pull requests on
 | 
				
			||||||
 | 
					GitHub. Core developers should avoid pushing their changes directly to
 | 
				
			||||||
 | 
					the PyTorch repository, instead relying on pull requests. Approving a
 | 
				
			||||||
 | 
					pull request by a core developer allows it to be merged without further
 | 
				
			||||||
 | 
					process. Core Developers and Project Maintainers ultimately approve
 | 
				
			||||||
 | 
					these changes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Notifying relevant experts about a bug tracker issue or a pull request
 | 
				
			||||||
 | 
					is important. Reviews from experts in the given interest area are
 | 
				
			||||||
 | 
					strongly preferred, especially on pull request approvals. Failure to do
 | 
				
			||||||
 | 
					so might end up with the change being reverted by the relevant expert.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Controversial decision process
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Substantial changes in a given interest area require a GitHub issue to
 | 
				
			||||||
 | 
					be opened for discussion. This includes:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Any semantic or syntactic change to the framework.
 | 
				
			||||||
 | 
					-  Backwards-incompatible changes to the Python or Cpp API.
 | 
				
			||||||
 | 
					-  Additions to the core framework, including substantial new
 | 
				
			||||||
 | 
					   functionality within an existing library.
 | 
				
			||||||
 | 
					-  Removing core features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Project Maintainers ultimately approve these changes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FAQ
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: What if I would like to own (or partly own) a part of the project
 | 
				
			||||||
 | 
					such as a domain api (i.e. Torch Vision)?** This is absolutely possible.
 | 
				
			||||||
 | 
					The first step is to start contributing to the existing project area and
 | 
				
			||||||
 | 
					contributing to its health and success. In addition to this, you can
 | 
				
			||||||
 | 
					make a proposal through a GitHub issue for new functionality or changes
 | 
				
			||||||
 | 
					to improve the project area.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: What if I am a company looking to use PyTorch internally for
 | 
				
			||||||
 | 
					development, can I be granted or purchase a board seat to drive the
 | 
				
			||||||
 | 
					project direction?** No, the PyTorch project is strictly driven by the
 | 
				
			||||||
 | 
					maintainer-driven project philosophy and does not have a board or
 | 
				
			||||||
 | 
					vehicle to take financial contributions relating to gaining influence
 | 
				
			||||||
 | 
					over technical direction.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: Does the PyTorch project support grants or ways to support
 | 
				
			||||||
 | 
					independent developers using or contributing to the project?** No, not
 | 
				
			||||||
 | 
					at this point. We are however looking at ways to better support the
 | 
				
			||||||
 | 
					community of independent developers around PyTorch. If you have
 | 
				
			||||||
 | 
					suggestions or inputs, please reach out on the PyTorch forums to
 | 
				
			||||||
 | 
					discuss.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: How do I contribute code to the project?** If the change is
 | 
				
			||||||
 | 
					relatively minor, a pull request on GitHub can be opened up immediately
 | 
				
			||||||
 | 
					for review and merge by the project committers. For larger changes,
 | 
				
			||||||
 | 
					please open an issue to make a proposal to discuss prior. Please also
 | 
				
			||||||
 | 
					see the **`PyTorch Contributor
 | 
				
			||||||
 | 
					Guide </docs/community/contribution_guide.html>`__** for contribution
 | 
				
			||||||
 | 
					guidelines.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: Can I become a committer on the project?** Unfortunately, the
 | 
				
			||||||
 | 
					current commit process to PyTorch involves an interaction with Facebook
 | 
				
			||||||
 | 
					infrastructure that can only be triggered by Facebook employees. We are
 | 
				
			||||||
 | 
					however looking at ways to expand the committer base to individuals
 | 
				
			||||||
 | 
					outside of Facebook and will provide an update when the tooling exists
 | 
				
			||||||
 | 
					to allow this.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Q: What if i would like to deliver a PyTorch tutorial at a conference
 | 
				
			||||||
 | 
					or otherwise? Do I need to be 'officially' a committer to do this?** No,
 | 
				
			||||||
 | 
					we encourage community members to showcase their work wherever and
 | 
				
			||||||
 | 
					whenever they can. Please reach out to
 | 
				
			||||||
 | 
					`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 | 
				
			||||||
 | 
					for marketing support.
 | 
				
			||||||
							
								
								
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								docs/source/community/persons_of_interest.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,130 @@
 | 
				
			|||||||
 | 
					PyTorch Governance | Persons of Interest
 | 
				
			||||||
 | 
					=========================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					General Maintainers
 | 
				
			||||||
 | 
					-------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
				
			||||||
 | 
					-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
				
			||||||
 | 
					-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
				
			||||||
 | 
					-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
				
			||||||
 | 
					-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 | 
				
			||||||
 | 
					-  (sunsetting) Sam Gross
 | 
				
			||||||
 | 
					   (`colesbury <https://github.com/colesbury>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Module-level maintainers
 | 
				
			||||||
 | 
					------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					JIT
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Zach Devito (`zdevito <https://github.com/zdevito>`__)
 | 
				
			||||||
 | 
					-  Michael Suo (`suo <https://github.com/suo>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Distributed
 | 
				
			||||||
 | 
					~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 | 
				
			||||||
 | 
					-  Shen Li (`mrshenli <https://github.com/mrshenli>`__)
 | 
				
			||||||
 | 
					-  (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Autograd Engine
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Alban Desmaison (`alband <https://github.com/alband>`__)
 | 
				
			||||||
 | 
					-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Multiprocessing and DataLoaders
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Simon Wang (`SsnL <https://github.com/SsnL>`__)
 | 
				
			||||||
 | 
					-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
				
			||||||
 | 
					-  (proposed) Vitaly Fedyunin
 | 
				
			||||||
 | 
					   (`VitalyFedyunin <https://github.com/proposed>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CUDA
 | 
				
			||||||
 | 
					~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
				
			||||||
 | 
					-  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					C++
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Will Feng (`yf225 <https://github.com/yf225>`__)
 | 
				
			||||||
 | 
					-  (sunsetting) Peter Goldsborough
 | 
				
			||||||
 | 
					   (`goldsborough <https://github.com/goldsborough>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Build + CI
 | 
				
			||||||
 | 
					~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Will Feng (`yf225 <https://github.com/yf225>`__)
 | 
				
			||||||
 | 
					-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
				
			||||||
 | 
					-  Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__)
 | 
				
			||||||
 | 
					-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
				
			||||||
 | 
					-  (sunsetting) Orion Reblitz-Richardson
 | 
				
			||||||
 | 
					(`orionr <https://github.com/orionr>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Distributions & RNG
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__)
 | 
				
			||||||
 | 
					-  Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__)
 | 
				
			||||||
 | 
					-  Alican Bozkurt (`alicanb <https://github.com/alicanb>`__)
 | 
				
			||||||
 | 
					-  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					C10
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
 | 
				
			||||||
 | 
					-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ONNX <-> PyTorch
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Lu Fang (`houseroad <https://github.com/houseroad>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					torch.nn
 | 
				
			||||||
 | 
					~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 | 
				
			||||||
 | 
					-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 | 
				
			||||||
 | 
					-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
				
			||||||
 | 
					-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 | 
				
			||||||
 | 
					-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CPU Performance / SIMD
 | 
				
			||||||
 | 
					~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 | 
				
			||||||
 | 
					-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 | 
				
			||||||
 | 
					-  Richard Zou (`zou3519 <https://github.com/zou3519>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AMD/ROCm/HIP
 | 
				
			||||||
 | 
					~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 | 
				
			||||||
 | 
					-  Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Windows
 | 
				
			||||||
 | 
					~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MKLDNN
 | 
				
			||||||
 | 
					~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					XLA
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
 | 
				
			||||||
 | 
					-  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
 | 
				
			||||||
 | 
					-  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
 | 
				
			||||||
 | 
					-  Alex Suhan (`asuhan <https://github.com/asuhan>`__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PPC
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-  Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
 | 
				
			||||||
@ -1,6 +1,101 @@
 | 
				
			|||||||
torch.hub
 | 
					torch.hub
 | 
				
			||||||
===================================
 | 
					===================================
 | 
				
			||||||
 | 
					Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Publishing models
 | 
				
			||||||
 | 
					-----------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
 | 
				
			||||||
 | 
					to a github repository by adding a simple ``hubconf.py`` file;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with
 | 
				
			||||||
 | 
					the following signature.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def entrypoint_name(pretrained=False, *args, **kwargs):
 | 
				
			||||||
 | 
					        ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					How to implement an entrypoint?
 | 
				
			||||||
 | 
					^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
				
			||||||
 | 
					Here is a code snipet from pytorch/vision repository, which specifies an entrypoint
 | 
				
			||||||
 | 
					for ``resnet18`` model. You can see a full script in
 | 
				
			||||||
 | 
					`pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dependencies = ['torch', 'math']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def resnet18(pretrained=False, *args, **kwargs):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Resnet18 model
 | 
				
			||||||
 | 
					        pretrained (bool): a recommended kwargs for all entrypoints
 | 
				
			||||||
 | 
					        args & kwargs are arguments for the function
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        ######## Call the model in the repo ###############
 | 
				
			||||||
 | 
					        from torchvision.models.resnet import resnet18 as _resnet18
 | 
				
			||||||
 | 
					        model = _resnet18(*args, **kwargs)
 | 
				
			||||||
 | 
					        ######## End of call ##############################
 | 
				
			||||||
 | 
					        # The following logic is REQUIRED
 | 
				
			||||||
 | 
					        if pretrained:
 | 
				
			||||||
 | 
					            # For weights saved in local repo
 | 
				
			||||||
 | 
								# model.load_state_dict(<path_to_saved_file>)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								# For weights saved elsewhere
 | 
				
			||||||
 | 
								checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
 | 
				
			||||||
 | 
					            model.load_state_dict(model_zoo.load_url(checkpoint, progress=False))
 | 
				
			||||||
 | 
					        return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- ``dependencies`` variable is a **list** of package names required to to run the model.
 | 
				
			||||||
 | 
					- Pretrained weights can either be stored local in the github repo, or loadable by
 | 
				
			||||||
 | 
					  ``model_zoo.load()``.
 | 
				
			||||||
 | 
					- ``pretrained`` controls whether to load the pre-trained weights provided by repo owners.
 | 
				
			||||||
 | 
					- ``args`` and ``kwargs`` are passed along to the real callable function.
 | 
				
			||||||
 | 
					- Docstring of the function works as a help message, explaining what does the model do and what
 | 
				
			||||||
 | 
					  are the allowed arguments.
 | 
				
			||||||
 | 
					- Entrypoint function should **ALWAYS** return a model(nn.module).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Important Notice
 | 
				
			||||||
 | 
					^^^^^^^^^^^^^^^^
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- The published models should be at least in a branch/tag. It can't be a random commit.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Loading models from Hub
 | 
				
			||||||
 | 
					-----------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Users can load the pre-trained models using ``torch.hub.load()`` API.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. automodule:: torch.hub
 | 
					.. automodule:: torch.hub
 | 
				
			||||||
.. autofunction:: load
 | 
					.. autofunction:: load
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    hub_model = hub.load(
 | 
				
			||||||
 | 
					        'pytorch/vision:master', # repo_owner/repo_name:branch
 | 
				
			||||||
 | 
					        'resnet18', # entrypoint
 | 
				
			||||||
 | 
					        1234, # args for callable [not applicable to resnet]
 | 
				
			||||||
 | 
					        pretrained=True) # kwargs for callable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Where are my downloaded model & weights saved?
 | 
				
			||||||
 | 
					^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The locations are used in the order of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- hub_dir: user specified path. It can be set in the following ways:
 | 
				
			||||||
 | 
					  - Setting the environment variable ``TORCH_HUB_DIR``
 | 
				
			||||||
 | 
					  - Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
 | 
				
			||||||
 | 
					- ``~/.torch/hub``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. autofunction:: set_dir
 | 
					.. autofunction:: set_dir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Caching logic
 | 
				
			||||||
 | 
					^^^^^^^^^^^^^
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
 | 
				
			||||||
 | 
					the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
 | 
				
			||||||
 | 
					when updates are published to the same branch, users can keep up with the latest release.
 | 
				
			||||||
 | 
				
			|||||||
@ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
   notes/*
 | 
					   notes/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. toctree::
 | 
				
			||||||
 | 
					  :glob:
 | 
				
			||||||
 | 
					  :maxdepth: 1
 | 
				
			||||||
 | 
					  :caption: Community
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  community/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. toctree::
 | 
					.. toctree::
 | 
				
			||||||
   :maxdepth: 1
 | 
					   :maxdepth: 1
 | 
				
			||||||
 | 
				
			|||||||
@ -1,4 +1,4 @@
 | 
				
			|||||||
Torch Script
 | 
					TorchScript
 | 
				
			||||||
============
 | 
					============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. contents:: :local:
 | 
					.. contents:: :local:
 | 
				
			||||||
@ -6,17 +6,17 @@ Torch Script
 | 
				
			|||||||
.. automodule:: torch.jit
 | 
					.. automodule:: torch.jit
 | 
				
			||||||
.. currentmodule:: torch.jit
 | 
					.. currentmodule:: torch.jit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script is a way to create serializable and optimizable models from PyTorch code.
 | 
					TorchScript is a way to create serializable and optimizable models from PyTorch code.
 | 
				
			||||||
Any code written in Torch Script can be saved from your Python
 | 
					Any code written in TorchScript can be saved from your Python
 | 
				
			||||||
process and loaded in a process where there is no Python dependency.
 | 
					process and loaded in a process where there is no Python dependency.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
We provide tools to incrementally transition a model from being a pure Python program
 | 
					We provide tools to incrementally transition a model from being a pure Python program
 | 
				
			||||||
to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
 | 
					to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program.
 | 
				
			||||||
This makes it possible to train models in PyTorch using familiar tools and then export
 | 
					This makes it possible to train models in PyTorch using familiar tools and then export
 | 
				
			||||||
the model to a production environment where it is not a good idea to run models as Python programs
 | 
					the model to a production environment where it is not a good idea to run models as Python programs
 | 
				
			||||||
for performance and multi-threading reasons.
 | 
					for performance and multi-threading reasons.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Creating Torch Script Code
 | 
					Creating TorchScript Code
 | 
				
			||||||
--------------------------
 | 
					--------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -117,26 +117,26 @@ Example:
 | 
				
			|||||||
            return self.resnet(input - self.means)
 | 
					            return self.resnet(input - self.means)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script Language Reference
 | 
					TorchScript Language Reference
 | 
				
			||||||
-------------------------------
 | 
					-------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script is a subset of Python that can either be written directly (using
 | 
					TorchScript is a subset of Python that can either be written directly (using
 | 
				
			||||||
the @script annotations) or generated automatically from Python code via
 | 
					the @script annotations) or generated automatically from Python code via
 | 
				
			||||||
tracing. When using tracing, code is automatically converted into this subset of
 | 
					tracing. When using tracing, code is automatically converted into this subset of
 | 
				
			||||||
Python by recording only the actual operators on tensors and simply executing and
 | 
					Python by recording only the actual operators on tensors and simply executing and
 | 
				
			||||||
discarding the other surrounding Python code.
 | 
					discarding the other surrounding Python code.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When writing Torch Script directly using @script annotations, the programmer must
 | 
					When writing TorchScript directly using @script annotations, the programmer must
 | 
				
			||||||
only use the subset of Python supported in Torch Script. This section documents
 | 
					only use the subset of Python supported in TorchScript. This section documents
 | 
				
			||||||
what is supported in Torch Script as if it were a language reference for a stand
 | 
					what is supported in TorchScript as if it were a language reference for a stand
 | 
				
			||||||
alone language. Any features of Python not mentioned in this reference are not
 | 
					alone language. Any features of Python not mentioned in this reference are not
 | 
				
			||||||
part of Torch Script.
 | 
					part of TorchScript.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
As a subset of Python any valid Torch Script function is also a valid Python
 | 
					As a subset of Python any valid TorchScript function is also a valid Python
 | 
				
			||||||
function. This makes it possible to remove the @script annotations and debug the
 | 
					function. This makes it possible to remove the @script annotations and debug the
 | 
				
			||||||
function using standard Python tools like pdb. The reverse is not true: there
 | 
					function using standard Python tools like pdb. The reverse is not true: there
 | 
				
			||||||
are many valid python programs that are not valid Torch Script programs.
 | 
					are many valid python programs that are not valid TorchScript programs.
 | 
				
			||||||
Instead, Torch Script focuses specifically on the features of Python that are
 | 
					Instead, TorchScript focuses specifically on the features of Python that are
 | 
				
			||||||
needed to represent neural network models in Torch.
 | 
					needed to represent neural network models in Torch.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. envvar:: PYTORCH_JIT=1
 | 
					.. envvar:: PYTORCH_JIT=1
 | 
				
			||||||
@ -150,9 +150,9 @@ needed to represent neural network models in Torch.
 | 
				
			|||||||
Types
 | 
					Types
 | 
				
			||||||
~~~~~
 | 
					~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The largest difference between Torch Script and the full Python language is that
 | 
					The largest difference between TorchScript and the full Python language is that
 | 
				
			||||||
Torch Script only support a small set of types that are needed to express neural
 | 
					TorchScript only support a small set of types that are needed to express neural
 | 
				
			||||||
net models. In particular Torch Script supports:
 | 
					net models. In particular TorchScript supports:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Tensor``
 | 
					``Tensor``
 | 
				
			||||||
    A PyTorch tensor of any dtype, dimension, or backend.
 | 
					    A PyTorch tensor of any dtype, dimension, or backend.
 | 
				
			||||||
@ -169,8 +169,8 @@ net models. In particular Torch Script supports:
 | 
				
			|||||||
``List[T]``
 | 
					``List[T]``
 | 
				
			||||||
    A list of which all members are type ``T``
 | 
					    A list of which all members are type ``T``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Unlike Python, each variable in Torch Script function must have a single static type.
 | 
					Unlike Python, each variable in TorchScript function must have a single static type.
 | 
				
			||||||
This makes it easier to optimize Torch Script functions.
 | 
					This makes it easier to optimize TorchScript functions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Example::
 | 
					Example::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -183,9 +183,9 @@ Example::
 | 
				
			|||||||
        return r # Type mismatch: r is set to type Tensor in the true branch
 | 
					        return r # Type mismatch: r is set to type Tensor in the true branch
 | 
				
			||||||
                 # and type int in the false branch
 | 
					                 # and type int in the false branch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
By default, all parameters to a Torch Script function are assumed to be Tensor
 | 
					By default, all parameters to a TorchScript function are assumed to be Tensor
 | 
				
			||||||
because this is the most common type used in modules. To specify that an
 | 
					because this is the most common type used in modules. To specify that an
 | 
				
			||||||
argument to a Torch Script function is another type, it is possible to use
 | 
					argument to a TorchScript function is another type, it is possible to use
 | 
				
			||||||
MyPy-style type annotations using the types listed above:
 | 
					MyPy-style type annotations using the types listed above:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Example::
 | 
					Example::
 | 
				
			||||||
@ -264,7 +264,7 @@ Subscripts
 | 
				
			|||||||
  ``t[i:j, i]``
 | 
					  ``t[i:j, i]``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  .. note::
 | 
					  .. note::
 | 
				
			||||||
    Torch Script currently does not support mutating tensors in place, so any
 | 
					    TorchScript currently does not support mutating tensors in place, so any
 | 
				
			||||||
    tensor indexing can only appear on the right-hand size of an expression.
 | 
					    tensor indexing can only appear on the right-hand size of an expression.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Function calls
 | 
					Function calls
 | 
				
			||||||
@ -328,7 +328,7 @@ Accessing Module Parameters
 | 
				
			|||||||
Statements
 | 
					Statements
 | 
				
			||||||
~~~~~~~~~~
 | 
					~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script supports the following types of statements:
 | 
					TorchScript supports the following types of statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Simple Assignments
 | 
					Simple Assignments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -438,7 +438,7 @@ Return
 | 
				
			|||||||
Variable Resolution
 | 
					Variable Resolution
 | 
				
			||||||
~~~~~~~~~~~~~~~~~~~
 | 
					~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script supports a subset of Python's variable resolution (i.e. scoping)
 | 
					TorchScript supports a subset of Python's variable resolution (i.e. scoping)
 | 
				
			||||||
rules. Local variables behave the same as in Python, except for the restriction
 | 
					rules. Local variables behave the same as in Python, except for the restriction
 | 
				
			||||||
that a variable must have the same type along all paths through a function.
 | 
					that a variable must have the same type along all paths through a function.
 | 
				
			||||||
If a variable has a different type on different sides of an if statement, it
 | 
					If a variable has a different type on different sides of an if statement, it
 | 
				
			||||||
@ -456,23 +456,23 @@ Example::
 | 
				
			|||||||
        print(y) # Error: undefined value y
 | 
					        print(y) # Error: undefined value y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Non-local variables are resolved to Python values at compile time when the
 | 
					Non-local variables are resolved to Python values at compile time when the
 | 
				
			||||||
function is defined. These values are then converted into Torch Script values using
 | 
					function is defined. These values are then converted into TorchScript values using
 | 
				
			||||||
the rules described in `Use of Python Values`_.
 | 
					the rules described in `Use of Python Values`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Use of Python Values
 | 
					Use of Python Values
 | 
				
			||||||
~~~~~~~~~~~~~~~~~~~~
 | 
					~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To make writing Torch Script more convenient, we allow script code to refer
 | 
					To make writing TorchScript more convenient, we allow script code to refer
 | 
				
			||||||
to Python values in the surrounding scope. For instance, any time there is a
 | 
					to Python values in the surrounding scope. For instance, any time there is a
 | 
				
			||||||
reference to ``torch``, the Torch Script compiler is actually resolving it to the
 | 
					reference to ``torch``, the TorchScript compiler is actually resolving it to the
 | 
				
			||||||
``torch`` Python module when the function is declared.  These Python values are
 | 
					``torch`` Python module when the function is declared.  These Python values are
 | 
				
			||||||
not a first class part of Torch Script. Instead they are desugared at compile-time
 | 
					not a first class part of TorchScript. Instead they are desugared at compile-time
 | 
				
			||||||
into the primitive types that Torch Script supports. This section describes the
 | 
					into the primitive types that TorchScript supports. This section describes the
 | 
				
			||||||
rules that are used when accessing Python values in Torch Script. They depend
 | 
					rules that are used when accessing Python values in TorchScript. They depend
 | 
				
			||||||
on the dynamic type of the python valued referenced.
 | 
					on the dynamic type of the python valued referenced.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Functions
 | 
					Functions
 | 
				
			||||||
  Torch Script can call python functions. This functionality is very useful when
 | 
					  TorchScript can call python functions. This functionality is very useful when
 | 
				
			||||||
  incrementally converting a model into script. The model can be moved function-by-function
 | 
					  incrementally converting a model into script. The model can be moved function-by-function
 | 
				
			||||||
  to script, leaving calls to Python functions in place. This way you can incrementally
 | 
					  to script, leaving calls to Python functions in place. This way you can incrementally
 | 
				
			||||||
  check the correctness of the model as you go.
 | 
					  check the correctness of the model as you go.
 | 
				
			||||||
@ -495,12 +495,12 @@ Functions
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Attribute Lookup On Python Modules
 | 
					Attribute Lookup On Python Modules
 | 
				
			||||||
    Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
 | 
					    TorchScript can lookup attributes on modules. Builtin functions like ``torch.add``
 | 
				
			||||||
    are accessed this way. This allows Torch Script to call functions defined in
 | 
					    are accessed this way. This allows TorchScript to call functions defined in
 | 
				
			||||||
    other modules.
 | 
					    other modules.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Python-defined Constants
 | 
					Python-defined Constants
 | 
				
			||||||
    Torch Script also provides a way to use constants that are defined in Python.
 | 
					    TorchScript also provides a way to use constants that are defined in Python.
 | 
				
			||||||
    These can be used to hard-code hyper-parameters into the function, or to
 | 
					    These can be used to hard-code hyper-parameters into the function, or to
 | 
				
			||||||
    define universal constants. There are two ways of specifying that a Python
 | 
					    define universal constants. There are two ways of specifying that a Python
 | 
				
			||||||
    value should be treated as a constant.
 | 
					    value should be treated as a constant.
 | 
				
			||||||
@ -597,36 +597,35 @@ Interpreting Graphs
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    The example script above produces the graph::
 | 
					    The example script above produces the graph::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        graph(%len : int) {
 | 
						graph(%len : int) {
 | 
				
			||||||
          %13 : float = prim::Constant[value=1]()
 | 
						  %15 : int = prim::Constant[value=1]()
 | 
				
			||||||
          %10 : int = prim::Constant[value=10]()
 | 
						  %9 : bool = prim::Constant[value=1]()
 | 
				
			||||||
          %2 : int = prim::Constant[value=4]()
 | 
						  %7 : Device = prim::Constant[value="cpu"]()
 | 
				
			||||||
          %1 : int = prim::Constant[value=3]()
 | 
						  %6 : int = prim::Constant[value=0]()
 | 
				
			||||||
          %3 : int[] = prim::ListConstruct(%1, %2)
 | 
						  %5 : int = prim::Constant[value=6]()
 | 
				
			||||||
          %4 : int = prim::Constant[value=6]()
 | 
						  %1 : int = prim::Constant[value=3]()
 | 
				
			||||||
          %5 : int = prim::Constant[value=0]()
 | 
						  %2 : int = prim::Constant[value=4]()
 | 
				
			||||||
          %6 : int[] = prim::Constant[value=[0, -1]]()
 | 
						  %11 : int = prim::Constant[value=10]()
 | 
				
			||||||
          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
 | 
						  %14 : float = prim::Constant[value=1]()
 | 
				
			||||||
          %8 : int = prim::Constant[value=1]()
 | 
						  %4 : int[] = prim::ListConstruct(%1, %2)
 | 
				
			||||||
          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
 | 
						  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
 | 
				
			||||||
            block0(%i : int, %12 : Dynamic) {
 | 
						  %rv : Tensor = prim::Loop(%len, %9, %rv.1)
 | 
				
			||||||
              %11 : int = aten::lt(%i, %10)
 | 
						    block0(%i : int, %13 : Tensor) {
 | 
				
			||||||
              %rv.4 : Dynamic = prim::If(%11)
 | 
						      %12 : bool = aten::lt(%i, %11)
 | 
				
			||||||
                block0() {
 | 
						      %rv.4 : Tensor = prim::If(%12)
 | 
				
			||||||
                  %14 : int = prim::Constant[value=1]()
 | 
							block0() {
 | 
				
			||||||
                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
 | 
							  %rv.2 : Tensor = aten::sub(%13, %14, %15)
 | 
				
			||||||
                  -> (%rv.2)
 | 
							  -> (%rv.2)
 | 
				
			||||||
                }
 | 
							}
 | 
				
			||||||
                block1() {
 | 
							block1() {
 | 
				
			||||||
                  %16 : int = prim::Constant[value=1]()
 | 
							  %rv.3 : Tensor = aten::add(%13, %14, %15)
 | 
				
			||||||
                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
 | 
							  -> (%rv.3)
 | 
				
			||||||
                  -> (%rv.3)
 | 
							}
 | 
				
			||||||
                }
 | 
						      -> (%9, %rv.4)
 | 
				
			||||||
              %19 : int = prim::Constant[value=1]()
 | 
						    }
 | 
				
			||||||
              -> (%19, %rv.4)
 | 
						  return (%rv);
 | 
				
			||||||
            }
 | 
						}
 | 
				
			||||||
          return (%rv);
 | 
					
 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
 | 
					    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
 | 
				
			||||||
    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
 | 
					    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
 | 
				
			||||||
@ -676,34 +675,39 @@ Automatic Trace Checking
 | 
				
			|||||||
        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
 | 
					        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Gives us the following diagnostic information::
 | 
					    Gives us the following diagnostic information::
 | 
				
			||||||
 | 
						ERROR: Graphs differed across invocations!
 | 
				
			||||||
 | 
						Graph diff::
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
        ERROR: Graphs differed across invocations!
 | 
							  graph(%x : Tensor) {
 | 
				
			||||||
        Graph diff:
 | 
							    %1 : int = prim::Constant[value=0]()
 | 
				
			||||||
            graph(%0 : Dynamic) {
 | 
							    %2 : int = prim::Constant[value=0]()
 | 
				
			||||||
                  %1 : int = prim::Constant[value=0]()
 | 
							    %result.1 : Tensor = aten::select(%x, %1, %2)
 | 
				
			||||||
                  %2 : int = prim::Constant[value=0]()
 | 
							    %4 : int = prim::Constant[value=0]()
 | 
				
			||||||
                  %3 : Dynamic = aten::select(%0, %1, %2)
 | 
							    %5 : int = prim::Constant[value=0]()
 | 
				
			||||||
                  %4 : int = prim::Constant[value=0]()
 | 
							    %6 : Tensor = aten::select(%x, %4, %5)
 | 
				
			||||||
                  %5 : int = prim::Constant[value=0]()
 | 
							    %result.2 : Tensor = aten::mul(%result.1, %6)
 | 
				
			||||||
                  %6 : Dynamic = aten::select(%0, %4, %5)
 | 
							    %8 : int = prim::Constant[value=0]()
 | 
				
			||||||
                  %7 : Dynamic = aten::mul(%3, %6)
 | 
							    %9 : int = prim::Constant[value=1]()
 | 
				
			||||||
                  %8 : int = prim::Constant[value=0]()
 | 
							    %10 : Tensor = aten::select(%x, %8, %9)
 | 
				
			||||||
                  %9 : int = prim::Constant[value=1]()
 | 
							-   %result : Tensor = aten::mul(%result.2, %10)
 | 
				
			||||||
                  %10 : Dynamic = aten::select(%0, %8, %9)
 | 
							+   %result.3 : Tensor = aten::mul(%result.2, %10)
 | 
				
			||||||
                  %11 : Dynamic = aten::mul(%7, %10)
 | 
							?          ++
 | 
				
			||||||
                  %12 : int = prim::Constant[value=0]()
 | 
							    %12 : int = prim::Constant[value=0]()
 | 
				
			||||||
                  %13 : int = prim::Constant[value=2]()
 | 
							    %13 : int = prim::Constant[value=2]()
 | 
				
			||||||
                  %14 : Dynamic = aten::select(%0, %12, %13)
 | 
							    %14 : Tensor = aten::select(%x, %12, %13)
 | 
				
			||||||
                  %15 : Dynamic = aten::mul(%11, %14)
 | 
							+   %result : Tensor = aten::mul(%result.3, %14)
 | 
				
			||||||
              +   %16 : int = prim::Constant[value=0]()
 | 
							+   %16 : int = prim::Constant[value=0]()
 | 
				
			||||||
              +   %17 : int = prim::Constant[value=3]()
 | 
							+   %17 : int = prim::Constant[value=3]()
 | 
				
			||||||
              +   %18 : Dynamic = aten::select(%0, %16, %17)
 | 
							+   %18 : Tensor = aten::select(%x, %16, %17)
 | 
				
			||||||
              +   %19 : Dynamic = aten::mul(%15, %18)
 | 
							-   %15 : Tensor = aten::mul(%result, %14)
 | 
				
			||||||
              -   return (%15);
 | 
							?     ^                                 ^
 | 
				
			||||||
              ?             ^
 | 
							+   %19 : Tensor = aten::mul(%result, %18)
 | 
				
			||||||
              +   return (%19);
 | 
							?     ^                                 ^
 | 
				
			||||||
              ?             ^
 | 
							-   return (%15);
 | 
				
			||||||
            }
 | 
							?             ^
 | 
				
			||||||
 | 
							+   return (%19);
 | 
				
			||||||
 | 
							?             ^
 | 
				
			||||||
 | 
							  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    This message indicates to us that the computation differed between when
 | 
					    This message indicates to us that the computation differed between when
 | 
				
			||||||
@ -733,23 +737,19 @@ Automatic Trace Checking
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Which produces::
 | 
					    Which produces::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        graph(%x : Dynamic) {
 | 
						graph(%x : Tensor) {
 | 
				
			||||||
          %1 : int = prim::Constant[value=0]()
 | 
						  %5 : bool = prim::Constant[value=1]()
 | 
				
			||||||
          %2 : int = prim::Constant[value=0]()
 | 
						  %1 : int = prim::Constant[value=0]()
 | 
				
			||||||
          %result.1 : Dynamic = aten::select(%x, %2, %1)
 | 
						  %result.1 : Tensor = aten::select(%x, %1, %1)
 | 
				
			||||||
          %4 : int = aten::size(%x, %1)
 | 
						  %4 : int = aten::size(%x, %1)
 | 
				
			||||||
          %5 : int = prim::Constant[value=1]()
 | 
						  %result : Tensor = prim::Loop(%4, %5, %result.1)
 | 
				
			||||||
          %result : Dynamic = prim::Loop(%4, %5, %result.1)
 | 
						    block0(%i : int, %7 : Tensor) {
 | 
				
			||||||
            block0(%i : int, %7 : Dynamic) {
 | 
						      %10 : Tensor = aten::select(%x, %1, %i)
 | 
				
			||||||
              %9 : int = prim::Constant[value=0]()
 | 
						      %result.2 : Tensor = aten::mul(%7, %10)
 | 
				
			||||||
              %10 : Dynamic = aten::select(%x, %9, %i)
 | 
						      -> (%5, %result.2)
 | 
				
			||||||
              %result.2 : Dynamic = aten::mul(%7, %10)
 | 
						    }
 | 
				
			||||||
              %12 : int = prim::Constant[value=1]()
 | 
						  return (%result);
 | 
				
			||||||
              -> (%12, %result.2)
 | 
						}
 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
          return (%result);
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Tracer Warnings
 | 
					Tracer Warnings
 | 
				
			||||||
    The tracer produces warnings for several problematic patterns in traced
 | 
					    The tracer produces warnings for several problematic patterns in traced
 | 
				
			||||||
@ -789,14 +789,24 @@ Tracer Warnings
 | 
				
			|||||||
Builtin Functions
 | 
					Builtin Functions
 | 
				
			||||||
~~~~~~~~~~~~~~~~~
 | 
					~~~~~~~~~~~~~~~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Torch Script supports a subset of the builtin tensor and neural network functions that
 | 
					Torch Script supports a subset of the builtin tensor and neural network
 | 
				
			||||||
PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
 | 
					functions that PyTorch provides. Most methods on Tensor as well as functions in
 | 
				
			||||||
namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
 | 
					the ``torch`` namespace, all functions in ``torch.nn.functional`` and all
 | 
				
			||||||
 | 
					modules from ``torch.nn`` are supported in Torch Script, excluding those in the
 | 
				
			||||||
 | 
					table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Unsupported ``torch.nn`` Modules  ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss
 | 
				
			||||||
 | 
					    torch.nn.modules.normalization.CrossMapLRN2d
 | 
				
			||||||
 | 
					    torch.nn.modules.fold.Fold
 | 
				
			||||||
 | 
					    torch.nn.modules.fold.Unfold
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.GRU
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.LSTM
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.RNN
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.GRUCell
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.LSTMCell
 | 
				
			||||||
 | 
					    torch.nn.modules.rnn.RNNCell
 | 
				
			||||||
 | 
					
 | 
				
			||||||
We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
 | 
					 | 
				
			||||||
``Conv`` module. This functionality is something that will be developed in the future.
 | 
					 | 
				
			||||||
For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
 | 
					 | 
				
			||||||
modules into ScriptModules on construction.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. automodule:: torch.jit.supported_ops
 | 
					.. automodule:: torch.jit.supported_ops
 | 
				
			||||||
 | 
				
			|||||||
@ -338,6 +338,7 @@ view of a storage and defines numeric operations on it.
 | 
				
			|||||||
   .. automethod:: reshape_as
 | 
					   .. automethod:: reshape_as
 | 
				
			||||||
   .. automethod:: resize_
 | 
					   .. automethod:: resize_
 | 
				
			||||||
   .. automethod:: resize_as_
 | 
					   .. automethod:: resize_as_
 | 
				
			||||||
 | 
					   .. automethod:: roll
 | 
				
			||||||
   .. automethod:: round
 | 
					   .. automethod:: round
 | 
				
			||||||
   .. automethod:: round_
 | 
					   .. automethod:: round_
 | 
				
			||||||
   .. automethod:: rsqrt
 | 
					   .. automethod:: rsqrt
 | 
				
			||||||
 | 
				
			|||||||
@ -269,6 +269,7 @@ Other Operations
 | 
				
			|||||||
.. autofunction:: histc
 | 
					.. autofunction:: histc
 | 
				
			||||||
.. autofunction:: meshgrid
 | 
					.. autofunction:: meshgrid
 | 
				
			||||||
.. autofunction:: renorm
 | 
					.. autofunction:: renorm
 | 
				
			||||||
 | 
					.. autofunction:: roll
 | 
				
			||||||
.. autofunction:: tensordot
 | 
					.. autofunction:: tensordot
 | 
				
			||||||
.. autofunction:: trace
 | 
					.. autofunction:: trace
 | 
				
			||||||
.. autofunction:: tril
 | 
					.. autofunction:: tril
 | 
				
			||||||
 | 
				
			|||||||
@ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 | 
				
			|||||||
file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 | 
					file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if (BUILD_CAFFE2_OPS)
 | 
					if (BUILD_CAFFE2_OPS)
 | 
				
			||||||
  #cmake only check for separate OpenMP library on AppleClang 7+
 | 
					 | 
				
			||||||
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
 | 
					 | 
				
			||||||
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
 | 
					 | 
				
			||||||
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
 | 
					 | 
				
			||||||
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
 | 
					 | 
				
			||||||
      Set(OpenMP_link ${OpenMP_libomp_LIBRARY})
 | 
					 | 
				
			||||||
    endif()
 | 
					 | 
				
			||||||
  endif()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  # Note(ilijar): Since Detectron ops currently have no
 | 
					  # Note(ilijar): Since Detectron ops currently have no
 | 
				
			||||||
  # CPU implementation, we only build GPU ops for now.
 | 
					  # CPU implementation, we only build GPU ops for now.
 | 
				
			||||||
  if (USE_CUDA)
 | 
					  if (USE_CUDA)
 | 
				
			||||||
@ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS)
 | 
				
			|||||||
        ${Detectron_CPU_SRCS}
 | 
					        ${Detectron_CPU_SRCS}
 | 
				
			||||||
        ${Detectron_GPU_SRCS})
 | 
					        ${Detectron_GPU_SRCS})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link})
 | 
					    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
 | 
				
			||||||
    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
 | 
					    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
 | 
				
			||||||
  elseif(NOT IOS_PLATFORM)
 | 
					  elseif(NOT IOS_PLATFORM)
 | 
				
			||||||
    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
 | 
					    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
 | 
				
			||||||
    target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
 | 
					    target_link_libraries(caffe2_detectron_ops caffe2)
 | 
				
			||||||
    install(TARGETS caffe2_detectron_ops DESTINATION lib)
 | 
					    install(TARGETS caffe2_detectron_ops DESTINATION lib)
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										166
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										166
									
								
								setup.py
									
									
									
									
									
								
							@ -124,6 +124,7 @@
 | 
				
			|||||||
#   LD_LIBRARY_PATH
 | 
					#   LD_LIBRARY_PATH
 | 
				
			||||||
#     we will search for libraries in these paths
 | 
					#     we will search for libraries in these paths
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import print_function
 | 
				
			||||||
from setuptools import setup, Extension, distutils, Command, find_packages
 | 
					from setuptools import setup, Extension, distutils, Command, find_packages
 | 
				
			||||||
import setuptools.command.build_ext
 | 
					import setuptools.command.build_ext
 | 
				
			||||||
import setuptools.command.install
 | 
					import setuptools.command.install
 | 
				
			||||||
@ -144,86 +145,32 @@ import json
 | 
				
			|||||||
import glob
 | 
					import glob
 | 
				
			||||||
import importlib
 | 
					import importlib
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
 | 
					# If you want to modify flags or environmental variables that is set when
 | 
				
			||||||
 | 
					# building torch, you should do it in tools/setup_helpers/configure.py.
 | 
				
			||||||
 | 
					# Please don't add it here unless it's only used in PyTorch.
 | 
				
			||||||
def hotpatch_var(var, prefix='USE_'):
 | 
					from tools.setup_helpers.configure import *
 | 
				
			||||||
    if check_env_flag('NO_' + var):
 | 
					 | 
				
			||||||
        os.environ[prefix + var] = '0'
 | 
					 | 
				
			||||||
    elif check_negative_env_flag('NO_' + var):
 | 
					 | 
				
			||||||
        os.environ[prefix + var] = '1'
 | 
					 | 
				
			||||||
    elif check_env_flag('WITH_' + var):
 | 
					 | 
				
			||||||
        os.environ[prefix + var] = '1'
 | 
					 | 
				
			||||||
    elif check_negative_env_flag('WITH_' + var):
 | 
					 | 
				
			||||||
        os.environ[prefix + var] = '0'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Before we run the setup_helpers, let's look for NO_* and WITH_*
 | 
					 | 
				
			||||||
# variables and hotpatch environment with the USE_* equivalent
 | 
					 | 
				
			||||||
use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
 | 
					 | 
				
			||||||
                'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
 | 
					 | 
				
			||||||
list(map(hotpatch_var, use_env_vars))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Also hotpatch a few with BUILD_* equivalent
 | 
					 | 
				
			||||||
build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
 | 
					 | 
				
			||||||
[hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
 | 
					 | 
				
			||||||
from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
 | 
					 | 
				
			||||||
                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
 | 
					 | 
				
			||||||
                                       USE_LMDB, USE_OPENCV, USE_FFMPEG)
 | 
					 | 
				
			||||||
from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 | 
					 | 
				
			||||||
from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
 | 
					 | 
				
			||||||
                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 | 
					 | 
				
			||||||
from tools.setup_helpers.fbgemm import USE_FBGEMM
 | 
					 | 
				
			||||||
from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY,
 | 
					 | 
				
			||||||
                                        MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
 | 
					 | 
				
			||||||
from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
 | 
					 | 
				
			||||||
    NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
 | 
					 | 
				
			||||||
from tools.setup_helpers.nnpack import USE_NNPACK
 | 
					 | 
				
			||||||
from tools.setup_helpers.qnnpack import USE_QNNPACK
 | 
					 | 
				
			||||||
from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
 | 
					 | 
				
			||||||
from tools.setup_helpers.generate_code import generate_code
 | 
					from tools.setup_helpers.generate_code import generate_code
 | 
				
			||||||
from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
 | 
					from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
 | 
				
			||||||
from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
 | 
					 | 
				
			||||||
    USE_GLOO_IBVERBS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
################################################################################
 | 
					################################################################################
 | 
				
			||||||
# Parameters parsed from environment
 | 
					# Parameters parsed from environment
 | 
				
			||||||
################################################################################
 | 
					################################################################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEBUG = check_env_flag('DEBUG')
 | 
					VERBOSE_SCRIPT = True
 | 
				
			||||||
REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO')
 | 
					# see if the user passed a quiet flag to setup.py arguments and respect
 | 
				
			||||||
IS_WINDOWS = (platform.system() == 'Windows')
 | 
					# that in our parts of the build
 | 
				
			||||||
IS_DARWIN = (platform.system() == 'Darwin')
 | 
					for arg in sys.argv:
 | 
				
			||||||
IS_LINUX = (platform.system() == 'Linux')
 | 
					    if arg == "--":
 | 
				
			||||||
IS_PPC = (platform.machine() == 'ppc64le')
 | 
					        break
 | 
				
			||||||
IS_ARM = (platform.machine() == 'aarch64')
 | 
					    if arg == '-q' or arg == '--quiet':
 | 
				
			||||||
 | 
					        VERBOSE_SCRIPT = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 | 
					if VERBOSE_SCRIPT:
 | 
				
			||||||
# ppc64le and aarch64 do not support MKLDNN
 | 
					    def report(*args):
 | 
				
			||||||
if IS_PPC or IS_ARM:
 | 
					        print(*args)
 | 
				
			||||||
    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF')
 | 
					 | 
				
			||||||
else:
 | 
					else:
 | 
				
			||||||
    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON')
 | 
					    def report(*args):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
 | 
					 | 
				
			||||||
RERUN_CMAKE = True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
NUM_JOBS = multiprocessing.cpu_count()
 | 
					 | 
				
			||||||
max_jobs = os.getenv("MAX_JOBS")
 | 
					 | 
				
			||||||
if max_jobs is not None:
 | 
					 | 
				
			||||||
    NUM_JOBS = min(NUM_JOBS, int(max_jobs))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
 | 
					 | 
				
			||||||
if not ONNX_NAMESPACE:
 | 
					 | 
				
			||||||
    ONNX_NAMESPACE = "onnx_torch"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Ninja
 | 
					 | 
				
			||||||
try:
 | 
					 | 
				
			||||||
    import ninja
 | 
					 | 
				
			||||||
    USE_NINJA = True
 | 
					 | 
				
			||||||
except ImportError:
 | 
					 | 
				
			||||||
    USE_NINJA = False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Constant known variables used throughout this file
 | 
					# Constant known variables used throughout this file
 | 
				
			||||||
cwd = os.path.dirname(os.path.abspath(__file__))
 | 
					cwd = os.path.dirname(os.path.abspath(__file__))
 | 
				
			||||||
@ -323,8 +270,9 @@ def build_libs(libs):
 | 
				
			|||||||
        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
 | 
					        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
 | 
					        build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
 | 
				
			||||||
    my_env = os.environ.copy()
 | 
					
 | 
				
			||||||
    my_env["PYTORCH_PYTHON"] = sys.executable
 | 
					    my_env, extra_flags = get_pytorch_env_with_flags()
 | 
				
			||||||
 | 
					    build_libs_cmd.extend(extra_flags)
 | 
				
			||||||
    my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
 | 
					    my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
 | 
				
			||||||
    my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
 | 
					    my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
 | 
				
			||||||
    my_env["PYTORCH_BUILD_VERSION"] = version
 | 
					    my_env["PYTORCH_BUILD_VERSION"] = version
 | 
				
			||||||
@ -334,64 +282,8 @@ def build_libs(libs):
 | 
				
			|||||||
        cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
 | 
					        cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
 | 
				
			||||||
    my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path
 | 
					    my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    my_env["NUM_JOBS"] = str(NUM_JOBS)
 | 
					    if VERBOSE_SCRIPT:
 | 
				
			||||||
    my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
 | 
					        my_env['VERBOSE_SCRIPT'] = '1'
 | 
				
			||||||
    if not IS_WINDOWS:
 | 
					 | 
				
			||||||
        if USE_NINJA:
 | 
					 | 
				
			||||||
            my_env["CMAKE_GENERATOR"] = '-GNinja'
 | 
					 | 
				
			||||||
            my_env["CMAKE_INSTALL"] = 'ninja install'
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            my_env['CMAKE_GENERATOR'] = ''
 | 
					 | 
				
			||||||
            my_env['CMAKE_INSTALL'] = 'make install'
 | 
					 | 
				
			||||||
    if USE_SYSTEM_NCCL:
 | 
					 | 
				
			||||||
        my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
 | 
					 | 
				
			||||||
        my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
 | 
					 | 
				
			||||||
        my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
 | 
					 | 
				
			||||||
    if USE_CUDA:
 | 
					 | 
				
			||||||
        my_env["CUDA_BIN_PATH"] = CUDA_HOME
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-cuda']
 | 
					 | 
				
			||||||
        if IS_WINDOWS:
 | 
					 | 
				
			||||||
            my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
 | 
					 | 
				
			||||||
    if USE_CUDA_STATIC_LINK:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--cuda-static-link']
 | 
					 | 
				
			||||||
    if USE_FBGEMM:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-fbgemm']
 | 
					 | 
				
			||||||
    if USE_ROCM:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-rocm']
 | 
					 | 
				
			||||||
    if USE_NNPACK:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-nnpack']
 | 
					 | 
				
			||||||
    if USE_NUMPY:
 | 
					 | 
				
			||||||
        my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR
 | 
					 | 
				
			||||||
    if USE_CUDNN:
 | 
					 | 
				
			||||||
        my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
 | 
					 | 
				
			||||||
        my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
 | 
					 | 
				
			||||||
        my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
 | 
					 | 
				
			||||||
    if USE_MIOPEN:
 | 
					 | 
				
			||||||
        my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
 | 
					 | 
				
			||||||
        my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
 | 
					 | 
				
			||||||
        my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
 | 
					 | 
				
			||||||
    if USE_MKLDNN:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-mkldnn']
 | 
					 | 
				
			||||||
    if USE_QNNPACK:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-qnnpack']
 | 
					 | 
				
			||||||
    if USE_GLOO_IBVERBS:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--use-gloo-ibverbs']
 | 
					 | 
				
			||||||
    if not RERUN_CMAKE:
 | 
					 | 
				
			||||||
        build_libs_cmd += ['--dont-rerun-cmake']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    my_env["BUILD_TORCH"] = "ON"
 | 
					 | 
				
			||||||
    my_env["BUILD_PYTHON"] = "ON"
 | 
					 | 
				
			||||||
    my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
 | 
					 | 
				
			||||||
    my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
 | 
					 | 
				
			||||||
    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
 | 
					 | 
				
			||||||
    my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
 | 
					 | 
				
			||||||
    my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        os.mkdir('build')
 | 
					        os.mkdir('build')
 | 
				
			||||||
    except OSError:
 | 
					    except OSError:
 | 
				
			||||||
@ -660,6 +552,16 @@ class build_ext(build_ext_parent):
 | 
				
			|||||||
        return outputs
 | 
					        return outputs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# this is a subclass of build just to get access to self.build_lib
 | 
				
			||||||
 | 
					# as there does not seem to be an utility function getting this
 | 
				
			||||||
 | 
					class create_pyi(distutils.command.build.build):
 | 
				
			||||||
 | 
					    def run(self):
 | 
				
			||||||
 | 
					        print("-- Building .pyi --")
 | 
				
			||||||
 | 
					        if sys.version_info[0] == 3:
 | 
				
			||||||
 | 
					            from tools.pyi.gen_pyi import gen_pyi
 | 
				
			||||||
 | 
					            gen_pyi(self.build_lib)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class build(distutils.command.build.build):
 | 
					class build(distutils.command.build.build):
 | 
				
			||||||
    sub_commands = [
 | 
					    sub_commands = [
 | 
				
			||||||
        ('build_deps', lambda self: True),
 | 
					        ('build_deps', lambda self: True),
 | 
				
			||||||
@ -914,6 +816,7 @@ if USE_CUDA:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
cmdclass = {
 | 
					cmdclass = {
 | 
				
			||||||
    'create_version_file': create_version_file,
 | 
					    'create_version_file': create_version_file,
 | 
				
			||||||
 | 
					    'create_pyi': create_pyi,
 | 
				
			||||||
    'build': build,
 | 
					    'build': build,
 | 
				
			||||||
    'build_py': build_py,
 | 
					    'build_py': build_py,
 | 
				
			||||||
    'build_ext': build_ext,
 | 
					    'build_ext': build_ext,
 | 
				
			||||||
@ -946,6 +849,7 @@ if __name__ == '__main__':
 | 
				
			|||||||
        entry_points=entry_points,
 | 
					        entry_points=entry_points,
 | 
				
			||||||
        package_data={
 | 
					        package_data={
 | 
				
			||||||
            'torch': [
 | 
					            'torch': [
 | 
				
			||||||
 | 
					                '__init__.pyi',
 | 
				
			||||||
                'lib/*.so*',
 | 
					                'lib/*.so*',
 | 
				
			||||||
                'lib/*.dylib*',
 | 
					                'lib/*.dylib*',
 | 
				
			||||||
                'lib/*.dll',
 | 
					                'lib/*.dll',
 | 
				
			||||||
 | 
				
			|||||||
@ -458,6 +458,10 @@ method_tests = [
 | 
				
			|||||||
     NO_ARGS, [skipIfNoLapack]),
 | 
					     NO_ARGS, [skipIfNoLapack]),
 | 
				
			||||||
    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
 | 
					    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
 | 
				
			||||||
     NO_ARGS, [skipIfNoLapack]),
 | 
					     NO_ARGS, [skipIfNoLapack]),
 | 
				
			||||||
 | 
					    ('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"),
 | 
				
			||||||
 | 
					    ('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"),
 | 
				
			||||||
 | 
					    ('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"),
 | 
				
			||||||
 | 
					    ('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"),
 | 
				
			||||||
    ('addcmul', (S, S), ((S, S), (S, S))),
 | 
					    ('addcmul', (S, S), ((S, S), (S, S))),
 | 
				
			||||||
    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
 | 
					    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
 | 
				
			||||||
    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
 | 
					    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
 | 
				
			||||||
@ -560,8 +564,14 @@ method_tests = [
 | 
				
			|||||||
    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
 | 
					    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
 | 
				
			||||||
    ('tril', (M, M), NO_ARGS),
 | 
					    ('tril', (M, M), NO_ARGS),
 | 
				
			||||||
    ('tril', (M, M), (2,), 'idx'),
 | 
					    ('tril', (M, M), (2,), 'idx'),
 | 
				
			||||||
 | 
					    ('tril', (S, M, M), NO_ARGS, 'batched'),
 | 
				
			||||||
 | 
					    ('tril', (S, M, M), (2,), 'batched_idx'),
 | 
				
			||||||
 | 
					    ('tril', (3, 3, S, S), NO_ARGS, 'more_batched'),
 | 
				
			||||||
    ('triu', (M, M), NO_ARGS),
 | 
					    ('triu', (M, M), NO_ARGS),
 | 
				
			||||||
    ('triu', (M, M), (2,), 'idx'),
 | 
					    ('triu', (M, M), (2,), 'idx'),
 | 
				
			||||||
 | 
					    ('triu', (S, M, M), NO_ARGS, 'batched'),
 | 
				
			||||||
 | 
					    ('triu', (S, M, M), (2,), 'batched_idx'),
 | 
				
			||||||
 | 
					    ('triu', (3, 3, S, S), NO_ARGS, 'more_batched'),
 | 
				
			||||||
    ('trace', (M, M), NO_ARGS),
 | 
					    ('trace', (M, M), NO_ARGS),
 | 
				
			||||||
    ('cross', (S, 3), ((S, 3),)),
 | 
					    ('cross', (S, 3), ((S, 3),)),
 | 
				
			||||||
    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
 | 
					    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
 | 
				
			||||||
 | 
				
			|||||||
@ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs):
 | 
				
			|||||||
        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
 | 
					        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def brute_pdist(inp, p=2):
 | 
				
			||||||
 | 
					    """Computes the same as torch.pdist using primitives"""
 | 
				
			||||||
 | 
					    n = inp.shape[-2]
 | 
				
			||||||
 | 
					    k = n * (n - 1) // 2
 | 
				
			||||||
 | 
					    if k == 0:
 | 
				
			||||||
 | 
					        # torch complains about empty indices
 | 
				
			||||||
 | 
					        return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device)
 | 
				
			||||||
 | 
					    square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1)
 | 
				
			||||||
 | 
					    unroll = square.view(square.shape[:-2] + (n * n,))
 | 
				
			||||||
 | 
					    inds = torch.ones(k, dtype=torch.int)
 | 
				
			||||||
 | 
					    inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int)
 | 
				
			||||||
 | 
					    return unroll[..., inds.cumsum(0)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def do_test_dtypes(self, dtypes, layout, device):
 | 
					def do_test_dtypes(self, dtypes, layout, device):
 | 
				
			||||||
    for dtype in dtypes:
 | 
					    for dtype in dtypes:
 | 
				
			||||||
        if dtype != torch.float16:
 | 
					        if dtype != torch.float16:
 | 
				
			||||||
 | 
				
			|||||||
@ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) {
 | 
				
			|||||||
  ASSERT_EQ(batch[1].target, "2");
 | 
					  ASSERT_EQ(batch[1].target, "2");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct DummyTensorDataset
 | 
				
			||||||
 | 
					    : datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> {
 | 
				
			||||||
 | 
					  Example<torch::Tensor, int> get(size_t index) override {
 | 
				
			||||||
 | 
					    const auto channels = static_cast<int64_t>(index);
 | 
				
			||||||
 | 
					    torch::Tensor tensor =
 | 
				
			||||||
 | 
					        (channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4});
 | 
				
			||||||
 | 
					    return {tensor, static_cast<int>(channels)};
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  torch::optional<size_t> size() const override {
 | 
				
			||||||
 | 
					    return 100;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST(DataTest, NormalizeTransform) {
 | 
				
			||||||
 | 
					  auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Works for zero (one implicit) channels
 | 
				
			||||||
 | 
					  std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output.size(), 1);
 | 
				
			||||||
 | 
					  // (1 - 0.5) / 0.1 = 5
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Works for one explicit channel
 | 
				
			||||||
 | 
					  output = dataset.get_batch(1);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output.size(), 1);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output[0].data.size(0), 1);
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Works for two channels with different moments
 | 
				
			||||||
 | 
					  dataset = DummyTensorDataset().map(
 | 
				
			||||||
 | 
					      transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2}));
 | 
				
			||||||
 | 
					  output = dataset.get_batch(2);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output.size(), 1);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output[0].data.size(0), 2);
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0]
 | 
				
			||||||
 | 
					                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
 | 
				
			||||||
 | 
					                  .allclose(torch::ones({1, 4, 4}) * 5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0]
 | 
				
			||||||
 | 
					                  .data.slice(/*dim=*/0, /*start=*/1)
 | 
				
			||||||
 | 
					                  .allclose(torch::ones({1, 4, 4}) * -2.5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Works for three channels with one moment value
 | 
				
			||||||
 | 
					  dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2));
 | 
				
			||||||
 | 
					  output = dataset.get_batch(3);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output.size(), 1);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output[0].data.size(0), 3);
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Works for three channels with different moments
 | 
				
			||||||
 | 
					  dataset = DummyTensorDataset().map(
 | 
				
			||||||
 | 
					      transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2}));
 | 
				
			||||||
 | 
					  output = dataset.get_batch(3);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output.size(), 1);
 | 
				
			||||||
 | 
					  ASSERT_EQ(output[0].data.size(0), 3);
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0]
 | 
				
			||||||
 | 
					                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
 | 
				
			||||||
 | 
					                  .allclose(torch::ones({1, 4, 4}) * 5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0]
 | 
				
			||||||
 | 
					                  .data.slice(/*dim=*/0, /*start=*/1, /*end=*/2)
 | 
				
			||||||
 | 
					                  .allclose(torch::ones({1, 4, 4}) * -2.5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					  ASSERT_TRUE(output[0]
 | 
				
			||||||
 | 
					                  .data.slice(/*dim=*/0, /*start=*/2)
 | 
				
			||||||
 | 
					                  .allclose(torch::ones({1, 4, 4}) * 12.5))
 | 
				
			||||||
 | 
					      << output[0].data;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
 | 
					struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
 | 
				
			||||||
  UnCopyableDataset() = default;
 | 
					  UnCopyableDataset() = default;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) {
 | 
				
			|||||||
TEST_F(ModuleTest, ZeroGrad) {
 | 
					TEST_F(ModuleTest, ZeroGrad) {
 | 
				
			||||||
  Linear module(3, 4);
 | 
					  Linear module(3, 4);
 | 
				
			||||||
  auto weight = torch::ones({8, 3}, torch::requires_grad());
 | 
					  auto weight = torch::ones({8, 3}, torch::requires_grad());
 | 
				
			||||||
  auto loss = module->forward(weight).sum();
 | 
					  auto loss = module(weight).sum();
 | 
				
			||||||
  loss.backward();
 | 
					  loss.backward();
 | 
				
			||||||
  for (auto& parameter : module->parameters()) {
 | 
					  for (auto& parameter : module->parameters()) {
 | 
				
			||||||
    auto grad = parameter.grad();
 | 
					    auto grad = parameter.grad();
 | 
				
			||||||
@ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) {
 | 
				
			|||||||
    ASSERT_NO_THROW(module->modules());
 | 
					    ASSERT_NO_THROW(module->modules());
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct ModuleWithNonTensorForwardImpl : torch::nn::Module {
 | 
				
			||||||
 | 
					  int64_t forward(torch::Tensor x) {
 | 
				
			||||||
 | 
					    return x.numel();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					TORCH_MODULE(ModuleWithNonTensorForward);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) {
 | 
				
			||||||
 | 
					  ModuleWithNonTensorForward m;
 | 
				
			||||||
 | 
					  ASSERT_EQ(m(torch::ones(123)), 123);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {};
 | 
				
			|||||||
TEST_F(ModulesTest, Conv1d) {
 | 
					TEST_F(ModulesTest, Conv1d) {
 | 
				
			||||||
  Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
 | 
					  Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
 | 
				
			||||||
  auto x = torch::randn({2, 3, 5}, torch::requires_grad());
 | 
					  auto x = torch::randn({2, 3, 5}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) {
 | 
				
			|||||||
TEST_F(ModulesTest, Conv2dEven) {
 | 
					TEST_F(ModulesTest, Conv2dEven) {
 | 
				
			||||||
  Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
 | 
					  Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
 | 
				
			||||||
  auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
 | 
					  auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) {
 | 
				
			|||||||
TEST_F(ModulesTest, Conv2dUneven) {
 | 
					TEST_F(ModulesTest, Conv2dUneven) {
 | 
				
			||||||
  Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
 | 
					  Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
 | 
				
			||||||
  auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
 | 
					  auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) {
 | 
				
			|||||||
TEST_F(ModulesTest, Conv3d) {
 | 
					TEST_F(ModulesTest, Conv3d) {
 | 
				
			||||||
  Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
 | 
					  Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
 | 
				
			||||||
  auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
 | 
					  auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) {
 | 
				
			|||||||
TEST_F(ModulesTest, Linear) {
 | 
					TEST_F(ModulesTest, Linear) {
 | 
				
			||||||
  Linear model(5, 2);
 | 
					  Linear model(5, 2);
 | 
				
			||||||
  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
					  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) {
 | 
				
			|||||||
  auto l3 = model->add(Linear(5, 100), "l3");
 | 
					  auto l3 = model->add(Linear(5, 100), "l3");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  auto x = torch::randn({1000, 10}, torch::requires_grad());
 | 
					  auto x = torch::randn({1000, 10}, torch::requires_grad());
 | 
				
			||||||
  x = l1->forward(x).clamp_min(0);
 | 
					  x = l1(x).clamp_min(0);
 | 
				
			||||||
  x = l2->forward(x).clamp_min(0);
 | 
					  x = l2(x).clamp_min(0);
 | 
				
			||||||
  x = l3->forward(x).clamp_min(0);
 | 
					  x = l3(x).clamp_min(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  x.backward();
 | 
					  x.backward();
 | 
				
			||||||
  ASSERT_EQ(x.ndimension(), 2);
 | 
					  ASSERT_EQ(x.ndimension(), 2);
 | 
				
			||||||
@ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 | 
				
			|||||||
  // Cannot get gradients to change indices (input) - only for embedding
 | 
					  // Cannot get gradients to change indices (input) - only for embedding
 | 
				
			||||||
  // params
 | 
					  // params
 | 
				
			||||||
  auto x = torch::full({10}, dict_size - 1, torch::kInt64);
 | 
					  auto x = torch::full({10}, dict_size - 1, torch::kInt64);
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 | 
				
			|||||||
TEST_F(ModulesTest, EmbeddingList) {
 | 
					TEST_F(ModulesTest, EmbeddingList) {
 | 
				
			||||||
  Embedding model(6, 4);
 | 
					  Embedding model(6, 4);
 | 
				
			||||||
  auto x = torch::full({2, 3}, 5, torch::kInt64);
 | 
					  auto x = torch::full({2, 3}, 5, torch::kInt64);
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) {
 | 
				
			|||||||
TEST_F(ModulesTest, Dropout) {
 | 
					TEST_F(ModulesTest, Dropout) {
 | 
				
			||||||
  Dropout dropout(0.5);
 | 
					  Dropout dropout(0.5);
 | 
				
			||||||
  torch::Tensor x = torch::ones(100, torch::requires_grad());
 | 
					  torch::Tensor x = torch::ones(100, torch::requires_grad());
 | 
				
			||||||
  torch::Tensor y = dropout->forward(x);
 | 
					  torch::Tensor y = dropout(x);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  y.backward();
 | 
					  y.backward();
 | 
				
			||||||
  ASSERT_EQ(y.ndimension(), 1);
 | 
					  ASSERT_EQ(y.ndimension(), 1);
 | 
				
			||||||
@ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) {
 | 
				
			|||||||
  ASSERT_GT(y.sum().item<float>(), 70); // Probably
 | 
					  ASSERT_GT(y.sum().item<float>(), 70); // Probably
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  dropout->eval();
 | 
					  dropout->eval();
 | 
				
			||||||
  y = dropout->forward(x);
 | 
					  y = dropout(x);
 | 
				
			||||||
  ASSERT_EQ(y.sum().item<float>(), 100);
 | 
					  ASSERT_EQ(y.sum().item<float>(), 100);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
 | 
				
			|||||||
    was_called = true;
 | 
					    was_called = true;
 | 
				
			||||||
    return input;
 | 
					    return input;
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
  auto output = functional->forward(torch::ones(5, torch::requires_grad()));
 | 
					  auto output = functional(torch::ones(5, torch::requires_grad()));
 | 
				
			||||||
  ASSERT_TRUE(was_called);
 | 
					  ASSERT_TRUE(was_called);
 | 
				
			||||||
  ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));
 | 
					  ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) {
 | 
				
			|||||||
  ASSERT_FALSE(bn->bias.defined());
 | 
					  ASSERT_FALSE(bn->bias.defined());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ASSERT_THROWS_WITH(
 | 
					  ASSERT_THROWS_WITH(
 | 
				
			||||||
      bn->forward(torch::ones({2, 5})),
 | 
					      bn(torch::ones({2, 5})),
 | 
				
			||||||
      "Calling BatchNorm::forward is only permitted "
 | 
					      "Calling BatchNorm::forward is only permitted "
 | 
				
			||||||
      "when the 'stateful' option is true (was false). "
 | 
					      "when the 'stateful' option is true (was false). "
 | 
				
			||||||
      "Use BatchNorm::pure_forward instead.");
 | 
					      "Use BatchNorm::pure_forward instead.");
 | 
				
			||||||
@ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) {
 | 
				
			|||||||
  model->to(torch::kCUDA);
 | 
					  model->to(torch::kCUDA);
 | 
				
			||||||
  auto x =
 | 
					  auto x =
 | 
				
			||||||
      torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
 | 
					      torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
@ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) {
 | 
				
			|||||||
  model->to(torch::kCUDA);
 | 
					  model->to(torch::kCUDA);
 | 
				
			||||||
  model->to(torch::kCPU);
 | 
					  model->to(torch::kCPU);
 | 
				
			||||||
  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
					  auto x = torch::randn({10, 5}, torch::requires_grad());
 | 
				
			||||||
  auto y = model->forward(x);
 | 
					  auto y = model(x);
 | 
				
			||||||
  torch::Tensor s = y.sum();
 | 
					  torch::Tensor s = y.sum();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  s.backward();
 | 
					  s.backward();
 | 
				
			||||||
 | 
				
			|||||||
@ -215,7 +215,9 @@ TEST(SerializeTest, Optim) {
 | 
				
			|||||||
TEST(SerializeTest, XOR_CUDA) {
 | 
					TEST(SerializeTest, XOR_CUDA) {
 | 
				
			||||||
  torch::manual_seed(0);
 | 
					  torch::manual_seed(0);
 | 
				
			||||||
  // We better be able to save and load a XOR model!
 | 
					  // We better be able to save and load a XOR model!
 | 
				
			||||||
  auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) {
 | 
					  auto getLoss = [](Sequential model,
 | 
				
			||||||
 | 
					                    uint32_t batch_size,
 | 
				
			||||||
 | 
					                    bool is_cuda = false) {
 | 
				
			||||||
    auto inputs = torch::empty({batch_size, 2});
 | 
					    auto inputs = torch::empty({batch_size, 2});
 | 
				
			||||||
    auto labels = torch::empty({batch_size});
 | 
					    auto labels = torch::empty({batch_size});
 | 
				
			||||||
    if (is_cuda) {
 | 
					    if (is_cuda) {
 | 
				
			||||||
@ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) {
 | 
				
			|||||||
  loss = getLoss(model3, 100, true);
 | 
					  loss = getLoss(model3, 100, true);
 | 
				
			||||||
  ASSERT_LT(loss.item<float>(), 0.1);
 | 
					  ASSERT_LT(loss.item<float>(), 0.1);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST(
 | 
				
			||||||
 | 
					    SerializeTest,
 | 
				
			||||||
 | 
					    CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) {
 | 
				
			||||||
 | 
					  struct C : torch::nn::Module {
 | 
				
			||||||
 | 
					    C() {
 | 
				
			||||||
 | 
					      register_buffer("foo", torch::ones(5, torch::kInt32));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  struct B : torch::nn::Module {};
 | 
				
			||||||
 | 
					  struct A : torch::nn::Module {
 | 
				
			||||||
 | 
					    A() {
 | 
				
			||||||
 | 
					      register_module("b", std::make_shared<B>());
 | 
				
			||||||
 | 
					      register_module("c", std::make_shared<C>());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  struct M : torch::nn::Module {
 | 
				
			||||||
 | 
					    M() {
 | 
				
			||||||
 | 
					      register_module("a", std::make_shared<A>());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  auto out = std::make_shared<M>();
 | 
				
			||||||
 | 
					  std::stringstream ss;
 | 
				
			||||||
 | 
					  torch::save(out, ss);
 | 
				
			||||||
 | 
					  auto in = std::make_shared<M>();
 | 
				
			||||||
 | 
					  torch::load(in, ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int output = in->named_buffers()["a.c.foo"].sum().item<int>();
 | 
				
			||||||
 | 
					  ASSERT_EQ(output, 5);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) {
 | 
				
			|||||||
  ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
 | 
					  ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct A : torch::nn::Module {
 | 
				
			||||||
 | 
					  int forward() {
 | 
				
			||||||
 | 
					    return 5;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct B : torch::nn::Module {
 | 
				
			||||||
 | 
					  std::string forward(torch::Tensor tensor) {
 | 
				
			||||||
 | 
					    return "";
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct C : torch::nn::Module {
 | 
				
			||||||
 | 
					  float forward(torch::Tensor& tensor) {
 | 
				
			||||||
 | 
					    return 5.0;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct D : torch::nn::Module {
 | 
				
			||||||
 | 
					  char forward(torch::Tensor&& tensor) {
 | 
				
			||||||
 | 
					    return 'x';
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct E : torch::nn::Module {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Put in a function because macros don't handle the comma between arguments to
 | 
				
			||||||
 | 
					// is_same well ...
 | 
				
			||||||
 | 
					template <typename Module, typename ExpectedType, typename... Args>
 | 
				
			||||||
 | 
					void assert_has_expected_type() {
 | 
				
			||||||
 | 
					  using ReturnType =
 | 
				
			||||||
 | 
					      typename torch::detail::return_type_of_forward<Module, Args...>::type;
 | 
				
			||||||
 | 
					  constexpr bool is_expected_type =
 | 
				
			||||||
 | 
					      std::is_same<ReturnType, ExpectedType>::value;
 | 
				
			||||||
 | 
					  ASSERT_TRUE(is_expected_type) << Module().name();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST(TestStatic, ReturnTypeOfForward) {
 | 
				
			||||||
 | 
					  assert_has_expected_type<A, int>();
 | 
				
			||||||
 | 
					  assert_has_expected_type<B, std::string, torch::Tensor>();
 | 
				
			||||||
 | 
					  assert_has_expected_type<C, float, torch::Tensor&>();
 | 
				
			||||||
 | 
					  assert_has_expected_type<D, char, torch::Tensor&&>();
 | 
				
			||||||
 | 
					  assert_has_expected_type<E, void>();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEST(TestStatic, Apply) {
 | 
					TEST(TestStatic, Apply) {
 | 
				
			||||||
  std::vector<int> v;
 | 
					  std::vector<int> v;
 | 
				
			||||||
  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
 | 
					  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
 | 
				
			||||||
 | 
				
			|||||||
@ -10,12 +10,13 @@ graph(%x.1_data : Tensor
 | 
				
			|||||||
  %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
 | 
					  %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
 | 
				
			||||||
    block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
 | 
					    block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
 | 
				
			||||||
      %16 : Long() = prim::NumToTensor(%6)
 | 
					      %16 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
      %alpha : float = prim::TensorToNum(%16)
 | 
					      %alpha : float = prim::Float(%16)
 | 
				
			||||||
      %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
 | 
					      %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
 | 
				
			||||||
      %mask : Tensor = aten::mul(%5_mask, %y_mask)
 | 
					      %mask : Tensor = aten::mul(%5_mask, %y_mask)
 | 
				
			||||||
      %dims : Tensor = aten::__or__(%5_dims, %y_dims)
 | 
					      %dims : Tensor = aten::__or__(%5_dims, %y_dims)
 | 
				
			||||||
      %data : Tensor = aten::where(%mask, %data.1, %5_data)
 | 
					      %data : Tensor = aten::where(%mask, %data.1, %5_data)
 | 
				
			||||||
      -> (%7, %data, %mask, %dims)
 | 
					      -> (%7, %data, %mask, %dims)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  return (%x, %10, %11);
 | 
					  %22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11)
 | 
				
			||||||
 | 
					  return (%22);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -7,33 +7,31 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %6 : int = prim::Constant[value=1]()
 | 
					  %6 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
					  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
				
			||||||
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %9 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %10 : bool = prim::TensorToBool(%7)
 | 
					  %alpha.1 : float = prim::Float(%9)
 | 
				
			||||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
					 | 
				
			||||||
  %alpha.1 : float = prim::TensorToNum(%11)
 | 
					 | 
				
			||||||
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
					  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
				
			||||||
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %16 : Long() = prim::NumToTensor(%6)
 | 
					  %14 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %alpha : float = prim::TensorToNum(%16)
 | 
					  %alpha : float = prim::Float(%14)
 | 
				
			||||||
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
					  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
				
			||||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %21 : bool = prim::Constant[value=1]()
 | 
					  %19 : bool = prim::Constant[value=1]()
 | 
				
			||||||
  %22 : int = prim::Constant[value=1]()
 | 
					  %20 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %23 : Tensor = aten::type_as(%8, %7)
 | 
					  %21 : Tensor = aten::type_as(%8, %7)
 | 
				
			||||||
  %data.2 : Tensor = aten::mul(%7, %23)
 | 
					  %data.2 : Tensor = aten::mul(%7, %21)
 | 
				
			||||||
  %25 : int = aten::dim(%data.2)
 | 
					  %23 : int = aten::dim(%data.2)
 | 
				
			||||||
  %26 : bool = aten::eq(%25, %22)
 | 
					  %24 : bool = aten::eq(%23, %20)
 | 
				
			||||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
 | 
					  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%24)
 | 
				
			||||||
    block0() {
 | 
					    block0() {
 | 
				
			||||||
      %29 : int = aten::dim(%data.1)
 | 
					      %27 : int = aten::dim(%data.1)
 | 
				
			||||||
      %30 : int = aten::sub(%29, %22)
 | 
					      %28 : int = aten::sub(%27, %20)
 | 
				
			||||||
      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
 | 
					      %data.4 : Tensor = prim::Loop(%28, %19, %data.2)
 | 
				
			||||||
        block0(%32 : int, %33 : Tensor) {
 | 
					        block0(%30 : int, %31 : Tensor) {
 | 
				
			||||||
          %34 : int = aten::dim(%33)
 | 
					          %32 : int = aten::dim(%31)
 | 
				
			||||||
          %data.3 : Tensor = aten::unsqueeze(%33, %34)
 | 
					          %data.3 : Tensor = aten::unsqueeze(%31, %32)
 | 
				
			||||||
          -> (%21, %data.3)
 | 
					          -> (%19, %data.3)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
					      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
				
			||||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
					      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
				
			||||||
@ -45,5 +43,6 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
					  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
				
			||||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
					  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
				
			||||||
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
					  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
				
			||||||
  return (%res_data, %res_mask, %res_dims);
 | 
					  %39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
				
			||||||
 | 
					  return (%39);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -7,34 +7,33 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %6 : int = prim::Constant[value=1]()
 | 
					  %6 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %7 : float = prim::Constant[value=0.1]()
 | 
					  %7 : float = prim::Constant[value=0.1]()
 | 
				
			||||||
  %8 : Float() = prim::NumToTensor(%7)
 | 
					  %8 : Float() = prim::NumToTensor(%7)
 | 
				
			||||||
  %other : float = prim::TensorToNum(%8)
 | 
					  %other : float = prim::Float(%8)
 | 
				
			||||||
  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
					  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
				
			||||||
  %11 : bool = prim::TensorToBool(%10)
 | 
					  %11 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %12 : Long() = prim::NumToTensor(%6)
 | 
					  %alpha.1 : float = prim::Float(%11)
 | 
				
			||||||
  %alpha.1 : float = prim::TensorToNum(%12)
 | 
					 | 
				
			||||||
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
					  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
 | 
				
			||||||
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %17 : Long() = prim::NumToTensor(%6)
 | 
					  %16 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %alpha : float = prim::TensorToNum(%17)
 | 
					  %alpha : float = prim::Float(%16)
 | 
				
			||||||
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
					  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
 | 
				
			||||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %22 : bool = prim::Constant[value=1]()
 | 
					  %21 : bool = prim::Constant[value=1]()
 | 
				
			||||||
  %23 : int = prim::Constant[value=1]()
 | 
					  %22 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %24 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
					  %23 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
				
			||||||
  %data.2 : Tensor = aten::mul(%10, %24)
 | 
					  %data.2 : Tensor = aten::mul(%10, %23)
 | 
				
			||||||
  %26 : int = aten::dim(%data.2)
 | 
					  %25 : int = aten::dim(%data.2)
 | 
				
			||||||
  %27 : bool = aten::eq(%26, %23)
 | 
					  %26 : bool = aten::eq(%25, %22)
 | 
				
			||||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%27)
 | 
					  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
 | 
				
			||||||
    block0() {
 | 
					    block0() {
 | 
				
			||||||
      %30 : int = aten::dim(%data.1)
 | 
					      %29 : int = aten::dim(%data.1)
 | 
				
			||||||
      %31 : int = aten::sub(%30, %23)
 | 
					      %30 : int = aten::sub(%29, %22)
 | 
				
			||||||
      %data.4 : Tensor = prim::Loop(%31, %22, %data.2)
 | 
					      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
 | 
				
			||||||
        block0(%33 : int, %34 : Tensor) {
 | 
					        block0(%32 : int, %33 : Tensor) {
 | 
				
			||||||
          %35 : int = aten::dim(%34)
 | 
					          %34 : int = aten::dim(%33)
 | 
				
			||||||
          %data.3 : Tensor = aten::unsqueeze(%34, %35)
 | 
					          %data.3 : Tensor = aten::unsqueeze(%33, %34)
 | 
				
			||||||
          -> (%22, %data.3)
 | 
					          -> (%21, %data.3)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
					      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
 | 
				
			||||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
					      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
 | 
				
			||||||
@ -46,5 +45,6 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
					  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
 | 
				
			||||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
					  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
 | 
				
			||||||
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
					  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
 | 
				
			||||||
  return (%res_data, %res_mask, %res_dims);
 | 
					  %41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
				
			||||||
 | 
					  return (%41);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -7,28 +7,26 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %6 : int = prim::Constant[value=1]()
 | 
					  %6 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
					  %7 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
				
			||||||
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %9 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %10 : bool = prim::TensorToBool(%7)
 | 
					  %alpha : float = prim::Float(%9)
 | 
				
			||||||
  %11 : Long() = prim::NumToTensor(%6)
 | 
					 | 
				
			||||||
  %alpha : float = prim::TensorToNum(%11)
 | 
					 | 
				
			||||||
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
					  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
				
			||||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %16 : bool = prim::Constant[value=1]()
 | 
					  %14 : bool = prim::Constant[value=1]()
 | 
				
			||||||
  %17 : int = prim::Constant[value=1]()
 | 
					  %15 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %18 : Tensor = aten::type_as(%8, %7)
 | 
					  %16 : Tensor = aten::type_as(%8, %7)
 | 
				
			||||||
  %data.2 : Tensor = aten::mul(%7, %18)
 | 
					  %data.2 : Tensor = aten::mul(%7, %16)
 | 
				
			||||||
  %20 : int = aten::dim(%data.2)
 | 
					  %18 : int = aten::dim(%data.2)
 | 
				
			||||||
  %21 : bool = aten::eq(%20, %17)
 | 
					  %19 : bool = aten::eq(%18, %15)
 | 
				
			||||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
 | 
					  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%19)
 | 
				
			||||||
    block0() {
 | 
					    block0() {
 | 
				
			||||||
      %24 : int = aten::dim(%data)
 | 
					      %22 : int = aten::dim(%data)
 | 
				
			||||||
      %25 : int = aten::sub(%24, %17)
 | 
					      %23 : int = aten::sub(%22, %15)
 | 
				
			||||||
      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
 | 
					      %data.4 : Tensor = prim::Loop(%23, %14, %data.2)
 | 
				
			||||||
        block0(%27 : int, %28 : Tensor) {
 | 
					        block0(%25 : int, %26 : Tensor) {
 | 
				
			||||||
          %29 : int = aten::dim(%28)
 | 
					          %27 : int = aten::dim(%26)
 | 
				
			||||||
          %data.3 : Tensor = aten::unsqueeze(%28, %29)
 | 
					          %data.3 : Tensor = aten::unsqueeze(%26, %27)
 | 
				
			||||||
          -> (%16, %data.3)
 | 
					          -> (%14, %data.3)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
					      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
				
			||||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
					      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
				
			||||||
@ -40,5 +38,6 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
					  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
				
			||||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
					  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
				
			||||||
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
					  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
				
			||||||
  return (%res_data, %res_mask, %res_dims);
 | 
					  %34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
				
			||||||
 | 
					  return (%34);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -7,29 +7,28 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %6 : int = prim::Constant[value=1]()
 | 
					  %6 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %7 : float = prim::Constant[value=0.1]()
 | 
					  %7 : float = prim::Constant[value=0.1]()
 | 
				
			||||||
  %8 : Float() = prim::NumToTensor(%7)
 | 
					  %8 : Float() = prim::NumToTensor(%7)
 | 
				
			||||||
  %other : float = prim::TensorToNum(%8)
 | 
					  %other : float = prim::Float(%8)
 | 
				
			||||||
  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
					  %10 : Tensor = aten::gt(%a.1_data, %other)
 | 
				
			||||||
  %11 : bool = prim::TensorToBool(%10)
 | 
					  %11 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
  %12 : Long() = prim::NumToTensor(%6)
 | 
					  %alpha : float = prim::Float(%11)
 | 
				
			||||||
  %alpha : float = prim::TensorToNum(%12)
 | 
					 | 
				
			||||||
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
					  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
 | 
				
			||||||
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %17 : bool = prim::Constant[value=1]()
 | 
					  %16 : bool = prim::Constant[value=1]()
 | 
				
			||||||
  %18 : int = prim::Constant[value=1]()
 | 
					  %17 : int = prim::Constant[value=1]()
 | 
				
			||||||
  %19 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
					  %18 : Tensor = aten::type_as(%a.1_mask, %10)
 | 
				
			||||||
  %data.2 : Tensor = aten::mul(%10, %19)
 | 
					  %data.2 : Tensor = aten::mul(%10, %18)
 | 
				
			||||||
  %21 : int = aten::dim(%data.2)
 | 
					  %20 : int = aten::dim(%data.2)
 | 
				
			||||||
  %22 : bool = aten::eq(%21, %18)
 | 
					  %21 : bool = aten::eq(%20, %17)
 | 
				
			||||||
  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%22)
 | 
					  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
 | 
				
			||||||
    block0() {
 | 
					    block0() {
 | 
				
			||||||
      %25 : int = aten::dim(%data)
 | 
					      %24 : int = aten::dim(%data)
 | 
				
			||||||
      %26 : int = aten::sub(%25, %18)
 | 
					      %25 : int = aten::sub(%24, %17)
 | 
				
			||||||
      %data.4 : Tensor = prim::Loop(%26, %17, %data.2)
 | 
					      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
 | 
				
			||||||
        block0(%28 : int, %29 : Tensor) {
 | 
					        block0(%27 : int, %28 : Tensor) {
 | 
				
			||||||
          %30 : int = aten::dim(%29)
 | 
					          %29 : int = aten::dim(%28)
 | 
				
			||||||
          %data.3 : Tensor = aten::unsqueeze(%29, %30)
 | 
					          %data.3 : Tensor = aten::unsqueeze(%28, %29)
 | 
				
			||||||
          -> (%17, %data.3)
 | 
					          -> (%16, %data.3)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
					      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
				
			||||||
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
					      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
				
			||||||
@ -41,5 +40,6 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
					  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
 | 
				
			||||||
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
					  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
 | 
				
			||||||
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
					  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
 | 
				
			||||||
  return (%res_data, %res_mask, %res_dims);
 | 
					  %36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
 | 
				
			||||||
 | 
					  return (%36);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -9,38 +9,35 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
  %8 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
					  %8 : Tensor = aten::gt(%a.1_data, %b_data)
 | 
				
			||||||
  %9 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
					  %9 : Tensor = aten::mul(%a.1_mask, %b_mask)
 | 
				
			||||||
  %10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
					  %10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
 | 
				
			||||||
  %11 : bool = prim::TensorToBool(%8)
 | 
					  %11 : int = prim::Constant[value=0]()
 | 
				
			||||||
  %12 : int = prim::Constant[value=0]()
 | 
					  %12 : Tensor = aten::mul(%8, %9)
 | 
				
			||||||
  %13 : Tensor = aten::mul(%8, %9)
 | 
					  %13 : Tensor = aten::sum(%12)
 | 
				
			||||||
  %14 : Tensor = aten::sum(%13)
 | 
					  %14 : Tensor = aten::gt(%13, %11)
 | 
				
			||||||
  %15 : Tensor = aten::gt(%14, %12)
 | 
					  %15 : bool = prim::Bool(%14)
 | 
				
			||||||
  %16 : bool = prim::TensorToBool(%15)
 | 
					  %16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
 | 
				
			||||||
  %17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
 | 
					    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
 | 
				
			||||||
    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
 | 
					      %27 : Long() = prim::NumToTensor(%6)
 | 
				
			||||||
      %30 : Long() = prim::NumToTensor(%6)
 | 
					      %alpha : float = prim::Float(%27)
 | 
				
			||||||
      %alpha : float = prim::TensorToNum(%30)
 | 
					 | 
				
			||||||
      %data : Tensor = aten::sub(%6_data, %b_data, %alpha)
 | 
					      %data : Tensor = aten::sub(%6_data, %b_data, %alpha)
 | 
				
			||||||
      %mask : Tensor = aten::mul(%6_mask, %b_mask)
 | 
					      %mask : Tensor = aten::mul(%6_mask, %b_mask)
 | 
				
			||||||
      %dims : Tensor = aten::__or__(%6_dims, %b_dims)
 | 
					      %dims : Tensor = aten::__or__(%6_dims, %b_dims)
 | 
				
			||||||
      %35 : Tensor = aten::gt(%data, %b_data)
 | 
					      %32 : Tensor = aten::gt(%data, %b_data)
 | 
				
			||||||
      %36 : Tensor = aten::mul(%mask, %b_mask)
 | 
					      %33 : Tensor = aten::mul(%mask, %b_mask)
 | 
				
			||||||
      %37 : Tensor = aten::__or__(%dims, %b_dims)
 | 
					      %34 : bool = prim::Constant[value=1]()
 | 
				
			||||||
      %38 : bool = prim::TensorToBool(%35)
 | 
					      %35 : int = prim::Constant[value=1]()
 | 
				
			||||||
      %39 : bool = prim::Constant[value=1]()
 | 
					      %36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
 | 
				
			||||||
      %40 : int = prim::Constant[value=1]()
 | 
					      %data.2 : Tensor = aten::mul(%cond_data.2, %36)
 | 
				
			||||||
      %41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
 | 
					      %38 : int = aten::dim(%data.2)
 | 
				
			||||||
      %data.2 : Tensor = aten::mul(%cond_data.2, %41)
 | 
					      %39 : bool = aten::eq(%38, %35)
 | 
				
			||||||
      %43 : int = aten::dim(%data.2)
 | 
					      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%39)
 | 
				
			||||||
      %44 : bool = aten::eq(%43, %40)
 | 
					 | 
				
			||||||
      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%44)
 | 
					 | 
				
			||||||
        block0() {
 | 
					        block0() {
 | 
				
			||||||
          %47 : int = aten::dim(%data)
 | 
					          %42 : int = aten::dim(%data)
 | 
				
			||||||
          %48 : int = aten::sub(%47, %40)
 | 
					          %43 : int = aten::sub(%42, %35)
 | 
				
			||||||
          %data.4 : Tensor = prim::Loop(%48, %39, %data.2)
 | 
					          %data.4 : Tensor = prim::Loop(%43, %34, %data.2)
 | 
				
			||||||
            block0(%50 : int, %51 : Tensor) {
 | 
					            block0(%45 : int, %46 : Tensor) {
 | 
				
			||||||
              %52 : int = aten::dim(%51)
 | 
					              %47 : int = aten::dim(%46)
 | 
				
			||||||
              %data.3 : Tensor = aten::unsqueeze(%51, %52)
 | 
					              %data.3 : Tensor = aten::unsqueeze(%46, %47)
 | 
				
			||||||
              -> (%39, %data.3)
 | 
					              -> (%34, %data.3)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
          %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
					          %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
 | 
				
			||||||
          %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
					          %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
 | 
				
			||||||
@ -52,12 +49,13 @@ graph(%a.1_data : Tensor
 | 
				
			|||||||
      %res_data : Tensor = aten::where(%cond_data, %data, %6_data)
 | 
					      %res_data : Tensor = aten::where(%cond_data, %data, %6_data)
 | 
				
			||||||
      %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
 | 
					      %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
 | 
				
			||||||
      %res_dims : Tensor = aten::__or__(%dims, %6_dims)
 | 
					      %res_dims : Tensor = aten::__or__(%dims, %6_dims)
 | 
				
			||||||
      %59 : int = prim::Constant[value=0]()
 | 
					      %54 : int = prim::Constant[value=0]()
 | 
				
			||||||
      %60 : Tensor = aten::mul(%35, %36)
 | 
					      %55 : Tensor = aten::mul(%32, %33)
 | 
				
			||||||
      %61 : Tensor = aten::sum(%60)
 | 
					      %56 : Tensor = aten::sum(%55)
 | 
				
			||||||
      %62 : Tensor = aten::gt(%61, %59)
 | 
					      %57 : Tensor = aten::gt(%56, %54)
 | 
				
			||||||
      %63 : bool = prim::TensorToBool(%62)
 | 
					      %58 : bool = prim::Bool(%57)
 | 
				
			||||||
      -> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims)
 | 
					      -> (%58, %32, %33, %res_data, %res_mask, %res_dims)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  return (%a, %21, %22);
 | 
					  %59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20)
 | 
				
			||||||
 | 
					  return (%59);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user