Add PyTorch Governance, Contributor Guide, and List of Persons of Interest

Summary: Adding new documents to the PyTorch website to describe how PyTorch is governed, how to contribute to the project, and lists persons of interest. Reviewed By: orionr Differential Revision: D14394573 fbshipit-source-id: ad98b807850c51de0b741e3acbbc3c699e97b27f
Fix dll loading issue for Caffe2 and Windows (#17214 )
2025-11-04 16:04:58 +08:00 · 2019-03-14 03:41:24 -07:00 · 2019-02-17 00:09:20 -05:00 · 2019-02-16 01:48:23 -05:00 · 2019-02-07 12:54:24 +05:30 · 2019-02-07 11:28:19 +05:30
247 changed files with 7534 additions and 3399 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,14 +1,14 @@
 # IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 # in this file to the new version number, and **ALSO** update the version number below:
-# PyTorchDockerVersion:262
+# PyTorchDockerVersion:282
-# Caffe2DockerVersion:230
+# Caffe2DockerVersion:238
 docker_config_defaults: &docker_config_defaults
  user: jenkins
  aws_auth:
    # This IAM user only allows read-write access to ECR
-    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
+    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
-    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
+    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
 # NOTE: We only perform the merge in build step and not in test step, because
 # all source files will be shared from build to test
@ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client
    sudo apt-get -qq update
    sudo apt-get -qq install openssh-client git
 install_doc_push_script: &install_doc_push_script
  name: Install the doc push script
  no_output_timeout: "2m"
  command: |
    cat >/home/circleci/project/doc_push_script.sh <<EOL
    # =================== The following code **should** be executed inside Docker container ===================
    # This is where the local pytorch install in the docker image is located
    pt_checkout="/var/lib/jenkins/workspace"
    # Since we're cat-ing this file, we need to escape all $'s
    echo "doc_push_script.sh: Invoked with \$*"
    git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
    pushd pytorch.github.io
    set -ex
    # Argument 1: Where to copy the built documentation to
    # (pytorch.github.io/$install_path)
    install_path="\$1"
    if [ -z "\$install_path" ]; then
    echo "error: doc_push_script.sh: install_path (arg1) not specified"
      exit 1
    fi
    # Argument 2: What version of the docs we are building.
    version="\$2"
    if [ -z "\$version" ]; then
    echo "error: doc_push_script.sh: version (arg2) not specified"
      exit 1
    fi
    is_master_doc=false
    if [ "\$version" == "master" ]; then
      is_master_doc=true
    fi
    # Argument 3: (optional) If present, we will NOT do any pushing. Used for testing.
    dry_run=false
    if [ "\$3" != "" ]; then
      dry_run=true
    fi
    echo "install_path: \$install_path  version: \$version  dry_run: \$dry_run"
    export LC_ALL=C
    export PATH=/opt/conda/bin:$PATH
    rm -rf pytorch || true
    # Get all the documentation sources, put them in one place
    pushd "\$pt_checkout"
    git clone https://github.com/pytorch/vision
    pushd vision
    conda install -q pillow
    time python setup.py install
    popd
    pushd docs
    rm -rf source/torchvision
    cp -r ../vision/docs/source source/torchvision
    # Build the docs
    pip -q install -r requirements.txt || true
    if [ "\$is_master_doc" = true ]; then
      make html
    else
      make html-stable
    fi
    # Move them into the docs repo
    popd
    popd
    git rm -rf "\$install_path" || true
    mv "\$pt_checkout/docs/build/html" "\$install_path"
    # Add the version handler by search and replace.
    # XXX: Consider moving this to the docs Makefile or site build
    if [ "\$is_master_doc" = true ]; then
      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \&#x25BC</a>@g"
    else
      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \&#x25BC</a>@g"
    fi
    git add "\$install_path" || true
    git status
    git config user.email "soumith+bot@pytorch.org"
    git config user.name "pytorchbot"
    # If there aren't changes, don't make a commit; push is no-op
    git commit -m "auto-generating sphinx docs" || true
    git status
    if [ "\$dry_run" = false ]; then
      echo "Pushing to pytorch.github.io:site"
      git push origin site
    else
      echo "Skipping push due to dry_run"
    fi
    popd
    # =================== The above code **should** be executed inside Docker container ===================
    EOL
    chmod +x /home/circleci/project/doc_push_script.sh
 setup_ci_environment: &setup_ci_environment
  name: Set Up CI Environment
  no_output_timeout: "1h"
@ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment
      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
      # This IAM user allows write access to S3 bucket for sccache
-      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
+      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
-      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
+      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
    fi
    # This IAM user only allows read-write access to ECR
-    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
+    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
-    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
+    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
    eval $(aws ecr get-login --region us-east-1 --no-include-email)
 pytorch_linux_build_defaults: &pytorch_linux_build_defaults
@ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
      <<: *setup_ci_environment
  - run:
      name: Test
-      no_output_timeout: "90m"
+      no_output_timeout: "1h"
      command: |
        set -e
        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
@ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
          export IN_CIRCLECI=1
-          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
+          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
-          brew install moreutils --without-parallel
+          # so we must unlink GNU `parallel` first, and relink it afterwards
          brew unlink parallel
          brew install moreutils
          brew link parallel --overwrite
          brew install cmake
          brew install expect
@ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
          export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
          # This IAM user allows write access to S3 bucket for sccache
-          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
-          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
          export SCCACHE_BIN=${PWD}/sccache_bin
          mkdir -p ${SCCACHE_BIN}
@ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
            sccache --show-stats
          fi
 ##############################################################################
 ##############################################################################
 # Job specifications
 ##############################################################################
 ##############################################################################
 version: 2
 jobs:
  pytorch_linux_trusty_py2_7_9_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py2_7_9_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_py2_7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py2_7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_py3_5_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py3_5_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_py3_6_gcc4_8_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py3_6_gcc4_8_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_py3_6_gcc5_4_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py3_6_gcc5_4_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_py3_6_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_py3_6_gcc7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_trusty_pynightly_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_trusty_pynightly_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults
  pytorch_linux_xenial_py3_clang5_asan_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
      PYTHON_VERSION: "3.6"
    <<: *pytorch_linux_build_defaults
  pytorch_linux_xenial_py3_clang5_asan_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
      PYTHON_VERSION: "3.6"
    resource_class: large
    <<: *pytorch_linux_test_defaults
-  pytorch_linux_xenial_cuda8_cudnn6_py3_build:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_build:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3"
    <<: *pytorch_linux_build_defaults
-  pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
    <<: *pytorch_linux_test_defaults
-  pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
      MULTI_GPU: "1"
    resource_class: gpu.large
    <<: *pytorch_linux_test_defaults
-  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
    <<: *pytorch_linux_test_defaults
-  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
@ -517,7 +631,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
      PYTHON_VERSION: "2.7"
      CUDA_VERSION: "9"
    <<: *pytorch_linux_build_defaults
@ -525,7 +639,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
      PYTHON_VERSION: "2.7"
      CUDA_VERSION: "9"
    resource_class: gpu.medium
@ -534,7 +648,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9"
    <<: *pytorch_linux_build_defaults
@ -542,7 +656,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9"
    resource_class: gpu.medium
@ -551,7 +665,7 @@ jobs:
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9.2"
    <<: *pytorch_linux_build_defaults
@ -559,7 +673,7 @@ jobs:
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9.2"
    resource_class: gpu.medium
@ -568,7 +682,7 @@ jobs:
  pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "10"
    <<: *pytorch_linux_build_defaults
@ -576,7 +690,7 @@ jobs:
  pytorch_short_perf_test_gpu:
    environment:
      JOB_BASE_NAME: pytorch-short-perf-test-gpu
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
@ -597,8 +711,8 @@ jobs:
          docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
          # This IAM user allows write access to S3 bucket for perf test numbers
-          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
+          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
-          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
+          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
          docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env
          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
@ -607,7 +721,7 @@ jobs:
  pytorch_doc_push:
    environment:
      JOB_BASE_NAME: pytorch-doc-push
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
    resource_class: large
    machine:
      image: default
@ -615,72 +729,39 @@ jobs:
    - run:
        <<: *setup_ci_environment
    - run:
-        name: Doc Push
+        <<: *install_doc_push_script
    - run:
        name: Doc Build and Push
        no_output_timeout: "1h"
        command: |
          set -e
          if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
            echo "Skipping doc push..."
            exit 0
          fi
          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
          docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
          export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
          cat >/home/circleci/project/doc_push_script.sh <<EOL
          # =================== The following code will be executed inside Docker container ===================
          git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
          pushd pytorch.github.io
          set -ex
          export LC_ALL=C
          export PATH=/opt/conda/bin:$PATH
          rm -rf pytorch || true
          # Get all the documentation sources, put them in one place
          # TODO: These clones can race
          git clone https://github.com/pytorch/pytorch
          pushd pytorch
          git clone https://github.com/pytorch/vision
          pushd vision
          conda install -q pillow
          time python setup.py install
          popd
          pushd docs
          rm -rf source/torchvision
          cp -r ../vision/docs/source source/torchvision
          # Build the docs
          pip -q install -r requirements.txt || true
          make html
          # Move them into the docs repo
          popd
          popd
          git rm -rf docs/master || true
          mv pytorch/docs/build/html docs/master
          find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\&#x25BC<\\/a>/g'
          git add docs/master || true
          git status
          git config user.email "soumith+bot@pytorch.org"
          git config user.name "pytorchbot"
          # If there aren't changes, don't make a commit; push is no-op
          git commit -m "auto-generating sphinx docs" || true
          git status
          git push origin site
          popd
          # =================== The above code will be executed inside Docker container ===================
          EOL
          chmod +x /home/circleci/project/doc_push_script.sh
          docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh
-          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          # master branch docs push
          if [[ "${CIRCLE_BRANCH}" == "master" ]]; then
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
          # stable release docs push. We keep an eternal PR open for merging
          # v1.0.1 -> master; everytime v1.0.1 is updated the following is run.
          elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1'
          # For open PRs: Do a dry_run of the docs build, don't push build
          else
            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1'
          fi
          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
          # Save the docs build so we can debug any problems
          export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug
          docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE}
          docker push ${DEBUG_COMMIT_DOCKER_IMAGE}
  pytorch_macos_10_13_py3_build:
    macos:
      xcode: "9.0"
@ -696,8 +777,11 @@ jobs:
            set -e
            export IN_CIRCLECI=1
-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
-            brew install moreutils --without-parallel
+            # so we must unlink GNU `parallel` first, and relink it afterwards
            brew unlink parallel
            brew install moreutils
            brew link parallel --overwrite
            brew install expect
            # Install sccache
@ -706,8 +790,8 @@ jobs:
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
            # This IAM user allows write access to S3 bucket for sccache
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
            git submodule sync && git submodule update -q --init
            chmod a+x .jenkins/pytorch/macos-build.sh
@ -740,8 +824,11 @@ jobs:
          command: |
            set -e
            export IN_CIRCLECI=1
-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
-            brew install moreutils --without-parallel
+            # so we must unlink GNU `parallel` first, and relink it afterwards
            brew unlink parallel
            brew install moreutils
            brew link parallel --overwrite
            brew install expect
            cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
@ -765,8 +852,11 @@ jobs:
            export IN_CIRCLECI=1
-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
-            brew install moreutils --without-parallel
+            # so we must unlink GNU `parallel` first, and relink it afterwards
            brew unlink parallel
            brew install moreutils
            brew link parallel --overwrite
            brew install expect
            # Install CUDA 9.2
@ -790,30 +880,13 @@ jobs:
            sudo chmod +x /usr/local/bin/sccache
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
            # This IAM user allows write access to S3 bucket for sccache
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}
            git submodule sync && git submodule update -q --init
            chmod a+x .jenkins/pytorch/macos-build.sh
            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
    <<: *caffe2_linux_build_defaults
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
    resource_class: gpu.medium
    <<: *caffe2_linux_test_defaults
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
@ -896,11 +969,20 @@ jobs:
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
    resource_class: gpu.medium
    <<: *caffe2_linux_test_defaults
  caffe2_py2_gcc4_9_ubuntu14_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
@ -1008,25 +1090,25 @@ workflows:
      - pytorch_linux_xenial_py3_clang5_asan_test:
          requires:
            - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_short_perf_test_gpu:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_doc_push:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
          requires:
@ -1047,10 +1129,6 @@ workflows:
            - pytorch_macos_10_13_py3_build
      - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
          requires:
            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
          requires:
@ -1072,6 +1150,9 @@ workflows:
          requires:
            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
          requires:
            - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
      - caffe2_py2_clang3_8_ubuntu16_04_build
      - caffe2_py2_clang3_9_ubuntu16_04_build
      - caffe2_py2_clang7_ubuntu16_04_build
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
  CMAKE_ARGS+=("-DBLAS=MKL")
  CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
 fi
 if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
  CMAKE_ARGS+=("-DUSE_CUDA=ON")
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@ -14,18 +14,8 @@ clang --version
 # symbolize=1: Gives us much better errors when things go wrong
 export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 # FIXME: Remove the hardcoded "-pthread" option.
 # With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
 # succeed because "pthread_create" is in libasan.so. However, libasan doesn't
 # have the full pthread implementation. Other advanced pthread functions doesn't
 # exist in libasan.so[2]. If we need some pthread advanced functions, we still
 # need to link the pthread library.
 # [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
 # [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
 #
 # TODO: Make the ASAN flags a more unified env var
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
-  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
+  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
  CXX_FLAGS="-pthread" \
  NO_CUDA=1 USE_MKLDNN=0 \
  python setup.py install
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -129,7 +129,7 @@ fi
 git add -f build/bin
 # Test documentation build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
  pushd docs
  # TODO: Don't run this here
  pip install -q -r requirements.txt || true
@ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 fi
 # Test standalone c10 build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
  mkdir -p c10/build
  pushd c10/build
  cmake ..
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@ -122,7 +122,7 @@ fi
 # Use conda cmake in some CI build. Conda cmake will be newer than our supported
 # min version 3.5, so we only do it in two builds that we know should use conda.
 if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
+  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
    if ! which conda; then
      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@ -5,9 +5,9 @@
 # in this file will report a failure (so you don't forget to
 # reenable the tests on merge ;)
-pytorch-linux-xenial-cuda8-cudnn6-py3-build
+pytorch-linux-xenial-cuda8-cudnn7-py3-build
-pytorch-linux-xenial-cuda8-cudnn6-py3-test
+pytorch-linux-xenial-cuda8-cudnn7-py3-test
-pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
+pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 pytorch-linux-xenial-cuda9-cudnn7-py2-build
 pytorch-linux-xenial-cuda9-cudnn7-py2-test
 pytorch-linux-xenial-cuda9-cudnn7-py3-build
--- a/.jenkins/pytorch/win-build.sh
+++ b/.jenkins/pytorch/win-build.sh
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
    sccache --show-stats
    sccache --zero-stats
    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
    for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
      IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
        del /S /Q %%i
      )
    )
    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
  )
--- a/.travis.yml
+++ b/.travis.yml
@ -34,10 +34,4 @@ matrix:
        script: cd docs/cpp/source && ./check-doxygen.sh
      - env: CLANG_TIDY
        python: "3.6"
        addons:
          apt:
            sources:
              - ubuntu-toolchain-r-test
              - llvm-toolchain-trusty
            packages: clang-tidy
        script: tools/run-clang-tidy-in-ci.sh
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60
 	--generate-code arch=compute_70,code=sm_70)
    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
      SET(CUFFT_FAKELINK_OPTIONS
 	--generate-code arch=compute_35,code=sm_35
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60
 	--generate-code arch=compute_70,code=sm_70)
    else()
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
    endif()
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@ -2122,55 +2122,6 @@
    - arg: THTensor* self
    - arg: THTensor* tensor
 ]]
 [[
  name: _th_tril
  cname: tril
  variants:
    - function
  return: argument 0
  arguments:
    - arg: THTensor* result
      output: True
    - THTensor* self
    - arg: long diagonal
      default: 0
 ]]
 [[
  name: _th_tril_
  cname: tril
  variants: function
  return: self
  arguments:
    - THTensor* self
    - THTensor* self
    - arg: long diagonal
      default: 0
 ]]
 [[
  name: _th_triu
  cname: triu
  variants:
    - function
  return: argument 0
  arguments:
    - arg: THTensor* result
      output: True
    - THTensor* self
    - arg: long diagonal
      default: 0
 ]]
 [[
  name: _th_triu_
  cname: triu
  variants:
    - function
  return: self
  arguments:
    - THTensor* self
    - THTensor* self
    - arg: long diagonal
      default: 0
 ]]
 [[
  name: _th_cross
  cname: cross
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) {
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (shape[i - leading_dims] == 1 && sizes[i] > 1) {
+    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
      reduce_dims.push_back(i);
    }
  }
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -81,6 +81,39 @@ inline void parallel_for(
 #endif
 }
 /*
 parallel_reduce
 begin: index at which to start applying reduction
 end: index at which to stop applying reduction
 grain_size: number of elements per chunk. impacts number of elements in
 intermediate results tensor and degree of parallelization.
 ident: identity for binary combination function sf. sf(ident, x) needs to return
 x.
 f: function for reduction over a chunk. f needs to be of signature scalar_t
 f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
 sf: function to combine two partial results. sf needs to be of signature
 scalar_t sf(scalar_t x, scalar_t y)
 For example, you might have a tensor of 10000 entires and want to sum together
 all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 an intermediate result tensor with 4 elements. Then it will execute the function
 "f" you provide and pass the beginning and end index of these chunks, so
 0-24999, 2500-4999, etc. and the combination identity. It will then write out
 the result from each of these chunks into the intermediate result tensor. After
 that it'll reduce the partial results from each chunk into a single number using
 the combination function sf and the identity ident. For a total summation this
 would be "+" and 0 respectively. This is similar to tbb's approach [1], where
 you need to provide a function to accumulate a subrange, a function to combine
 two partial results and an identity.
 [1] https://software.intel.com/en-us/node/506154
 */
 template <class scalar_t, class F, class SF>
 inline scalar_t parallel_reduce(
    const int64_t begin,
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
 void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
  AT_CHECK(
-    t.type().backend() == backend,
+    !t.defined() || t.type().backend() == backend,
    "Expected tensor to have ", toString(backend),
    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
    "(while checking arguments for ", c, ")");
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -52,14 +52,11 @@ namespace c10 {
  _(prim, TupleSlice)              \
  _(prim, ListConstruct)           \
  _(prim, ListUnpack)              \
  _(prim, BoolToTensor)            \
  _(prim, NumToTensor)             \
  _(prim, TensorToNum)             \
  _(prim, ImplicitTensorToNum)     \
-  _(prim, TensorToBool)            \
+  _(prim, Bool)                    \
-  _(prim, IntToFloat)              \
+  _(prim, Int)                     \
-  _(prim, FloatToInt)              \
+  _(prim, Float)                   \
  _(prim, StringToFloat)           \
  _(prim, device)                  \
  _(prim, dtype)                   \
  _(prim, shape)                   \
@ -139,7 +136,8 @@ namespace c10 {
  _(attr, name)                    \
  _(attr, a)                       \
  _(attr, b)                       \
-  _(attr, beg)
+  _(attr, beg)                     \
  _(attr, idx)
 #else
 #define FORALL_NS_SYMBOLS(_) \
  _(namespaces, prim)              \
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
    ss << "Future[" << getElementType()->python_str() << "]";
    return ss.str();
  }
  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
    return create(contained_types.at(0));
  }
 private:
  FutureType(TypePtr elem) : SingleElementType(elem) {}
 };
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
 }
 inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
  AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
  if (typ->isSubtypeOf(IntType::get())) {
    return CompleteTensorType::create(at::kLong, at::kCPU, {});
  } else if (typ->isSubtypeOf(FloatType::get())) {
@ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
 template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }
-CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
+CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);
 using TypeEnv = std::unordered_map<std::string, TypePtr>;
 struct MatchTypeReturn {
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
  return value;
 }
-TypePtr inferTypeFrom(const IValue& value) {
+// why incomplete? You cannot completely recover a type from
 // an IValue, List[List[int]] and List[List[Tensor]] will both
 // become ivalue.isGenericList() and cannot be recovered.
 // The only appropriate place to use this is where you know that
 // you are only dealing with a subset of objects where you can recover
 // the type, like in the tracer.
 TypePtr incompleteInferTypeFrom(const IValue& value) {
  if (value.isTensor()) {
    return CompleteTensorType::create(value.toTensor());
  } else if (value.isDouble()) {
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
  } else if (value.isDoubleList()) {
    return ListType::ofFloats();
  } else if (value.isTuple()) {
-    return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
+    return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
  } else if (value.isDevice()) {
    return DeviceObjType::get();
  }
-  AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
+  AT_ERROR("Type cannot be accurately recovered from this IValue.");
 }
 c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
--- a/aten/src/ATen/cpu/vec256/functional.h
+++ b/aten/src/ATen/cpu/vec256/functional.h
@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
    vec256::Vec256<scalar_t> acc_vec,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  scalar_t acc_arr[Vec::size];
+  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
  for (int64_t i = 1; i < size; i++) {
-    scalar_t acc_arr_next[Vec::size];
+    scalar_t acc_arr_next[Vec::size()];
    acc_arr_next[0] = acc_arr[i];
    Vec acc_vec_next = Vec::loadu(acc_arr_next);
    acc_vec = vec_fun(acc_vec, acc_vec_next);
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
 template <typename scalar_t, typename Op>
 inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = Vec::loadu(data);
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    acc_vec = vec_fun(acc_vec, data_vec);
  }
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
    Vec data_vec = Vec::loadu(data + d, size - d);
    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
+  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 }
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
    scalar_t* data,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    data_vec = map_fun(data_vec);
    acc_vec = red_fun(acc_vec, data_vec);
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
    data_vec = map_fun(data_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
    const scalar_t* data2,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size) {
+  if (size < Vec::size()) {
    Vec data_vec = Vec::loadu(data, size);
    Vec data2_vec = Vec::loadu(data2, size);
    data_vec = map_fun(data_vec, data2_vec);
    return vec_reduce_all(red_fun, data_vec, size);
  }
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    Vec data2_vec = Vec::loadu(data2 + d);
    data_vec = map_fun(data_vec, data2_vec);
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
    data_vec = map_fun(data_vec, data2_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }
 template <typename scalar_t, typename Op>
@ -103,7 +103,7 @@ inline void map(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
    output_vec.store(output_data + d);
  }
@ -122,7 +122,7 @@ inline void map2(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(input_data + d);
    Vec data_vec2 = Vec::loadu(input_data2 + d);
    Vec output_vec = vec_fun(data_vec, data_vec2);
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@ -15,14 +15,24 @@
 namespace at {
 namespace vec256 {
 // Note [Acceptable use of anonymous namespace in header]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Yes you saw right, this is an anonymous namespace in a header.  This header,
 // and all of its subheaders, REQUIRE their code to be entirely inlined into
 // the compilation unit that uses them.  It's important that these functions have
 // internal linkage so that kernels for different architectures don't get
 // combined during linking. It's sufficient to label functions "static", but
 // class methods must be an unnamed namespace to have internal linkage (since
 // static means something different in the context of classes).
 namespace {
 template <typename T>
 std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
-  T buf[Vec256<T>::size];
+  T buf[Vec256<T>::size()];
  vec.store(buf);
  stream << "vec[";
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    if (i != 0) {
      stream << ", ";
    }
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -20,6 +20,7 @@
 namespace at {
 namespace vec256 {
 // See Note [Acceptable use of anonymous namespace in header]
 namespace {
 template<size_t n> struct int_of_size;
@ -45,15 +46,49 @@ struct Vec256 {
 private:
  T values[32 / sizeof(T)] = {0};
 public:
-  static constexpr int size = 32 / sizeof(T);
+  // Note [constexpr static function to avoid odr-usage compiler bug]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  // Why, you might ask, is size defined to be a static constexpr function,
  // rather than a more ordinary 'static constexpr int size;' variable?
  // The problem lies within ODR rules for static constexpr members versus
  // static constexpr functions.  First, recall that this class (along with all
  // of its derivations) live in an anonymous namespace: they are intended to be
  // *completely* inlined at their use-sites, because we need to compile it
  // multiple times for different instruction sets.
  //
  // Because of this constraint, we CANNOT provide a single definition for
  // any static members in this class; since we want to compile the class
  // multiple times, there wouldn't actually be any good place to put the
  // definition.  Now here is the problem: if we ODR-use a static constexpr
  // member, we are *obligated* to provide a definition.  Without the
  // definition, you get a compile error like:
  //
  //    relocation R_X86_64_PC32 against undefined symbol
  //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
  //    a shared object; recompile with -fPIC
  //
  // If this were C++17, we could replace a static constexpr variable with
  // an inline variable which doesn't require one definition. But we are not
  // C++17.  So the next best thing is to replace the member with a static
  // constexpr (and therefore inline) function, which does not require ODR
  // either.
  //
  // Also, technically according to the C++ standard, we don't have to define
  // a constexpr variable if we never odr-use it.  But it seems that some
  // versions GCC/Clang have buggy determinations on whether or not an
  // identifier is odr-used or not, and in any case it's hard to tel if
  // a variabe is odr-used or not.  So best to just cut the probem at the root.
  static constexpr int size() {
    return 32 / sizeof(T);
  }
  Vec256() {}
  Vec256(T val) {
-    for (int i = 0; i != size; i++) {
+    for (int i = 0; i != size(); i++) {
      values[i] = val;
    }
  }
  template<typename... Args,
-           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
+           typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
  Vec256(Args... vals) {
    values = { vals... };
  }
@ -61,7 +96,7 @@ public:
  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
    int64_t mask = mask_;
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (mask & 0x01) {
        vec[i] = b[i];
      } else {
@ -74,9 +109,9 @@ public:
  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
                          const Vec256<T>& mask) {
    Vec256 vec;
-    int_same_size_t<T> buffer[size];
+    int_same_size_t<T> buffer[size()];
    mask.store(buffer);
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (buffer[i] & 0x01)
       {
        vec[i] = b[i];
@ -88,14 +123,14 @@ public:
  }
  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      vec.values[i] = base + i * step;
    }
    return vec;
  }
-  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
+  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (i < count) {
        vec[i] = b[i];
      } else {
@ -114,7 +149,7 @@ public:
    std::memcpy(vec.values, ptr, count * sizeof(T));
    return vec;
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
    std::memcpy(ptr, values, count * sizeof(T));
  }
  const T& operator[](int idx) const {
@ -125,14 +160,14 @@ public:
  }
  Vec256<T> map(T (*f)(T)) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i != size; i++) {
+    for (int64_t i = 0; i != size(); i++) {
      ret[i] = f(values[i]);
    }
    return ret;
  }
  Vec256<T> abs() const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = values[i] < 0 ? -values[i] : values[i];
    }
    return ret;
@ -214,7 +249,7 @@ public:
  }
  Vec256<T> pow(const Vec256<T> &exp) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = std::pow(values[i], exp[i]);
    }
    return ret;
@ -222,7 +257,7 @@ public:
 #define DEFINE_COMP(binary_pred)                                              \
  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
    Vec256<T> vec;                                                            \
-    for (int64_t i = 0; i != size; i++) {                                     \
+    for (int64_t i = 0; i != size(); i++) {                                     \
      if (values[i] binary_pred other.values[i]) {                            \
        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
      } else {                                                                \
@ -242,7 +277,7 @@ public:
 template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] + b[i];
  }
  return c;
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T
 template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] - b[i];
  }
  return c;
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T
 template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] * b[i];
  }
  return c;
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T
 template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] / b[i];
  }
  return c;
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
 // either input is a NaN.
 template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
 // either input is a NaN.
 template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
 template <class T>                                                          \
 Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
  using iT = int_same_size_t<T>;                                            \
-  iT buffer[Vec256<T>::size];                                               \
+  iT buffer[Vec256<T>::size()];                                               \
-  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
+  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
    auto a_val = a[i];                                                      \
    auto b_val = b[i];                                                      \
    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
 template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  int_same_size_t<T> index_arr[size];
  vindex.store(static_cast<void*>(index_arr));
  T buffer[size];
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline mask_gather(const Vec256<T>& src, T const* base_addr,
                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
  int_same_size_t<T> index_arr[size];
@ -392,7 +427,7 @@ namespace {
  template<typename dst_t, typename src_t>
  struct CastImpl {
    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
-      src_t src_arr[Vec256<src_t>::size];
+      src_t src_arr[Vec256<src_t>::size()];
      src.store(static_cast<void*>(src_arr));
      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
    }
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {
 template <typename T>
 inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  src.store(static_cast<void*>(src_arr));
  int_same_size_t<T> buffer[size];
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
 //       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 //                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 //       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 //                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 interleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
 template <typename src_T, typename dst_T>
 void convert(const src_T *src, dst_T *dst, int64_t n) {
-#pragma unroll
+#ifndef _MSC_VER  
 # pragma unroll  
 #endif
  for (int64_t i = 0; i < n; i++) {
    *dst = static_cast<dst_T>(
        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@ -8,6 +8,7 @@
 namespace at {
 namespace vec256 {
 // See Note [Acceptable use of anonymous namespace in header]
 namespace {
 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<double> {
 private:
  __m256d values;
 public:
-  static constexpr int size = 4;
+  static constexpr int size() {
    return 4;
  }
  Vec256() {}
  Vec256(__m256d v) : values(v) {}
  Vec256(double val) {
@ -40,7 +43,7 @@ public:
    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
-                            int64_t count = size) {
+                            int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -53,22 +56,22 @@ public:
    }
    return b;
  }
-  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
+  static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size)
+    if (count == size())
      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
-    __at_align32__ double tmp_values[size];
+    __at_align32__ double tmp_values[size()];
    std::memcpy(
        tmp_values,
        reinterpret_cast<const double*>(ptr),
        count * sizeof(double));
    return _mm256_load_pd(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
-    if (count == size) {
+    if (count == size()) {
      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
    } else if (count > 0) {
-      double tmp_values[size];
+      double tmp_values[size()];
      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(double));
    }
@ -252,7 +255,7 @@ template <>
 void convert(const double* src, double* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@ -8,6 +8,7 @@
 namespace at {
 namespace vec256 {
 // See Note [Acceptable use of anonymous namespace in header]
 namespace {
 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<float> {
 private:
  __m256 values;
 public:
-  static constexpr int size = 8;
+  static constexpr int size() {
    return 8;
  }
  Vec256() {}
  Vec256(__m256 v) : values(v) {}
  Vec256(float val) {
@ -43,7 +46,7 @@ public:
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
-                           int64_t count = size) {
+                           int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -64,19 +67,19 @@ public:
    }
    return b;
  }
-  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
+  static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size)
+    if (count == size())
      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
-    __at_align32__ float tmp_values[size];
+    __at_align32__ float tmp_values[size()];
    std::memcpy(
        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
    return _mm256_loadu_ps(tmp_values);
  }
-  void store(void* ptr, int64_t count = size) const {
+  void store(void* ptr, int64_t count = size()) const {
-    if (count == size) {
+    if (count == size()) {
      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
    } else if (count > 0) {
-      float tmp_values[size];
+      float tmp_values[size()];
      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(float));
    }
@ -260,7 +263,7 @@ template <>
 void convert(const float* src, float* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
+  for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@ -12,6 +12,11 @@ namespace {
 struct Vec256i {
 protected:
  __m256i values;
  static inline __m256i invert(const __m256i& v) {
    const auto ones = _mm256_set1_epi64x(-1);
    return _mm256_xor_si256(ones, v);
  }
 public:
  Vec256i() {}
  Vec256i(__m256i v) : values(v) {}
@ -22,7 +27,9 @@ public:
 template <>
 struct Vec256<int64_t> : public Vec256i {
-  static constexpr int size = 4;
+  static constexpr int size() {
    return 4;
  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
@ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
@ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i {
    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<int64_t>
-  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
+  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
-    if (count == size) {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int64_t tmp_values[size];
+      __at_align32__ int64_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
    }
@ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i {
    return _mm256_cmpeq_epi64(values, other.values);
  }
  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpeq_epi64(values, other.values));
    auto eq = _mm256_cmpeq_epi64(values, other.values);
    return _mm256_xor_si256(zero, eq);  // invert
  }
  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
    return _mm256_cmpgt_epi64(other.values, values);
  }
  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi64(values, other.values));
    auto gt = _mm256_cmpgt_epi64(values, other.values);
    return _mm256_xor_si256(zero, gt);  // invert
  }
  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
    return _mm256_cmpgt_epi64(values, other.values);
  }
  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi64(other.values, values));
    auto lt = _mm256_cmpgt_epi64(other.values, values);
    return _mm256_xor_si256(zero, lt);  // invert
  }
 };
 template <>
 struct Vec256<int32_t> : public Vec256i {
-  static constexpr int size = 8;
+  static constexpr int size() {
    return 8;
  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
@ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i {
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<int32_t>
-  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
+  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
-    __at_align32__ int32_t tmp_values[size];
+    __at_align32__ int32_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
-    if (count == size) {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int32_t tmp_values[size];
+      __at_align32__ int32_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
    }
@ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i {
    return _mm256_cmpeq_epi32(values, other.values);
  }
  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpeq_epi32(values, other.values));
    auto eq = _mm256_cmpeq_epi32(values, other.values);
    return _mm256_xor_si256(zero, eq);  // invert
  }
  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
    return _mm256_cmpgt_epi32(other.values, values);
  }
  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi32(values, other.values));
    auto gt = _mm256_cmpgt_epi32(values, other.values);
    return _mm256_xor_si256(zero, gt);  // invert
  }
  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
    return _mm256_cmpgt_epi32(values, other.values);
  }
  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi32(other.values, values));
    auto lt = _mm256_cmpgt_epi32(other.values, values);
    return _mm256_xor_si256(zero, lt);  // invert
  }
 };
@ -212,13 +209,17 @@ template <>
 void convert(const int32_t *src, float *dst, int64_t n) {
  int64_t i;
  // int32_t and float have same size
-#pragma unroll
+#ifndef _MSC_VER
-  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
+# pragma unroll
 #endif
  for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
    auto output_vec = _mm256_cvtepi32_ps(input_vec);
    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER
 # pragma unroll
 #endif
  for (; i < n; i++) {
    dst[i] = static_cast<float>(src[i]);
  }
@ -228,13 +229,17 @@ template <>
 void convert(const int32_t *src, double *dst, int64_t n) {
  int64_t i;
  // int32_t has half the size of double
-#pragma unroll
+#ifndef _MSC_VER
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+# pragma unroll
 #endif
  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER
 # pragma unroll
 #endif
  for (; i < n; i++) {
    dst[i] = static_cast<double>(src[i]);
  }
@ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {
 template <>
 struct Vec256<int16_t> : public Vec256i {
-  static constexpr int size = 16;
+  static constexpr int size() {
    return 16;
  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
@ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
@ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i {
      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
  }
  static Vec256<int16_t>
-  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
+  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
-    if (count == size) {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int16_t tmp_values[size];
+      __at_align32__ int16_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
    }
@ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i {
    return _mm256_cmpeq_epi16(values, other.values);
  }
  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpeq_epi16(values, other.values));
    auto eq = _mm256_cmpeq_epi16(values, other.values);
    return _mm256_xor_si256(zero, eq);  // invert
  }
  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
    return _mm256_cmpgt_epi16(other.values, values);
  }
  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi16(values, other.values));
    auto gt = _mm256_cmpgt_epi16(values, other.values);
    return _mm256_xor_si256(zero, gt);  // invert
  }
  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
    return _mm256_cmpgt_epi16(values, other.values);
  }
  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
+    return invert(_mm256_cmpgt_epi16(other.values, values));
    auto lt = _mm256_cmpgt_epi16(other.values, values);
    return _mm256_xor_si256(zero, lt);  // invert
  }
 };
@ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 template <typename T>
 Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
-  T values_a[Vec256<T>::size];
+  T values_a[Vec256<T>::size()];
-  T values_b[Vec256<T>::size];
+  T values_b[Vec256<T>::size()];
  a.store(values_a);
  b.store(values_b);
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    values_a[i] /= values_b[i];
  }
  return Vec256<T>::loadu(values_a);
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
  // NOTE: extra parenthesis around numbers disable clang warnings about
  // dead code
-  return (
+  return true;
      (CUDNN_VERSION >= (6021)) ||
      (CUDNN_VERSION >= (6000) && prop->major >= 5));
 #else
  return false;
 #endif
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -9,45 +9,6 @@
 #include "ATen/cuda/ATenCUDAGeneral.h"
 #include <cuda.h>
 #if CUDNN_VERSION < 7000
 #include <curand_kernel.h>
 /*
 Note [cuDNN dropout descriptor initialization]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In most cases, setting descriptors in cuDNN is cheap (e.g.,
 cudnnSetTensorNdDescriptor).  However, this is not the case for
 cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
 expensive precomputation to initialize the random number generator states.  In
 cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
 which means that law-abiding clients were expected to generate a dropout
 descriptor once and cache it.  However, our ATen interface is (1) stateless (so
 we can't cache the descriptors) and (2) does not accept arbitrary user types in
 its interface (so we can't pass the descriptor in).  This puts us in a pickle.
 In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
 forgoes the expensive initialization process, and can initialize the
 descriptor with a pre-initialized state CUDA tensor.  This is great, because
 it means we can simply pass in the state tensor and then initialize the
 descriptor internally.  Unfortunately, this function is not available in
 cuDNN 6.
 To work around this, we break the cuDNN abstraction barrier, and have
 the struct layout of the underlaying dropout descriptor.  With this struct,
 we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
 */
 // Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
 struct cudnnDropoutStruct {
  float dropout;
  int nstates;
  void * states;
 };
 #endif
 namespace at { namespace native {
 // TODO: Add constructors for all of the descriptors
@ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor
    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
                                          CUDNN_CROSS_CORRELATION, mathType));
 #if CUDNN_VERSION >= 7000
    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
    if(dataType == CUDNN_DATA_HALF)
      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 #endif
  }
 };
@ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor
  }
 };
 #if CUDNN_VERSION < 7000
 // See Note [cuDNN dropout descriptor initialization]
 inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
    cudnnDropoutDescriptor_t dropoutDesc,
    cudnnHandle_t handle,
    float dropout,
    void *states,
    size_t stateSizeInBytes,
    unsigned long long seed) {
  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
  // This is not entirely accurate but is good enough to catch some API
  // uses which would not be compatible in cuDNN 7.  Feel free to fix
  // this if you notice something is wrong.
  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
  size_t expectedStateSizeInBytes;
  // State size will differ depending on size of GPU
  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
  if (ret != CUDNN_STATUS_SUCCESS) return ret;
  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
  dropoutDesc->dropout = dropout;
  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
  dropoutDesc->states = states;
  return CUDNN_STATUS_SUCCESS;
 }
 #endif // CUDNN_VERSION
 struct AT_CUDA_API DropoutDescriptor
  : public Descriptor<cudnnDropoutStruct,
                      &cudnnCreateDropoutDescriptor,
@ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor
          mode,
          algo,
          datatype));
-#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
    if (prop->major >= 7) {
      if (datatype == CUDNN_DATA_HALF) {
@ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor
  }
 };
 #if CUDNN_VERSION >= 7000
 struct AT_CUDA_API CTCLossDescriptor
  : public Descriptor<cudnnCTCLossStruct,
                      &cudnnCreateCTCLossDescriptor,
@ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor
  }
 };
 #endif
 union Constant
 {
  float f;
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
-      weight_num, channel_size);
+      " and channel size = ", channel_size, ".");
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
      prelu_cpu_kernel_multi_weights<scalar_t>(
@ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
-      weight_num, channel_size);
+      " and channel size = ", channel_size, ".");
    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)
 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
-  if (self.dim() <= 2 && A.dim() <= 2) {
+  AT_CHECK(self.dim() >= 2,
           "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
  AT_CHECK(A.dim() >= 2,
           "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
  if (self.dim() == 2 && A.dim() == 2) {
    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
    // TH and ATen. We should remove the TH gesv bindings, especially
    // since the lapackGesv function is already in ATen.
    linearSolveCheckInputs(self, A);  // Checks square shape of A, and compatibility of self and A
    return at::_th_gesv_single(self, A);
  }
@ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) {
  }
  squareCheckInputs(self);
  // TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors,
  // this can be simplified. Currently, we are zero-ing out values in the
  // batch of matrices by using a mask and the `where` function.
  // The simplification with batched `triu` and `tril` would be this:
  // if (upper) {
  //   return raw_cholesky_output.triu();
  // } else {
  //   return raw_cholesky_output.tril();
  // }
  auto raw_cholesky_output = at::_cholesky_helper(self, upper);
-  int64_t n = self.size(-1);
+  if (upper) {
-  auto indices = at::ones({n, n}, self.options().dtype(at::kByte));
+    return raw_cholesky_output.triu_();
-  indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self);
+  } else {
-  return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output);
+    return raw_cholesky_output.tril_();
  }
 }
 Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
@ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
  return result;
 }
 template <typename scalar_t, bool inplace, bool upper>
 static void apply_triu_tril_single(
    scalar_t* result, scalar_t* self,
    int64_t k, int64_t n, int64_t m,
    int64_t res_row_stride, int64_t res_col_stride,
    int64_t self_row_stride, int64_t self_col_stride) {
  constexpr int64_t zero = 0;
  int64_t i;
  if (upper) {
    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++) {
      for (int64_t j = 0; j < std::min(m, i + k); j++) {
        result[i * res_row_stride + j * res_col_stride] = 0;
      }
      if (!inplace) {  // copy the rest of the self if not inplace
        for (int64_t j = std::max(zero, i + k); j < m; j++) {
          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
        }
      }
    }
  } else {
    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++) {
      for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
        result[i * res_row_stride + j * res_col_stride] = 0;
      }
      if (!inplace) {  // copy the rest of the self if not inplace
        for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
        }
      }
    }
  }
 }
 template <typename scalar_t, bool inplace, bool upper>
 void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) {
  auto n = self.size(-2);
  auto m = self.size(-1);
  auto self_data = self.data<scalar_t>();
  auto self_stride = self.dim() > 2 ? self.stride(-3) : 1;
  auto batchsize = batchCount(self);
  auto self_row_stride = self.stride(-2);
  auto self_column_stride = self.stride(-1);
  auto result_data = result.data<scalar_t>();
  int64_t result_stride, result_row_stride, result_column_stride;
  if (result_data != self_data) {
    result_stride = result.dim() > 2 ? result.stride(-3) : 1;
    result_row_stride = result.stride(-2);
    result_column_stride = result.stride(-1);
  } else {
    result_stride = self_stride;
    result_row_stride = self_row_stride;
    result_column_stride = self_column_stride;
  }
  int64_t b;
  #pragma omp parallel for private(b)
  for (b = 0; b < batchsize; b++) {
    scalar_t* self_batch = &self_data[b * self_stride];
    scalar_t* result_batch = &result_data[b * result_stride];
    apply_triu_tril_single<scalar_t, inplace, upper>(
        result_batch, self_batch, k, n, m,
        result_row_stride, result_column_stride, self_row_stride, self_column_stride);
  }
 }
 Tensor tril(const Tensor& self, int64_t k) {
  Tensor result = at::empty({0}, self.options());
  at::tril_out(result, self, k);
  return result;
 }
 Tensor& tril_cpu_(Tensor &self, int64_t k) {
  if (self.numel() == 0) {
    return self;
  }
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
    apply_triu_tril<scalar_t, true, false>(self, self, k);
  });
  return self;
 }
 Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
  if (result.sizes() != self.sizes()) {
    result.resize_as_(self);
  }
  if (self.numel() == 0) {
    return result;
  }
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
    apply_triu_tril<scalar_t, false, false>(result, self_c, k);
  });
  return result;
 }
 Tensor triu(const Tensor& self, int64_t k) {
  Tensor result = at::empty({0}, self.options());
  at::triu_out(result, self, k);
  return result;
 }
 Tensor& triu_cpu_(Tensor &self, int64_t k) {
  if (self.numel() == 0) {
    return self;
  }
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
    apply_triu_tril<scalar_t, true, true>(self, self, k);
  });
  return self;
 }
 Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
  if (result.sizes() != self.sizes()) {
    result.resize_as_(self);
  }
  if (self.numel() == 0) {
    return result;
  }
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
    apply_triu_tril<scalar_t, false, true>(result, self_c, k);
  });
  return result;
 }
 }}  // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -378,8 +378,8 @@ at::Tensor _convolution(
    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
             ") should be the same");
-
+    output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias,
-    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
+                                    params.padding, params.stride, params.dilation, params.groups);
 #endif
  } else {
    if (params.groups == 1) {
--- a/aten/src/ATen/native/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyDefinitions.cpp
@ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) {
 }
 Tensor & eq_(Tensor& self, const Tensor & other) {
-  return at::_th_ge_(self, other);
+  return at::_th_eq_(self, other);
 }
 Tensor & ne_(Tensor& self, Scalar other) {
@ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) {
  return at::_th_atan2_(self, other);
 }
 Tensor & tril_(Tensor& self, int64_t diagonal) {
  return at::_th_tril_(self, diagonal);
 }
 Tensor & triu_(Tensor& self, int64_t diagonal) {
  return at::_th_triu_(self, diagonal);
 }
 Tensor & digamma_(Tensor& self) {
  return at::_th_digamma_(self);
 }
@ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
  return at::_th_cross(self, other, dim);
 }
 Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
  return at::_th_triu_out(result, self, diagonal);
 }
 Tensor triu(const Tensor & self, int64_t diagonal) {
  return at::_th_triu(self, diagonal);
 }
 Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
  return at::_th_tril_out(result, self, diagonal);
 }
 Tensor tril(const Tensor & self, int64_t diagonal) {
  return at::_th_tril(self, diagonal);
 }
 Tensor trace(const Tensor & self) {
  return at::_th_trace(self);
 }
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) {
  return batched_matrices.size(-1) * batched_matrices.size(-2);
 }
 /* Checks a necessary property for the triu and tril implementations, hence the name.
 * Here batch contiguity is checked for tensors with greater than 4 dimensions.
 * Contiguous tensors and tensors with less than 3 dimensions pass this check
 */ 
 static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) {
  // Complete contiguity is the most desired property, which is why
  // we return true if the tensor is contiguous
  if (tensor.is_contiguous()) return true;
  int64_t dims = tensor.dim();
  // Tensors with dimension less than 4 are handled by default
  if (dims <= 3) return true;
  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
  for (int64_t i = dims - 3; i >= 0; i--) {
    if (expected_stride != tensor.stride(i)) return false;
    expected_stride *= tensor.size(i);
  }
  return true;
 }
 // Returns the epsilon value for floating types except half
 static inline double _get_epsilon(const ScalarType& sc_type) {
  switch (sc_type) {
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
 std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
                                                  const Tensor& running_mean, const Tensor& running_var,
                                                  bool train, double momentum, double eps) {
  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
  return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
      return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
    });
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -21,7 +21,6 @@ namespace native {
 DEFINE_DISPATCH(sum_stub);
 DEFINE_DISPATCH(prod_stub);
 DEFINE_DISPATCH(norm_kernel);
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
  ScalarType scalarType = self.type().scalarType();
@ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
    return result;
-  if (self.is_contiguous() && result.is_contiguous()) {
+  return at::_th_norm_out(result, self, p, dim, keepdim);
    _dimreduce_setup(result, self, dim);
    norm_kernel(kCPU, result, self, p, dim);
    if (!keepdim) {
      result.squeeze_(dim);
    }
    return result;
  } else {
    return at::_th_norm_out(result, self, p, dim, keepdim);
  }
 }
 Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
@ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
             "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
-    if (self.is_cuda()) {
+    return at::_th_norm(self, p);
      return at::_th_norm(self, p);
    } else {
      if (self.is_contiguous()) {
        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
        norm_kernel(kCPU, result, self, p, c10::nullopt);
        return result;
      } else {
        return at::_th_norm(self, p);
      }
    }
  }
 }
--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@ -34,11 +34,11 @@ Tensor _bincount_cpu_template(
  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
  nbins = std::max(nbins, minlength); // at least minlength # of bins
-  const input_t* self_p = self.contiguous().data<input_t>();
+  const input_t* self_p = self.data<input_t>();
  if (has_weights) {
    output = native::zeros({nbins}, weights.options());
    weights_t* output_p = output.data<weights_t>();
-    const weights_t* weights_p = weights.contiguous().data<weights_t>();
+    const weights_t* weights_p = weights.data<weights_t>();
    for (int64_t i = 0; i < self.size(0); i++) {
      output_p[self_p[i]] += weights_p[i];
    }
@ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
    const auto scalar = weights.type().scalarType();
    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
-      return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
+      return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
    return _bincount_cpu_template<scalar_t, double>(
-        self, weights.toType(CPU(kDouble)), minlength);
+        self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength);
  });
 }
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const {
 }
 void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
  if (range.size() == 0) {
    return;
  }
  auto strides = get_strides();
  while (strides.size() < 2 * ntensors()) {
    strides.push_back(0);
@ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range)
  int64_t ndim = values.size();
  for (int dim = 0; dim < ndim; dim++) {
    int64_t size = shape[dim];
-    values[dim] = linear_offset % size;
+    if (size > 0) {
-    linear_offset /= size;
+      values[dim] = linear_offset % size;
      linear_offset /= size;
    }
  }
  AT_ASSERT(linear_offset == 0);
 }
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -101,14 +101,14 @@ struct PDist {
    scalar_t * const res_start = result.data<scalar_t>();
    int64_t combs = result.numel(); // n * (n - 1) / 2
    const Vec pvec(p);
    // We conceptually iterate over tuples of (i, j, k) where i is the first
    // vector from the input, j is the second, and k is the result index. This
    // parallelizes over the range of k and infers what i and j are from the
    // value of k.
-    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
+    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) {
-      float n2 = n - .5;
+      const Vec pvec(p);
      double n2 = n - .5;
      // The -1 accounts for floating point truncation issues
      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
@ -149,7 +149,7 @@ struct PDist {
  }
  template <typename F>
-  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
+  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
      const Vec self_vec_i = Vec::loadu(self_i, count);
@ -177,7 +177,6 @@ struct PDist {
    const int64_t n = self.size(0);
    const int64_t m = self.size(1);
    const int64_t gs = grad.stride(0);
    const Vec pvec(p);
    const scalar_t * const grad_start = grad.data<scalar_t>();
    const scalar_t * const dist_start = dist.data<scalar_t>();
@ -187,17 +186,19 @@ struct PDist {
    // The only way to parallelize and avoid locking requires parallelizing
    // over the columns of the input, i.e. we compute the gradient for the
    // first section of each vector independentaly of the second section, etc.
-    at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
+    at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) {
-      const scalar_t * self_l = self_start + l * Vec::size;
+      const Vec pvec(p);
      scalar_t * res_l = res_start + l * Vec::size;
-      for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
+      const scalar_t * self_l = self_start + l * Vec::size();
      scalar_t * res_l = res_start + l * Vec::size();
      for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) {
        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
      }
    });
-    const int64_t remainder = m % Vec::size;
+    const int64_t remainder = m % Vec::size();
    if (remainder) {
-      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
+      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder);
    }
  }
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@ -308,7 +308,9 @@ static inline void
 mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
                 const int_same_size_t<scalar_t> *offsets,
                 const int_same_size_t<scalar_t> *mask, int64_t len) {
-  #pragma unroll
+  #ifndef _MSC_VER  
  # pragma unroll  
  #endif
  for (int64_t i = 0; i < len; i++) {
    if (mask[i] & 0x01) {
      base_addr[offsets[i]] += src[i];
@ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
    auto i_se_offset = i_sw_offset + iVec(inp_sW);
-    #pragma unroll
+    #ifndef _MSC_VER  
    # pragma unroll  
    #endif
    for (int64_t c = 0; c < C; ++c) {
      auto inp_slice_C_ptr = inp_slice[c].data();
@ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    // So we store the necessary vectors to temporary arrays and use the helper
    // mask_scatter_add defined above.
-    integer_t i_gInp_nw_offset_arr[iVec::size];
+    integer_t i_gInp_nw_offset_arr[iVec::size()];
-    integer_t i_gInp_ne_offset_arr[iVec::size];
+    integer_t i_gInp_ne_offset_arr[iVec::size()];
-    integer_t i_gInp_sw_offset_arr[iVec::size];
+    integer_t i_gInp_sw_offset_arr[iVec::size()];
-    integer_t i_gInp_se_offset_arr[iVec::size];
+    integer_t i_gInp_se_offset_arr[iVec::size()];
    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
    i_gInp_se_offset.store(i_gInp_se_offset_arr);
-    integer_t i_nw_mask_arr[iVec::size];
+    integer_t i_nw_mask_arr[iVec::size()];
-    integer_t i_ne_mask_arr[iVec::size];
+    integer_t i_ne_mask_arr[iVec::size()];
-    integer_t i_sw_mask_arr[iVec::size];
+    integer_t i_sw_mask_arr[iVec::size()];
-    integer_t i_se_mask_arr[iVec::size];
+    integer_t i_se_mask_arr[iVec::size()];
    nw_mask.store(i_nw_mask_arr);
    ne_mask.store(i_ne_mask_arr);
    sw_mask.store(i_sw_mask_arr);
    se_mask.store(i_se_mask_arr);
-    scalar_t gInp_corner_arr[Vec::size];
+    scalar_t gInp_corner_arr[Vec::size()];
    auto gx = Vec(0), gy = Vec(0);
-    #pragma unroll
+    #ifndef _MSC_VER  
    # pragma unroll  
    #endif
    for (int64_t c = 0; c < C; ++c) {
      auto inp_slice_C_ptr = inp_slice[c].data();
      auto gInp_slice_C_ptr = gInp_slice[c].data();
@ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    gx = gx * gx_mult;
    gy = gy * gy_mult;
-    constexpr int64_t step = Vec::size;
+    constexpr int64_t step = Vec::size();
    auto interleaved_gGrid = interleave2(gx, gy);
    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
@ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
    auto out_ptr = out_slice.data() + offset;
    auto out_sC = out_slice.stride(0);
    auto inp_slice_ptr = inp_slice.data();
-    #pragma unroll
+    #ifndef _MSC_VER  
    # pragma unroll  
    #endif
    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
      // mask_gather zeros out the mask, so we need to make a copy
      auto mask_copy = mask;
@ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
-    integer_t mask_arr[iVec::size];
+    integer_t mask_arr[iVec::size()];
    i_mask.store(mask_arr);
-    integer_t gInp_offset_arr[iVec::size];
+    integer_t gInp_offset_arr[iVec::size()];
    i_gInp_offset.store(gInp_offset_arr);
-    #pragma unroll
+    #ifndef _MSC_VER  
    # pragma unroll  
    #endif
    for (int64_t c = 0; c < C; ++c) {
      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
                       gInp_offset_arr, mask_arr, len);
@ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
  using Vec = Vec256<scalar_t>;
  using iVec = Vec256<int_same_size_t<scalar_t>>;
-  constexpr int64_t step = Vec::size;
+  constexpr int64_t step = Vec::size();
  // Loop over each output pixel in grid.
  // We consider the following three cases (after slicing out the batch
@ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator(
    auto spatial_offset = 0;
    auto i_offsets_delta = iVec(grid_sW * step);
-    #pragma unroll
+    #ifndef _MSC_VER  
    # pragma unroll  
    #endif
    for (int64_t h = 0; h < out_H; h++) {
      auto grid_ptr_x = grid_ptr + h * grid_sH;
      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
      auto i_offsets = iVec::arange(0, grid_sW);
-      #pragma unroll
+      #ifndef _MSC_VER  
      # pragma unroll  
      #endif
      for (int64_t w = 0; w < out_W; w += step) {
        auto len = std::min(step, out_W - w);
        if (len < step) {
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t>
 static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
-    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
-    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a1, b1);
    auto out2 = vop(a2, b2);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
  binary_loop(data, strides, i, n, op);
@ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op,
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
  auto a = Vec(*(scalar_t*)in1_ptr);
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
-    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a, b1);
    auto out2 = vop(a, b2);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
  binary_loop(data, strides, i, n, op);
@ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op,
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
  auto b = Vec(*(scalar_t*)in2_ptr);
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
-    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a1, b);
    auto out2 = vop(a2, b);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
  binary_loop(data, strides, i, n, op);
@ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o
  char* in_ptr = data[1];
  Vec acc[4];
  for  (int j = 0; j < 4; j++) {
-    acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t));
+    acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t));
  }
  for (int64_t i = 1; i < n; i++) {
    const char* ptr = in_ptr + stride * i;
-    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t))));
+    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
-    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t))));
+    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
-    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t))));
+    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
-    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t))));
+    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
  }
  if (reduce) {
-    scalar_t buffer[Vec::size];
+    scalar_t buffer[Vec::size()];
    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
    acc[0].store(buffer);
-    for (int j = 1; j < Vec::size; j++) {
+    for (int j = 1; j < Vec::size(); j++) {
      buffer[0] = op(buffer[0], buffer[j]);
    }
    auto dst = (scalar_t*)out_ptr;
    *dst = op(*dst, buffer[0]);
  } else {
    for (int j = 0; j < 4; j++) {
-      auto dst = out_ptr + j * Vec::size * sizeof(scalar_t);
+      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
      acc[j] = vop(acc[j], Vec::loadu(dst));
      acc[j].store(dst);
    }
@ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int
 template <typename func_t, typename vec_func_t>
 static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
  VEC_HEADER(func_t)
-  int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t);
+  int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
-  int64_t count = n / (4 * Vec::size);
+  int64_t count = n / (4 * Vec::size());
  if (count > 0) {
    reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
  }
  char* ptrs[3] = { data[0], data[0], data[1] };
  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
-  binary_loop(ptrs, strides, count * 4 * Vec::size, n, op);
+  binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
 }
 // computes the reduction out = op(out, in)
@ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t>
 static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
  VEC_HEADER(func_t)
-  // reduce down each column of 4 * Vec::size elements (128 bytes)
+  // reduce down each column of 4 * Vec::size() elements (128 bytes)
  int64_t outer_stride[2] = { 128, 128 };
-  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] {
+  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
    reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
  });
  // reduce down the remaining columns
  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
-  int64_t remaining = size1 % (4 * Vec::size);
+  int64_t remaining = size1 % (4 * Vec::size());
  UNARY_OUTER_LOOP(data, step, remaining, [&] {
    char* ptrs[3] = { data[0], data[0], data[1] };
    int64_t strides[] = { 0, 0, inner_stride };
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) {
      /*identity=*/1);
  });
 }
 static inline int64_t round_down(int64_t a, int64_t m) {
  return a - (a % m);
 }
 template<typename scalar_t>
 struct NormReduction {
  // reduction width in number of scalar elements
  static constexpr int WIDTH = 128 / sizeof(scalar_t);
  using Vec = Vec256<scalar_t>;
  static void apply(
      Tensor& res,
      const Tensor& self,
      Scalar p,
      c10::optional<int64_t> dim) {
    auto out_ = res.data<scalar_t>();
    auto data_ = self.data<scalar_t>();
    auto numel = self.numel();
    float pval = 0.0;
    if (p.isIntegral()){
      pval = p.to<int64_t>();
    } else if (p.isFloatingPoint()) {
      pval = p.to<float>();
    }
    if (!dim.has_value()) {
      *out_ = reduce_all(data_, numel,  pval);
      return;
    }
    int64_t n = self.size(*dim);
    int64_t stride = self.stride(*dim);
    // A contiguous tensor does not need to hold a meaningful stride
    // if the corresponding size is 1
    if (n == 1) {
      stride = 1;
      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
        stride *= self.size(i);
      }
    }
    int64_t batch = numel / n;
    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
      for (int64_t bi = begin; bi < end; bi++) {
        int64_t b = bi / stride;
        int64_t i = bi % stride;
        const scalar_t* data = &data_[b * n * stride + i];
        out_[bi] = norm_reduce(data, n, stride, pval);
      }
    });
  }
  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
    scalar_t sum = parallel_reduce(
      0,
      size,
      internal::GRAIN_SIZE,
      (scalar_t)0,
      [=](int64_t begin, int64_t end, scalar_t init) {
        const scalar_t* data = &data_[begin];
        int64_t n = end - begin;
        scalar_t result = norm_reduce(data, n, 1, pval);
        return result;
      },
      std::plus<scalar_t>());
    return sum;
  }
  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
    scalar_t result = 0.0;
    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
      int64_t n_rounded = round_down(n, WIDTH);
      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
    } else {
      result = norm_reduce_sequential(data, n, stride, pval);
    }
    return result;
  }
  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
    scalar_t result = 0.0;
    if (pval == 0) {
      for (int64_t k = 0; k < n; k++) {
        result += (data[k * stride] != 0.0);
      }
    } else if (pval == 1) {
      for (int64_t k = 0; k < n; k++) {
        result += std::abs(data[k * stride]);
      }
    } else if (pval == 2) {
      for (int64_t k = 0; k < n; k++) {
        result += data[k * stride] * data[k * stride];
      }
      result = std::sqrt(result);
    } else if (pval == 3) {
      for (int64_t k = 0; k < n; k++) {
        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
      }
      result = std::pow(result, 1.0/3);
    } else if (pval == INFINITY) {
      for (int64_t k = 0; k < n; k++) {
        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
      }
    } else if (pval == -INFINITY) {
      result = INFINITY;
      for (int64_t k = 0; k < n; k++) {
        result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result;
      }
    } else {
      for (int64_t k = 0; k < n; k++) {
        result += std::pow(std::abs(data[k * stride]), pval);
      }
      result = std::pow(result, 1.0/pval);
    }
    return result;
  }
  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
  // n is already rounded by 128
  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
    scalar_t result = 0.0;
    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
    int64_t rows = n / WIDTH;
    if (pval == 1){
      for (int row = 0; row < rows; row ++) {
        for (int j = 0; j != 4; j++) {
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
          acc[j] = acc[j] + val.abs();
        }
      }
    }
    else if (pval == 2) {
      for (int row = 0; row < rows; row ++) {
        for (int j = 0; j != 4; j++) {
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
          acc[j] = acc[j] + val * val;
        }
      }
    }
    else if (pval == 3) {
      for (int row = 0; row < rows; row ++) {
        for (int j = 0; j != 4; j++) {
          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
          acc[j] = acc[j] + (val * val * val).abs();
        }
      }
    }
    scalar_t buf[WIDTH] = {0};
    for (int j = 0; j != 4; j++) {
      acc[j].store(&buf[j * Vec::size]);
    }
    for (int i = 0; i < WIDTH; i++) {
      result += buf[i];
    }
    result = std::pow(result, 1.0/pval);
    return result;
  }
 };
 static void norm_kernel_impl(
    Tensor& result,
    const Tensor& self,
    Scalar p,
    c10::optional<int64_t> dim) {
  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
    NormReduction<scalar_t>::apply(result, self, p, dim);
  });
 }
 }  // anonymous namespace
 REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
 REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 }}  // namespace at::native
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim(
    int64_t outer_size,
    int64_t dim_size) {
  using Vec = vec256::Vec256<scalar_t>;
-  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
+  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
  if (grain_size < CHUNK_SIZE)
    grain_size = CHUNK_SIZE;
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@ -37,9 +37,9 @@ template <>
 int64_t _sigmoid(float* x, float* y, int64_t size) {
  using Vec = Vec256<float>;
  int64_t i = 0;
-  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
    Vec ret = Vec::loadu(y + i);
-    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    Vec ret2 = Vec::loadu(y + i + Vec::size());
    ret = ret.neg();
    ret2 = ret2.neg();
 #if defined(__AVX2__) && !defined(_MSC_VER)
@ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) {
    ret = ret.reciprocal();
    ret2 = ret2.reciprocal();
    ret.store(x + i);
-    ret2.store(x + i + Vec::size);
+    ret2.store(x + i + Vec::size());
  }
  return i;
 }
@ -63,9 +63,9 @@ template <>
 int64_t _sigmoid(double* x, double* y, int64_t size) {
  using Vec = Vec256<double>;
  int64_t i = 0;
-  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
    Vec ret = Vec::loadu(y + i);
-    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    Vec ret2 = Vec::loadu(y + i + Vec::size());
    ret = ret.neg();
    ret2 = ret2.neg();
    ret = ret.exp();
@ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) {
    ret = ret.reciprocal();
    ret2 = ret2.reciprocal();
    ret.store(x + i);
-    ret2.store(x + i + Vec::size);
+    ret2.store(x + i + Vec::size());
  }
  return i;
 }
@ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
          if (stridex == 1 && stridey == 1) {
            i = _sigmoid(x, y, size);
          }
-          for (; i < size; i += Vec::size) {
+          for (; i < size; i += Vec::size()) {
-            scalar_t buffer[Vec::size];
+            scalar_t buffer[Vec::size()];
-            int64_t width = Vec::size;
+            int64_t width = Vec::size();
            width = std::min(width, size - i);
            for (int64_t j = 0; j < width; j++) {
              buffer[j] = y[stridey * (i + j)];
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
-      weight_num, channel_size);
+      " and channel size = ", channel_size, ".");
    // config to run cuda kernel
    int64_t input_numel = input.numel();
@ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
-      weight_num, channel_size);
+      " and channel size = ", channel_size, ".");
    // config to run cuda kernel
    int64_t input_numel = input.numel();
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
  }
 }
 template <typename scalar_t, bool upper>
 __global__
 void triu_tril_kernel(
    scalar_t* result, scalar_t* self, int64_t k, int64_t N,
    int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride,
    int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) {
  int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (linear_idx >= N) {
    return;
  }
  int64_t self_batch_idx = blockIdx.y;
  int64_t row = linear_idx / self_ncol;
  int64_t col = linear_idx % self_ncol;
  bool mask = upper ? (col - row >= k) : (col - row <= k);
  // Now compute the offset for the self and result tensor
  int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride;
  int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride;
  result[res_offset] = mask ? self[self_offset] : scalar_t(0);
 }
 template <bool upper>
 Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
  int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2),
          res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1,
          res_row_stride = result.stride(-2), res_col_stride = result.stride(-1),
          self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1,
          self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
  dim3 dim_block = cuda::getApplyBlock();
  dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
  AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{
    triu_tril_kernel<scalar_t, upper>
      <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
        result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
        res_batch_stride, res_row_stride, res_col_stride,
        self_batch_stride, self_row_stride, self_col_stride, self.size(-1));
  });
  AT_CUDA_CHECK(cudaGetLastError());
  return result;
 }
 Tensor& tril_cuda_(Tensor &self, int64_t k) {
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
  return tril_cuda_out(self, self, k);
 }
 Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
  if (result.sizes() != self.sizes()) {
    result.resize_as_(self);
  }
  if (self.numel() == 0) {
    return result;
  }
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
  return triu_tril_cuda_template<false>(result, self_c, k, "tril");
 }
 Tensor& triu_cuda_(Tensor &self, int64_t k) {
  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
  return triu_cuda_out(self, self, k);
 }
 Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
  if (result.sizes() != self.sizes()) {
    result.resize_as_(self);
  }
  if (self.numel() == 0) {
    return result;
  }
  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
  return triu_tril_cuda_template<true>(result, self_c, k, "triu");
 }
 }}  // namespace at::native
 #undef ALLOCATE_ARRAY
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@ -1,4 +1,5 @@
-#include "ATen/ATen.h"
+#include <ATen/ATen.h>
 #include <ATen/cuda/Exceptions.h>
 #include <THC/THCTensorMathReduce.cuh>
 #include <math.h>
@ -78,13 +79,13 @@ struct dists {
 };
 template <typename scalar_t, typename F>
-__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
+__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
                                              const double n2, const double n2_squared_minus_1) {
  const int k = blockIdx.x;
  const int stride = blockDim.x;
  float n2 = n - .5;
  // The -1 accounts for floating point truncation issues
-  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
  const scalar_t * const start = self + i * m;
@ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t
 }
 template <typename scalar_t, typename F>
-__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
+__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
                                                       const double n2, const double n2_squared_minus_1) {
  const int k = blockIdx.y * blockDim.y + threadIdx.y;
  const int init = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
@ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const
    return;
  }
  float n2 = n - .5;
  // The -1 accounts for floating point truncation issues
-  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
  int64_t ib = j - i - 1;
  int64_t jb = n - 2 - i;
@ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
  const dim3 block(forward_threads);
  int64_t n = self.size(0);
  int64_t m = self.size(1);
  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
  const double n2 = n - .5;
  const double n2_squared_minus_1 = n2 * n2 - 1;
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
    if (p == 0.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (p == 1.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (p == 2.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (std::isinf(p)) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    }
  });
  AT_CUDA_CHECK(cudaGetLastError());
 }
 void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
@ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
  const int64_t n = result.size(0);
  int64_t m = self.size(1);
  const int block_x = 64;
-  const int block_y = 4;
+  // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
  // From binary search, block_y of 16 gives us max pdist dim0 of 1449,
  //                     block_y of  4 gives us max pdist dim0 of  725.
  const int block_y = 16;
  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
  const int grid_y = (dist.numel() + block_y - 1) / block_y;
  const dim3 grid(grid_x, grid_y);
  const dim3 block(block_x, block_y);
  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
  const double n2 = n - .5;
  const double n2_squared_minus_1 = n2 * n2 - 1;
  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
    if (p == 1.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (p < 2.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (p == 2.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (std::isinf(p)) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    }
  });
  AT_CUDA_CHECK(cudaGetLastError());
  at::sum_out(result, buffer, 0);
 }
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
    default:
      AT_ERROR(
-          "Unknown mode for embedding_bag_backward_cuda %d", mode);
+          "Unknown mode for embedding_bag_backward_cuda ", mode);
  }
 }
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
-      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
+      } else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) {
          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
      }
    }
@ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
      if (targets.type().scalarType() == kLong) {
 	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
      } else {
-	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+	return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
      }
    });
 }
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_
                                                            const Tensor& running_mean_, const Tensor& running_var_,
                                                            bool train, double momentum, double epsilon) {
  TensorArg input_arg{ input_, "input", 1 },
            weight_arg{ weight_, "weight", 2 },
            bias_arg{ bias_, "bias", 3 },
            run_mean_arg{ running_mean_, "running_mean", 4 },
            run_var_arg{ running_var_, "running_var", 5 };
  CheckedFrom c = "batch_norm_cuda";
  checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg});
  using accscalar_t = at::acc_type<scalar_t, true>;
  int64_t n_input = input_.size(1);
  Tensor save_mean_;
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@ -7,28 +7,13 @@
 #include <tuple>
 #include <thrust/unique.h>
 #include <thrust/sort.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
 namespace at {
 namespace native{
 namespace {
 template <typename scalar_t>
 __global__ void inverse_indices_kernel(
    const scalar_t* input_data,
    const scalar_t* output_data,
    int64_t* inverse_indices_data,
    int64_t num_inp,
    int64_t num_out) {
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    int64_t stride = blockDim.x * gridDim.x;
    for (int64_t i = idx; i < num_inp * num_out; i += stride) {
      if (input_data[i / num_out] == output_data[i % num_out]){
        inverse_indices_data[i / num_out] = i % num_out;
      }
    }
  }
 template <typename scalar_t>
  std::tuple<Tensor, Tensor> _unique_cuda_template(
@ -47,25 +32,29 @@ template <typename scalar_t>
    Tensor output = input.clone();
    output = output.view(-1);
    scalar_t* output_data = output.data<scalar_t>();
-    thrust::sort(policy, output_data, output_data + num_inp);
+    Tensor inverse_indices;
-    scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
+    if (!return_inverse) {
-    int64_t num_out = output_end - output_data;
+        inverse_indices = at::empty({0},  self.type().toScalarType(kLong));
-    output.resize_(num_out);
+        thrust::sort(policy, output_data, output_data + num_inp);
-
+    } else {
-    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
+        Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong));
-
+        int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>();
-    if (return_inverse) {
+        thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr);
-      inverse_indices.resize_(input.sizes());
+        Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong));
-      int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
+        inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong));
-      int block = 512;
+        int64_t* inv_loc_ptr = inv_loc.data<int64_t>();
-      int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
+        int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>();
-      inverse_indices_kernel<<<grid, block, 0, stream>>>(
+        thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }});
-        input_data, output_data, inverse_indices_data, num_inp, num_out);
+        inv_loc[0] = 0;
        thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
        thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
        inverse_indices.resize_(input.sizes());
    }
    int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data;
    output.resize_(num_out);
    THCudaCheck(cudaGetLastError());
    return std::tuple<Tensor, Tensor>(output, inverse_indices);
  }
 template <typename scalar_t>
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
 #if CUDNN_VERSION >= 6000
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
 #endif
    };
    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
@ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
  THCCachingAllocator_emptyCache();
 }
 //hot fix for #16610
 //specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData, 
 //adding "strided" argument, so in the end this looks simpler.
 template<>
 void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) {
  using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>;
  auto& cache = search::cache();
  if (cache.find(args.params, algo)) {
    return;
  }
  if (args.params.deterministic && !benchmark) {
    *algo = search::DEFAULT_ALGO;
    return;
  }
  int stride_dim = args.input.dim() - 2;
  bool strided = false;
  for (int i = 0; i< stride_dim; i++) {
      if (args.params.stride[i] != 1) {
         strided = true;
         break;
      }
  }
  if (!benchmark) {
    search::getAlgorithm(args, algo);
    if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
       *algo = search::DEFAULT_ALGO;
    }
    return;
  }
  if (cache.find(args.params, algo)) {
    // re-check cache since another thread may have benchmarked the algorithm
    return;
  }
  auto perfResults = search::findAlgorithm(args);
  // for deterministic algo, look at all the perf results and return the best
  // deterministic algo
  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
      !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
      *algo = perfResults.algo;
  } else {
      *algo = search::DEFAULT_ALGO;
  }
  if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
     *algo = search::DEFAULT_ALGO;
  }
  cache.insert(args.params, *algo);
  // Free the cached blocks in our caching allocator. They are
  // needed here because the above benchmarking uses a huge amount of memory,
  // e.g. a few GBs.
  THCCachingAllocator_emptyCache();
 }
 template<typename algo_t>
 Workspace chooseAlgorithm(
    const ConvolutionArgs& args,
@ -848,19 +907,9 @@ Tensor cudnn_convolution_forward(
  // See #4500
  Tensor weight_contig = weight->contiguous();
 #if CUDNN_VERSION < 7000
  for (int i = 0; i < groups; i++) {
    raw_cudnn_convolution_forward_out(
        narrowGroup(*output, output_channels_dim,        i, groups),
        narrowGroup(*input,  input_channels_dim,         i, groups),
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
        padding, stride, dilation, 1, benchmark, deterministic);
  }
 #else
  raw_cudnn_convolution_forward_out(
      *output, *input, weight_contig,
      padding, stride, dilation, groups, benchmark, deterministic);
 #endif
  return *output;
 }
@ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input(
  // See #4500
  Tensor weight_contig = weight->contiguous();
 #if CUDNN_VERSION < 7000
  for (int i = 0; i < groups; i++) {
    raw_cudnn_convolution_backward_input_out(
        narrowGroup(*grad_input, input_channels_dim, i, groups),
        narrowGroup(*grad_output, output_channels_dim, i, groups),
        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
        padding, stride, dilation, 1, benchmark, deterministic);
  }
 #else
  raw_cudnn_convolution_backward_input_out(
      *grad_input, *grad_output, weight_contig,
      padding, stride, dilation, groups, benchmark, deterministic);
 #endif
  return *grad_input;
 }
@ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight(
  TensorArg grad_weight{ grad_weight_t, "result", 0 };
  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
 #if CUDNN_VERSION < 7000
  for (int i = 0; i < groups; i++) {
    raw_cudnn_convolution_backward_weight_out(
        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
        narrowGroup(*grad_output, output_channels_dim, i, groups),
        narrowGroup(*input, input_channels_dim, i, groups),
        padding, stride, dilation, groups, benchmark, deterministic);
  }
 #else
  raw_cudnn_convolution_backward_weight_out(
      *grad_weight, *grad_output, *input,
      padding, stride, dilation, groups, benchmark, deterministic);
 #endif
  return grad_weight_t;
 }
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@ -7,7 +7,7 @@
 #endif
-#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
+#if !AT_CUDNN_ENABLED()
 namespace at { namespace native {
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -375,7 +375,7 @@ namespace {
      case CUDNN_RNN_TANH:
        return 2;
      default:
-        AT_ERROR("unknown cuDNN RNN mode %d", mode);
+        AT_ERROR("unknown cuDNN RNN mode ", mode);
    }
  }
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2555,9 +2555,15 @@
 - func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method
  dispatch:
    CPU: tril_cpu_
    CUDA: tril_cuda_
 - func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
  variants: method
  dispatch:
    CPU: triu_cpu_
    CUDA: triu_cuda_
 - func: digamma_(Tensor self) -> Tensor
  variants: method
@ -2658,11 +2664,17 @@
  variants: method, function
 - func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
  dispatch:
    CPU: triu_cpu_out
    CUDA: triu_cuda_out
 - func: triu(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
 - func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
  dispatch:
    CPU: tril_cpu_out
    CUDA: tril_cuda_out
 - func: tril(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
--- a/aten/src/ATen/test/cuda_cudnn_test.cpp
+++ b/aten/src/ATen/test/cuda_cudnn_test.cpp
@ -11,18 +11,4 @@ using namespace at::native;
 TEST(CUDNNTest, CUDNNTestCUDA) {
  if (!at::cuda::is_available()) return;
  manual_seed(123);
 #if CUDNN_VERSION < 7000
  auto handle = getCudnnHandle();
  DropoutDescriptor desc1, desc2;
  desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte));
  desc2.set(handle, 0.5, desc1.state);
  bool isEQ;
  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
  ASSERT_TRUE(isEQ);
  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
  ASSERT_TRUE(isEQ);
  isEQ = (desc1.desc()->states == desc2.desc()->states);
  ASSERT_TRUE(isEQ);
 #endif
 }
--- a/aten/src/ATen/test/test_install/CMakeLists.txt
+++ b/aten/src/ATen/test/test_install/CMakeLists.txt
@ -3,6 +3,8 @@ find_package(ATen REQUIRED)
 include_directories(${ATEN_INCLUDE_DIR})
 # C++11
-set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
+if (not MSVC) 
    set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") 
 endif()
 add_executable(main main.cpp)
 target_link_libraries(main ${ATEN_LIBRARIES})
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@ -247,10 +247,13 @@
 #ifdef _OPENMP
-#ifndef _WIN32
+#ifdef _WIN32  
-#define PRAGMA(P) _Pragma(#P)
+// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.  
 #define PRAGMA_LOOP(P)    // Noop  
 #define PRAGMA(P)         __pragma(P)
 #else
-#define PRAGMA(P) __pragma(P)
+#define PRAGMA_LOOP(P)    _Pragma(#P)  
 #define PRAGMA(P)         _Pragma(#P)
 #endif
 #include <omp.h>
@ -369,7 +372,7 @@
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
    ptrdiff_t iter = 0;                                                                        \
    if(tp != (TYPE2*)rp) {                                                                             \
-      PRAGMA(ivdep) \
+      PRAGMA_LOOP(ivdep) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
      for (iter = 0; iter < SIZE; iter++) {                             \
        TYPE2 *TENSOR2##_data = tp+iter;                                \
@ -377,7 +380,7 @@
        CODE                                                            \
      }\
    } else {\
-      PRAGMA(simd) \
+      PRAGMA_LOOP(simd) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE2* TENSOR2##_data = tp+iter;\
@ -449,7 +452,7 @@
    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
    ptrdiff_t iter = 0;\
    if(tp != (TYPE2*)rp) {                                                                             \
-      PRAGMA(ivdep) \
+      PRAGMA_LOOP(ivdep) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE1 *TENSOR1##_data = rp+iter;\
@ -458,7 +461,7 @@
        CODE                                \
      } \
    } else {\
-      PRAGMA(simd) \
+      PRAGMA_LOOP(simd) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE1 *TENSOR1##_data = rp+iter;\
--- a/aten/src/TH/generic/THTensorApply.hpp
+++ b/aten/src/TH/generic/THTensorApply.hpp
@ -13,10 +13,13 @@
 #ifdef _OPENMP
-#ifndef _WIN32
+#ifdef _WIN32  
-#define PRAGMA(P) _Pragma(#P)
+// MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.  
 #define PRAGMA_LOOP(P)    // Noop  
 #define PRAGMA(P)         __pragma(P)
 #else
-#define PRAGMA(P) __pragma(P)
+#define PRAGMA_LOOP(P)    _Pragma(#P)  
 #define PRAGMA(P)         _Pragma(#P)
 #endif
 #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
  int free_b = 0;
  if (a == NULL) a = ra_;
  if (b == NULL) b = rb_;
  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
      a->dim());
  THArgCheck(!a->is_empty(), 2, "A should not be empty");
  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
      "dimensions, but has %d", b->dim());
  THArgCheck(!b->is_empty(), 2, "B should not be empty");
  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
      a->size(0), a->size(1));
  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
      "rows, B has %ld", a->size(0), b->size(0));
  if (b->dim() == 1) {
    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
            b->stride(0), 1, 0);
    free_b = 1;
  }
  int n, nrhs, lda, ldb, info;
  THIntTensor *ipiv;
@ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
  THTensor_(freeCopyTo)(ra__, ra_);
  THTensor_(freeCopyTo)(rb__, rb_);
  THIntTensor_free(ipiv);
  if (free_b) c10::raw::intrusive_ptr::decref(b);
 }
 void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n
 TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
 TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
 TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
  REAL_SWAP(ARR(III), ARR(JJJ)); \
  LONG_SWAP(IDX(III), IDX(JJJ))
 /* Emulate NumPy behavior of putting NaNs
 * at the end of an ascending list. */
 #define GT_OR_NAN(x, y) \
  ((x != x && y == y) || (x > y))
 static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
 {
  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
@ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
      /* Use median of three for pivot choice */
    P=(L+R)>>1;
    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+    if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+    if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
    do {
-      do { i = i+1; } while(ARR(i) < piv);
+      do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
-      do { j = j-1; } while(ARR(j) > piv);
+      do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
      if (j < i)
          break;
      BOTH_SWAP(i, j);
@ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
  } /* while not done */
  /* Now insertion sort on the concatenation of subfiles */
  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) > ARR(i+1)) {
+    if (GT_OR_NAN(ARR(i),ARR(i+1))) {
      piv = ARR(i);
      pid = IDX(i);
      j = i+1;
@ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
        ARR(j-1) = ARR(j);
        IDX(j-1) = IDX(j);
        j = j+1;
-      } while(j < elements && ARR(j) < piv);
+      } while(j < elements && GT_OR_NAN(piv, ARR(j)));
      ARR(j-1) = piv;
      IDX(j-1) = pid;
     }
@ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
      /* Use median of three for pivot choice */
    P=(L+R)>>1;
    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
+    if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+    if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }
    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
    do {
-      do { i = i+1; } while(ARR(i) > piv);
+      do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
-      do { j = j-1; } while(ARR(j) < piv);
+      do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
      if (j < i)
          break;
      BOTH_SWAP(i, j);
@ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
  } /* while not done */
  /* Now insertion sort on the concatenation of subfiles */
  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) < ARR(i+1)) {
+    if (GT_OR_NAN(ARR(i+1), ARR(i))) {
      piv = ARR(i);
      pid = IDX(i);
      j = i+1;
@ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
        ARR(j-1) = ARR(j);
        IDX(j-1) = IDX(j);
        j = j+1;
-      } while(j < elements && ARR(j) > piv);
+      } while(j < elements && GT_OR_NAN(ARR(j), piv));
      ARR(j-1) = piv;
      IDX(j-1) = pid;
     }
@ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i
  THLongTensor_free(tmpIndices);
 }
 void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
 {
  int64_t t_size_0, t_size_1;
  int64_t t_stride_0, t_stride_1;
  int64_t r__stride_0, r__stride_1;
  scalar_t *t_data, *r__data;
  int64_t r, c;
  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
  THTensor_(resizeAs)(r_, t);
  t_size_0 = THTensor_(size)(t, 0);
  t_size_1 = THTensor_(size)(t, 1);
  t_stride_0 = THTensor_(stride)(t, 0);
  t_stride_1 = THTensor_(stride)(t, 1);
  r__stride_0 = THTensor_(stride)(r_, 0);
  r__stride_1 = THTensor_(stride)(r_, 1);
  r__data = r_->data<scalar_t>();
  t_data = t->data<scalar_t>();
  for(r = 0; r < t_size_0; r++)
  {
    int64_t sz = THMin(r+k+1, t_size_1);
    for(c = THMax(0, r+k+1); c < t_size_1; c++)
      r__data[r*r__stride_0+c*r__stride_1] = 0;
    for(c = 0; c < sz; c++)
      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
  }
 }
 void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 {
  int64_t t_size_0, t_size_1;
--- a/aten/src/THC/THCSortUtils.cuh
+++ b/aten/src/THC/THCSortUtils.cuh
@ -6,17 +6,17 @@
 #include "THCNumerics.cuh"
 // Collection of kernel sort routines
-template <typename T>
+template <typename T, bool handleNaN = false>
 struct LTComp {
  __device__ inline bool operator()(const T& a, const T& b) const {
-    return THCNumerics<T>::lt(a, b);
+    return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b);
  }
 };
-template <typename T>
+template <typename T, bool handleNaN = false>
 struct GTComp {
  __device__ inline bool operator()(const T& a, const T& b) const {
-    return THCNumerics<T>::gt(a, b);
+    return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b);
  }
 };
--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) {
 }
 template <typename T>
-__device__ int binarySearchForMultinomial(T* dist,
+__device__ int binarySearchForMultinomial(T* cumdist,
                                          T* dist,
                                          int size,
                                          T val) {
  int start = 0;
  int end = size;
-  // dist[size - 1] = 0 => all zero prob dist
+  // cumdist[size - 1] = 0 => all zero prob dist
-  assert(THCNumerics<T>::gt(dist[size - 1], 0));
+  assert(THCNumerics<T>::gt(cumdist[size - 1], 0));
  while (end - start > 0) {
    int mid = start + (end - start) / 2;
-    T midVal = dist[mid];
+    T midVal = cumdist[mid];
    if (THCNumerics<T>::lt(midVal, val)) {
      start = mid + 1;
    } else {
@ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist,
    start = size - 1;
  }
-  T curVal = dist[start];
+  T curVal = cumdist[start];
-  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
+  while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--;
  return start;
 }
@ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
                                 int64_t* dest,
                                 int64_t distributions,
                                 int categories,
-                                 T* normDistPrefixSum) {
+                                 T* normDistPrefixSum,
                                 T* normDist) {
  // At the moment, each warp computes one sample value in the binary
  // search due to divergence. It seems possible to compute multiple
  // values and limit divergence though later on. However, no matter
@ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
        // Find the bucket that a uniform sample lies in
        int choice = binarySearchForMultinomial<T>(
          normDistPrefixSum + curDist * categories,
          normDist + curDist * categories,
          categories,
          r);
@ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
      // Find the bucket that a uniform sample lies in
      int choice = binarySearchForMultinomial<T>(
        normDistPrefixSum + curDist * categories,
        origDist + curDist * categories,
        categories,
        r);
--- a/aten/src/THC/THCTensorSort.cuh
+++ b/aten/src/THC/THCTensorSort.cuh
@ -15,17 +15,17 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
-template <typename T>
+template <typename T, bool handleNaN = false>
 struct ThrustGTOp {
  __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return THCNumerics<T>::gt(lhs, rhs);
+    return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
  }
 };
-template <typename T>
+template <typename T, bool handleNaN = false>
 struct ThrustLTOp {
  __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return THCNumerics<T>::lt(lhs, rhs);
+    return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
  }
 };
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
  int64_t n = a_->size(0);
  int64_t nrhs = b_->size(1);
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
    }
  } else {
    THCTensor_(resizeAs)(state, self_, src_);
    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state,
          n_sample,
          THCudaLongTensor_data(state, self),
          numDist, numCategories,
-          THCTensor_(data)(state, prefixSum));
+          THCTensor_(data)(state, prefixSum),
 	  THCTensor_(data)(state, normDist));
    } else {
      // Sample without replacement
--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
    dim3 block(blockSize);                                              \
                                                                        \
    if (dir) {                                                          \
-      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \
+      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
          keyInfo,                                                      \
          keySlices,                                                    \
@ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
          valueInfo,                                                    \
          (TYPE) valueInfo.strides[collapseValueDim],                   \
-          GTComp<scalar_t>());                                              \
+          GTComp<scalar_t, true>());                                    \
    } else {                                                            \
-      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \
+      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
          keyInfo,                                                      \
          keySlices,                                                    \
@ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
          valueInfo,                                                    \
          (TYPE) valueInfo.strides[collapseValueDim],                   \
-          LTComp<scalar_t>());                                              \
+          LTComp<scalar_t, true>());                                              \
    }                                                                   \
  } while (0)
@ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state,
 #if CUDA_VERSION >= 7000
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>());
+      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>());
  } else {
    thrust::stable_sort_by_key(
 #if CUDA_VERSION >= 7000
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>());
+      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>());
  }
  // Then, re-sort according to slice that each index is
--- a/c10/Half.h
+++ b/c10/Half.h
@ -383,6 +383,14 @@ struct Converter<
  }
 };
 // In some versions of MSVC, there will be a compiler error when building.
 // C4146: unary minus operator applied to unsigned type, result still unsigned
 // It can be addressed by disabling the following warning. 
 #ifdef _MSC_VER
 #pragma warning( push )
 #pragma warning( disable : 4146 )
 #endif
 // skip isnan and isinf check for integral types
 template <typename To, typename From>
 typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
@ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
  }
 }
 #ifdef _MSC_VER
 #pragma warning( pop )
 #endif
 template <typename To, typename From>
 typename std::enable_if<std::is_floating_point<From>::value, bool>::type
 overflows(From f) {
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@ -11,9 +11,11 @@ using c10::intrusive_ptr_target;
 using c10::make_intrusive;
 using c10::weak_intrusive_ptr;
 #ifndef _MSC_VER
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wself-move"
 #endif
 namespace {
 class SomeClass0Parameters : public intrusive_ptr_target {};
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@ -25,7 +25,7 @@ Error::Error(
 // Caffe2-style error message
 Error::Error(
    const char* file,
-    const int line,
+    const uint32_t line,
    const char* condition,
    const std::string& msg,
    const std::string& backtrace,
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -49,7 +49,7 @@ class C10_API Error : public std::exception {
  Error(SourceLocation source_location, const std::string& msg);
  Error(
      const char* file,
-      const int line,
+      const uint32_t line,
      const char* condition,
      const std::string& msg,
      const std::string& backtrace,
@ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
 // not met.
 // In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
 // which is different from the definition of `SourceLocation` that requires
 // unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
 // error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
 // Here the static cast is used to pass the build.
 #define AT_ERROR(...) \
-  throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
+  throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 #define AT_WARN(...) \
-  ::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
+  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 #define AT_ASSERT(cond)                       \
  if (!(cond)) {                              \
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@ -17,9 +17,10 @@
 #include <utility>
 #include <type_traits>
 #ifndef _MSC_VER
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-
+#endif
 #ifdef _MSC_VER
 #define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
 #else
@ -1457,4 +1458,6 @@ namespace ska
 } // end namespace ska
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #endif
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -72,18 +72,27 @@ class C10_API intrusive_ptr_target {
 // We also have to disable -Wunknown-warning-option and -Wpragmas, because
 // some other compilers don't know about -Wterminate or -Wexceptions and
 // will show a warning about unknown warning options otherwise.
-#pragma GCC diagnostic push
+#ifdef _MSC_VER
-#pragma GCC diagnostic ignored "-Wpragmas"
+#  pragma warning(push)  
-#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#  pragma warning(disable: 4297) // function assumed not to throw an exception but does  
-#pragma GCC diagnostic ignored "-Wterminate"
+#else  
-#pragma GCC diagnostic ignored "-Wexceptions"
+#  pragma GCC diagnostic push  
 #  pragma GCC diagnostic ignored "-Wpragmas"  
 #  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 #  pragma GCC diagnostic ignored "-Wterminate"  
 #  pragma GCC diagnostic ignored "-Wexceptions"  
 #endif
    AT_ASSERTM(
        refcount_.load() == 0,
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
    AT_ASSERTM(
        weakcount_.load() == 0,
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
-#pragma GCC diagnostic pop
+#ifdef _MSC_VER
 #  pragma warning(pop)  
 #else  
 #  pragma GCC diagnostic pop  
 #endif
  }
  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@ -430,12 +430,16 @@ class C10_API TypeMeta {
    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
    // disabled for compilers that don't know '-Wundefined-var-template' and
    // would error at our attempt to disable it.
-#pragma GCC diagnostic push
+#ifndef _MSC_VER  
-#pragma GCC diagnostic ignored "-Wpragmas"
+#  pragma GCC diagnostic push  
-#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#  pragma GCC diagnostic ignored "-Wpragmas"  
-#pragma GCC diagnostic ignored "-Wundefined-var-template"
+#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
 #  pragma GCC diagnostic ignored "-Wundefined-var-template"  
 #endif
    return TypeMeta(_typeMetaDataInstance<T>());
-#pragma GCC diagnostic pop
+#ifndef _MSC_VER  
 #  pragma GCC diagnostic pop  
 #endif
  }
 private:
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY)
  else()
    target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
  endif()
  #cmake only check for separate OpenMP library on AppleClang 7+
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
      target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
    endif()
  endif()
 endif()
 target_link_libraries(caffe2 PUBLIC c10)
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
@ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 # Set standard properties on the target
 torch_set_target_props(caffe2)
-if (MSVC)
+if (NOT MSVC) 
-target_compile_options(caffe2 INTERFACE "-std=c++11")
+  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>") 
 else()
 target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
 endif()
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -93,7 +93,7 @@ using std::vector;
 #define CAFFE2_NORETURN __attribute__((noreturn))
 #endif
-#if defined(_MSC_VER)
+#if (defined _MSC_VER && !defined NOMINMAX)
 #define NOMINMAX
 #endif
--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -1,5 +1,8 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 from caffe2.proto import caffe2_pb2
 import os
 import sys
 import platform
 # TODO: refactor & remove the following alias
 caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
@ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
 caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 if platform.system() == 'Windows':
    IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
    if IS_CONDA:
        from ctypes import windll, c_wchar_p
        from ctypes.wintypes import DWORD, HMODULE
        AddDllDirectory = windll.kernel32.AddDllDirectory
        AddDllDirectory.restype = DWORD
        AddDllDirectory.argtypes = [c_wchar_p]
    def add_extra_dll_dir(extra_dll_dir):
        if os.path.isdir(extra_dll_dir):
            os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH']
            if IS_CONDA:
                AddDllDirectory(extra_dll_dir)
    # first get nvToolsExt PATH
    def get_nvToolsExt_path():
        NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
        if os.path.exists(NVTOOLEXT_HOME):
            return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
        else:
            return ''
    py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
    th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
    th_dll_path = os.path.join(th_root, 'lib')
    dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()]
    # then add the path to env
    for p in dll_paths:
        add_extra_dll_dir(p)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -628,37 +628,12 @@ endif()
 # ---[ OpenMP
 if(USE_OPENMP)
-  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+  find_package(OpenMP)
  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
    if(DARWIN_VERSION GREATER 9)
      set(APPLE_OPENMP_SUCKS 1)
    endif(DARWIN_VERSION GREATER 9)
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
      OUTPUT_VARIABLE GCC_VERSION)
    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
      add_compile_options(-Wno-unknown-pragmas)
      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
    endif()
  endif()
  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
    find_package(OpenMP)
    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
  endif()
  if(OPENMP_FOUND)
    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
  else()
    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
    caffe2_update_option(USE_OPENMP OFF)
@ -690,7 +665,12 @@ if(USE_CUDA)
      caffe2_update_option(USE_NVRTC OFF)
    endif()
    if(CAFFE2_USE_CUDNN)
-      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+      IF(CUDNN_STATIC_LINKAGE)
 	LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
 	  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
      ELSE()
 	list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
      ENDIF()
    else()
      caffe2_update_option(USE_CUDNN OFF)
    endif()
@ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE)
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
  ENDIF()
  # OpenMP support?
  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
    IF (DARWIN_VERSION GREATER 9)
      SET(APPLE_OPENMP_SUCKS 1)
    ENDIF (DARWIN_VERSION GREATER 9)
    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
      OUTPUT_VARIABLE GCC_VERSION)
    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
      add_compile_options(-Wno-unknown-pragmas)
      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
    ENDIF()
  ENDIF()
  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
    FIND_PACKAGE(OpenMP)
    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
  ENDIF()
  IF (OPENMP_FOUND)
    MESSAGE(STATUS "Compiling with OpenMP support")
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
  ENDIF()
  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
  FIND_PACKAGE(MAGMA)
@ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE)
    SET(AT_CUDA_ENABLED 0)
  else()
    SET(AT_CUDA_ENABLED 1)
    find_package(CUDA 5.5 REQUIRED)
  endif()
  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
@ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE)
  SET(AT_MKLDNN_ENABLED 0)
  SET(CAFFE2_USE_MKLDNN OFF)
  IF (USE_MKLDNN)
    FIND_PACKAGE(MKLDNN)
    INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
    IF(MKLDNN_FOUND)
      SET(AT_MKLDNN_ENABLED 1)
-      INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR})
+      INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR})
      IF(BUILD_CAFFE2_OPS)
        SET(CAFFE2_USE_MKLDNN ON)
        LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@ -2,7 +2,6 @@
 #
 # The following variables are optionally searched for defaults
 #  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found
 #  USE_MKLDNN
 #
 # The following are set after configuration is done:
 #  MKLDNN_FOUND          : set to true if mkl-dnn is found.
@ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND)
 SET(MKLDNN_LIBRARIES)
 SET(MKLDNN_INCLUDE_DIR)
 IF (NOT USE_MKLDNN)
  RETURN()
 ENDIF(NOT USE_MKLDNN)
 IF(MSVC)
  MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
  RETURN()
@ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
 LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
 IF(MKL_FOUND)
  # Append to mkldnn dependencies
  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
  # The OMP-related variables of MKL-DNN have to be overwritten here,
  # if MKL is used, and the OMP version is defined by MKL.
  # MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL.
  # INTEL_MKL_DIR gives the MKL root path.
  IF (INTEL_MKL_DIR)
    SET(MKLROOT ${INTEL_MKL_DIR})
    IF(WIN32)
      SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
    ELSE(WIN32)
      IF (MKL_LIBRARIES_gomp_LIBRARY)
        SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY})
      ELSE(MKL_LIBRARIES_gomp_LIBRARY)
        SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY})
      ENDIF(MKL_LIBRARIES_gomp_LIBRARY)
      SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
    ENDIF(WIN32)
  ELSE(INTEL_MKL_DIR)
    MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!")
  ENDIF(INTEL_MKL_DIR)
 ELSE(MKL_FOUND)
  # If we cannot find MKL, we will use the Intel MKL Small library
  # comes with ${MKLDNN_ROOT}/external
@ -75,60 +51,65 @@ ELSE(MKL_FOUND)
  ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)
  FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
-  IF(MKLML_INNER_INCLUDE_DIR)
+  IF(NOT MKLML_INNER_INCLUDE_DIR)
-    # if user has multiple version under external/ then guess last
+    MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
-    # one alphabetically is "latest" and warn
+    RETURN()
-    LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
+  ENDIF(NOT MKLML_INNER_INCLUDE_DIR)
-    IF(MKLINCLEN GREATER 1)
+  # if user has multiple version under external/ then guess last
-      LIST(SORT MKLML_INNER_INCLUDE_DIR)
+  # one alphabetically is "latest" and warn
-      LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
+  LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
-      LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
+  IF(MKLINCLEN GREATER 1)
-      SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
+    LIST(SORT MKLML_INNER_INCLUDE_DIR)
-    ENDIF(MKLINCLEN GREATER 1)
+    LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
-    GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
+    LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
-    LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
+    SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
  ENDIF(MKLINCLEN GREATER 1)
  GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
-    IF(APPLE)
+  IF(APPLE)
-      SET(__mklml_inner_libs mklml iomp5)
+    SET(__mklml_inner_libs mklml iomp5)
-    ELSE(APPLE)
+  ELSE(APPLE)
-      SET(__mklml_inner_libs mklml_intel iomp5)
+    SET(__mklml_inner_libs mklml_intel iomp5)
-    ENDIF(APPLE)
+  ENDIF(APPLE)
-
+  FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
-    FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
+    STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
-      STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
+    FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
-      FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
+          NAMES ${__mklml_inner_lib}
-            NAMES ${__mklml_inner_lib}
+          PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
-            PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
+          DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
-            DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
+    MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
-      MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
+    IF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
-      LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
+      MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
-    ENDFOREACH(__mklml_inner_lib)
+      RETURN()
-  ENDIF(MKLML_INNER_INCLUDE_DIR)
+    ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
    LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
  ENDFOREACH(__mklml_inner_lib)
 ENDIF(MKL_FOUND)
-LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES)
+IF(MKL_FOUND)
-LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR)
+  SET(MKL_cmake_included TRUE)
-INCLUDE(FindPackageHandleStandardArgs)
+  SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE)
-find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for})
+ENDIF(MKL_FOUND)
 SET(WITH_TEST FALSE CACHE BOOL "" FORCE)
 SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE)
 SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
 ADD_SUBDIRECTORY(${MKLDNN_ROOT})
 IF(NOT TARGET mkldnn)
  MESSAGE("Failed to include MKL-DNN target")
  RETURN()
 ENDIF(NOT TARGET mkldnn)
 IF(MKL_FOUND)
  TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL)
 ENDIF(MKL_FOUND)
 IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized)
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow)
  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow)
 ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
 LIST(APPEND MKLDNN_LIBRARIES mkldnn)
-IF(MKLDNN_FOUND)
+SET(MKLDNN_FOUND TRUE)
-  IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
+MESSAGE(STATUS "Found MKL-DNN: TRUE")
    ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized)
  ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
  SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE)
  SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE)
  ADD_SUBDIRECTORY(${MKLDNN_ROOT})
  SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
  IF(WIN32)
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}")
  ELSE(WIN32)
    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
  ENDIF(WIN32)
 ELSE(MKLDNN_FOUND)
  MESSAGE(STATUS "MKLDNN source files not found!")
 ENDIF(MKLDNN_FOUND)
 UNSET(__mklml_inner_libs)
 UNSET(__mkldnn_looked_for)
 ENDIF(NOT MKLDNN_FOUND)
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -9,6 +9,12 @@ endif()
 # release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
 # we dont want to statically link cudart, because we rely on it's dynamic linkage in
 # python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
 # Technically, we can link cudart here statically, and link libtorch_python.so
 # to a dynamic libcudart.so, but that's just wasteful
 SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
 # Find CUDA.
 find_package(CUDA 7.0)
 if(NOT CUDA_FOUND)
@ -89,6 +95,9 @@ endif()
 if(DEFINED ENV{CUDNN_LIBRARY})
  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
  if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a")
    SET(CUDNN_STATIC_LINKAGE ON)
  endif()
 else()
  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
@ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN)
        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
  endif()
  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
  if(CUDNN_VERSION VERSION_LESS "7.0.0")
    message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
  endif()
 endif()
 # ---[ CUDA libraries wrapper
@ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA)
    set_property(
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt)
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl)
 else()
    set_property(
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@ -0,0 +1,917 @@
 PyTorch Contribution Guide
 ==========================
 PyTorch is a GPU-accelerated Python tensor computation package for
 building deep neural networks built on tape-based autograd systems.
 The PyTorch Contribution Process
 --------------------------------
 The PyTorch organization is governed by `PyTorch
 Governance </docs/community/governance.html>`__.
 The PyTorch development process involves a healthy amount of open
 discussions between the core development team and the community.
 PyTorch operates similar to most open source projects on GitHub.
 However, if you've never contributed to an open source project before,
 here is the basic process.
 -  **Figure out what you're going to work on.** The majority of open
   source contributions come from people scratching their own itches.
   However, if you don't know what you want to work on, or are just
   looking to get more acquainted with the project, here are some tips
   for how to find appropriate tasks:
   -  Look through the `issue
      tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if
      there are any issues you know how to fix. Issues that are
      confirmed by other contributors tend to be better to investigate.
      We also maintain some labels for issues which are likely to be
      good for new people, e.g., **bootcamp** and **1hr**, although
      these labels are less well maintained.
   -  Join us on Slack and let us know you're interested in getting to
      know PyTorch. We're very happy to help out researchers and
      partners get up to speed with the codebase.
 -  **Figure out the scope of your change and reach out for design
   comments on a GitHub issue if it's large.** The majority of pull
   requests are small; in that case, no need to let us know about what
   you want to do, just get cracking. But if the change is going to be
   large, it's usually a good idea to get some design comments about it
   first.
   -  If you don't know how big a change is going to be, we can help you
      figure it out! Just post about it on issues or Slack.
   -  Some feature additions are very standardized; for example, lots of
      people add new operators or optimizers to PyTorch. Design
      discussion in these cases boils down mostly to, “Do we want this
      operator/optimizer?” Giving evidence for its utility, e.g., usage
      in peer reviewed papers, or existence in other frameworks, helps a
      bit when making this case.
   -  Core changes and refactors can be quite difficult to coordinate,
      as the pace of development on PyTorch master is quite fast.
      Definitely reach out about fundamental or cross-cutting changes;
      we can often give guidance about how to stage such changes into
      more easily reviewable pieces.
 -  **Code it out!**
   -  See the technical guide for advice for working with PyTorch in a
      technical form.
 -  **Open a pull request.**
   -  If you are not ready for the pull request to be reviewed, tag it
      with [WIP]. We will ignore it when doing review passes. If you are
      working on a complex change, it's good to start things off as WIP,
      because you will need to spend time looking at CI results to see
      if things worked out or not.
   -  Find an appropriate reviewer for your change. We have some folks
      who regularly go through the PR queue and try to review
      everything, but if you happen to know who the maintainer for a
      given subsystem affected by your patch is, feel free to include
      them directly on the pull request. You can learn more about this
      structure at PyTorch Subsystem Ownership.
 -  **Iterate on the pull request until it's accepted!**
   -  We'll try our best to minimize the number of review roundtrips and
      block PRs only when there are major issues. For the most common
      issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__.
   -  Once a pull request is accepted and CI is passing, there is
      nothing else you need to do; we will merge the PR for you.
 Getting Started
 ---------------
 Proposing new features
 ~~~~~~~~~~~~~~~~~~~~~~
 New feature ideas are best discussed on a specific issue. Please include
 as much information as you can, any accompanying data, and your proposed
 solution. The PyTorch team and community frequently reviews new issues
 and comments where they think they can help. If you feel confident in
 your solution, go ahead and implement it.
 Reporting Issues
 ~~~~~~~~~~~~~~~~
 If you've identified an issue, first search through the `list of
 existing issues <https://github.com/pytorch/pytorch/issues>`__ on the
 repo. If you are unable to find a similar issue, then create a new one.
 Supply as much information you can to reproduce the problematic
 behavior. Also, include any additional insights like the behavior you
 expect.
 Implementing Features or Fixing Bugs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If you want to fix a specific issue, it's best to comment on the
 individual issue with your intent. However, we do not lock or assign
 issues except in cases where we have worked with the developer before.
 It's best to strike up a conversation on the issue and discuss your
 proposed solution. The PyTorch team can provide guidance that saves you
 time.
 Issues that are labeled first-new-issue, low, or medium priority provide
 the best entrance point are great places to start.
 Adding Tutorials
 ~~~~~~~~~~~~~~~~
 A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__
 come from the community itself and we welcome additional contributions.
 To learn more about how to contribute a new tutorial you can learn more
 here: `PyTorch.org Tutorial Contribution Guide on
 Github <https://github.com/pytorch/tutorials/#contributing>`__
 Improving Documentation & Tutorials
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We aim to produce high quality documentation and tutorials. On rare
 occasions that content includes typos or bugs. If you find something you
 can fix, send us a pull request for consideration.
 Take a look at the `Documentation <#on-documentation>`__ section to learn how our system
 works.
 Participating in online discussions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 You can find active discussions happening on the PyTorch Discussion
 `forum <https://discuss.pytorch.org/>`__.
 Submitting pull requests to fix open issues
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 You can view a list of all open issues
 `here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an
 issue is a great way to get the attention of the team. From here you can
 share your ideas and how you plan to resolve the issue.
 For more challenging issues, the team will provide feedback and
 direction for how to best solve the issue.
 If you're not able to fix the issue itself, commenting and sharing
 whether you can reproduce the issue can be useful for helping the team
 identify problem areas.
 Reviewing open pull requests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We appreciate your help reviewing and commenting on pull requests. Our
 team strives to keep the number of open pull requests at a manageable
 size, we respond quickly for more information if we need it, and we
 merge PRs that we think are useful. However, due to the high level of
 interest, additional eyes on pull requests is appreciated.
 Improving code readability
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 Improve code readability helps everyone. It is often better to submit a
 small number of pull requests that touch few files versus a large pull
 request that touches many files. Starting a discussion in the PyTorch
 forum `here <https://discuss.pytorch.org/>`__ or on an issue related to
 your improvement is the best way to get started.
 Adding test cases to make the codebase more robust
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Additional test coverage is appreciated.
 Promoting PyTorch
 ~~~~~~~~~~~~~~~~~
 Your use of PyTorch in your projects, research papers, write ups, blogs,
 or general discussions around the internet helps to raise awareness for
 PyTorch and our growing community. Please reach out to
 `pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 for marketing support.
 Triaging issues
 ~~~~~~~~~~~~~~~
 If you feel that an issue could benefit from a particular tag or level
 of complexity comment on the issue and share your opinion. If an you
 feel an issue isn't categorized properly comment and let the team know.
 About open source development
 -----------------------------
 If this is your first time contributing to an open source project, some
 aspects of the development process may seem unusual to you.
 -  **There is no way to “claim” issues.** People often want to “claim”
   an issue when they decide to work on it, to ensure that there isn't
   wasted work when someone else ends up working on it. This doesn't
   really work too well in open source, since someone may decide to work
   on something, and end up not having time to do it. Feel free to give
   information in an advisory fashion, but at the end of the day, we
   will take running code and rough consensus.
 -  **There is a high bar for new functionality that is added.** Unlike
   in a corporate environment, where the person who wrote code
   implicitly “owns” it and can be expected to take care of it in the
   beginning of its lifetime, once a pull request is merged into an open
   source project, it immediately becomes the collective responsibility
   of all maintainers on the project. When we merge code, we are saying
   that we, the maintainers, are able to review subsequent changes and
   make a bugfix to the code. This naturally leads to a higher standard
   of contribution.
 Common Mistakes To Avoid
 ------------------------
 -  **Did you add tests?** (Or if the change is hard to test, did you
   describe how you tested your change?)
   -  We have a few motivations for why we ask for tests:
      1. to help us tell if we break it later
      2. to help us tell if the patch is correct in the first place
         (yes, we did review it, but as Knuth says, “beware of the
         following code, for I have not run it, merely proven it
         correct”)
   -  When is it OK not to add a test? Sometimes a change can't be
      conveniently tested, or the change is so obviously correct (and
      unlikely to be broken) that it's OK not to test it. On the
      contrary, if a change is seems likely (or is known to be likely)
      to be accidentally broken, it's important to put in the time to
      work out a testing strategy.
 -  **Is your PR too long?**
   -  It's easier for us to review and merge small PRs. Difficulty of
      reviewing a PR scales nonlinearly with its size.
   -  When is it OK to submit a large PR? It helps a lot if there was a
      corresponding design discussion in an issue, with sign off from
      the people who are going to review your diff. We can also help
      give advice about how to split up a large change into individually
      shippable parts. Similarly, it helps if there is a complete
      description of the contents of the PR: it's easier to review code
      if we know what's inside!
 -  **Comments for subtle things?** In cases where behavior of your code
   is nuanced, please include extra comments and documentation to allow
   us to better understand the intention of your code.
 -  **Did you add a hack?** Sometimes a hack is the right answer. But
   usually we will have to discuss it.
 -  **Do you want to touch a very core component?** In order to prevent
   major regressions, pull requests that touch core components receive
   extra scrutiny. Make sure you've discussed your changes with the team
   before undertaking major changes.
 -  **Want to add a new feature?** If you want to add new features,
   comment your intention on the related issue. Our team tries to
   comment on and provide feedback to the community. It's better to have
   an open discussion with the team and the rest of the community prior
   to building new features. This helps us stay aware of what you're
   working on and increases the chance that it'll be merged.
 -  **Did you touch unrelated code to the PR?** To aid in code review,
   please only include files in your pull request that are directly
   related to your changes.
 Frequently asked questions
 -  **How can I contribute as a reviewer?** There is lots of value if
   community developer reproduce issues, try out new functionality, or
   otherwise help us identify or troubleshoot issues. Commenting on
   tasks or pull requests with your enviroment details is helpful and
   appreciated.
 -  **CI tests failed, what does it mean?** Maybe you need to merge with
   master or rebase with latest changes. Pushing your changes should
   re-trigger CI tests. If the tests persist, you'll want to trace
   through the error messages and resolve the related issues.
 -  **What are the most high risk changes?** Anything that tourhces build
   configuration is an risky area. Please avoid changing these unless
   you've had a discussion with the team beforehand.
 -  **Hey, a commit showed up on my branch, what's up with that?**
   Sometimes another community member will provide a patch or fix to
   your pull request or branch. This is often needed for getting CI tests
   to pass.
 On Documentation
 ----------------
 Python Docs
 ~~~~~~~~~~~
 PyTorch documentation is generated from python source using
 `Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is
 copied to the docs folder in the master branch of
 `pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
 and is served via GitHub pages.
 -  Site: http://pytorch.org/docs
 -  GitHub: http://github.com/pytorch/pytorch/docs
 -  Served from:
   `https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
 C++ Docs
 ~~~~~~~~
 For C++ code we use Doxygen to generate the content files. The C++ docs
 are built on a special server and the resulting files are copied to the
 https://github.com/pytorch/cppdocs repo, and are served from GitHub
 pages.
 -  Site: http://pytorch.org/cppdocs
 -  GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp
 -  Served from: https://github.com/pytorch/cppdocs
 Tutorials
 ---------
 PyTorch tutorials are documents used to help understand using PyTorch to
 accomplish specific tasks or to understand more holistic concepts.
 Tutorials are built using
 `Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__
 from executable python sources files, or from restructured-text (rst)
 files.
 -  Site: http://pytorch.org/tutorials
 -  GitHub: http://github.com/pytorch/tutorials
 Tutorials Build Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 For tutorials, `pull
 requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a
 rebuild the entire site using CircleCI to test the effects of the
 change. This build is sharded into 9 worker builds and takes around 40
 minutes total. At the same time, we do a Netlify build using *make
 html-noplot*, which builds the site without rendering the notebook
 output into pages for quick review.
 After a PR is accepted, the site is rebuilt and deployed from CircleCI.
 Contributing a new Tutorial
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 `PyTorch.org Tutorial Contribution
 Guide <https://github.com/pytorch/tutorials/#contributing>`__
 Code Style
 ~~~~~~~~~~
 **Python style**
 **C++ style**
 Submitting a Pull Request
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 PyTorch development happens publicly on our Github repo.
 To have your feature or fix added to PyTorch, please submit a Pull
 Request.
 Running Tests
 ~~~~~~~~~~~~~
 Show examples for running all tests, just one individual...
 Technical Process
 -----------------
 Developing PyTorch
 ~~~~~~~~~~~~~~~~~~
 To develop PyTorch on your machine, here are some tips:
 1. Uninstall all existing PyTorch installs:
 ::
    conda uninstall pytorch
    pip uninstall torch
    pip uninstall torch # run this command twice
 2. Clone a copy of PyTorch from source:
 ::
    git clone https://github.com/pytorch/pytorch
    cd pytorch
 3. Install PyTorch in ``build develop`` mode:
 A full set of instructions on installing PyTorch from source is here:
 https://github.com/pytorch/pytorch#from-source
 The change you have to make is to replace
 ::
    python setup.py install
 with
 ::
    python setup.py build develop
 This is especially useful if you are only changing Python files.
 This mode will symlink the Python files from the current local source
 tree into the Python install.
 Hence, if you modify a Python file, you do not need to reinstall PyTorch
 again and again.
 For example:
 -  Install local PyTorch in ``build develop`` mode
 -  modify your Python file ``torch/__init__.py`` (for example)
 -  test functionality
 -  modify your Python file ``torch/__init__.py``
 -  test functionality
 -  modify your Python file ``torch/__init__.py``
 -  test functionality
 You do not need to repeatedly install after modifying Python files.
 In case you want to reinstall, make sure that you uninstall PyTorch
 first by running ``pip uninstall torch`` and ``python setup.py clean``.
 Then you can install in ``build develop`` mode again.
 Codebase structure
 ------------------
 -  `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core
   library files that work everywhere, both server and mobile. We are
   slowly moving pieces from
   `ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
   here. This library is intended only to contain essential
   functionality, and appropriate to use in settings where binary size
   matters. (But you'll have a lot of missing functionality if you try
   to use it directly.)
 -  `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++
   tensor library for PyTorch (no autograd support)
   -  `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__
      -  `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__
         `THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__
         `THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__
         `THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__
         - Legacy library code from the original Torch. Try not to add
         things here; we're slowly porting these to
         `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__.
         -  generic - Contains actual implementations of operators,
            parametrized over ``scalar_t``. Files here get compiled N
            times per supported scalar type in PyTorch.
      -  `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__
         -  `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
            - Core functionality of ATen. This is migrating to top-level
            c10 folder.
         -  `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__
            - Modern implementations of operators. If you want to write
            a new operator, here is where it should go. Most CPU
            operators go in the top level directory, except for
            operators which need to be compiled specially; see cpu
            below.
            -  `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__
               - Not actually CPU implementations of operators, but
               specifically implementations which are compiled with
               processor-specific instructions, like AVX. See the
               `README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__
               for more details.
            -  `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__
               - CUDA implementations of operators.
            -  `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__
               - CPU and CUDA implementations of COO sparse tensor
               operations
            -  `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__
               `mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__
               `miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__
               `cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__
               -  implementations of operators which simply bind to some
                  backend library.
 -  `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ -
   The actual PyTorch library. Everything that is not in
   `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
   is a Python module, following the PyTorch Python frontend module
   structure.
   -  `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
      - C++ files composing the PyTorch library. Files in this directory
      tree are a mix of Python binding code, and C++ heavy lifting.
      Consult ``setup.py`` for the canonical list of Python binding
      files; conventionally, they are often prefixed with ``python_``.
      -  `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__
         - Compiler and frontend for TorchScript JIT frontend.
      -  `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__
         - Implementation of reverse-mode automatic differentiation.
      -  `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__
         - The PyTorch C++ frontend.
      -  `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__
         - Distributed training support for PyTorch.
 -  `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ -
   Code generation scripts for the PyTorch library. See
   `README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__
   of this directory for more details.
 -  `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ -
   Python unit tests for PyTorch Python frontend.
   -  `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__
      - Basic tests for PyTorch functionality.
   -  `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__
      - Tests for non-NN automatic differentiation support.
   -  `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__
      - Tests for NN operators and their automatic differentiation.
   -  `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__
      - Tests for the JIT compiler and TorchScript.
   -  ...
   -  `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__
      - C++ unit tests for PyTorch C++ frontend.
   -  `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__
      - Automatically generated "expect" files which are used to compare
      against expected output.
   -  `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__
      - Tests for ONNX export functionality, using both PyTorch and
      Caffe2.
 -  `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ -
   The Caffe2 library.
   -  `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__
      - Core files of Caffe2, e.g., tensor, workspace, blobs, etc.
   -  `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__
      - Operators of Caffe2.
   -  `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__
      - Python bindings to Caffe2.
   -  ...
 Unit Testing
 ------------
 PyTorch's testing is located under ``test/``. Run the entire test suite
 with
 ::
    python test/run_test.py
 or run individual test files, like ``python test/test_nn.py``, for
 individual test suites.
 Better local unit tests with pytest
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We don't officially support ``pytest``, but it works well with our
 ``unittest`` tests and offers a number of useful features for local
 developing. Install it via ``pip install pytest``.
 If you want to just run tests that contain a specific substring, you can
 use the ``-k`` flag:
 ::
    pytest test/test_nn.py -k Loss -v
 The above is an example of testing a change to Loss functions: this
 command runs tests such as ``TestNN.test_BCELoss``\ and
 ``TestNN.test_MSELoss`` and can be useful to save keystrokes.
 Writing documentation
 ---------------------
 PyTorch uses `Google
 style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__
 for formatting docstrings. Length of line inside docstrings block must
 be limited to 80 characters to fit into Jupyter documentation popups.
 For C++ documentation (https://pytorch.org/cppdocs), we use
 `Doxygen <http://www.doxygen.nl/>`__ and then convert it to
 `Sphinx <http://www.sphinx-doc.org/>`__ via
 `Breathe <https://github.com/michaeljones/breathe>`__
 and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen
 reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__
 for more information on the documentation syntax. To build the
 documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``.
 We run Doxygen in CI (Travis) to verify that you do not use invalid
 Doxygen commands. To run this check locally, run ``./check-doxygen.sh``
 from inside ``docs/cpp``.
 Managing multiple build trees
 -----------------------------
 One downside to using ``python setup.py develop`` is that your
 development version of PyTorch will be installed globally on your
 account (e.g., if you run ``import torch`` anywhere else, the
 development version will be used.
 If you want to manage multiple builds of PyTorch, you can make use of
 `conda environments <https://conda.io/docs/using/envs.html>`__ to
 maintain separate Python package environments, each of which can be tied
 to a specific build of PyTorch. To set one up:
 ::
    conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed
    python setup.py build develop
 C++ Development tips
 --------------------
 If you are working on the C++ code, there are a few important things
 that you will want to keep in mind:
 1. How to rebuild only the code you are working on.
 2. How to make rebuilds in the absence of changes go faster.
 Build only what you need.
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 ``python setup.py build`` will build everything, but since our build
 system is not very optimized for incremental rebuilds, this will
 actually be very slow. Far better is to only request rebuilds of the
 parts of the project you are working on:
 -  Working on the Python bindings? Run ``python setup.py develop`` to
   rebuild (NB: no ``build`` here!)
 -  Working on ``torch/csrc`` or ``aten``? Run
   ``python setup.py rebuild_libtorch`` to rebuild and avoid having to
   rebuild other dependent libraries we depend on.
 -  Working on one of the other dependent libraries? The other valid
   targets are listed in ``dep_libs`` in ``setup.py``. prepend
   ``build_`` to get a target, and run as e.g.
   ``python setup.py build_gloo``.
 -  Working on a test binary? Run
   ``(cd build && ninja bin/test_binary_name)`` to rebuild only that
   test binary (without rerunning cmake). (Replace ``ninja`` with
   ``make`` if you don't have ninja installed).
 On the initial build, you can also speed things up with the environment
 variables ``DEBUG`` and ``NO_CUDA``.
 -  ``DEBUG=1`` will enable debug builds (-g -O0)
 -  ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations
   (-g -O3)
 -  ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing
   on something not CUDA related), to save compile time.
 For example:
 ::
    NO_CUDA=1 DEBUG=1 python setup.py build develop
 Make sure you continue to pass these flags on subsequent builds.
 Code completion and IDE support
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 When using ``python setup.py develop``, PyTorch will generate a
 ``compile_commands.json`` file that can be used by many editors to
 provide command completion and error highlighting for PyTorch's C++
 code. You need to ``pip install ninja`` to generate accurate information
 for the code in ``torch/csrc``. More information at:
 -  https://sarcasm.github.io/notes/dev/compilation-database.html
 Make no-op build fast.
 ~~~~~~~~~~~~~~~~~~~~~~
 Use Ninja
 ~~~~~~~~~
 Python ``setuptools`` is pretty dumb, and always rebuilds every C file
 in a project. If you install the ninja build system with
 ``pip install ninja``, then PyTorch will use it to track dependencies
 correctly. If PyTorch was already built, you will need to run
 ``python setup.py clean`` once after installing ninja for builds to
 succeed.
 Use CCache
 ~~~~~~~~~~
 Even when dependencies are tracked with file modification, there are
 many situations where files get rebuilt when a previous compilation was
 exactly the same.
 Using ccache in a situation like this is a real time-saver. However, by
 default, ccache does not properly support CUDA stuff, so here are the
 instructions for installing a custom ccache fork that has CUDA support:
 ::
    # install and export ccacheif ! ls ~/ccache/bin/ccachethen
        sudo apt-get update
        sudo apt-get install -y automake autoconf
        sudo apt-get install -y asciidoc
        mkdir -p ~/ccache
        pushd /tmp
        rm -rf ccache
        git clone https://github.com/colesbury/ccache -b ccbin
        pushd ccache
        ./autogen.sh
        ./configure
        make install prefix=~/ccache
        popdpopd
        mkdir -p ~/ccache/lib
        mkdir -p ~/ccache/cuda
        ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
        ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
        ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
        ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
        ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
        ~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
 CUDA Development tips
 ---------------------
 If you are working on the CUDA code, here are some useful CUDA debugging
 tips:
 1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug
   symbols (``-g -G``). This will be particularly helpful in debugging
   device code. However, it will slow down the build process for about
   50% (compared to only ``DEBUG=1``), so use wisely.
 2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging
   friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a
   CUDA tensor (rather than all zeros).
 Hope this helps, and thanks for considering to contribute.
 Windows development tips
 ------------------------
 Occasionally, you will write a patch which works on Linux, but fails CI
 on Windows. There are a few aspects in which MSVC (the Windows compiler
 toolchain we use) is stricter than Linux, which are worth keeping in
 mind when fixing these problems.
 1. Symbols are NOT exported by default on Windows; instead, you have to
   explicitly mark a symbol as exported/imported in a header file with
   ``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have
   codified this pattern into a set of macros which follow the
   convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen.
   (Every separate shared library needs a unique macro name, because
   symbol visibility is on a per shared library basis. See
   c10/macros/Macros.h for more details.) The upshot is if you see an
   "unresolved external" error in your Windows build, this is probably
   because you forgot to mark a function with ``*_API``. However, there
   is one important counterexample to this principle: if you want a
   *templated* function to be instantiated at the call site, do NOT mark
   it with ``*_API`` (if you do mark it, you'll have to explicitly
   instantiate all of the specializations used by the call sites.)
 2. If you link against a library, this does not make its dependencies
   transitively visible. You must explicitly specify a link dependency
   against every library whose symbols you use. (This is different from
   Linux where in most environments, transitive dependencies can be used
   to fulfill unresolved symbols.)
 3. If you have a Windows box (we have a few on EC2 which you can request
   access to) and you want to run the build, the easiest way is to just
   run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run
   ``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing
   away your Conda environment.)
 Even if you don't know anything about MSVC, you can use cmake to build
 simple programs on Windows; this can be helpful if you want to learn
 more about some peculiar linking behavior by reproducing it on a small
 example. Here's a simple example cmake file that defines two dynamic
 libraries, one linking with the other:
 ::
    project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo)
 You can build it with:
 ::
    mkdir buildcd build
    cmake ..
    cmake --build .
 Known MSVC (and MSVC with NVCC) bugs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The PyTorch codebase sometimes likes to use exciting C++ features, and
 these exciting features lead to exciting bugs in Windows compilers. To
 add insult to injury, the error messages will often not tell you which
 line of code actually induced the erroring template instantiation. We've
 found the most effective way to debug these problems is to carefully
 read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few
 well known pitfalls and workarounds:
 -  This is not actually a bug per se, but in general, code generated by
   MSVC is more sensitive to memory errors; you may have written some
   code that does a use-after-free or stack overflows; on Linux the code
   might work, but on Windows your program will crash. ASAN may not
   catch all of these problems: stay vigilant to the possibility that
   your crash is due to a real memory problem.
 -  (NVCC) ``c10::optional`` does not work when used from device code.
   Don't use it from kernels. Upstream issue:
   https://github.com/akrzemi1/Optional/issues/58 and our local issue
   #10329.
 -  ``constexpr`` generally works less well on MSVC.
   -  The idiom ``static_assert(f() == f())`` to test if ``f`` is
      constexpr does not work; you'll get "error C2131: expression did
      not evaluate to a constant". Don't use these asserts on Windows.
      (Example: ``c10/util/intrusive_ptr.h``)
 -  (NVCC) Code you access inside a ``static_assert`` will eagerly be
   evaluated as if it were device code, and so you might get an error
   that the code is "not accessible".
 ::
    class A {
      static A singleton_;
      static constexpr inline A* singleton() {
        return &singleton_;
      }
    };static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
 -  The compiler will run out of heap space if you attempt to compile
   files that are too large. Splitting such files into separate files
   helps. (Example: ``THTensorMath``, ``THTensorMoreMath``,
   ``THTensorEvenMoreMath``.)
 -  MSVC's preprocessor (but not the standard compiler) has a bug where
   it incorrectly tokenizes raw string literals, ending when it sees a
   ``"``. This causes preprocessor tokens inside the literal like
   an\ ``#endif`` to be incorrectly treated as preprocessor directives.
   See https://godbolt.org/z/eVTIJq as an example.
 Running Clang-Tidy
 ~~~~~~~~~~~~~~~~~~
 `Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a
 C++ linter and static analysis tool based on the clang compiler. We run
 clang-tidy in our CI to make sure that new C++ code is safe, sane and
 efficient. See our
 `.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__
 file for the simple commands we use for this. To run clang-tidy locally,
 follow these steps:
 1. Install clang-tidy. First, check if you already have clang-tidy by
   simply writing ``clang-tidy`` in your terminal. If you don't yet have
   clang-tidy, you should be able to install it easily with your package
   manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu.
   See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on
   how to install the latest version. Note that newer versions of
   clang-tidy will have more checks than older versions. In our CI, we
   run clang-tidy-6.0.
 2. Use our driver script to run clang-tidy over any changes relative to
   some git revision (you may want to replace ``HEAD~1`` with ``HEAD``
   to pick up uncommitted changes). Changes are picked up based on a
   ``git diff`` with the given revision:
 ::
    python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
 Above, it is assumed you are in the PyTorch root folder.
 ``path/to/build`` should be the path to where you built PyTorch from
 source, e.g. ``build`` in the PyTorch root folder if you used
 ``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change
 the clang-tidy this script uses. Make sure you have PyYaml installed,
 which is in PyTorch's ``requirements.txt``.
 Pre-commit Tidy/Linting Hook
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We use clang-tidy and flake8 to perform additional formatting and
 semantic checking of code. We provide a pre-commit git hook for
 performing these checks, before a commit is created:
 ::
    ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
 Caffe2 notes
 ------------
 In 2018, we merged Caffe2 into the PyTorch source repository. While the
 steady state aspiration is that Caffe2 and PyTorch share code freely, in
 the meantime there will be some separation. If you submit a PR to only
 PyTorch or only Caffe2 code, CI will only run for the project you
 edited. The logic for this is implemented in
 ``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can
 look at this to see what path prefixes constitute changes. This also
 means if you ADD a new top-level path, or you start sharing code between
 projects, you need to modify these files. There are a few "unusual"
 directories which, for historical reasons, are Caffe2/PyTorch specific.
 Here they are:
 -  ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``,
   ``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code
   in them without extra coordination.
 -  ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools``
   are PyTorch-specific. Don't put Caffe2 code in them without extra
   coordination.
--- a/docs/source/community/governance.rst
+++ b/docs/source/community/governance.rst
@ -0,0 +1,154 @@
 PyTorch Governance
 ==========================
 Governance Philosophy and Guiding Tenets
 -----------------------------------------
 PyTorch adopts a governance structure with a small set of maintainers
 driving the overall project direction with a strong bias towards
 PyTorch's design philosophy where design and code contributions are
 valued. Beyond the core maintainers, there is also a slightly broader
 set of core developers that have the ability to directly merge pull
 requests and own various parts of the core code base.
 Beyond the maintainers and core devs, the community is encouraged to
 contribute, file issues, make proposals, review pull requests and be
 present in the community. Given contributions and willingness to
 invest, anyone can be provided write access or ownership of parts of
 the codebase.
 Based on this governance structure, the project has the following core
 operating tenets by which decisions are made and overall culture is
 derived:
 1. **Code contributions** matter much more than corporate sponsorship
   and independent developers are highly valued.
 2. **Project influence** is gained through contributions (whether PRs,
   forum answers, code reviews or otherwise)
 Key people and their functions
 ------------------------------
 Project Maintainers
 ~~~~~~~~~~~~~~~~~~~
 Project maintainers provide leadership and direction for the PyTorch
 project. Specifics include:
 -  Articulate a cohesive long-term vision for the project
 -  Possess a deep understanding of the PyTorch code base
 -  Negotiate and resolve contentious issues in ways acceptable to all
   parties involved
 PyTorch Maintainers:
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 -  (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__)
 Core Developers
 ~~~~~~~~~~~~~~~
 The PyTorch project is developed by a team of core developers. You can
 find the list of core developers at `PyTorch Governance \| Persons of
 Interest </docs/community/persons_of_interest.html>`__.
 While membership is determined by presence in the "PyTorch core" team in
 the "PyTorch"
 `organization <https://github.com/orgs/pytorch/teams/facebook>`__ on
 GitHub, contribution takes many forms:
 -  committing changes to the repository;
 -  reviewing pull requests by others;
 -  triaging bug reports on the issue tracker;
 -  discussing topics on official PyTorch communication channels.
 Moderators
 ~~~~~~~~~~
 There is a group of people, some of which are not core developers,
 responsible for ensuring that discussions on official communication
 channels adhere to the Code of Conduct. They take action in view of
 violations and help to support a healthy community. You can find the
 list of moderators `here <https://discuss.pytorch.org/about>`__.
 Decision Making
 ---------------
 Uncontroversial Changes
 ~~~~~~~~~~~~~~~~~~~~~~~
 Primary work happens through bug tracker issues and pull requests on
 GitHub. Core developers should avoid pushing their changes directly to
 the PyTorch repository, instead relying on pull requests. Approving a
 pull request by a core developer allows it to be merged without further
 process. Core Developers and Project Maintainers ultimately approve
 these changes.
 Notifying relevant experts about a bug tracker issue or a pull request
 is important. Reviews from experts in the given interest area are
 strongly preferred, especially on pull request approvals. Failure to do
 so might end up with the change being reverted by the relevant expert.
 Controversial decision process
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Substantial changes in a given interest area require a GitHub issue to
 be opened for discussion. This includes:
 -  Any semantic or syntactic change to the framework.
 -  Backwards-incompatible changes to the Python or Cpp API.
 -  Additions to the core framework, including substantial new
   functionality within an existing library.
 -  Removing core features
 Project Maintainers ultimately approve these changes.
 FAQ
 ---
 **Q: What if I would like to own (or partly own) a part of the project
 such as a domain api (i.e. Torch Vision)?** This is absolutely possible.
 The first step is to start contributing to the existing project area and
 contributing to its health and success. In addition to this, you can
 make a proposal through a GitHub issue for new functionality or changes
 to improve the project area.
 **Q: What if I am a company looking to use PyTorch internally for
 development, can I be granted or purchase a board seat to drive the
 project direction?** No, the PyTorch project is strictly driven by the
 maintainer-driven project philosophy and does not have a board or
 vehicle to take financial contributions relating to gaining influence
 over technical direction.
 **Q: Does the PyTorch project support grants or ways to support
 independent developers using or contributing to the project?** No, not
 at this point. We are however looking at ways to better support the
 community of independent developers around PyTorch. If you have
 suggestions or inputs, please reach out on the PyTorch forums to
 discuss.
 **Q: How do I contribute code to the project?** If the change is
 relatively minor, a pull request on GitHub can be opened up immediately
 for review and merge by the project committers. For larger changes,
 please open an issue to make a proposal to discuss prior. Please also
 see the **`PyTorch Contributor
 Guide </docs/community/contribution_guide.html>`__** for contribution
 guidelines.
 **Q: Can I become a committer on the project?** Unfortunately, the
 current commit process to PyTorch involves an interaction with Facebook
 infrastructure that can only be triggered by Facebook employees. We are
 however looking at ways to expand the committer base to individuals
 outside of Facebook and will provide an update when the tooling exists
 to allow this.
 **Q: What if i would like to deliver a PyTorch tutorial at a conference
 or otherwise? Do I need to be 'officially' a committer to do this?** No,
 we encourage community members to showcase their work wherever and
 whenever they can. Please reach out to
 `pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
 for marketing support.
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@ -0,0 +1,130 @@
 PyTorch Governance | Persons of Interest
 =========================================
 General Maintainers
 -------------------
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 -  (sunsetting) Sam Gross
   (`colesbury <https://github.com/colesbury>`__)
 Module-level maintainers
 ------------------------
 JIT
 ~~~
 -  Zach Devito (`zdevito <https://github.com/zdevito>`__)
 -  Michael Suo (`suo <https://github.com/suo>`__)
 Distributed
 ~~~~~~~~~~~
 -  Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 -  Shen Li (`mrshenli <https://github.com/mrshenli>`__)
 -  (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__)
 Autograd Engine
 ~~~~~~~~~~~~~~~
 -  Alban Desmaison (`alband <https://github.com/alband>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 Multiprocessing and DataLoaders
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -  Simon Wang (`SsnL <https://github.com/SsnL>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  (proposed) Vitaly Fedyunin
   (`VitalyFedyunin <https://github.com/proposed>`__)
 CUDA
 ~~~~
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
 C++
 ~~~
 -  Will Feng (`yf225 <https://github.com/yf225>`__)
 -  (sunsetting) Peter Goldsborough
   (`goldsborough <https://github.com/goldsborough>`__)
 Build + CI
 ~~~~~~~~~~
 -  Will Feng (`yf225 <https://github.com/yf225>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 -  (sunsetting) Orion Reblitz-Richardson
 (`orionr <https://github.com/orionr>`__)
 Distributions & RNG
 ~~~~~~~~~~~~~~~~~~~
 -  Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__)
 -  Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__)
 -  Alican Bozkurt (`alicanb <https://github.com/alicanb>`__)
 -  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
 C10
 ~~~
 -  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 ONNX <-> PyTorch
 ~~~~~~~~~~~~~~~~
 -  Lu Fang (`houseroad <https://github.com/houseroad>`__)
 torch.nn
 ~~~~~~~~
 -  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 -  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 CPU Performance / SIMD
 ~~~~~~~~~~~~~~~~~~~~~~
 -  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 -  Sam Gross (`colesbury <https://github.com/colesbury>`__)
 -  Richard Zou (`zou3519 <https://github.com/zou3519>`__)
 AMD/ROCm/HIP
 ~~~~~~~~~~~~
 -  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 -  Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__)
 Windows
 ~~~~~~~
 -  Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
 MKLDNN
 ~~~~~~
 -  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
 XLA
 ~~~
 -  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
 -  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
 -  Alex Suhan (`asuhan <https://github.com/asuhan>`__)
 PPC
 ~~~
 -  Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
--- a/docs/source/hub.rst
+++ b/docs/source/hub.rst
@ -1,6 +1,101 @@
 torch.hub
 ===================================
 Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
 Publishing models
 -----------------
 Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
 to a github repository by adding a simple ``hubconf.py`` file;
 ``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with
 the following signature.
 ::
    def entrypoint_name(pretrained=False, *args, **kwargs):
        ...
 How to implement an entrypoint?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is a code snipet from pytorch/vision repository, which specifies an entrypoint
 for ``resnet18`` model. You can see a full script in
 `pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_
 ::
    dependencies = ['torch', 'math']
    def resnet18(pretrained=False, *args, **kwargs):
        """
        Resnet18 model
        pretrained (bool): a recommended kwargs for all entrypoints
        args & kwargs are arguments for the function
        """
        ######## Call the model in the repo ###############
        from torchvision.models.resnet import resnet18 as _resnet18
        model = _resnet18(*args, **kwargs)
        ######## End of call ##############################
        # The following logic is REQUIRED
        if pretrained:
            # For weights saved in local repo
 			# model.load_state_dict(<path_to_saved_file>)
 			# For weights saved elsewhere
 			checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
            model.load_state_dict(model_zoo.load_url(checkpoint, progress=False))
        return model
 - ``dependencies`` variable is a **list** of package names required to to run the model.
 - Pretrained weights can either be stored local in the github repo, or loadable by
  ``model_zoo.load()``.
 - ``pretrained`` controls whether to load the pre-trained weights provided by repo owners.
 - ``args`` and ``kwargs`` are passed along to the real callable function.
 - Docstring of the function works as a help message, explaining what does the model do and what
  are the allowed arguments.
 - Entrypoint function should **ALWAYS** return a model(nn.module).
 Important Notice
 ^^^^^^^^^^^^^^^^
 - The published models should be at least in a branch/tag. It can't be a random commit.
 Loading models from Hub
 -----------------------
 Users can load the pre-trained models using ``torch.hub.load()`` API.
 .. automodule:: torch.hub
 .. autofunction:: load
 Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo.
 ::
    hub_model = hub.load(
        'pytorch/vision:master', # repo_owner/repo_name:branch
        'resnet18', # entrypoint
        1234, # args for callable [not applicable to resnet]
        pretrained=True) # kwargs for callable
 Where are my downloaded model & weights saved?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 The locations are used in the order of
 - hub_dir: user specified path. It can be set in the following ways:
  - Setting the environment variable ``TORCH_HUB_DIR``
  - Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
 - ``~/.torch/hub``
 .. autofunction:: set_dir
 Caching logic
 ^^^^^^^^^^^^^
 By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``.
 Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
 the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
 when updates are published to the same branch, users can keep up with the latest release.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   notes/*
 .. toctree::
  :glob:
  :maxdepth: 1
  :caption: Community
  community/*
 .. toctree::
   :maxdepth: 1
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -1,4 +1,4 @@
-Torch Script
+TorchScript
 ============
 .. contents:: :local:
@ -6,17 +6,17 @@ Torch Script
 .. automodule:: torch.jit
 .. currentmodule:: torch.jit
-Torch Script is a way to create serializable and optimizable models from PyTorch code.
+TorchScript is a way to create serializable and optimizable models from PyTorch code.
-Any code written in Torch Script can be saved from your Python
+Any code written in TorchScript can be saved from your Python
 process and loaded in a process where there is no Python dependency.
 We provide tools to incrementally transition a model from being a pure Python program
-to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
+to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program.
 This makes it possible to train models in PyTorch using familiar tools and then export
 the model to a production environment where it is not a good idea to run models as Python programs
 for performance and multi-threading reasons.
-Creating Torch Script Code
+Creating TorchScript Code
 --------------------------
@ -117,26 +117,26 @@ Example:
            return self.resnet(input - self.means)
-Torch Script Language Reference
+TorchScript Language Reference
 -------------------------------
-Torch Script is a subset of Python that can either be written directly (using
+TorchScript is a subset of Python that can either be written directly (using
 the @script annotations) or generated automatically from Python code via
 tracing. When using tracing, code is automatically converted into this subset of
 Python by recording only the actual operators on tensors and simply executing and
 discarding the other surrounding Python code.
-When writing Torch Script directly using @script annotations, the programmer must
+When writing TorchScript directly using @script annotations, the programmer must
-only use the subset of Python supported in Torch Script. This section documents
+only use the subset of Python supported in TorchScript. This section documents
-what is supported in Torch Script as if it were a language reference for a stand
+what is supported in TorchScript as if it were a language reference for a stand
 alone language. Any features of Python not mentioned in this reference are not
-part of Torch Script.
+part of TorchScript.
-As a subset of Python any valid Torch Script function is also a valid Python
+As a subset of Python any valid TorchScript function is also a valid Python
 function. This makes it possible to remove the @script annotations and debug the
 function using standard Python tools like pdb. The reverse is not true: there
-are many valid python programs that are not valid Torch Script programs.
+are many valid python programs that are not valid TorchScript programs.
-Instead, Torch Script focuses specifically on the features of Python that are
+Instead, TorchScript focuses specifically on the features of Python that are
 needed to represent neural network models in Torch.
 .. envvar:: PYTORCH_JIT=1
@ -150,9 +150,9 @@ needed to represent neural network models in Torch.
 Types
 ~~~~~
-The largest difference between Torch Script and the full Python language is that
+The largest difference between TorchScript and the full Python language is that
-Torch Script only support a small set of types that are needed to express neural
+TorchScript only support a small set of types that are needed to express neural
-net models. In particular Torch Script supports:
+net models. In particular TorchScript supports:
 ``Tensor``
    A PyTorch tensor of any dtype, dimension, or backend.
@ -169,8 +169,8 @@ net models. In particular Torch Script supports:
 ``List[T]``
    A list of which all members are type ``T``
-Unlike Python, each variable in Torch Script function must have a single static type.
+Unlike Python, each variable in TorchScript function must have a single static type.
-This makes it easier to optimize Torch Script functions.
+This makes it easier to optimize TorchScript functions.
 Example::
@ -183,9 +183,9 @@ Example::
        return r # Type mismatch: r is set to type Tensor in the true branch
                 # and type int in the false branch
-By default, all parameters to a Torch Script function are assumed to be Tensor
+By default, all parameters to a TorchScript function are assumed to be Tensor
 because this is the most common type used in modules. To specify that an
-argument to a Torch Script function is another type, it is possible to use
+argument to a TorchScript function is another type, it is possible to use
 MyPy-style type annotations using the types listed above:
 Example::
@ -264,7 +264,7 @@ Subscripts
  ``t[i:j, i]``
  .. note::
-    Torch Script currently does not support mutating tensors in place, so any
+    TorchScript currently does not support mutating tensors in place, so any
    tensor indexing can only appear on the right-hand size of an expression.
 Function calls
@ -328,7 +328,7 @@ Accessing Module Parameters
 Statements
 ~~~~~~~~~~
-Torch Script supports the following types of statements:
+TorchScript supports the following types of statements:
 Simple Assignments
@ -438,7 +438,7 @@ Return
 Variable Resolution
 ~~~~~~~~~~~~~~~~~~~
-Torch Script supports a subset of Python's variable resolution (i.e. scoping)
+TorchScript supports a subset of Python's variable resolution (i.e. scoping)
 rules. Local variables behave the same as in Python, except for the restriction
 that a variable must have the same type along all paths through a function.
 If a variable has a different type on different sides of an if statement, it
@ -456,23 +456,23 @@ Example::
        print(y) # Error: undefined value y
 Non-local variables are resolved to Python values at compile time when the
-function is defined. These values are then converted into Torch Script values using
+function is defined. These values are then converted into TorchScript values using
 the rules described in `Use of Python Values`_.
 Use of Python Values
 ~~~~~~~~~~~~~~~~~~~~
-To make writing Torch Script more convenient, we allow script code to refer
+To make writing TorchScript more convenient, we allow script code to refer
 to Python values in the surrounding scope. For instance, any time there is a
-reference to ``torch``, the Torch Script compiler is actually resolving it to the
+reference to ``torch``, the TorchScript compiler is actually resolving it to the
 ``torch`` Python module when the function is declared.  These Python values are
-not a first class part of Torch Script. Instead they are desugared at compile-time
+not a first class part of TorchScript. Instead they are desugared at compile-time
-into the primitive types that Torch Script supports. This section describes the
+into the primitive types that TorchScript supports. This section describes the
-rules that are used when accessing Python values in Torch Script. They depend
+rules that are used when accessing Python values in TorchScript. They depend
 on the dynamic type of the python valued referenced.
 Functions
-  Torch Script can call python functions. This functionality is very useful when
+  TorchScript can call python functions. This functionality is very useful when
  incrementally converting a model into script. The model can be moved function-by-function
  to script, leaving calls to Python functions in place. This way you can incrementally
  check the correctness of the model as you go.
@ -495,12 +495,12 @@ Functions
 Attribute Lookup On Python Modules
-    Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
+    TorchScript can lookup attributes on modules. Builtin functions like ``torch.add``
-    are accessed this way. This allows Torch Script to call functions defined in
+    are accessed this way. This allows TorchScript to call functions defined in
    other modules.
 Python-defined Constants
-    Torch Script also provides a way to use constants that are defined in Python.
+    TorchScript also provides a way to use constants that are defined in Python.
    These can be used to hard-code hyper-parameters into the function, or to
    define universal constants. There are two ways of specifying that a Python
    value should be treated as a constant.
@ -597,36 +597,35 @@ Interpreting Graphs
    The example script above produces the graph::
-        graph(%len : int) {
+	graph(%len : int) {
-          %13 : float = prim::Constant[value=1]()
+	  %15 : int = prim::Constant[value=1]()
-          %10 : int = prim::Constant[value=10]()
+	  %9 : bool = prim::Constant[value=1]()
-          %2 : int = prim::Constant[value=4]()
+	  %7 : Device = prim::Constant[value="cpu"]()
-          %1 : int = prim::Constant[value=3]()
+	  %6 : int = prim::Constant[value=0]()
-          %3 : int[] = prim::ListConstruct(%1, %2)
+	  %5 : int = prim::Constant[value=6]()
-          %4 : int = prim::Constant[value=6]()
+	  %1 : int = prim::Constant[value=3]()
-          %5 : int = prim::Constant[value=0]()
+	  %2 : int = prim::Constant[value=4]()
-          %6 : int[] = prim::Constant[value=[0, -1]]()
+	  %11 : int = prim::Constant[value=10]()
-          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
+	  %14 : float = prim::Constant[value=1]()
-          %8 : int = prim::Constant[value=1]()
+	  %4 : int[] = prim::ListConstruct(%1, %2)
-          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
+	  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
-            block0(%i : int, %12 : Dynamic) {
+	  %rv : Tensor = prim::Loop(%len, %9, %rv.1)
-              %11 : int = aten::lt(%i, %10)
+	    block0(%i : int, %13 : Tensor) {
-              %rv.4 : Dynamic = prim::If(%11)
+	      %12 : bool = aten::lt(%i, %11)
-                block0() {
+	      %rv.4 : Tensor = prim::If(%12)
-                  %14 : int = prim::Constant[value=1]()
+		block0() {
-                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
+		  %rv.2 : Tensor = aten::sub(%13, %14, %15)
-                  -> (%rv.2)
+		  -> (%rv.2)
-                }
+		}
-                block1() {
+		block1() {
-                  %16 : int = prim::Constant[value=1]()
+		  %rv.3 : Tensor = aten::add(%13, %14, %15)
-                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
+		  -> (%rv.3)
-                  -> (%rv.3)
+		}
-                }
+	      -> (%9, %rv.4)
-              %19 : int = prim::Constant[value=1]()
+	    }
-              -> (%19, %rv.4)
+	  return (%rv);
-            }
+	}
-          return (%rv);
+
        }
    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
@ -676,34 +675,39 @@ Automatic Trace Checking
        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
    Gives us the following diagnostic information::
 	ERROR: Graphs differed across invocations!
 	Graph diff::
-        ERROR: Graphs differed across invocations!
+		  graph(%x : Tensor) {
-        Graph diff:
+		    %1 : int = prim::Constant[value=0]()
-            graph(%0 : Dynamic) {
+		    %2 : int = prim::Constant[value=0]()
-                  %1 : int = prim::Constant[value=0]()
+		    %result.1 : Tensor = aten::select(%x, %1, %2)
-                  %2 : int = prim::Constant[value=0]()
+		    %4 : int = prim::Constant[value=0]()
-                  %3 : Dynamic = aten::select(%0, %1, %2)
+		    %5 : int = prim::Constant[value=0]()
-                  %4 : int = prim::Constant[value=0]()
+		    %6 : Tensor = aten::select(%x, %4, %5)
-                  %5 : int = prim::Constant[value=0]()
+		    %result.2 : Tensor = aten::mul(%result.1, %6)
-                  %6 : Dynamic = aten::select(%0, %4, %5)
+		    %8 : int = prim::Constant[value=0]()
-                  %7 : Dynamic = aten::mul(%3, %6)
+		    %9 : int = prim::Constant[value=1]()
-                  %8 : int = prim::Constant[value=0]()
+		    %10 : Tensor = aten::select(%x, %8, %9)
-                  %9 : int = prim::Constant[value=1]()
+		-   %result : Tensor = aten::mul(%result.2, %10)
-                  %10 : Dynamic = aten::select(%0, %8, %9)
+		+   %result.3 : Tensor = aten::mul(%result.2, %10)
-                  %11 : Dynamic = aten::mul(%7, %10)
+		?          ++
-                  %12 : int = prim::Constant[value=0]()
+		    %12 : int = prim::Constant[value=0]()
-                  %13 : int = prim::Constant[value=2]()
+		    %13 : int = prim::Constant[value=2]()
-                  %14 : Dynamic = aten::select(%0, %12, %13)
+		    %14 : Tensor = aten::select(%x, %12, %13)
-                  %15 : Dynamic = aten::mul(%11, %14)
+		+   %result : Tensor = aten::mul(%result.3, %14)
-              +   %16 : int = prim::Constant[value=0]()
+		+   %16 : int = prim::Constant[value=0]()
-              +   %17 : int = prim::Constant[value=3]()
+		+   %17 : int = prim::Constant[value=3]()
-              +   %18 : Dynamic = aten::select(%0, %16, %17)
+		+   %18 : Tensor = aten::select(%x, %16, %17)
-              +   %19 : Dynamic = aten::mul(%15, %18)
+		-   %15 : Tensor = aten::mul(%result, %14)
-              -   return (%15);
+		?     ^                                 ^
-              ?             ^
+		+   %19 : Tensor = aten::mul(%result, %18)
-              +   return (%19);
+		?     ^                                 ^
-              ?             ^
+		-   return (%15);
-            }
+		?             ^
 		+   return (%19);
 		?             ^
 		  }
    This message indicates to us that the computation differed between when
@ -733,23 +737,19 @@ Automatic Trace Checking
    Which produces::
-        graph(%x : Dynamic) {
+	graph(%x : Tensor) {
-          %1 : int = prim::Constant[value=0]()
+	  %5 : bool = prim::Constant[value=1]()
-          %2 : int = prim::Constant[value=0]()
+	  %1 : int = prim::Constant[value=0]()
-          %result.1 : Dynamic = aten::select(%x, %2, %1)
+	  %result.1 : Tensor = aten::select(%x, %1, %1)
-          %4 : int = aten::size(%x, %1)
+	  %4 : int = aten::size(%x, %1)
-          %5 : int = prim::Constant[value=1]()
+	  %result : Tensor = prim::Loop(%4, %5, %result.1)
-          %result : Dynamic = prim::Loop(%4, %5, %result.1)
+	    block0(%i : int, %7 : Tensor) {
-            block0(%i : int, %7 : Dynamic) {
+	      %10 : Tensor = aten::select(%x, %1, %i)
-              %9 : int = prim::Constant[value=0]()
+	      %result.2 : Tensor = aten::mul(%7, %10)
-              %10 : Dynamic = aten::select(%x, %9, %i)
+	      -> (%5, %result.2)
-              %result.2 : Dynamic = aten::mul(%7, %10)
+	    }
-              %12 : int = prim::Constant[value=1]()
+	  return (%result);
-              -> (%12, %result.2)
+	}
            }
          return (%result);
        }
 Tracer Warnings
    The tracer produces warnings for several problematic patterns in traced
@ -789,14 +789,24 @@ Tracer Warnings
 Builtin Functions
 ~~~~~~~~~~~~~~~~~
-Torch Script supports a subset of the builtin tensor and neural network functions that
+Torch Script supports a subset of the builtin tensor and neural network
-PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
+functions that PyTorch provides. Most methods on Tensor as well as functions in
-namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
+the ``torch`` namespace, all functions in ``torch.nn.functional`` and all
 modules from ``torch.nn`` are supported in Torch Script, excluding those in the
 table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`.
 Unsupported ``torch.nn`` Modules  ::
    torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss
    torch.nn.modules.normalization.CrossMapLRN2d
    torch.nn.modules.fold.Fold
    torch.nn.modules.fold.Unfold
    torch.nn.modules.rnn.GRU
    torch.nn.modules.rnn.LSTM
    torch.nn.modules.rnn.RNN
    torch.nn.modules.rnn.GRUCell
    torch.nn.modules.rnn.LSTMCell
    torch.nn.modules.rnn.RNNCell
 We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
 ``Conv`` module. This functionality is something that will be developed in the future.
 For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
 modules into ScriptModules on construction.
 .. automodule:: torch.jit.supported_ops
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -338,6 +338,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: reshape_as
   .. automethod:: resize_
   .. automethod:: resize_as_
   .. automethod:: roll
   .. automethod:: round
   .. automethod:: round_
   .. automethod:: rsqrt
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -269,6 +269,7 @@ Other Operations
 .. autofunction:: histc
 .. autofunction:: meshgrid
 .. autofunction:: renorm
 .. autofunction:: roll
 .. autofunction:: tensordot
 .. autofunction:: trace
 .. autofunction:: tril
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 if (BUILD_CAFFE2_OPS)
  #cmake only check for separate OpenMP library on AppleClang 7+
  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
      Set(OpenMP_link ${OpenMP_libomp_LIBRARY})
    endif()
  endif()
  # Note(ilijar): Since Detectron ops currently have no
  # CPU implementation, we only build GPU ops for now.
  if (USE_CUDA)
@ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS)
        ${Detectron_CPU_SRCS}
        ${Detectron_GPU_SRCS})
-    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
  elseif(NOT IOS_PLATFORM)
    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-    target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops caffe2)
    install(TARGETS caffe2_detectron_ops DESTINATION lib)
  endif()
 endif()
--- a/setup.py
+++ b/setup.py
@ -124,6 +124,7 @@
 #   LD_LIBRARY_PATH
 #     we will search for libraries in these paths
 from __future__ import print_function
 from setuptools import setup, Extension, distutils, Command, find_packages
 import setuptools.command.build_ext
 import setuptools.command.install
@ -144,86 +145,32 @@ import json
 import glob
 import importlib
-from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
+# If you want to modify flags or environmental variables that is set when
-
+# building torch, you should do it in tools/setup_helpers/configure.py.
-
+# Please don't add it here unless it's only used in PyTorch.
-def hotpatch_var(var, prefix='USE_'):
+from tools.setup_helpers.configure import *
    if check_env_flag('NO_' + var):
        os.environ[prefix + var] = '0'
    elif check_negative_env_flag('NO_' + var):
        os.environ[prefix + var] = '1'
    elif check_env_flag('WITH_' + var):
        os.environ[prefix + var] = '1'
    elif check_negative_env_flag('WITH_' + var):
        os.environ[prefix + var] = '0'
 # Before we run the setup_helpers, let's look for NO_* and WITH_*
 # variables and hotpatch environment with the USE_* equivalent
 use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
                'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
 list(map(hotpatch_var, use_env_vars))
 # Also hotpatch a few with BUILD_* equivalent
 build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
 [hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
 from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
                                       USE_LMDB, USE_OPENCV, USE_FFMPEG)
 from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 from tools.setup_helpers.fbgemm import USE_FBGEMM
 from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY,
                                        MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
 from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
    NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
 from tools.setup_helpers.nnpack import USE_NNPACK
 from tools.setup_helpers.qnnpack import USE_QNNPACK
 from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
 from tools.setup_helpers.generate_code import generate_code
 from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
 from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
    USE_GLOO_IBVERBS
 ################################################################################
 # Parameters parsed from environment
 ################################################################################
-DEBUG = check_env_flag('DEBUG')
+VERBOSE_SCRIPT = True
-REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO')
+# see if the user passed a quiet flag to setup.py arguments and respect
-IS_WINDOWS = (platform.system() == 'Windows')
+# that in our parts of the build
-IS_DARWIN = (platform.system() == 'Darwin')
+for arg in sys.argv:
-IS_LINUX = (platform.system() == 'Linux')
+    if arg == "--":
-IS_PPC = (platform.machine() == 'ppc64le')
+        break
-IS_ARM = (platform.machine() == 'aarch64')
+    if arg == '-q' or arg == '--quiet':
        VERBOSE_SCRIPT = False
-BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
+if VERBOSE_SCRIPT:
-# ppc64le and aarch64 do not support MKLDNN
+    def report(*args):
-if IS_PPC or IS_ARM:
+        print(*args)
    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF')
 else:
-    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON')
+    def report(*args):
-
+        pass
 USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
 RERUN_CMAKE = True
 NUM_JOBS = multiprocessing.cpu_count()
 max_jobs = os.getenv("MAX_JOBS")
 if max_jobs is not None:
    NUM_JOBS = min(NUM_JOBS, int(max_jobs))
 ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
 if not ONNX_NAMESPACE:
    ONNX_NAMESPACE = "onnx_torch"
 # Ninja
 try:
    import ninja
    USE_NINJA = True
 except ImportError:
    USE_NINJA = False
 # Constant known variables used throughout this file
 cwd = os.path.dirname(os.path.abspath(__file__))
@ -323,8 +270,9 @@ def build_libs(libs):
        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
    else:
        build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
-    my_env = os.environ.copy()
+
-    my_env["PYTORCH_PYTHON"] = sys.executable
+    my_env, extra_flags = get_pytorch_env_with_flags()
    build_libs_cmd.extend(extra_flags)
    my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
    my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
    my_env["PYTORCH_BUILD_VERSION"] = version
@ -334,64 +282,8 @@ def build_libs(libs):
        cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
    my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path
-    my_env["NUM_JOBS"] = str(NUM_JOBS)
+    if VERBOSE_SCRIPT:
-    my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
+        my_env['VERBOSE_SCRIPT'] = '1'
    if not IS_WINDOWS:
        if USE_NINJA:
            my_env["CMAKE_GENERATOR"] = '-GNinja'
            my_env["CMAKE_INSTALL"] = 'ninja install'
        else:
            my_env['CMAKE_GENERATOR'] = ''
            my_env['CMAKE_INSTALL'] = 'make install'
    if USE_SYSTEM_NCCL:
        my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
        my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
        my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
    if USE_CUDA:
        my_env["CUDA_BIN_PATH"] = CUDA_HOME
        build_libs_cmd += ['--use-cuda']
        if IS_WINDOWS:
            my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
    if USE_CUDA_STATIC_LINK:
        build_libs_cmd += ['--cuda-static-link']
    if USE_FBGEMM:
        build_libs_cmd += ['--use-fbgemm']
    if USE_ROCM:
        build_libs_cmd += ['--use-rocm']
    if USE_NNPACK:
        build_libs_cmd += ['--use-nnpack']
    if USE_NUMPY:
        my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR
    if USE_CUDNN:
        my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
        my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
        my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
    if USE_MIOPEN:
        my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
        my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
        my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
    if USE_MKLDNN:
        build_libs_cmd += ['--use-mkldnn']
    if USE_QNNPACK:
        build_libs_cmd += ['--use-qnnpack']
    if USE_GLOO_IBVERBS:
        build_libs_cmd += ['--use-gloo-ibverbs']
    if not RERUN_CMAKE:
        build_libs_cmd += ['--dont-rerun-cmake']
    my_env["BUILD_TORCH"] = "ON"
    my_env["BUILD_PYTHON"] = "ON"
    my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
    my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
    my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
    my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
    my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
    my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
    my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
    try:
        os.mkdir('build')
    except OSError:
@ -660,6 +552,16 @@ class build_ext(build_ext_parent):
        return outputs
 # this is a subclass of build just to get access to self.build_lib
 # as there does not seem to be an utility function getting this
 class create_pyi(distutils.command.build.build):
    def run(self):
        print("-- Building .pyi --")
        if sys.version_info[0] == 3:
            from tools.pyi.gen_pyi import gen_pyi
            gen_pyi(self.build_lib)
 class build(distutils.command.build.build):
    sub_commands = [
        ('build_deps', lambda self: True),
@ -914,6 +816,7 @@ if USE_CUDA:
 cmdclass = {
    'create_version_file': create_version_file,
    'create_pyi': create_pyi,
    'build': build,
    'build_py': build_py,
    'build_ext': build_ext,
@ -946,6 +849,7 @@ if __name__ == '__main__':
        entry_points=entry_points,
        package_data={
            'torch': [
                '__init__.pyi',
                'lib/*.so*',
                'lib/*.dylib*',
                'lib/*.dll',
--- a/test/common_methods_invocations.py
+++ b/test/common_methods_invocations.py
@ -458,6 +458,10 @@ method_tests = [
     NO_ARGS, [skipIfNoLapack]),
    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
     NO_ARGS, [skipIfNoLapack]),
    ('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"),
    ('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"),
    ('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"),
    ('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"),
    ('addcmul', (S, S), ((S, S), (S, S))),
    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
@ -560,8 +564,14 @@ method_tests = [
    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
    ('tril', (M, M), NO_ARGS),
    ('tril', (M, M), (2,), 'idx'),
    ('tril', (S, M, M), NO_ARGS, 'batched'),
    ('tril', (S, M, M), (2,), 'batched_idx'),
    ('tril', (3, 3, S, S), NO_ARGS, 'more_batched'),
    ('triu', (M, M), NO_ARGS),
    ('triu', (M, M), (2,), 'idx'),
    ('triu', (S, M, M), NO_ARGS, 'batched'),
    ('triu', (S, M, M), (2,), 'batched_idx'),
    ('triu', (3, 3, S, S), NO_ARGS, 'more_batched'),
    ('trace', (M, M), NO_ARGS),
    ('cross', (S, 3), ((S, 3),)),
    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
--- a/test/common_utils.py
+++ b/test/common_utils.py
@ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs):
        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
 def brute_pdist(inp, p=2):
    """Computes the same as torch.pdist using primitives"""
    n = inp.shape[-2]
    k = n * (n - 1) // 2
    if k == 0:
        # torch complains about empty indices
        return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device)
    square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1)
    unroll = square.view(square.shape[:-2] + (n * n,))
    inds = torch.ones(k, dtype=torch.int)
    inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int)
    return unroll[..., inds.cumsum(0)]
 def do_test_dtypes(self, dtypes, layout, device):
    for dtype in dtypes:
        if dtype != torch.float16:
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) {
  ASSERT_EQ(batch[1].target, "2");
 }
 struct DummyTensorDataset
    : datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> {
  Example<torch::Tensor, int> get(size_t index) override {
    const auto channels = static_cast<int64_t>(index);
    torch::Tensor tensor =
        (channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4});
    return {tensor, static_cast<int>(channels)};
  }
  torch::optional<size_t> size() const override {
    return 100;
  }
 };
 TEST(DataTest, NormalizeTransform) {
  auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1));
  // Works for zero (one implicit) channels
  std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0);
  ASSERT_EQ(output.size(), 1);
  // (1 - 0.5) / 0.1 = 5
  ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5))
      << output[0].data;
  // Works for one explicit channel
  output = dataset.get_batch(1);
  ASSERT_EQ(output.size(), 1);
  ASSERT_EQ(output[0].data.size(0), 1);
  ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5))
      << output[0].data;
  // Works for two channels with different moments
  dataset = DummyTensorDataset().map(
      transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2}));
  output = dataset.get_batch(2);
  ASSERT_EQ(output.size(), 1);
  ASSERT_EQ(output[0].data.size(0), 2);
  ASSERT_TRUE(output[0]
                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
                  .allclose(torch::ones({1, 4, 4}) * 5))
      << output[0].data;
  ASSERT_TRUE(output[0]
                  .data.slice(/*dim=*/0, /*start=*/1)
                  .allclose(torch::ones({1, 4, 4}) * -2.5))
      << output[0].data;
  // Works for three channels with one moment value
  dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2));
  output = dataset.get_batch(3);
  ASSERT_EQ(output.size(), 1);
  ASSERT_EQ(output[0].data.size(0), 3);
  ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5))
      << output[0].data;
  // Works for three channels with different moments
  dataset = DummyTensorDataset().map(
      transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2}));
  output = dataset.get_batch(3);
  ASSERT_EQ(output.size(), 1);
  ASSERT_EQ(output[0].data.size(0), 3);
  ASSERT_TRUE(output[0]
                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
                  .allclose(torch::ones({1, 4, 4}) * 5))
      << output[0].data;
  ASSERT_TRUE(output[0]
                  .data.slice(/*dim=*/0, /*start=*/1, /*end=*/2)
                  .allclose(torch::ones({1, 4, 4}) * -2.5))
      << output[0].data;
  ASSERT_TRUE(output[0]
                  .data.slice(/*dim=*/0, /*start=*/2)
                  .allclose(torch::ones({1, 4, 4}) * 12.5))
      << output[0].data;
 }
 struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
  UnCopyableDataset() = default;
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) {
 TEST_F(ModuleTest, ZeroGrad) {
  Linear module(3, 4);
  auto weight = torch::ones({8, 3}, torch::requires_grad());
-  auto loss = module->forward(weight).sum();
+  auto loss = module(weight).sum();
  loss.backward();
  for (auto& parameter : module->parameters()) {
    auto grad = parameter.grad();
@ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) {
    ASSERT_NO_THROW(module->modules());
  }
 }
 struct ModuleWithNonTensorForwardImpl : torch::nn::Module {
  int64_t forward(torch::Tensor x) {
    return x.numel();
  }
 };
 TORCH_MODULE(ModuleWithNonTensorForward);
 TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) {
  ModuleWithNonTensorForward m;
  ASSERT_EQ(m(torch::ones(123)), 123);
 }
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {};
 TEST_F(ModulesTest, Conv1d) {
  Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) {
 TEST_F(ModulesTest, Conv2dEven) {
  Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) {
 TEST_F(ModulesTest, Conv2dUneven) {
  Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
  auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) {
 TEST_F(ModulesTest, Conv3d) {
  Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) {
 TEST_F(ModulesTest, Linear) {
  Linear model(5, 2);
  auto x = torch::randn({10, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) {
  auto l3 = model->add(Linear(5, 100), "l3");
  auto x = torch::randn({1000, 10}, torch::requires_grad());
-  x = l1->forward(x).clamp_min(0);
+  x = l1(x).clamp_min(0);
-  x = l2->forward(x).clamp_min(0);
+  x = l2(x).clamp_min(0);
-  x = l3->forward(x).clamp_min(0);
+  x = l3(x).clamp_min(0);
  x.backward();
  ASSERT_EQ(x.ndimension(), 2);
@ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
  // Cannot get gradients to change indices (input) - only for embedding
  // params
  auto x = torch::full({10}, dict_size - 1, torch::kInt64);
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 TEST_F(ModulesTest, EmbeddingList) {
  Embedding model(6, 4);
  auto x = torch::full({2, 3}, 5, torch::kInt64);
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) {
 TEST_F(ModulesTest, Dropout) {
  Dropout dropout(0.5);
  torch::Tensor x = torch::ones(100, torch::requires_grad());
-  torch::Tensor y = dropout->forward(x);
+  torch::Tensor y = dropout(x);
  y.backward();
  ASSERT_EQ(y.ndimension(), 1);
@ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) {
  ASSERT_GT(y.sum().item<float>(), 70); // Probably
  dropout->eval();
-  y = dropout->forward(x);
+  y = dropout(x);
  ASSERT_EQ(y.sum().item<float>(), 100);
 }
@ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
    was_called = true;
    return input;
  });
-  auto output = functional->forward(torch::ones(5, torch::requires_grad()));
+  auto output = functional(torch::ones(5, torch::requires_grad()));
  ASSERT_TRUE(was_called);
  ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));
@ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) {
  ASSERT_FALSE(bn->bias.defined());
  ASSERT_THROWS_WITH(
-      bn->forward(torch::ones({2, 5})),
+      bn(torch::ones({2, 5})),
      "Calling BatchNorm::forward is only permitted "
      "when the 'stateful' option is true (was false). "
      "Use BatchNorm::pure_forward instead.");
@ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) {
  model->to(torch::kCUDA);
  auto x =
      torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
@ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) {
  model->to(torch::kCUDA);
  model->to(torch::kCPU);
  auto x = torch::randn({10, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();
  s.backward();
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@ -215,7 +215,9 @@ TEST(SerializeTest, Optim) {
 TEST(SerializeTest, XOR_CUDA) {
  torch::manual_seed(0);
  // We better be able to save and load a XOR model!
-  auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) {
+  auto getLoss = [](Sequential model,
                    uint32_t batch_size,
                    bool is_cuda = false) {
    auto inputs = torch::empty({batch_size, 2});
    auto labels = torch::empty({batch_size});
    if (is_cuda) {
@ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) {
  loss = getLoss(model3, 100, true);
  ASSERT_LT(loss.item<float>(), 0.1);
 }
 TEST(
    SerializeTest,
    CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) {
  struct C : torch::nn::Module {
    C() {
      register_buffer("foo", torch::ones(5, torch::kInt32));
    }
  };
  struct B : torch::nn::Module {};
  struct A : torch::nn::Module {
    A() {
      register_module("b", std::make_shared<B>());
      register_module("c", std::make_shared<C>());
    }
  };
  struct M : torch::nn::Module {
    M() {
      register_module("a", std::make_shared<A>());
    }
  };
  auto out = std::make_shared<M>();
  std::stringstream ss;
  torch::save(out, ss);
  auto in = std::make_shared<M>();
  torch::load(in, ss);
  const int output = in->named_buffers()["a.c.foo"].sum().item<int>();
  ASSERT_EQ(output, 5);
 }
--- a/test/cpp/api/static.cpp
+++ b/test/cpp/api/static.cpp
@ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) {
  ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
 }
 struct A : torch::nn::Module {
  int forward() {
    return 5;
  }
 };
 struct B : torch::nn::Module {
  std::string forward(torch::Tensor tensor) {
    return "";
  }
 };
 struct C : torch::nn::Module {
  float forward(torch::Tensor& tensor) {
    return 5.0;
  }
 };
 struct D : torch::nn::Module {
  char forward(torch::Tensor&& tensor) {
    return 'x';
  }
 };
 struct E : torch::nn::Module {};
 // Put in a function because macros don't handle the comma between arguments to
 // is_same well ...
 template <typename Module, typename ExpectedType, typename... Args>
 void assert_has_expected_type() {
  using ReturnType =
      typename torch::detail::return_type_of_forward<Module, Args...>::type;
  constexpr bool is_expected_type =
      std::is_same<ReturnType, ExpectedType>::value;
  ASSERT_TRUE(is_expected_type) << Module().name();
 }
 TEST(TestStatic, ReturnTypeOfForward) {
  assert_has_expected_type<A, int>();
  assert_has_expected_type<B, std::string, torch::Tensor>();
  assert_has_expected_type<C, float, torch::Tensor&>();
  assert_has_expected_type<D, char, torch::Tensor&&>();
  assert_has_expected_type<E, void>();
 }
 TEST(TestStatic, Apply) {
  std::vector<int> v;
  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
--- a/test/expect/TestBatched.test_for.expect
+++ b/test/expect/TestBatched.test_for.expect
@ -10,12 +10,13 @@ graph(%x.1_data : Tensor
  %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
    block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
      %16 : Long() = prim::NumToTensor(%6)
-      %alpha : float = prim::TensorToNum(%16)
+      %alpha : float = prim::Float(%16)
      %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
      %mask : Tensor = aten::mul(%5_mask, %y_mask)
      %dims : Tensor = aten::__or__(%5_dims, %y_dims)
      %data : Tensor = aten::where(%mask, %data.1, %5_data)
      -> (%7, %data, %mask, %dims)
    }
-  return (%x, %10, %11);
+  %22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11)
  return (%22);
 }
--- a/test/expect/TestBatched.test_if_else.expect
+++ b/test/expect/TestBatched.test_if_else.expect
@ -7,33 +7,31 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
-  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
+  %9 : Long() = prim::NumToTensor(%6)
-  %10 : bool = prim::TensorToBool(%7)
+  %alpha.1 : float = prim::Float(%9)
  %11 : Long() = prim::NumToTensor(%6)
  %alpha.1 : float = prim::TensorToNum(%11)
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %16 : Long() = prim::NumToTensor(%6)
+  %14 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%16)
+  %alpha : float = prim::Float(%14)
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %21 : bool = prim::Constant[value=1]()
+  %19 : bool = prim::Constant[value=1]()
-  %22 : int = prim::Constant[value=1]()
+  %20 : int = prim::Constant[value=1]()
-  %23 : Tensor = aten::type_as(%8, %7)
+  %21 : Tensor = aten::type_as(%8, %7)
-  %data.2 : Tensor = aten::mul(%7, %23)
+  %data.2 : Tensor = aten::mul(%7, %21)
-  %25 : int = aten::dim(%data.2)
+  %23 : int = aten::dim(%data.2)
-  %26 : bool = aten::eq(%25, %22)
+  %24 : bool = aten::eq(%23, %20)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%24)
    block0() {
-      %29 : int = aten::dim(%data.1)
+      %27 : int = aten::dim(%data.1)
-      %30 : int = aten::sub(%29, %22)
+      %28 : int = aten::sub(%27, %20)
-      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
+      %data.4 : Tensor = prim::Loop(%28, %19, %data.2)
-        block0(%32 : int, %33 : Tensor) {
+        block0(%30 : int, %31 : Tensor) {
-          %34 : int = aten::dim(%33)
+          %32 : int = aten::dim(%31)
-          %data.3 : Tensor = aten::unsqueeze(%33, %34)
+          %data.3 : Tensor = aten::unsqueeze(%31, %32)
-          -> (%21, %data.3)
+          -> (%19, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
@ -45,5 +43,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
+  %39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
  return (%39);
 }
--- a/test/expect/TestBatched.test_if_else_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_else_with_scalar.expect
@ -7,34 +7,33 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : float = prim::Constant[value=0.1]()
  %8 : Float() = prim::NumToTensor(%7)
-  %other : float = prim::TensorToNum(%8)
+  %other : float = prim::Float(%8)
  %10 : Tensor = aten::gt(%a.1_data, %other)
-  %11 : bool = prim::TensorToBool(%10)
+  %11 : Long() = prim::NumToTensor(%6)
-  %12 : Long() = prim::NumToTensor(%6)
+  %alpha.1 : float = prim::Float(%11)
  %alpha.1 : float = prim::TensorToNum(%12)
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %17 : Long() = prim::NumToTensor(%6)
+  %16 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%17)
+  %alpha : float = prim::Float(%16)
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %22 : bool = prim::Constant[value=1]()
+  %21 : bool = prim::Constant[value=1]()
-  %23 : int = prim::Constant[value=1]()
+  %22 : int = prim::Constant[value=1]()
-  %24 : Tensor = aten::type_as(%a.1_mask, %10)
+  %23 : Tensor = aten::type_as(%a.1_mask, %10)
-  %data.2 : Tensor = aten::mul(%10, %24)
+  %data.2 : Tensor = aten::mul(%10, %23)
-  %26 : int = aten::dim(%data.2)
+  %25 : int = aten::dim(%data.2)
-  %27 : bool = aten::eq(%26, %23)
+  %26 : bool = aten::eq(%25, %22)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%27)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
    block0() {
-      %30 : int = aten::dim(%data.1)
+      %29 : int = aten::dim(%data.1)
-      %31 : int = aten::sub(%30, %23)
+      %30 : int = aten::sub(%29, %22)
-      %data.4 : Tensor = prim::Loop(%31, %22, %data.2)
+      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
-        block0(%33 : int, %34 : Tensor) {
+        block0(%32 : int, %33 : Tensor) {
-          %35 : int = aten::dim(%34)
+          %34 : int = aten::dim(%33)
-          %data.3 : Tensor = aten::unsqueeze(%34, %35)
+          %data.3 : Tensor = aten::unsqueeze(%33, %34)
-          -> (%22, %data.3)
+          -> (%21, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
@ -46,5 +45,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
+  %41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
  return (%41);
 }
--- a/test/expect/TestBatched.test_if_noelse.expect
+++ b/test/expect/TestBatched.test_if_noelse.expect
@ -7,28 +7,26 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
-  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
+  %9 : Long() = prim::NumToTensor(%6)
-  %10 : bool = prim::TensorToBool(%7)
+  %alpha : float = prim::Float(%9)
  %11 : Long() = prim::NumToTensor(%6)
  %alpha : float = prim::TensorToNum(%11)
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %16 : bool = prim::Constant[value=1]()
+  %14 : bool = prim::Constant[value=1]()
-  %17 : int = prim::Constant[value=1]()
+  %15 : int = prim::Constant[value=1]()
-  %18 : Tensor = aten::type_as(%8, %7)
+  %16 : Tensor = aten::type_as(%8, %7)
-  %data.2 : Tensor = aten::mul(%7, %18)
+  %data.2 : Tensor = aten::mul(%7, %16)
-  %20 : int = aten::dim(%data.2)
+  %18 : int = aten::dim(%data.2)
-  %21 : bool = aten::eq(%20, %17)
+  %19 : bool = aten::eq(%18, %15)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%19)
    block0() {
-      %24 : int = aten::dim(%data)
+      %22 : int = aten::dim(%data)
-      %25 : int = aten::sub(%24, %17)
+      %23 : int = aten::sub(%22, %15)
-      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
+      %data.4 : Tensor = prim::Loop(%23, %14, %data.2)
-        block0(%27 : int, %28 : Tensor) {
+        block0(%25 : int, %26 : Tensor) {
-          %29 : int = aten::dim(%28)
+          %27 : int = aten::dim(%26)
-          %data.3 : Tensor = aten::unsqueeze(%28, %29)
+          %data.3 : Tensor = aten::unsqueeze(%26, %27)
-          -> (%16, %data.3)
+          -> (%14, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -40,5 +38,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
+  %34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
  return (%34);
 }
--- a/test/expect/TestBatched.test_if_noelse_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
@ -7,29 +7,28 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : float = prim::Constant[value=0.1]()
  %8 : Float() = prim::NumToTensor(%7)
-  %other : float = prim::TensorToNum(%8)
+  %other : float = prim::Float(%8)
  %10 : Tensor = aten::gt(%a.1_data, %other)
-  %11 : bool = prim::TensorToBool(%10)
+  %11 : Long() = prim::NumToTensor(%6)
-  %12 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::Float(%11)
  %alpha : float = prim::TensorToNum(%12)
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %17 : bool = prim::Constant[value=1]()
+  %16 : bool = prim::Constant[value=1]()
-  %18 : int = prim::Constant[value=1]()
+  %17 : int = prim::Constant[value=1]()
-  %19 : Tensor = aten::type_as(%a.1_mask, %10)
+  %18 : Tensor = aten::type_as(%a.1_mask, %10)
-  %data.2 : Tensor = aten::mul(%10, %19)
+  %data.2 : Tensor = aten::mul(%10, %18)
-  %21 : int = aten::dim(%data.2)
+  %20 : int = aten::dim(%data.2)
-  %22 : bool = aten::eq(%21, %18)
+  %21 : bool = aten::eq(%20, %17)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%22)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
    block0() {
-      %25 : int = aten::dim(%data)
+      %24 : int = aten::dim(%data)
-      %26 : int = aten::sub(%25, %18)
+      %25 : int = aten::sub(%24, %17)
-      %data.4 : Tensor = prim::Loop(%26, %17, %data.2)
+      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
-        block0(%28 : int, %29 : Tensor) {
+        block0(%27 : int, %28 : Tensor) {
-          %30 : int = aten::dim(%29)
+          %29 : int = aten::dim(%28)
-          %data.3 : Tensor = aten::unsqueeze(%29, %30)
+          %data.3 : Tensor = aten::unsqueeze(%28, %29)
-          -> (%17, %data.3)
+          -> (%16, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -41,5 +40,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
+  %36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
  return (%36);
 }
--- a/test/expect/TestBatched.test_while.expect
+++ b/test/expect/TestBatched.test_while.expect
@ -9,38 +9,35 @@ graph(%a.1_data : Tensor
  %8 : Tensor = aten::gt(%a.1_data, %b_data)
  %9 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %11 : bool = prim::TensorToBool(%8)
+  %11 : int = prim::Constant[value=0]()
-  %12 : int = prim::Constant[value=0]()
+  %12 : Tensor = aten::mul(%8, %9)
-  %13 : Tensor = aten::mul(%8, %9)
+  %13 : Tensor = aten::sum(%12)
-  %14 : Tensor = aten::sum(%13)
+  %14 : Tensor = aten::gt(%13, %11)
-  %15 : Tensor = aten::gt(%14, %12)
+  %15 : bool = prim::Bool(%14)
-  %16 : bool = prim::TensorToBool(%15)
+  %16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
-  %17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
+    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
-    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
+      %27 : Long() = prim::NumToTensor(%6)
-      %30 : Long() = prim::NumToTensor(%6)
+      %alpha : float = prim::Float(%27)
      %alpha : float = prim::TensorToNum(%30)
      %data : Tensor = aten::sub(%6_data, %b_data, %alpha)
      %mask : Tensor = aten::mul(%6_mask, %b_mask)
      %dims : Tensor = aten::__or__(%6_dims, %b_dims)
-      %35 : Tensor = aten::gt(%data, %b_data)
+      %32 : Tensor = aten::gt(%data, %b_data)
-      %36 : Tensor = aten::mul(%mask, %b_mask)
+      %33 : Tensor = aten::mul(%mask, %b_mask)
-      %37 : Tensor = aten::__or__(%dims, %b_dims)
+      %34 : bool = prim::Constant[value=1]()
-      %38 : bool = prim::TensorToBool(%35)
+      %35 : int = prim::Constant[value=1]()
-      %39 : bool = prim::Constant[value=1]()
+      %36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
-      %40 : int = prim::Constant[value=1]()
+      %data.2 : Tensor = aten::mul(%cond_data.2, %36)
-      %41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
+      %38 : int = aten::dim(%data.2)
-      %data.2 : Tensor = aten::mul(%cond_data.2, %41)
+      %39 : bool = aten::eq(%38, %35)
-      %43 : int = aten::dim(%data.2)
+      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%39)
      %44 : bool = aten::eq(%43, %40)
      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%44)
        block0() {
-          %47 : int = aten::dim(%data)
+          %42 : int = aten::dim(%data)
-          %48 : int = aten::sub(%47, %40)
+          %43 : int = aten::sub(%42, %35)
-          %data.4 : Tensor = prim::Loop(%48, %39, %data.2)
+          %data.4 : Tensor = prim::Loop(%43, %34, %data.2)
-            block0(%50 : int, %51 : Tensor) {
+            block0(%45 : int, %46 : Tensor) {
-              %52 : int = aten::dim(%51)
+              %47 : int = aten::dim(%46)
-              %data.3 : Tensor = aten::unsqueeze(%51, %52)
+              %data.3 : Tensor = aten::unsqueeze(%46, %47)
-              -> (%39, %data.3)
+              -> (%34, %data.3)
            }
          %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
          %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -52,12 +49,13 @@ graph(%a.1_data : Tensor
      %res_data : Tensor = aten::where(%cond_data, %data, %6_data)
      %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
      %res_dims : Tensor = aten::__or__(%dims, %6_dims)
-      %59 : int = prim::Constant[value=0]()
+      %54 : int = prim::Constant[value=0]()
-      %60 : Tensor = aten::mul(%35, %36)
+      %55 : Tensor = aten::mul(%32, %33)
-      %61 : Tensor = aten::sum(%60)
+      %56 : Tensor = aten::sum(%55)
-      %62 : Tensor = aten::gt(%61, %59)
+      %57 : Tensor = aten::gt(%56, %54)
-      %63 : bool = prim::TensorToBool(%62)
+      %58 : bool = prim::Bool(%57)
-      -> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims)
+      -> (%58, %32, %33, %res_data, %res_mask, %res_dims)
    }
-  return (%a, %21, %22);
+  %59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20)
  return (%59);
 }
--- a/Show More
+++ b/Show More