Add PyTorch Governance, Contributor Guide, and List of Persons of Interest

Summary: Adding new documents to the PyTorch website to describe how PyTorch is governed, how to contribute to the project, and lists persons of interest. Reviewed By: orionr Differential Revision: D14394573 fbshipit-source-id: ad98b807850c51de0b741e3acbbc3c699e97b27f
Fix dll loading issue for Caffe2 and Windows (#17214 )
2025-10-21 21:49:24 +08:00 · 2019-03-14 03:41:24 -07:00 · 2019-02-17 00:09:20 -05:00 · 2019-02-16 01:48:23 -05:00 · 2019-02-07 12:54:24 +05:30 · 2019-02-07 11:28:19 +05:30
247 changed files with 7534 additions and 3399 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,14 +1,14 @@
 # IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 # in this file to the new version number, and **ALSO** update the version number below:
-# PyTorchDockerVersion:262
-# Caffe2DockerVersion:230
+# PyTorchDockerVersion:282
+# Caffe2DockerVersion:238

 docker_config_defaults: &docker_config_defaults
  user: jenkins
  aws_auth:
    # This IAM user only allows read-write access to ECR
-    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
-    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
+    aws_access_key_id: ${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
+    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}

 # NOTE: We only perform the merge in build step and not in test step, because
 # all source files will be shared from build to test
@ -20,6 +20,110 @@ install_official_git_client: &install_official_git_client
    sudo apt-get -qq update
    sudo apt-get -qq install openssh-client git

+install_doc_push_script: &install_doc_push_script
+  name: Install the doc push script
+  no_output_timeout: "2m"
+  command: |
+    cat >/home/circleci/project/doc_push_script.sh <<EOL
+    # =================== The following code **should** be executed inside Docker container ===================
+
+    # This is where the local pytorch install in the docker image is located
+    pt_checkout="/var/lib/jenkins/workspace"
+
+    # Since we're cat-ing this file, we need to escape all $'s
+    echo "doc_push_script.sh: Invoked with \$*"
+
+    git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
+    pushd pytorch.github.io
+
+    set -ex
+
+    # Argument 1: Where to copy the built documentation to
+    # (pytorch.github.io/$install_path)
+    install_path="\$1"
+    if [ -z "\$install_path" ]; then
+    echo "error: doc_push_script.sh: install_path (arg1) not specified"
+      exit 1
+    fi
+
+    # Argument 2: What version of the docs we are building.
+    version="\$2"
+    if [ -z "\$version" ]; then
+    echo "error: doc_push_script.sh: version (arg2) not specified"
+      exit 1
+    fi
+
+    is_master_doc=false
+    if [ "\$version" == "master" ]; then
+      is_master_doc=true
+    fi
+
+    # Argument 3: (optional) If present, we will NOT do any pushing. Used for testing.
+    dry_run=false
+    if [ "\$3" != "" ]; then
+      dry_run=true
+    fi
+
+    echo "install_path: \$install_path  version: \$version  dry_run: \$dry_run"
+
+    export LC_ALL=C
+    export PATH=/opt/conda/bin:$PATH
+
+    rm -rf pytorch || true
+
+    # Get all the documentation sources, put them in one place
+    pushd "\$pt_checkout"
+    git clone https://github.com/pytorch/vision
+    pushd vision
+    conda install -q pillow
+    time python setup.py install
+    popd
+    pushd docs
+    rm -rf source/torchvision
+    cp -r ../vision/docs/source source/torchvision
+
+    # Build the docs
+    pip -q install -r requirements.txt || true
+    if [ "\$is_master_doc" = true ]; then
+      make html
+    else
+      make html-stable
+    fi
+
+    # Move them into the docs repo
+    popd
+    popd
+    git rm -rf "\$install_path" || true
+    mv "\$pt_checkout/docs/build/html" "\$install_path"
+
+    # Add the version handler by search and replace.
+    # XXX: Consider moving this to the docs Makefile or site build
+    if [ "\$is_master_doc" = true ]; then
+      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \&#x25BC</a>@g"
+    else
+      find "\$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\$version \&#x25BC</a>@g"
+    fi
+
+    git add "\$install_path" || true
+    git status
+    git config user.email "soumith+bot@pytorch.org"
+    git config user.name "pytorchbot"
+    # If there aren't changes, don't make a commit; push is no-op
+    git commit -m "auto-generating sphinx docs" || true
+    git status
+
+    if [ "\$dry_run" = false ]; then
+      echo "Pushing to pytorch.github.io:site"
+      git push origin site
+    else
+      echo "Skipping push due to dry_run"
+    fi
+
+    popd
+    # =================== The above code **should** be executed inside Docker container ===================
+    EOL
+    chmod +x /home/circleci/project/doc_push_script.sh
+
 setup_ci_environment: &setup_ci_environment
  name: Set Up CI Environment
  no_output_timeout: "1h"
@ -66,13 +170,13 @@ setup_ci_environment: &setup_ci_environment
      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env

      # This IAM user allows write access to S3 bucket for sccache
-      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
-      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}" >> /home/circleci/project/env
+      echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
+      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}" >> /home/circleci/project/env
    fi

    # This IAM user only allows read-write access to ECR
-    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V2}
-    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V2}
+    export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V3}
+    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V3}
    eval $(aws ecr get-login --region us-east-1 --no-include-email)

 pytorch_linux_build_defaults: &pytorch_linux_build_defaults
@ -117,7 +221,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
      <<: *setup_ci_environment
  - run:
      name: Test
-      no_output_timeout: "90m"
+      no_output_timeout: "1h"
      command: |
        set -e
        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
@ -297,8 +401,11 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults

          export IN_CIRCLECI=1

-          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
-          brew install moreutils --without-parallel
+          # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
+          # so we must unlink GNU `parallel` first, and relink it afterwards
+          brew unlink parallel
+          brew install moreutils
+          brew link parallel --overwrite
          brew install cmake
          brew install expect

@ -331,8 +438,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
          export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2

          # This IAM user allows write access to S3 bucket for sccache
-          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
-          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+          export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
+          export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}

          export SCCACHE_BIN=${PWD}/sccache_bin
          mkdir -p ${SCCACHE_BIN}
@ -361,154 +468,161 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
            sccache --show-stats
          fi

+##############################################################################
+##############################################################################
+# Job specifications
+##############################################################################
+##############################################################################
+
 version: 2
 jobs:
  pytorch_linux_trusty_py2_7_9_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py2_7_9_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_py2_7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py2_7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_py3_5_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py3_5_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_py3_6_gcc4_8_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py3_6_gcc4_8_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_py3_6_gcc5_4_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py3_6_gcc5_4_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_py3_6_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_py3_6_gcc7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_trusty_pynightly_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_trusty_pynightly_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:282"
    resource_class: large
    <<: *pytorch_linux_test_defaults

  pytorch_linux_xenial_py3_clang5_asan_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
      PYTHON_VERSION: "3.6"
    <<: *pytorch_linux_build_defaults

  pytorch_linux_xenial_py3_clang5_asan_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:282"
      PYTHON_VERSION: "3.6"
    resource_class: large
    <<: *pytorch_linux_test_defaults

-  pytorch_linux_xenial_cuda8_cudnn6_py3_build:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_build:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
+      BUILD_ENVIRONMENT: "pytorch-linux-xenial-cuda8-cudnn7-py3"
    <<: *pytorch_linux_build_defaults

-  pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
    <<: *pytorch_linux_test_defaults

-  pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
      MULTI_GPU: "1"
    resource_class: gpu.large
    <<: *pytorch_linux_test_defaults

-  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
    <<: *pytorch_linux_test_defaults

-  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
    environment:
-      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
@ -517,7 +631,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
      PYTHON_VERSION: "2.7"
      CUDA_VERSION: "9"
    <<: *pytorch_linux_build_defaults
@ -525,7 +639,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:282"
      PYTHON_VERSION: "2.7"
      CUDA_VERSION: "9"
    resource_class: gpu.medium
@ -534,7 +648,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9"
    <<: *pytorch_linux_build_defaults
@ -542,7 +656,7 @@ jobs:
  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9"
    resource_class: gpu.medium
@ -551,7 +665,7 @@ jobs:
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9.2"
    <<: *pytorch_linux_build_defaults
@ -559,7 +673,7 @@ jobs:
  pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "9.2"
    resource_class: gpu.medium
@ -568,7 +682,7 @@ jobs:
  pytorch_linux_xenial_cuda10_cudnn7_py3_gcc7_build:
    environment:
      JOB_BASE_NAME: pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "10"
    <<: *pytorch_linux_build_defaults
@ -576,7 +690,7 @@ jobs:
  pytorch_short_perf_test_gpu:
    environment:
      JOB_BASE_NAME: pytorch-short-perf-test-gpu
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
      PYTHON_VERSION: "3.6"
      CUDA_VERSION: "8"
    resource_class: gpu.medium
@ -597,8 +711,8 @@ jobs:

          docker cp $id:/var/lib/jenkins/workspace/env /home/circleci/project/env
          # This IAM user allows write access to S3 bucket for perf test numbers
-          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
-          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V2}" >> /home/circleci/project/env
+          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
+          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PERF_TEST_S3_BUCKET_V3}" >> /home/circleci/project/env
          docker cp /home/circleci/project/env $id:/var/lib/jenkins/workspace/env

          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
@ -607,7 +721,7 @@ jobs:
  pytorch_doc_push:
    environment:
      JOB_BASE_NAME: pytorch-doc-push
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:262"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:282"
    resource_class: large
    machine:
      image: default
@ -615,72 +729,39 @@ jobs:
    - run:
        <<: *setup_ci_environment
    - run:
-        name: Doc Push
+        <<: *install_doc_push_script
+    - run:
+        name: Doc Build and Push
        no_output_timeout: "1h"
        command: |
          set -e
-          if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
-            echo "Skipping doc push..."
-            exit 0
-          fi
          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
          docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
          export id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})

-          cat >/home/circleci/project/doc_push_script.sh <<EOL
-          # =================== The following code will be executed inside Docker container ===================
-          git clone https://yf225:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/pytorch.github.io -b site
-          pushd pytorch.github.io
-
-          set -ex
-
-          export LC_ALL=C
-          export PATH=/opt/conda/bin:$PATH
-
-          rm -rf pytorch || true
-
-          # Get all the documentation sources, put them in one place
-          # TODO: These clones can race
-          git clone https://github.com/pytorch/pytorch
-          pushd pytorch
-          git clone https://github.com/pytorch/vision
-          pushd vision
-          conda install -q pillow
-          time python setup.py install
-          popd
-          pushd docs
-          rm -rf source/torchvision
-          cp -r ../vision/docs/source source/torchvision
-
-          # Build the docs
-          pip -q install -r requirements.txt || true
-          make html
-
-          # Move them into the docs repo
-          popd
-          popd
-          git rm -rf docs/master || true
-          mv pytorch/docs/build/html docs/master
-          find docs/master -name "*.html" -print0 | xargs -0 sed -i -E 's/master[[:blank:]]\\([[:digit:]]\\.[[:digit:]]\\.[[:xdigit:]]+\\+[[:xdigit:]]+[[:blank:]]\\)/<a href="http:\\/\\/pytorch.org\\/docs\\/versions.html">& \\&#x25BC<\\/a>/g'
-          git add docs/master || true
-          git status
-          git config user.email "soumith+bot@pytorch.org"
-          git config user.name "pytorchbot"
-          # If there aren't changes, don't make a commit; push is no-op
-          git commit -m "auto-generating sphinx docs" || true
-          git status
-          git push origin site
-
-          popd
-          # =================== The above code will be executed inside Docker container ===================
-          EOL
-          chmod +x /home/circleci/project/doc_push_script.sh
          docker cp /home/circleci/project/doc_push_script.sh $id:/var/lib/jenkins/workspace/doc_push_script.sh

-          export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          # master branch docs push
+          if [[ "${CIRCLE_BRANCH}" == "master" ]]; then
+            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+
+          # stable release docs push. We keep an eternal PR open for merging
+          # v1.0.1 -> master; everytime v1.0.1 is updated the following is run.
+          elif [[ "${CIRCLE_BRANCH}" == "v1.0.1" ]]; then
+            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/stable 1.0.1") | docker exec -u jenkins -i "$id" bash) 2>&1'
+
+          # For open PRs: Do a dry_run of the docs build, don't push build
+          else
+            export COMMAND='((echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./doc_push_script.sh docs/master master dry_run") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          fi
          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts

+          # Save the docs build so we can debug any problems
+          export DEBUG_COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}-debug
+          docker commit "$id" ${DEBUG_COMMIT_DOCKER_IMAGE}
+          docker push ${DEBUG_COMMIT_DOCKER_IMAGE}
+
  pytorch_macos_10_13_py3_build:
    macos:
      xcode: "9.0"
@ -696,8 +777,11 @@ jobs:
            set -e

            export IN_CIRCLECI=1
-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
-            brew install moreutils --without-parallel
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
+            # so we must unlink GNU `parallel` first, and relink it afterwards
+            brew unlink parallel
+            brew install moreutils
+            brew link parallel --overwrite
            brew install expect

            # Install sccache
@ -706,8 +790,8 @@ jobs:

            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
            # This IAM user allows write access to S3 bucket for sccache
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}

            git submodule sync && git submodule update -q --init
            chmod a+x .jenkins/pytorch/macos-build.sh
@ -740,8 +824,11 @@ jobs:
          command: |
            set -e
            export IN_CIRCLECI=1
-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
-            brew install moreutils --without-parallel
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
+            # so we must unlink GNU `parallel` first, and relink it afterwards
+            brew unlink parallel
+            brew install moreutils
+            brew link parallel --overwrite
            brew install expect

            cp -r /Users/distiller/pytorch-ci-env/workspace/. /Users/distiller/project
@ -765,8 +852,11 @@ jobs:

            export IN_CIRCLECI=1

-            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the `parallel` formulae
-            brew install moreutils --without-parallel
+            # moreutils installs a `parallel` executable by default, which conflicts with the executable from the GNU `parallel`
+            # so we must unlink GNU `parallel` first, and relink it afterwards
+            brew unlink parallel
+            brew install moreutils
+            brew link parallel --overwrite
            brew install expect

            # Install CUDA 9.2
@ -790,30 +880,13 @@ jobs:
            sudo chmod +x /usr/local/bin/sccache
            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
            # This IAM user allows write access to S3 bucket for sccache
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V2}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V2}
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V3}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V3}

            git submodule sync && git submodule update -q --init
            chmod a+x .jenkins/pytorch/macos-build.sh
            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts

-  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
-    environment:
-      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
-      CUDA_VERSION: "8"
-      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
-    <<: *caffe2_linux_build_defaults
-
-  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
-    environment:
-      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
-      CUDA_VERSION: "8"
-      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
-    resource_class: gpu.medium
-    <<: *caffe2_linux_test_defaults
-
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
@ -896,11 +969,20 @@ jobs:
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
+      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
-      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults

+  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
+    environment:
+      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
+      CUDA_VERSION: "8"
+      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
+    resource_class: gpu.medium
+    <<: *caffe2_linux_test_defaults
+
  caffe2_py2_gcc4_9_ubuntu14_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
@ -1008,25 +1090,25 @@ workflows:
      - pytorch_linux_xenial_py3_clang5_asan_test:
          requires:
            - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_short_perf_test_gpu:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_doc_push:
          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
          requires:
@ -1047,10 +1129,6 @@ workflows:
            - pytorch_macos_10_13_py3_build
      - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build

-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
          requires:
@ -1072,6 +1150,9 @@ workflows:
          requires:
            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
+      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
+          requires:
+            - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
      - caffe2_py2_clang3_8_ubuntu16_04_build
      - caffe2_py2_clang3_9_ubuntu16_04_build
      - caffe2_py2_clang7_ubuntu16_04_build
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@ -124,6 +124,7 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")

 if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
  CMAKE_ARGS+=("-DBLAS=MKL")
+  CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
 fi
 if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
  CMAKE_ARGS+=("-DUSE_CUDA=ON")
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@ -14,18 +14,8 @@ clang --version
 # symbolize=1: Gives us much better errors when things go wrong
 export ASAN_OPTIONS=detect_leaks=0:symbolize=1

-# FIXME: Remove the hardcoded "-pthread" option.
-# With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
-# succeed because "pthread_create" is in libasan.so. However, libasan doesn't
-# have the full pthread implementation. Other advanced pthread functions doesn't
-# exist in libasan.so[2]. If we need some pthread advanced functions, we still
-# need to link the pthread library.
-# [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
-# [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
-#
 # TODO: Make the ASAN flags a more unified env var
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
-  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
-  CXX_FLAGS="-pthread" \
+  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
  NO_CUDA=1 USE_MKLDNN=0 \
  python setup.py install
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -129,7 +129,7 @@ fi
 git add -f build/bin

 # Test documentation build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
  pushd docs
  # TODO: Don't run this here
  pip install -q -r requirements.txt || true
@ -138,7 +138,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
 fi

 # Test standalone c10 build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
  mkdir -p c10/build
  pushd c10/build
  cmake ..
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@ -122,7 +122,7 @@ fi
 # Use conda cmake in some CI build. Conda cmake will be newer than our supported
 # min version 3.5, so we only do it in two builds that we know should use conda.
 if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn6-py2* ]] || \
+  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
     [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
    if ! which conda; then
      echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@ -5,9 +5,9 @@
 # in this file will report a failure (so you don't forget to
 # reenable the tests on merge ;)

-pytorch-linux-xenial-cuda8-cudnn6-py3-build
-pytorch-linux-xenial-cuda8-cudnn6-py3-test
-pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
+pytorch-linux-xenial-cuda8-cudnn7-py3-build
+pytorch-linux-xenial-cuda8-cudnn7-py3-test
+pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
 pytorch-linux-xenial-cuda9-cudnn7-py2-build
 pytorch-linux-xenial-cuda9-cudnn7-py2-test
 pytorch-linux-xenial-cuda9-cudnn7-py3-build
--- a/.jenkins/pytorch/win-build.sh
+++ b/.jenkins/pytorch/win-build.sh
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
    sccache --show-stats
    sccache --zero-stats
    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
+    for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
+      IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
+        del /S /Q %%i
+      )
+    )
    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
  )

--- a/.travis.yml
+++ b/.travis.yml
@ -34,10 +34,4 @@ matrix:
        script: cd docs/cpp/source && ./check-doxygen.sh
      - env: CLANG_TIDY
        python: "3.6"
-        addons:
-          apt:
-            sources:
-              - ubuntu-toolchain-r-test
-              - llvm-toolchain-trusty
-            packages: clang-tidy
        script: tools/run-clang-tidy-in-ci.sh
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -206,6 +206,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60
 	--generate-code arch=compute_70,code=sm_70)
+    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60
+	--generate-code arch=compute_70,code=sm_70)
    else()
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
    endif()
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@ -2122,55 +2122,6 @@
    - arg: THTensor* self
    - arg: THTensor* tensor
 ]]
-[[
-  name: _th_tril
-  cname: tril
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - arg: long diagonal
-      default: 0
-]]
-[[
-  name: _th_tril_
-  cname: tril
-  variants: function
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-    - arg: long diagonal
-      default: 0
-]]
-[[
-  name: _th_triu
-  cname: triu
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - arg: long diagonal
-      default: 0
-]]
-[[
-  name: _th_triu_
-  cname: triu
-  variants:
-    - function
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-    - arg: long diagonal
-      default: 0
-]]
 [[
  name: _th_cross
  cname: cross
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -147,7 +147,7 @@ static inline Tensor sum_to(Tensor tensor, const IntList shape) {
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (shape[i - leading_dims] == 1 && sizes[i] > 1) {
+    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
      reduce_dims.push_back(i);
    }
  }
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -81,6 +81,39 @@ inline void parallel_for(
 #endif
 }

+/*
+parallel_reduce
+
+begin: index at which to start applying reduction
+
+end: index at which to stop applying reduction
+
+grain_size: number of elements per chunk. impacts number of elements in
+intermediate results tensor and degree of parallelization.
+
+ident: identity for binary combination function sf. sf(ident, x) needs to return
+x.
+
+f: function for reduction over a chunk. f needs to be of signature scalar_t
+f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+
+sf: function to combine two partial results. sf needs to be of signature
+scalar_t sf(scalar_t x, scalar_t y)
+
+For example, you might have a tensor of 10000 entires and want to sum together
+all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
+an intermediate result tensor with 4 elements. Then it will execute the function
+"f" you provide and pass the beginning and end index of these chunks, so
+0-24999, 2500-4999, etc. and the combination identity. It will then write out
+the result from each of these chunks into the intermediate result tensor. After
+that it'll reduce the partial results from each chunk into a single number using
+the combination function sf and the identity ident. For a total summation this
+would be "+" and 0 respectively. This is similar to tbb's approach [1], where
+you need to provide a function to accumulate a subrange, a function to combine
+two partial results and an identity.
+
+[1] https://software.intel.com/en-us/node/506154
+*/
 template <class scalar_t, class F, class SF>
 inline scalar_t parallel_reduce(
    const int64_t begin,
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -196,7 +196,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {

 void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
  AT_CHECK(
-    t.type().backend() == backend,
+    !t.defined() || t.type().backend() == backend,
    "Expected tensor to have ", toString(backend),
    " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
    "(while checking arguments for ", c, ")");
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -52,14 +52,11 @@ namespace c10 {
  _(prim, TupleSlice)              \
  _(prim, ListConstruct)           \
  _(prim, ListUnpack)              \
-  _(prim, BoolToTensor)            \
  _(prim, NumToTensor)             \
-  _(prim, TensorToNum)             \
  _(prim, ImplicitTensorToNum)     \
-  _(prim, TensorToBool)            \
-  _(prim, IntToFloat)              \
-  _(prim, FloatToInt)              \
-  _(prim, StringToFloat)           \
+  _(prim, Bool)                    \
+  _(prim, Int)                     \
+  _(prim, Float)                   \
  _(prim, device)                  \
  _(prim, dtype)                   \
  _(prim, shape)                   \
@ -139,7 +136,8 @@ namespace c10 {
  _(attr, name)                    \
  _(attr, a)                       \
  _(attr, b)                       \
-  _(attr, beg)
+  _(attr, beg)                     \
+  _(attr, idx)
 #else
 #define FORALL_NS_SYMBOLS(_) \
  _(namespaces, prim)              \
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
    ss << "Future[" << getElementType()->python_str() << "]";
    return ss.str();
  }
+  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
+    return create(contained_types.at(0));
+  }
 private:
  FutureType(TypePtr elem) : SingleElementType(elem) {}
 };
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
 }

 inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
-  AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
  if (typ->isSubtypeOf(IntType::get())) {
    return CompleteTensorType::create(at::kLong, at::kCPU, {});
  } else if (typ->isSubtypeOf(FloatType::get())) {
@ -915,7 +917,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
 template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }

-CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
+CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);

 using TypeEnv = std::unordered_map<std::string, TypePtr>;
 struct MatchTypeReturn {
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
  return value;
 }

-TypePtr inferTypeFrom(const IValue& value) {
+// why incomplete? You cannot completely recover a type from
+// an IValue, List[List[int]] and List[List[Tensor]] will both
+// become ivalue.isGenericList() and cannot be recovered.
+// The only appropriate place to use this is where you know that
+// you are only dealing with a subset of objects where you can recover
+// the type, like in the tracer.
+TypePtr incompleteInferTypeFrom(const IValue& value) {
  if (value.isTensor()) {
    return CompleteTensorType::create(value.toTensor());
  } else if (value.isDouble()) {
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
  } else if (value.isDoubleList()) {
    return ListType::ofFloats();
  } else if (value.isTuple()) {
-    return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
+    return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
  } else if (value.isDevice()) {
    return DeviceObjType::get();
  }
-  AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
+  AT_ERROR("Type cannot be accurately recovered from this IValue.");
 }

 c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
--- a/aten/src/ATen/cpu/vec256/functional.h
+++ b/aten/src/ATen/cpu/vec256/functional.h
@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
    vec256::Vec256<scalar_t> acc_vec,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  scalar_t acc_arr[Vec::size];
+  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
  for (int64_t i = 1; i < size; i++) {
-    scalar_t acc_arr_next[Vec::size];
+    scalar_t acc_arr_next[Vec::size()];
    acc_arr_next[0] = acc_arr[i];
    Vec acc_vec_next = Vec::loadu(acc_arr_next);
    acc_vec = vec_fun(acc_vec, acc_vec_next);
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
 template <typename scalar_t, typename Op>
 inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = Vec::loadu(data);
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    acc_vec = vec_fun(acc_vec, data_vec);
  }
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
    Vec data_vec = Vec::loadu(data + d, size - d);
    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
+  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
    scalar_t* data,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    data_vec = map_fun(data_vec);
    acc_vec = red_fun(acc_vec, data_vec);
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
    data_vec = map_fun(data_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
    const scalar_t* data2,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size) {
+  if (size < Vec::size()) {
    Vec data_vec = Vec::loadu(data, size);
    Vec data2_vec = Vec::loadu(data2, size);
    data_vec = map_fun(data_vec, data2_vec);
    return vec_reduce_all(red_fun, data_vec, size);
  }
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    Vec data2_vec = Vec::loadu(data2 + d);
    data_vec = map_fun(data_vec, data2_vec);
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
    data_vec = map_fun(data_vec, data2_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename Op>
@ -103,7 +103,7 @@ inline void map(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
    output_vec.store(output_data + d);
  }
@ -122,7 +122,7 @@ inline void map2(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(input_data + d);
    Vec data_vec2 = Vec::loadu(input_data2 + d);
    Vec output_vec = vec_fun(data_vec, data_vec2);
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@ -15,14 +15,24 @@

 namespace at {
 namespace vec256 {
+
+// Note [Acceptable use of anonymous namespace in header]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Yes you saw right, this is an anonymous namespace in a header.  This header,
+// and all of its subheaders, REQUIRE their code to be entirely inlined into
+// the compilation unit that uses them.  It's important that these functions have
+// internal linkage so that kernels for different architectures don't get
+// combined during linking. It's sufficient to label functions "static", but
+// class methods must be an unnamed namespace to have internal linkage (since
+// static means something different in the context of classes).
 namespace {

 template <typename T>
 std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
-  T buf[Vec256<T>::size];
+  T buf[Vec256<T>::size()];
  vec.store(buf);
  stream << "vec[";
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    if (i != 0) {
      stream << ", ";
    }
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -20,6 +20,7 @@

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 template<size_t n> struct int_of_size;
@ -45,15 +46,49 @@ struct Vec256 {
 private:
  T values[32 / sizeof(T)] = {0};
 public:
-  static constexpr int size = 32 / sizeof(T);
+  // Note [constexpr static function to avoid odr-usage compiler bug]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Why, you might ask, is size defined to be a static constexpr function,
+  // rather than a more ordinary 'static constexpr int size;' variable?
+  // The problem lies within ODR rules for static constexpr members versus
+  // static constexpr functions.  First, recall that this class (along with all
+  // of its derivations) live in an anonymous namespace: they are intended to be
+  // *completely* inlined at their use-sites, because we need to compile it
+  // multiple times for different instruction sets.
+  //
+  // Because of this constraint, we CANNOT provide a single definition for
+  // any static members in this class; since we want to compile the class
+  // multiple times, there wouldn't actually be any good place to put the
+  // definition.  Now here is the problem: if we ODR-use a static constexpr
+  // member, we are *obligated* to provide a definition.  Without the
+  // definition, you get a compile error like:
+  //
+  //    relocation R_X86_64_PC32 against undefined symbol
+  //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
+  //    a shared object; recompile with -fPIC
+  //
+  // If this were C++17, we could replace a static constexpr variable with
+  // an inline variable which doesn't require one definition. But we are not
+  // C++17.  So the next best thing is to replace the member with a static
+  // constexpr (and therefore inline) function, which does not require ODR
+  // either.
+  //
+  // Also, technically according to the C++ standard, we don't have to define
+  // a constexpr variable if we never odr-use it.  But it seems that some
+  // versions GCC/Clang have buggy determinations on whether or not an
+  // identifier is odr-used or not, and in any case it's hard to tel if
+  // a variabe is odr-used or not.  So best to just cut the probem at the root.
+  static constexpr int size() {
+    return 32 / sizeof(T);
+  }
  Vec256() {}
  Vec256(T val) {
-    for (int i = 0; i != size; i++) {
+    for (int i = 0; i != size(); i++) {
      values[i] = val;
    }
  }
  template<typename... Args,
-           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
+           typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
  Vec256(Args... vals) {
    values = { vals... };
  }
@ -61,7 +96,7 @@ public:
  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
    int64_t mask = mask_;
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (mask & 0x01) {
        vec[i] = b[i];
      } else {
@ -74,9 +109,9 @@ public:
  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
                          const Vec256<T>& mask) {
    Vec256 vec;
-    int_same_size_t<T> buffer[size];
+    int_same_size_t<T> buffer[size()];
    mask.store(buffer);
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (buffer[i] & 0x01)
       {
        vec[i] = b[i];
@ -88,14 +123,14 @@ public:
  }
  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      vec.values[i] = base + i * step;
    }
    return vec;
  }
-  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
+  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (i < count) {
        vec[i] = b[i];
      } else {
@ -114,7 +149,7 @@ public:
    std::memcpy(vec.values, ptr, count * sizeof(T));
    return vec;
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
    std::memcpy(ptr, values, count * sizeof(T));
  }
  const T& operator[](int idx) const {
@ -125,14 +160,14 @@ public:
  }
  Vec256<T> map(T (*f)(T)) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i != size; i++) {
+    for (int64_t i = 0; i != size(); i++) {
      ret[i] = f(values[i]);
    }
    return ret;
  }
  Vec256<T> abs() const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = values[i] < 0 ? -values[i] : values[i];
    }
    return ret;
@ -214,7 +249,7 @@ public:
  }
  Vec256<T> pow(const Vec256<T> &exp) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = std::pow(values[i], exp[i]);
    }
    return ret;
@ -222,7 +257,7 @@ public:
 #define DEFINE_COMP(binary_pred)                                              \
  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
    Vec256<T> vec;                                                            \
-    for (int64_t i = 0; i != size; i++) {                                     \
+    for (int64_t i = 0; i != size(); i++) {                                     \
      if (values[i] binary_pred other.values[i]) {                            \
        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
      } else {                                                                \
@ -242,7 +277,7 @@ public:

 template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] + b[i];
  }
  return c;
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] - b[i];
  }
  return c;
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] * b[i];
  }
  return c;
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] / b[i];
  }
  return c;
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
 // either input is a NaN.
 template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
 // either input is a NaN.
 template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
 template <class T>                                                          \
 Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
  using iT = int_same_size_t<T>;                                            \
-  iT buffer[Vec256<T>::size];                                               \
-  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
+  iT buffer[Vec256<T>::size()];                                               \
+  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
    auto a_val = a[i];                                                      \
    auto b_val = b[i];                                                      \
    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
 template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  int_same_size_t<T> index_arr[size];
  vindex.store(static_cast<void*>(index_arr));
  T buffer[size];
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline mask_gather(const Vec256<T>& src, T const* base_addr,
                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
  int_same_size_t<T> index_arr[size];
@ -392,7 +427,7 @@ namespace {
  template<typename dst_t, typename src_t>
  struct CastImpl {
    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
-      src_t src_arr[Vec256<src_t>::size];
+      src_t src_arr[Vec256<src_t>::size()];
      src.store(static_cast<void*>(src_arr));
      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
    }
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {

 template <typename T>
 inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  src.store(static_cast<void*>(src_arr));
  int_same_size_t<T> buffer[size];
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
 //       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 //                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 //       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 //                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 interleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {

 template <typename src_T, typename dst_T>
 void convert(const src_T *src, dst_T *dst, int64_t n) {
-#pragma unroll
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
  for (int64_t i = 0; i < n; i++) {
    *dst = static_cast<dst_T>(
        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@ -8,6 +8,7 @@

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<double> {
 private:
  __m256d values;
 public:
-  static constexpr int size = 4;
+  static constexpr int size() {
+    return 4;
+  }
  Vec256() {}
  Vec256(__m256d v) : values(v) {}
  Vec256(double val) {
@ -40,7 +43,7 @@ public:
    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
-                            int64_t count = size) {
+                            int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -53,22 +56,22 @@ public:
    }
    return b;
  }
-  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
-    if (count == size)
+  static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));

-    __at_align32__ double tmp_values[size];
+    __at_align32__ double tmp_values[size()];
    std::memcpy(
        tmp_values,
        reinterpret_cast<const double*>(ptr),
        count * sizeof(double));
    return _mm256_load_pd(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
    } else if (count > 0) {
-      double tmp_values[size];
+      double tmp_values[size()];
      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(double));
    }
@ -252,7 +255,7 @@ template <>
 void convert(const double* src, double* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@ -8,6 +8,7 @@

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<float> {
 private:
  __m256 values;
 public:
-  static constexpr int size = 8;
+  static constexpr int size() {
+    return 8;
+  }
  Vec256() {}
  Vec256(__m256 v) : values(v) {}
  Vec256(float val) {
@ -43,7 +46,7 @@ public:
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
-                           int64_t count = size) {
+                           int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -64,19 +67,19 @@ public:
    }
    return b;
  }
-  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
-    if (count == size)
+  static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
-    __at_align32__ float tmp_values[size];
+    __at_align32__ float tmp_values[size()];
    std::memcpy(
        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
    return _mm256_loadu_ps(tmp_values);
  }
-  void store(void* ptr, int64_t count = size) const {
-    if (count == size) {
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
    } else if (count > 0) {
-      float tmp_values[size];
+      float tmp_values[size()];
      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(float));
    }
@ -260,7 +263,7 @@ template <>
 void convert(const float* src, float* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
+  for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@ -12,6 +12,11 @@ namespace {
 struct Vec256i {
 protected:
  __m256i values;
+
+  static inline __m256i invert(const __m256i& v) {
+    const auto ones = _mm256_set1_epi64x(-1);
+    return _mm256_xor_si256(ones, v);
+  }
 public:
  Vec256i() {}
  Vec256i(__m256i v) : values(v) {}
@ -22,7 +27,9 @@ public:

 template <>
 struct Vec256<int64_t> : public Vec256i {
-  static constexpr int size = 4;
+  static constexpr int size() {
+    return 4;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
@ -31,7 +38,7 @@ struct Vec256<int64_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
@ -51,7 +58,7 @@ struct Vec256<int64_t> : public Vec256i {
    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<int64_t>
-  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
+  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -68,15 +75,15 @@ struct Vec256<int64_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int64_t tmp_values[size];
+      __at_align32__ int64_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
    }
@ -93,31 +100,27 @@ struct Vec256<int64_t> : public Vec256i {
    return _mm256_cmpeq_epi64(values, other.values);
  }
  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto eq = _mm256_cmpeq_epi64(values, other.values);
-    return _mm256_xor_si256(zero, eq);  // invert
+    return invert(_mm256_cmpeq_epi64(values, other.values));
  }
  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
    return _mm256_cmpgt_epi64(other.values, values);
  }
  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto gt = _mm256_cmpgt_epi64(values, other.values);
-    return _mm256_xor_si256(zero, gt);  // invert
+    return invert(_mm256_cmpgt_epi64(values, other.values));
  }
  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
    return _mm256_cmpgt_epi64(values, other.values);
  }
  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto lt = _mm256_cmpgt_epi64(other.values, values);
-    return _mm256_xor_si256(zero, lt);  // invert
+    return invert(_mm256_cmpgt_epi64(other.values, values));
  }
 };

 template <>
 struct Vec256<int32_t> : public Vec256i {
-  static constexpr int size = 8;
+  static constexpr int size() {
+    return 8;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
@ -139,7 +142,7 @@ struct Vec256<int32_t> : public Vec256i {
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<int32_t>
-  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
+  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -164,15 +167,15 @@ struct Vec256<int32_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
-    __at_align32__ int32_t tmp_values[size];
+    __at_align32__ int32_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int32_t tmp_values[size];
+      __at_align32__ int32_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
    }
@ -186,25 +189,19 @@ struct Vec256<int32_t> : public Vec256i {
    return _mm256_cmpeq_epi32(values, other.values);
  }
  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto eq = _mm256_cmpeq_epi32(values, other.values);
-    return _mm256_xor_si256(zero, eq);  // invert
+    return invert(_mm256_cmpeq_epi32(values, other.values));
  }
  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
    return _mm256_cmpgt_epi32(other.values, values);
  }
  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto gt = _mm256_cmpgt_epi32(values, other.values);
-    return _mm256_xor_si256(zero, gt);  // invert
+    return invert(_mm256_cmpgt_epi32(values, other.values));
  }
  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
    return _mm256_cmpgt_epi32(values, other.values);
  }
  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto lt = _mm256_cmpgt_epi32(other.values, values);
-    return _mm256_xor_si256(zero, lt);  // invert
+    return invert(_mm256_cmpgt_epi32(other.values, values));
  }
 };

@ -212,13 +209,17 @@ template <>
 void convert(const int32_t *src, float *dst, int64_t n) {
  int64_t i;
  // int32_t and float have same size
-#pragma unroll
-  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
    auto output_vec = _mm256_cvtepi32_ps(input_vec);
    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER
+# pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<float>(src[i]);
  }
@ -228,13 +229,17 @@ template <>
 void convert(const int32_t *src, double *dst, int64_t n) {
  int64_t i;
  // int32_t has half the size of double
-#pragma unroll
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER
+# pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<double>(src[i]);
  }
@ -242,7 +247,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {

 template <>
 struct Vec256<int16_t> : public Vec256i {
-  static constexpr int size = 16;
+  static constexpr int size() {
+    return 16;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
@ -255,7 +262,7 @@ struct Vec256<int16_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
@ -303,7 +310,7 @@ struct Vec256<int16_t> : public Vec256i {
      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
  }
  static Vec256<int16_t>
-  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
+  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -344,15 +351,15 @@ struct Vec256<int16_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int16_t tmp_values[size];
+      __at_align32__ int16_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
    }
@ -366,25 +373,19 @@ struct Vec256<int16_t> : public Vec256i {
    return _mm256_cmpeq_epi16(values, other.values);
  }
  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto eq = _mm256_cmpeq_epi16(values, other.values);
-    return _mm256_xor_si256(zero, eq);  // invert
+    return invert(_mm256_cmpeq_epi16(values, other.values));
  }
  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
    return _mm256_cmpgt_epi16(other.values, values);
  }
  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto gt = _mm256_cmpgt_epi16(values, other.values);
-    return _mm256_xor_si256(zero, gt);  // invert
+    return invert(_mm256_cmpgt_epi16(values, other.values));
  }
  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
    return _mm256_cmpgt_epi16(values, other.values);
  }
  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
-    auto zero = _mm256_set1_epi64x(0);
-    auto lt = _mm256_cmpgt_epi16(other.values, values);
-    return _mm256_xor_si256(zero, lt);  // invert
+    return invert(_mm256_cmpgt_epi16(other.values, values));
  }
 };

@ -454,11 +455,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>

 template <typename T>
 Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
-  T values_a[Vec256<T>::size];
-  T values_b[Vec256<T>::size];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
  a.store(values_a);
  b.store(values_b);
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    values_a[i] /= values_b[i];
  }
  return Vec256<T>::loadu(values_a);
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -97,9 +97,7 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
  // NOTE: extra parenthesis around numbers disable clang warnings about
  // dead code
-  return (
-      (CUDNN_VERSION >= (6021)) ||
-      (CUDNN_VERSION >= (6000) && prop->major >= 5));
+  return true;
 #else
  return false;
 #endif
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -9,45 +9,6 @@
 #include "ATen/cuda/ATenCUDAGeneral.h"
 #include <cuda.h>

-#if CUDNN_VERSION < 7000
-
-#include <curand_kernel.h>
-
-/*
-Note [cuDNN dropout descriptor initialization]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In most cases, setting descriptors in cuDNN is cheap (e.g.,
-cudnnSetTensorNdDescriptor).  However, this is not the case for
-cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
-expensive precomputation to initialize the random number generator states.  In
-cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
-which means that law-abiding clients were expected to generate a dropout
-descriptor once and cache it.  However, our ATen interface is (1) stateless (so
-we can't cache the descriptors) and (2) does not accept arbitrary user types in
-its interface (so we can't pass the descriptor in).  This puts us in a pickle.
-
-In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
-forgoes the expensive initialization process, and can initialize the
-descriptor with a pre-initialized state CUDA tensor.  This is great, because
-it means we can simply pass in the state tensor and then initialize the
-descriptor internally.  Unfortunately, this function is not available in
-cuDNN 6.
-
-To work around this, we break the cuDNN abstraction barrier, and have
-the struct layout of the underlaying dropout descriptor.  With this struct,
-we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
-*/
-
-// Reverse engineered from cuDNN 6, see Note [cuDNN dropout descriptor initialization]
-struct cudnnDropoutStruct {
-  float dropout;
-  int nstates;
-  void * states;
-};
-
-#endif
-
 namespace at { namespace native {

 // TODO: Add constructors for all of the descriptors
@ -193,12 +154,10 @@ struct AT_CUDA_API ConvolutionDescriptor
    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
                                          CUDNN_CROSS_CORRELATION, mathType));
-#if CUDNN_VERSION >= 7000
    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
    if(dataType == CUDNN_DATA_HALF)
      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
-#endif
  }
 };

@ -212,35 +171,6 @@ struct AT_CUDA_API SpatialTransformerDescriptor
  }
 };

-#if CUDNN_VERSION < 7000
-
-// See Note [cuDNN dropout descriptor initialization]
-inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnHandle_t handle,
-    float dropout,
-    void *states,
-    size_t stateSizeInBytes,
-    unsigned long long seed) {
-  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
-  // This is not entirely accurate but is good enough to catch some API
-  // uses which would not be compatible in cuDNN 7.  Feel free to fix
-  // this if you notice something is wrong.
-  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
-  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
-  size_t expectedStateSizeInBytes;
-  // State size will differ depending on size of GPU
-  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
-  if (ret != CUDNN_STATUS_SUCCESS) return ret;
-  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
-  dropoutDesc->dropout = dropout;
-  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
-  dropoutDesc->states = states;
-  return CUDNN_STATUS_SUCCESS;
-}
-
-#endif // CUDNN_VERSION
-
 struct AT_CUDA_API DropoutDescriptor
  : public Descriptor<cudnnDropoutStruct,
                      &cudnnCreateDropoutDescriptor,
@ -304,7 +234,7 @@ struct AT_CUDA_API RNNDescriptor
          mode,
          algo,
          datatype));
-#if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
    if (prop->major >= 7) {
      if (datatype == CUDNN_DATA_HALF) {
@ -319,8 +249,6 @@ struct AT_CUDA_API RNNDescriptor
  }
 };

-#if CUDNN_VERSION >= 7000
-
 struct AT_CUDA_API CTCLossDescriptor
  : public Descriptor<cudnnCTCLossStruct,
                      &cudnnCreateCTCLossDescriptor,
@ -331,8 +259,6 @@ struct AT_CUDA_API CTCLossDescriptor
  }
 };

-#endif
-
 union Constant
 {
  float f;
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -168,8 +168,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
-      weight_num, channel_size);
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
+      " and channel size = ", channel_size, ".");

    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
      prelu_cpu_kernel_multi_weights<scalar_t>(
@ -295,8 +295,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
-      weight_num, channel_size);
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
+      " and channel size = ", channel_size, ".");

    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -152,10 +152,15 @@ std::tuple<Tensor, Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)

 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
-  if (self.dim() <= 2 && A.dim() <= 2) {
+  AT_CHECK(self.dim() >= 2,
+           "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
+  AT_CHECK(A.dim() >= 2,
+           "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
+  if (self.dim() == 2 && A.dim() == 2) {
    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
    // TH and ATen. We should remove the TH gesv bindings, especially
    // since the lapackGesv function is already in ATen.
+    linearSolveCheckInputs(self, A);  // Checks square shape of A, and compatibility of self and A
    return at::_th_gesv_single(self, A);
  }

@ -350,20 +355,12 @@ Tensor cholesky(const Tensor &self, bool upper) {
  }
  squareCheckInputs(self);

-  // TODO: (#14071) Once `triu`, `tril` is implemented for batched tensors,
-  // this can be simplified. Currently, we are zero-ing out values in the
-  // batch of matrices by using a mask and the `where` function.
-  // The simplification with batched `triu` and `tril` would be this:
-  // if (upper) {
-  //   return raw_cholesky_output.triu();
-  // } else {
-  //   return raw_cholesky_output.tril();
-  // }
  auto raw_cholesky_output = at::_cholesky_helper(self, upper);
-  int64_t n = self.size(-1);
-  auto indices = at::ones({n, n}, self.options().dtype(at::kByte));
-  indices = upper ? indices.tril(-1).expand_as(self) : indices.triu(1).expand_as(self);
-  return at::where(indices, at::zeros({}, self.options()), raw_cholesky_output);
+  if (upper) {
+    return raw_cholesky_output.triu_();
+  } else {
+    return raw_cholesky_output.tril_();
+  }
 }

 Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
@ -374,4 +371,136 @@ Tensor& cholesky_out(Tensor &result, const Tensor &self, bool upper) {
  return result;
 }

+template <typename scalar_t, bool inplace, bool upper>
+static void apply_triu_tril_single(
+    scalar_t* result, scalar_t* self,
+    int64_t k, int64_t n, int64_t m,
+    int64_t res_row_stride, int64_t res_col_stride,
+    int64_t self_row_stride, int64_t self_col_stride) {
+
+  constexpr int64_t zero = 0;
+  int64_t i;
+
+  if (upper) {
+    #pragma omp parallel for private(i)
+    for (i = 0; i < n; i++) {
+      for (int64_t j = 0; j < std::min(m, i + k); j++) {
+        result[i * res_row_stride + j * res_col_stride] = 0;
+      }
+      if (!inplace) {  // copy the rest of the self if not inplace
+        for (int64_t j = std::max(zero, i + k); j < m; j++) {
+          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
+        }
+      }
+    }
+  } else {
+    #pragma omp parallel for private(i)
+    for (i = 0; i < n; i++) {
+      for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
+        result[i * res_row_stride + j * res_col_stride] = 0;
+      }
+      if (!inplace) {  // copy the rest of the self if not inplace
+        for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
+          result[i * res_row_stride + j * res_col_stride] = self[i * self_row_stride + j * self_col_stride];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, bool inplace, bool upper>
+void apply_triu_tril(Tensor& result, const Tensor& self, int64_t k) {
+  auto n = self.size(-2);
+  auto m = self.size(-1);
+  auto self_data = self.data<scalar_t>();
+  auto self_stride = self.dim() > 2 ? self.stride(-3) : 1;
+  auto batchsize = batchCount(self);
+  auto self_row_stride = self.stride(-2);
+  auto self_column_stride = self.stride(-1);
+
+  auto result_data = result.data<scalar_t>();
+  int64_t result_stride, result_row_stride, result_column_stride;
+  if (result_data != self_data) {
+    result_stride = result.dim() > 2 ? result.stride(-3) : 1;
+    result_row_stride = result.stride(-2);
+    result_column_stride = result.stride(-1);
+  } else {
+    result_stride = self_stride;
+    result_row_stride = self_row_stride;
+    result_column_stride = self_column_stride;
+  }
+
+  int64_t b;
+  #pragma omp parallel for private(b)
+  for (b = 0; b < batchsize; b++) {
+    scalar_t* self_batch = &self_data[b * self_stride];
+    scalar_t* result_batch = &result_data[b * result_stride];
+    apply_triu_tril_single<scalar_t, inplace, upper>(
+        result_batch, self_batch, k, n, m,
+        result_row_stride, result_column_stride, self_row_stride, self_column_stride);
+  }
+}
+
+Tensor tril(const Tensor& self, int64_t k) {
+  Tensor result = at::empty({0}, self.options());
+  at::tril_out(result, self, k);
+  return result;
+}
+
+Tensor& tril_cpu_(Tensor &self, int64_t k) {
+  if (self.numel() == 0) {
+    return self;
+  }
+  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
+  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
+    apply_triu_tril<scalar_t, true, false>(self, self, k);
+  });
+  return self;
+}
+
+Tensor& tril_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
+  if (result.sizes() != self.sizes()) {
+    result.resize_as_(self);
+  }
+  if (self.numel() == 0) {
+    return result;
+  }
+  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
+  AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
+    apply_triu_tril<scalar_t, false, false>(result, self_c, k);
+  });
+  return result;
+}
+
+Tensor triu(const Tensor& self, int64_t k) {
+  Tensor result = at::empty({0}, self.options());
+  at::triu_out(result, self, k);
+  return result;
+}
+
+Tensor& triu_cpu_(Tensor &self, int64_t k) {
+  if (self.numel() == 0) {
+    return self;
+  }
+  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
+  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
+    apply_triu_tril<scalar_t, true, true>(self, self, k);
+  });
+  return self;
+}
+
+Tensor& triu_cpu_out(Tensor &result, const Tensor& self, int64_t k) {
+  if (result.sizes() != self.sizes()) {
+    result.resize_as_(self);
+  }
+  if (self.numel() == 0) {
+    return result;
+  }
+  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
+  AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
+    apply_triu_tril<scalar_t, false, true>(result, self_c, k);
+  });
+  return result;
+}
+
 }}  // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -378,8 +378,8 @@ at::Tensor _convolution(
    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
             ") should be the same");
-
-    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
+    output = at::mkldnn_convolution(input, weight.contiguous(), bias.defined() ? bias.contiguous() : bias,
+                                    params.padding, params.stride, params.dilation, params.groups);
 #endif
  } else {
    if (params.groups == 1) {
--- a/aten/src/ATen/native/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyDefinitions.cpp
@ -110,7 +110,7 @@ Tensor & eq_(Tensor& self, Scalar other) {
 }

 Tensor & eq_(Tensor& self, const Tensor & other) {
-  return at::_th_ge_(self, other);
+  return at::_th_eq_(self, other);
 }

 Tensor & ne_(Tensor& self, Scalar other) {
@ -129,14 +129,6 @@ Tensor & atan2_(Tensor& self, const Tensor & other) {
  return at::_th_atan2_(self, other);
 }

-Tensor & tril_(Tensor& self, int64_t diagonal) {
-  return at::_th_tril_(self, diagonal);
-}
-
-Tensor & triu_(Tensor& self, int64_t diagonal) {
-  return at::_th_triu_(self, diagonal);
-}
-
 Tensor & digamma_(Tensor& self) {
  return at::_th_digamma_(self);
 }
@ -271,22 +263,6 @@ Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) {
  return at::_th_cross(self, other, dim);
 }

-Tensor & triu_out(Tensor & result, const Tensor & self, int64_t diagonal) {
-  return at::_th_triu_out(result, self, diagonal);
-}
-
-Tensor triu(const Tensor & self, int64_t diagonal) {
-  return at::_th_triu(self, diagonal);
-}
-
-Tensor & tril_out(Tensor & result, const Tensor & self, int64_t diagonal) {
-  return at::_th_tril_out(result, self, diagonal);
-}
-
-Tensor tril(const Tensor & self, int64_t diagonal) {
-  return at::_th_tril(self, diagonal);
-}
-
 Tensor trace(const Tensor & self) {
  return at::_th_trace(self);
 }
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -41,6 +41,28 @@ static inline int64_t matrixStride(const Tensor& batched_matrices) {
  return batched_matrices.size(-1) * batched_matrices.size(-2);
 }

+/* Checks a necessary property for the triu and tril implementations, hence the name.
+ * Here batch contiguity is checked for tensors with greater than 4 dimensions.
+ * Contiguous tensors and tensors with less than 3 dimensions pass this check
+ */ 
+static inline bool checkTrilTriuBatchContiguous(const Tensor& tensor) {
+  // Complete contiguity is the most desired property, which is why
+  // we return true if the tensor is contiguous
+  if (tensor.is_contiguous()) return true;
+
+  int64_t dims = tensor.dim();
+
+  // Tensors with dimension less than 4 are handled by default
+  if (dims <= 3) return true;
+
+  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
+  for (int64_t i = dims - 3; i >= 0; i--) {
+    if (expected_stride != tensor.stride(i)) return false;
+    expected_stride *= tensor.size(i);
+  }
+  return true;
+}
+
 // Returns the epsilon value for floating types except half
 static inline double _get_epsilon(const ScalarType& sc_type) {
  switch (sc_type) {
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -422,6 +422,8 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
 std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const Tensor& weight, const Tensor& bias,
                                                  const Tensor& running_mean, const Tensor& running_var,
                                                  bool train, double momentum, double eps) {
+  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
+
  return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
      return batch_norm_cpu_template<scalar_t>(self, weight, bias, running_mean, running_var, train, momentum, eps);
    });
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -21,7 +21,6 @@ namespace native {

 DEFINE_DISPATCH(sum_stub);
 DEFINE_DISPATCH(prod_stub);
-DEFINE_DISPATCH(norm_kernel);

 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
  ScalarType scalarType = self.type().scalarType();
@ -410,16 +409,7 @@ Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
    return result;
-  if (self.is_contiguous() && result.is_contiguous()) {
-    _dimreduce_setup(result, self, dim);
-    norm_kernel(kCPU, result, self, p, dim);
-    if (!keepdim) {
-      result.squeeze_(dim);
-    }
-    return result;
-  } else {
-    return at::_th_norm_out(result, self, p, dim, keepdim);
-  }
+  return at::_th_norm_out(result, self, p, dim, keepdim);
 }

 Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
@ -445,17 +435,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
             "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
-    if (self.is_cuda()) {
-      return at::_th_norm(self, p);
-    } else {
-      if (self.is_contiguous()) {
-        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
-        norm_kernel(kCPU, result, self, p, c10::nullopt);
-        return result;
-      } else {
-        return at::_th_norm(self, p);
-      }
-    }
+    return at::_th_norm(self, p);
  }
 }

--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@ -34,11 +34,11 @@ Tensor _bincount_cpu_template(
  int64_t nbins = static_cast<int64_t>(*self.max().data<input_t>()) + 1L;
  nbins = std::max(nbins, minlength); // at least minlength # of bins

-  const input_t* self_p = self.contiguous().data<input_t>();
+  const input_t* self_p = self.data<input_t>();
  if (has_weights) {
    output = native::zeros({nbins}, weights.options());
    weights_t* output_p = output.data<weights_t>();
-    const weights_t* weights_p = weights.contiguous().data<weights_t>();
+    const weights_t* weights_p = weights.data<weights_t>();
    for (int64_t i = 0; i < self.size(0); i++) {
      output_p[self_p[i]] += weights_p[i];
    }
@ -58,9 +58,9 @@ _bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
  return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
    const auto scalar = weights.type().scalarType();
    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
-      return _bincount_cpu_template<scalar_t, float>(self, weights, minlength);
+      return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
    return _bincount_cpu_template<scalar_t, double>(
-        self, weights.toType(CPU(kDouble)), minlength);
+        self.contiguous(), weights.contiguous().toType(CPU(kDouble)), minlength);
  });
 }

--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@ -385,6 +385,9 @@ void TensorIterator::serial_for_each(const loop_t& loop, Range range) const {
 }

 void TensorIterator::serial_for_each(const loop2d_t& loop, Range range) const {
+  if (range.size() == 0) {
+    return;
+  }
  auto strides = get_strides();
  while (strides.size() < 2 * ntensors()) {
    strides.push_back(0);
@ -677,8 +680,10 @@ DimCounter::DimCounter(IntList shape, Range range)
  int64_t ndim = values.size();
  for (int dim = 0; dim < ndim; dim++) {
    int64_t size = shape[dim];
-    values[dim] = linear_offset % size;
-    linear_offset /= size;
+    if (size > 0) {
+      values[dim] = linear_offset % size;
+      linear_offset /= size;
+    }
  }
  AT_ASSERT(linear_offset == 0);
 }
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -101,14 +101,14 @@ struct PDist {

    scalar_t * const res_start = result.data<scalar_t>();
    int64_t combs = result.numel(); // n * (n - 1) / 2
-    const Vec pvec(p);

    // We conceptually iterate over tuples of (i, j, k) where i is the first
    // vector from the input, j is the second, and k is the result index. This
    // parallelizes over the range of k and infers what i and j are from the
    // value of k.
-    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
-      float n2 = n - .5;
+    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=](int64_t k, int64_t end) {
+      const Vec pvec(p);
+      double n2 = n - .5;
      // The -1 accounts for floating point truncation issues
      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
@ -149,7 +149,7 @@ struct PDist {
  }

  template <typename F>
-  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
+  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {

      const Vec self_vec_i = Vec::loadu(self_i, count);
@ -177,7 +177,6 @@ struct PDist {
    const int64_t n = self.size(0);
    const int64_t m = self.size(1);
    const int64_t gs = grad.stride(0);
-    const Vec pvec(p);

    const scalar_t * const grad_start = grad.data<scalar_t>();
    const scalar_t * const dist_start = dist.data<scalar_t>();
@ -187,17 +186,19 @@ struct PDist {
    // The only way to parallelize and avoid locking requires parallelizing
    // over the columns of the input, i.e. we compute the gradient for the
    // first section of each vector independentaly of the second section, etc.
-    at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
-      const scalar_t * self_l = self_start + l * Vec::size;
-      scalar_t * res_l = res_start + l * Vec::size;
+    at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [=](int64_t l, int64_t end) {
+      const Vec pvec(p);

-      for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
+      const scalar_t * self_l = self_start + l * Vec::size();
+      scalar_t * res_l = res_start + l * Vec::size();
+
+      for (const scalar_t * const res_end = res_start + end * Vec::size(); res_l != res_end; self_l += Vec::size(), res_l += Vec::size()) {
        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
      }
    });
-    const int64_t remainder = m % Vec::size;
+    const int64_t remainder = m % Vec::size();
    if (remainder) {
-      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
+      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, Vec(p), n, m, gs, remainder);
    }
  }

--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@ -308,7 +308,9 @@ static inline void
 mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
                 const int_same_size_t<scalar_t> *offsets,
                 const int_same_size_t<scalar_t> *mask, int64_t len) {
-  #pragma unroll
+  #ifndef _MSC_VER  
+  # pragma unroll  
+  #endif
  for (int64_t i = 0; i < len; i++) {
    if (mask[i] & 0x01) {
      base_addr[offsets[i]] += src[i];
@ -429,7 +431,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
    auto i_se_offset = i_sw_offset + iVec(inp_sW);

-    #pragma unroll
+    #ifndef _MSC_VER  
+    # pragma unroll  
+    #endif
    for (int64_t c = 0; c < C; ++c) {
      auto inp_slice_C_ptr = inp_slice[c].data();

@ -480,28 +484,30 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    // So we store the necessary vectors to temporary arrays and use the helper
    // mask_scatter_add defined above.

-    integer_t i_gInp_nw_offset_arr[iVec::size];
-    integer_t i_gInp_ne_offset_arr[iVec::size];
-    integer_t i_gInp_sw_offset_arr[iVec::size];
-    integer_t i_gInp_se_offset_arr[iVec::size];
+    integer_t i_gInp_nw_offset_arr[iVec::size()];
+    integer_t i_gInp_ne_offset_arr[iVec::size()];
+    integer_t i_gInp_sw_offset_arr[iVec::size()];
+    integer_t i_gInp_se_offset_arr[iVec::size()];
    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
    i_gInp_se_offset.store(i_gInp_se_offset_arr);

-    integer_t i_nw_mask_arr[iVec::size];
-    integer_t i_ne_mask_arr[iVec::size];
-    integer_t i_sw_mask_arr[iVec::size];
-    integer_t i_se_mask_arr[iVec::size];
+    integer_t i_nw_mask_arr[iVec::size()];
+    integer_t i_ne_mask_arr[iVec::size()];
+    integer_t i_sw_mask_arr[iVec::size()];
+    integer_t i_se_mask_arr[iVec::size()];
    nw_mask.store(i_nw_mask_arr);
    ne_mask.store(i_ne_mask_arr);
    sw_mask.store(i_sw_mask_arr);
    se_mask.store(i_se_mask_arr);

-    scalar_t gInp_corner_arr[Vec::size];
+    scalar_t gInp_corner_arr[Vec::size()];

    auto gx = Vec(0), gy = Vec(0);
-    #pragma unroll
+    #ifndef _MSC_VER  
+    # pragma unroll  
+    #endif
    for (int64_t c = 0; c < C; ++c) {
      auto inp_slice_C_ptr = inp_slice[c].data();
      auto gInp_slice_C_ptr = gInp_slice[c].data();
@ -533,7 +539,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding>
    gx = gx * gx_mult;
    gy = gy * gy_mult;

-    constexpr int64_t step = Vec::size;
+    constexpr int64_t step = Vec::size();
    auto interleaved_gGrid = interleave2(gx, gy);
    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
@ -592,7 +598,9 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>
    auto out_ptr = out_slice.data() + offset;
    auto out_sC = out_slice.stride(0);
    auto inp_slice_ptr = inp_slice.data();
-    #pragma unroll
+    #ifndef _MSC_VER  
+    # pragma unroll  
+    #endif
    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
      // mask_gather zeros out the mask, so we need to make a copy
      auto mask_copy = mask;
@ -622,12 +630,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding>

    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous

-    integer_t mask_arr[iVec::size];
+    integer_t mask_arr[iVec::size()];
    i_mask.store(mask_arr);
-    integer_t gInp_offset_arr[iVec::size];
+    integer_t gInp_offset_arr[iVec::size()];
    i_gInp_offset.store(gInp_offset_arr);

-    #pragma unroll
+    #ifndef _MSC_VER  
+    # pragma unroll  
+    #endif
    for (int64_t c = 0; c < C; ++c) {
      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
                       gInp_offset_arr, mask_arr, len);
@ -656,7 +666,7 @@ static inline void grid_sample_2d_grid_slice_iterator(

  using Vec = Vec256<scalar_t>;
  using iVec = Vec256<int_same_size_t<scalar_t>>;
-  constexpr int64_t step = Vec::size;
+  constexpr int64_t step = Vec::size();

  // Loop over each output pixel in grid.
  // We consider the following three cases (after slicing out the batch
@ -733,12 +743,16 @@ static inline void grid_sample_2d_grid_slice_iterator(
    auto spatial_offset = 0;
    auto i_offsets_delta = iVec(grid_sW * step);

-    #pragma unroll
+    #ifndef _MSC_VER  
+    # pragma unroll  
+    #endif
    for (int64_t h = 0; h < out_H; h++) {
      auto grid_ptr_x = grid_ptr + h * grid_sH;
      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
      auto i_offsets = iVec::arange(0, grid_sW);
-      #pragma unroll
+      #ifndef _MSC_VER  
+      # pragma unroll  
+      #endif
      for (int64_t w = 0; w < out_W; w += step) {
        auto len = std::min(step, out_W - w);
        if (len < step) {
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -80,15 +80,15 @@ template <typename func_t, typename vec_func_t>
 static inline void vectorized_binary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
-    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
-    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a1, b1);
    auto out2 = vop(a2, b2);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), sizeof(scalar_t) };
  binary_loop(data, strides, i, n, op);
@ -100,13 +100,13 @@ static inline void vectorized_binary_loop_s1(char** data, int64_t n, func_t op,
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
  auto a = Vec(*(scalar_t*)in1_ptr);
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto b1 = Vec::loadu(in2_ptr + i * sizeof(scalar_t));
-    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto b2 = Vec::loadu(in2_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a, b1);
    auto out2 = vop(a, b2);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), 0, sizeof(scalar_t) };
  binary_loop(data, strides, i, n, op);
@ -118,13 +118,13 @@ static inline void vectorized_binary_loop_s2(char** data, int64_t n, func_t op,
  VEC_LOOP_HEADER(func_t, data)
  int64_t i = 0;
  auto b = Vec(*(scalar_t*)in2_ptr);
-  for (; i <= n - 2 * Vec::size; i += 2 * Vec::size) {
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto a1 = Vec::loadu(in1_ptr + i * sizeof(scalar_t));
-    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size) * sizeof(scalar_t));
+    auto a2 = Vec::loadu(in1_ptr + (i + Vec::size()) * sizeof(scalar_t));
    auto out1 = vop(a1, b);
    auto out2 = vop(a2, b);
    out1.store(out_ptr + i * sizeof(scalar_t));
-    out2.store(out_ptr + (i + Vec::size) * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
  }
  int64_t strides[] = { sizeof(scalar_t), sizeof(scalar_t), 0 };
  binary_loop(data, strides, i, n, op);
@ -137,27 +137,27 @@ static inline void reduction128(char** data, int64_t n, int64_t stride, func_t o
  char* in_ptr = data[1];
  Vec acc[4];
  for  (int j = 0; j < 4; j++) {
-    acc[j] = Vec::loadu(in_ptr + j * Vec::size * sizeof(scalar_t));
+    acc[j] = Vec::loadu(in_ptr + j * Vec::size() * sizeof(scalar_t));
  }
  for (int64_t i = 1; i < n; i++) {
    const char* ptr = in_ptr + stride * i;
-    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size * sizeof(scalar_t))));
-    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size * sizeof(scalar_t))));
-    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size * sizeof(scalar_t))));
-    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size * sizeof(scalar_t))));
+    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
+    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
+    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
+    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
  }
  if (reduce) {
-    scalar_t buffer[Vec::size];
+    scalar_t buffer[Vec::size()];
    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
    acc[0].store(buffer);
-    for (int j = 1; j < Vec::size; j++) {
+    for (int j = 1; j < Vec::size(); j++) {
      buffer[0] = op(buffer[0], buffer[j]);
    }
    auto dst = (scalar_t*)out_ptr;
    *dst = op(*dst, buffer[0]);
  } else {
    for (int j = 0; j < 4; j++) {
-      auto dst = out_ptr + j * Vec::size * sizeof(scalar_t);
+      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
      acc[j] = vop(acc[j], Vec::loadu(dst));
      acc[j].store(dst);
    }
@ -177,14 +177,14 @@ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int
 template <typename func_t, typename vec_func_t>
 static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
  VEC_HEADER(func_t)
-  int64_t vector_stride = 4 * Vec::size * sizeof(scalar_t);
-  int64_t count = n / (4 * Vec::size);
+  int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
+  int64_t count = n / (4 * Vec::size());
  if (count > 0) {
    reduction128(data, count, vector_stride, op, vop, /*reduce=*/true);
  }
  char* ptrs[3] = { data[0], data[0], data[1] };
  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
-  binary_loop(ptrs, strides, count * 4 * Vec::size, n, op);
+  binary_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
 }

 // computes the reduction out = op(out, in)
@ -192,15 +192,15 @@ template <typename func_t, typename vec_func_t>
 static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
  VEC_HEADER(func_t)

-  // reduce down each column of 4 * Vec::size elements (128 bytes)
+  // reduce down each column of 4 * Vec::size() elements (128 bytes)
  int64_t outer_stride[2] = { 128, 128 };
-  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size), [&] {
+  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
    reduction128(data, size0, inner_stride, op, vop, /*reduce=*/false);
  });

  // reduce down the remaining columns
  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
-  int64_t remaining = size1 % (4 * Vec::size);
+  int64_t remaining = size1 % (4 * Vec::size());
  UNARY_OUTER_LOOP(data, step, remaining, [&] {
    char* ptrs[3] = { data[0], data[0], data[1] };
    int64_t strides[] = { 0, 0, inner_stride };
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -31,180 +31,9 @@ static void prod_kernel_impl(TensorIterator& iter) {
      /*identity=*/1);
  });
 }
-
-static inline int64_t round_down(int64_t a, int64_t m) {
-  return a - (a % m);
-}
-
-template<typename scalar_t>
-struct NormReduction {
-  // reduction width in number of scalar elements
-  static constexpr int WIDTH = 128 / sizeof(scalar_t);
-  using Vec = Vec256<scalar_t>;
-
-  static void apply(
-      Tensor& res,
-      const Tensor& self,
-      Scalar p,
-      c10::optional<int64_t> dim) {
-    auto out_ = res.data<scalar_t>();
-    auto data_ = self.data<scalar_t>();
-    auto numel = self.numel();
-    float pval = 0.0;
-    if (p.isIntegral()){
-      pval = p.to<int64_t>();
-    } else if (p.isFloatingPoint()) {
-      pval = p.to<float>();
-    }
-    if (!dim.has_value()) {
-      *out_ = reduce_all(data_, numel,  pval);
-      return;
-    }
-    int64_t n = self.size(*dim);
-    int64_t stride = self.stride(*dim);
-    // A contiguous tensor does not need to hold a meaningful stride
-    // if the corresponding size is 1
-    if (n == 1) {
-      stride = 1;
-      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
-        stride *= self.size(i);
-      }
-    }
-    int64_t batch = numel / n;
-    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
-      for (int64_t bi = begin; bi < end; bi++) {
-        int64_t b = bi / stride;
-        int64_t i = bi % stride;
-        const scalar_t* data = &data_[b * n * stride + i];
-        out_[bi] = norm_reduce(data, n, stride, pval);
-      }
-    });
-  }
-
-  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
-    scalar_t sum = parallel_reduce(
-      0,
-      size,
-      internal::GRAIN_SIZE,
-      (scalar_t)0,
-      [=](int64_t begin, int64_t end, scalar_t init) {
-        const scalar_t* data = &data_[begin];
-        int64_t n = end - begin;
-        scalar_t result = norm_reduce(data, n, 1, pval);
-        return result;
-      },
-      std::plus<scalar_t>());
-    return sum;
-  }
-
-  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
-    scalar_t result = 0.0;
-    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
-      int64_t n_rounded = round_down(n, WIDTH);
-      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
-      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
-      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
-    } else {
-      result = norm_reduce_sequential(data, n, stride, pval);
-    }
-    return result;
-  }
-
-  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
-    scalar_t result = 0.0;
-    if (pval == 0) {
-      for (int64_t k = 0; k < n; k++) {
-        result += (data[k * stride] != 0.0);
-      }
-    } else if (pval == 1) {
-      for (int64_t k = 0; k < n; k++) {
-        result += std::abs(data[k * stride]);
-      }
-    } else if (pval == 2) {
-      for (int64_t k = 0; k < n; k++) {
-        result += data[k * stride] * data[k * stride];
-      }
-      result = std::sqrt(result);
-    } else if (pval == 3) {
-      for (int64_t k = 0; k < n; k++) {
-        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
-      }
-      result = std::pow(result, 1.0/3);
-    } else if (pval == INFINITY) {
-      for (int64_t k = 0; k < n; k++) {
-        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
-      }
-    } else if (pval == -INFINITY) {
-      result = INFINITY;
-      for (int64_t k = 0; k < n; k++) {
-        result = std::abs(data[k * stride]) < result ? std::abs(data[k * stride]) : result;
-      }
-    } else {
-      for (int64_t k = 0; k < n; k++) {
-        result += std::pow(std::abs(data[k * stride]), pval);
-      }
-      result = std::pow(result, 1.0/pval);
-    }
-    return result;
-  }
-
-  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
-  // n is already rounded by 128
-  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
-    scalar_t result = 0.0;
-    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
-    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
-    int64_t rows = n / WIDTH;
-    if (pval == 1){
-      for (int row = 0; row < rows; row ++) {
-        for (int j = 0; j != 4; j++) {
-          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
-          acc[j] = acc[j] + val.abs();
-        }
-      }
-    }
-    else if (pval == 2) {
-      for (int row = 0; row < rows; row ++) {
-        for (int j = 0; j != 4; j++) {
-          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
-          acc[j] = acc[j] + val * val;
-        }
-      }
-    }
-    else if (pval == 3) {
-      for (int row = 0; row < rows; row ++) {
-        for (int j = 0; j != 4; j++) {
-          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
-          acc[j] = acc[j] + (val * val * val).abs();
-        }
-      }
-    }
-    scalar_t buf[WIDTH] = {0};
-    for (int j = 0; j != 4; j++) {
-      acc[j].store(&buf[j * Vec::size]);
-    }
-    for (int i = 0; i < WIDTH; i++) {
-      result += buf[i];
-    }
-    result = std::pow(result, 1.0/pval);
-    return result;
-  }
-};
-
-static void norm_kernel_impl(
-    Tensor& result,
-    const Tensor& self,
-    Scalar p,
-    c10::optional<int64_t> dim) {
-  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
-    NormReduction<scalar_t>::apply(result, self, p, dim);
-  });
-}
-
 }  // anonymous namespace

 REGISTER_DISPATCH(sum_stub, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_stub, &prod_kernel_impl);
-REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);

 }}  // namespace at::native
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@ -29,7 +29,7 @@ inline void _vec_log_softmax_lastdim(
    int64_t outer_size,
    int64_t dim_size) {
  using Vec = vec256::Vec256<scalar_t>;
-  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
+  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
  if (grain_size < CHUNK_SIZE)
    grain_size = CHUNK_SIZE;
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@ -37,9 +37,9 @@ template <>
 int64_t _sigmoid(float* x, float* y, int64_t size) {
  using Vec = Vec256<float>;
  int64_t i = 0;
-  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
    Vec ret = Vec::loadu(y + i);
-    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    Vec ret2 = Vec::loadu(y + i + Vec::size());
    ret = ret.neg();
    ret2 = ret2.neg();
 #if defined(__AVX2__) && !defined(_MSC_VER)
@ -54,7 +54,7 @@ int64_t _sigmoid(float* x, float* y, int64_t size) {
    ret = ret.reciprocal();
    ret2 = ret2.reciprocal();
    ret.store(x + i);
-    ret2.store(x + i + Vec::size);
+    ret2.store(x + i + Vec::size());
  }
  return i;
 }
@ -63,9 +63,9 @@ template <>
 int64_t _sigmoid(double* x, double* y, int64_t size) {
  using Vec = Vec256<double>;
  int64_t i = 0;
-  for (; i < size - (size % (2 * Vec::size)); i += 2 * Vec::size) {
+  for (; i < size - (size % (2 * Vec::size())); i += 2 * Vec::size()) {
    Vec ret = Vec::loadu(y + i);
-    Vec ret2 = Vec::loadu(y + i + Vec::size);
+    Vec ret2 = Vec::loadu(y + i + Vec::size());
    ret = ret.neg();
    ret2 = ret2.neg();
    ret = ret.exp();
@ -75,7 +75,7 @@ int64_t _sigmoid(double* x, double* y, int64_t size) {
    ret = ret.reciprocal();
    ret2 = ret2.reciprocal();
    ret.store(x + i);
-    ret2.store(x + i + Vec::size);
+    ret2.store(x + i + Vec::size());
  }
  return i;
 }
@ -95,9 +95,9 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
          if (stridex == 1 && stridey == 1) {
            i = _sigmoid(x, y, size);
          }
-          for (; i < size; i += Vec::size) {
-            scalar_t buffer[Vec::size];
-            int64_t width = Vec::size;
+          for (; i < size; i += Vec::size()) {
+            scalar_t buffer[Vec::size()];
+            int64_t width = Vec::size();
            width = std::min(width, size - i);
            for (int64_t j = 0; j < width; j++) {
              buffer[j] = y[stridey * (i + j)];
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@ -82,8 +82,8 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
-      weight_num, channel_size);
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
+      " and channel size = ", channel_size, ".");

    // config to run cuda kernel
    int64_t input_numel = input.numel();
@ -198,8 +198,8 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
      input_stride1 = strides[1];
    }
    AT_CHECK(channel_size == weight_num,
-      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
-      weight_num, channel_size);
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
+      " and channel size = ", channel_size, ".");

    // config to run cuda kernel
    int64_t input_numel = input.numel();
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@ -376,6 +376,81 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
  }
 }

+template <typename scalar_t, bool upper>
+__global__
+void triu_tril_kernel(
+    scalar_t* result, scalar_t* self, int64_t k, int64_t N,
+    int64_t res_batch_stride, int64_t res_row_stride, int64_t res_col_stride,
+    int64_t self_batch_stride, int64_t self_row_stride, int64_t self_col_stride, int64_t self_ncol) {
+  int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_idx >= N) {
+    return;
+  }
+
+  int64_t self_batch_idx = blockIdx.y;
+  int64_t row = linear_idx / self_ncol;
+  int64_t col = linear_idx % self_ncol;
+
+  bool mask = upper ? (col - row >= k) : (col - row <= k);
+
+  // Now compute the offset for the self and result tensor
+  int64_t res_offset = self_batch_idx * res_batch_stride + row * res_row_stride + col * res_col_stride;
+  int64_t self_offset = self_batch_idx * self_batch_stride + row * self_row_stride + col * self_col_stride;
+  result[res_offset] = mask ? self[self_offset] : scalar_t(0);
+}
+
+template <bool upper>
+Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
+  int64_t n_batches = batchCount(self), mat_size = self.size(-1) * self.size(-2),
+          res_batch_stride = result.dim() > 2 ? result.stride(-3) : 1,
+          res_row_stride = result.stride(-2), res_col_stride = result.stride(-1),
+          self_batch_stride = self.dim() > 2 ? self.stride(-3) : 1,
+          self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
+  dim3 dim_block = cuda::getApplyBlock();
+  dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
+  AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), name, [&]{
+    triu_tril_kernel<scalar_t, upper>
+      <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+        result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
+        res_batch_stride, res_row_stride, res_col_stride,
+        self_batch_stride, self_row_stride, self_col_stride, self.size(-1));
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return result;
+}
+
+Tensor& tril_cuda_(Tensor &self, int64_t k) {
+  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
+  return tril_cuda_out(self, self, k);
+}
+
+Tensor& tril_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
+  if (result.sizes() != self.sizes()) {
+    result.resize_as_(self);
+  }
+  if (self.numel() == 0) {
+    return result;
+  }
+  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
+  return triu_tril_cuda_template<false>(result, self_c, k, "tril");
+}
+
+Tensor& triu_cuda_(Tensor &self, int64_t k) {
+  if (!checkTrilTriuBatchContiguous(self)) self = self.contiguous();
+  return triu_cuda_out(self, self, k);
+}
+
+Tensor& triu_cuda_out(Tensor &result, const Tensor& self, int64_t k) {
+  if (result.sizes() != self.sizes()) {
+    result.resize_as_(self);
+  }
+  if (self.numel() == 0) {
+    return result;
+  }
+  Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
+  return triu_tril_cuda_template<true>(result, self_c, k, "triu");
+}
+
 }}  // namespace at::native

 #undef ALLOCATE_ARRAY
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@ -1,4 +1,5 @@
-#include "ATen/ATen.h"
+#include <ATen/ATen.h>
+#include <ATen/cuda/Exceptions.h>
 #include <THC/THCTensorMathReduce.cuh>
 #include <math.h>

@ -78,13 +79,13 @@ struct dists {
 };

 template <typename scalar_t, typename F>
-__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
+__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
+                                              const double n2, const double n2_squared_minus_1) {
  const int k = blockIdx.x;
  const int stride = blockDim.x;

-  float n2 = n - .5;
  // The -1 accounts for floating point truncation issues
-  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;

  const scalar_t * const start = self + i * m;
@ -124,7 +125,8 @@ __global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t
 }

 template <typename scalar_t, typename F>
-__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
+__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
+                                                       const double n2, const double n2_squared_minus_1) {
  const int k = blockIdx.y * blockDim.y + threadIdx.y;
  const int init = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
@ -133,9 +135,8 @@ __global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const
    return;
  }

-  float n2 = n - .5;
  // The -1 accounts for floating point truncation issues
-  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
  int64_t ib = j - i - 1;
  int64_t jb = n - 2 - i;
@ -161,20 +162,25 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
  const dim3 block(forward_threads);
  int64_t n = self.size(0);
  int64_t m = self.size(1);
+  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
+  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
+  const double n2 = n - .5;
+  const double n2_squared_minus_1 = n2 * n2 - 1;

  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
    if (p == 0.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (p == 1.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (p == 2.0) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else if (std::isinf(p)) {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    } else {
-      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
    }
  });
+  AT_CUDA_CHECK(cudaGetLastError());
 }

 void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
@ -186,26 +192,34 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
  const int64_t n = result.size(0);
  int64_t m = self.size(1);
  const int block_x = 64;
-  const int block_y = 4;
+  // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
+  // From binary search, block_y of 16 gives us max pdist dim0 of 1449,
+  //                     block_y of  4 gives us max pdist dim0 of  725.
+  const int block_y = 16;
  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
  const int grid_y = (dist.numel() + block_y - 1) / block_y;
  const dim3 grid(grid_x, grid_y);
  const dim3 block(block_x, block_y);
+  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
+  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
+  const double n2 = n - .5;
+  const double n2_squared_minus_1 = n2 * n2 - 1;

  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
    if (p == 1.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (p < 2.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (p == 2.0) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else if (std::isinf(p)) {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    } else {
-      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
    }
  });
+  AT_CUDA_CHECK(cudaGetLastError());

  at::sum_out(result, buffer, 0);
 }
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -396,7 +396,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind

    default:
      AT_ERROR(
-          "Unknown mode for embedding_bag_backward_cuda %d", mode);
+          "Unknown mode for embedding_bag_backward_cuda ", mode);
  }
 }

--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@ -336,7 +336,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];

        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
-      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
+      } else if ((s < 2*max_target_length+1) && ((target_length == 0) || (s > 2*target_length+1) || (t >= input_length))) {
          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
      }
    }
@ -626,7 +626,7 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
      if (targets.type().scalarType() == kLong) {
 	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
      } else {
-	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+	return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
      }
    });
 }
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@ -402,6 +402,14 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda_template(const Tensor& input_
                                                            const Tensor& running_mean_, const Tensor& running_var_,
                                                            bool train, double momentum, double epsilon) {

+  TensorArg input_arg{ input_, "input", 1 },
+            weight_arg{ weight_, "weight", 2 },
+            bias_arg{ bias_, "bias", 3 },
+            run_mean_arg{ running_mean_, "running_mean", 4 },
+            run_var_arg{ running_var_, "running_var", 5 };
+  CheckedFrom c = "batch_norm_cuda";
+  checkAllSameGPU(c, {input_arg, weight_arg, bias_arg, run_mean_arg, run_var_arg});
+
  using accscalar_t = at::acc_type<scalar_t, true>;
  int64_t n_input = input_.size(1);
  Tensor save_mean_;
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@ -7,28 +7,13 @@
 #include <tuple>
 #include <thrust/unique.h>
 #include <thrust/sort.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>

 namespace at {
 namespace native{

 namespace {
-template <typename scalar_t>
-__global__ void inverse_indices_kernel(
-    const scalar_t* input_data,
-    const scalar_t* output_data,
-    int64_t* inverse_indices_data,
-    int64_t num_inp,
-    int64_t num_out) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t stride = blockDim.x * gridDim.x;
-
-    for (int64_t i = idx; i < num_inp * num_out; i += stride) {
-      if (input_data[i / num_out] == output_data[i % num_out]){
-        inverse_indices_data[i / num_out] = i % num_out;
-      }
-    }
-  }
-

 template <typename scalar_t>
  std::tuple<Tensor, Tensor> _unique_cuda_template(
@ -47,25 +32,29 @@ template <typename scalar_t>
    Tensor output = input.clone();
    output = output.view(-1);
    scalar_t* output_data = output.data<scalar_t>();
-    thrust::sort(policy, output_data, output_data + num_inp);
-    scalar_t* output_end = thrust::unique(policy, output_data, output_data + num_inp);
-    int64_t num_out = output_end - output_data;
-    output.resize_(num_out);
-
-    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
-
-    if (return_inverse) {
-      inverse_indices.resize_(input.sizes());
-      int64_t* inverse_indices_data = inverse_indices.data<int64_t>();
-      int block = 512;
-      int grid = std::min<int64_t>((num_inp * num_out + block - 1) / block, 2048L);
-      inverse_indices_kernel<<<grid, block, 0, stream>>>(
-        input_data, output_data, inverse_indices_data, num_inp, num_out);
+    Tensor inverse_indices;
+    if (!return_inverse) {
+        inverse_indices = at::empty({0},  self.type().toScalarType(kLong));
+        thrust::sort(policy, output_data, output_data + num_inp);
+    } else {
+        Tensor sorted_indices = at::arange(0, num_inp, self.type().toScalarType(kLong));
+        int64_t* sorted_indices_ptr = sorted_indices.data<int64_t>();
+        thrust::sort_by_key(policy, output_data, output_data + num_inp, sorted_indices_ptr);
+        Tensor inv_loc = at::empty({num_inp}, self.type().toScalarType(kLong));
+        inverse_indices = at::empty({num_inp}, self.type().toScalarType(kLong));
+        int64_t* inv_loc_ptr = inv_loc.data<int64_t>();
+        int64_t* inverse_indices_ptr = inverse_indices.data<int64_t>();
+        thrust::adjacent_difference(policy, output_data, output_data + num_inp, inv_loc_ptr, [=] __device__ (scalar_t a, scalar_t b) -> int64_t { if (a != b) {return 1;} else { return 0; }});
+        inv_loc[0] = 0;
+        thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
+        thrust::scatter(policy,inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
+        inverse_indices.resize_(input.sizes());
    }
+    int64_t num_out = thrust::unique(policy, output_data, output_data + num_inp) - output_data;
+    output.resize_(num_out);

    THCudaCheck(cudaGetLastError());
    return std::tuple<Tensor, Tensor>(output, inverse_indices);
-
  }

 template <typename scalar_t>
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@ -603,9 +603,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgo_t> {
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
-#if CUDNN_VERSION >= 6000
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
-#endif
    };
    // NOTE: - 1 because ALGO_WINOGRAD is not implemented
    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
@ -697,6 +695,67 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
  THCCachingAllocator_emptyCache();
 }

+
+//hot fix for #16610
+//specializing algorithm_search would be cleaner, as it is specialized already, but that would require also specializing getBestAlgorithm for bwdData, 
+//adding "strided" argument, so in the end this looks simpler.
+template<>
+void findAlgorithm(const ConvolutionArgs& args, bool benchmark, cudnnConvolutionBwdDataAlgo_t * algo) {
+  using search = algorithm_search<cudnnConvolutionBwdDataAlgo_t>;
+  auto& cache = search::cache();
+
+  if (cache.find(args.params, algo)) {
+    return;
+  }
+
+  if (args.params.deterministic && !benchmark) {
+    *algo = search::DEFAULT_ALGO;
+    return;
+  }
+  
+  int stride_dim = args.input.dim() - 2;
+  bool strided = false;
+  for (int i = 0; i< stride_dim; i++) {
+      if (args.params.stride[i] != 1) {
+         strided = true;
+         break;
+      }
+  }
+
+  if (!benchmark) {
+    search::getAlgorithm(args, algo);
+    if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
+       *algo = search::DEFAULT_ALGO;
+    }
+    return;
+  }
+
+  if (cache.find(args.params, algo)) {
+    // re-check cache since another thread may have benchmarked the algorithm
+    return;
+  }
+
+  auto perfResults = search::findAlgorithm(args);
+  // for deterministic algo, look at all the perf results and return the best
+  // deterministic algo
+  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
+      !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
+      *algo = perfResults.algo;
+  } else {
+      *algo = search::DEFAULT_ALGO;
+  }
+  if (strided && (*algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || *algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
+     *algo = search::DEFAULT_ALGO;
+  }
+  cache.insert(args.params, *algo);
+
+  // Free the cached blocks in our caching allocator. They are
+  // needed here because the above benchmarking uses a huge amount of memory,
+  // e.g. a few GBs.
+  THCCachingAllocator_emptyCache();
+}
+
+
 template<typename algo_t>
 Workspace chooseAlgorithm(
    const ConvolutionArgs& args,
@ -848,19 +907,9 @@ Tensor cudnn_convolution_forward(
  // See #4500
  Tensor weight_contig = weight->contiguous();

-#if CUDNN_VERSION < 7000
-  for (int i = 0; i < groups; i++) {
-    raw_cudnn_convolution_forward_out(
-        narrowGroup(*output, output_channels_dim,        i, groups),
-        narrowGroup(*input,  input_channels_dim,         i, groups),
-        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
-        padding, stride, dilation, 1, benchmark, deterministic);
-  }
-#else
  raw_cudnn_convolution_forward_out(
      *output, *input, weight_contig,
      padding, stride, dilation, groups, benchmark, deterministic);
-#endif

  return *output;
 }
@ -986,19 +1035,9 @@ Tensor cudnn_convolution_backward_input(
  // See #4500
  Tensor weight_contig = weight->contiguous();

-#if CUDNN_VERSION < 7000
-  for (int i = 0; i < groups; i++) {
-    raw_cudnn_convolution_backward_input_out(
-        narrowGroup(*grad_input, input_channels_dim, i, groups),
-        narrowGroup(*grad_output, output_channels_dim, i, groups),
-        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
-        padding, stride, dilation, 1, benchmark, deterministic);
-  }
-#else
  raw_cudnn_convolution_backward_input_out(
      *grad_input, *grad_output, weight_contig,
      padding, stride, dilation, groups, benchmark, deterministic);
-#endif

  return *grad_input;
 }
@ -1119,19 +1158,9 @@ Tensor cudnn_convolution_backward_weight(
  TensorArg grad_weight{ grad_weight_t, "result", 0 };
  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);

-#if CUDNN_VERSION < 7000
-  for (int i = 0; i < groups; i++) {
-    raw_cudnn_convolution_backward_weight_out(
-        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
-        narrowGroup(*grad_output, output_channels_dim, i, groups),
-        narrowGroup(*input, input_channels_dim, i, groups),
-        padding, stride, dilation, groups, benchmark, deterministic);
-  }
-#else
  raw_cudnn_convolution_backward_weight_out(
      *grad_weight, *grad_output, *input,
      padding, stride, dilation, groups, benchmark, deterministic);
-#endif

  return grad_weight_t;
 }
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@ -7,7 +7,7 @@
 #endif


-#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
+#if !AT_CUDNN_ENABLED()

 namespace at { namespace native {

--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -375,7 +375,7 @@ namespace {
      case CUDNN_RNN_TANH:
        return 2;
      default:
-        AT_ERROR("unknown cuDNN RNN mode %d", mode);
+        AT_ERROR("unknown cuDNN RNN mode ", mode);
    }
  }

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2555,9 +2555,15 @@

 - func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method
+  dispatch:
+    CPU: tril_cpu_
+    CUDA: tril_cuda_

 - func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
  variants: method
+  dispatch:
+    CPU: triu_cpu_
+    CUDA: triu_cuda_

 - func: digamma_(Tensor self) -> Tensor
  variants: method
@ -2658,11 +2664,17 @@
  variants: method, function

 - func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
+  dispatch:
+    CPU: triu_cpu_out
+    CUDA: triu_cuda_out

 - func: triu(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function

 - func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
+  dispatch:
+    CPU: tril_cpu_out
+    CUDA: tril_cuda_out

 - func: tril(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
--- a/aten/src/ATen/test/cuda_cudnn_test.cpp
+++ b/aten/src/ATen/test/cuda_cudnn_test.cpp
@ -11,18 +11,4 @@ using namespace at::native;
 TEST(CUDNNTest, CUDNNTestCUDA) {
  if (!at::cuda::is_available()) return;
  manual_seed(123);
-
-#if CUDNN_VERSION < 7000
-  auto handle = getCudnnHandle();
-  DropoutDescriptor desc1, desc2;
-  desc1.initialize_rng(handle, 0.5, 42, TensorOptions().device(DeviceType::CUDA).dtype(kByte));
-  desc2.set(handle, 0.5, desc1.state);
-  bool isEQ;
-  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
-  ASSERT_TRUE(isEQ);
-  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
-  ASSERT_TRUE(isEQ);
-  isEQ = (desc1.desc()->states == desc2.desc()->states);
-  ASSERT_TRUE(isEQ);
-#endif
 }
--- a/aten/src/ATen/test/test_install/CMakeLists.txt
+++ b/aten/src/ATen/test/test_install/CMakeLists.txt
@ -3,6 +3,8 @@ find_package(ATen REQUIRED)
 include_directories(${ATEN_INCLUDE_DIR})

 # C++11
-set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
+if (not MSVC) 
+    set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}") 
+endif()
 add_executable(main main.cpp)
 target_link_libraries(main ${ATEN_LIBRARIES})
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@ -247,10 +247,13 @@

 #ifdef _OPENMP

-#ifndef _WIN32
-#define PRAGMA(P) _Pragma(#P)
+#ifdef _WIN32  
+// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.  
+#define PRAGMA_LOOP(P)    // Noop  
+#define PRAGMA(P)         __pragma(P)
 #else
-#define PRAGMA(P) __pragma(P)
+#define PRAGMA_LOOP(P)    _Pragma(#P)  
+#define PRAGMA(P)         _Pragma(#P)
 #endif

 #include <omp.h>
@ -369,7 +372,7 @@
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
    ptrdiff_t iter = 0;                                                                        \
    if(tp != (TYPE2*)rp) {                                                                             \
-      PRAGMA(ivdep) \
+      PRAGMA_LOOP(ivdep) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \
      for (iter = 0; iter < SIZE; iter++) {                             \
        TYPE2 *TENSOR2##_data = tp+iter;                                \
@ -377,7 +380,7 @@
        CODE                                                            \
      }\
    } else {\
-      PRAGMA(simd) \
+      PRAGMA_LOOP(simd) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE2* TENSOR2##_data = tp+iter;\
@ -449,7 +452,7 @@
    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
    ptrdiff_t iter = 0;\
    if(tp != (TYPE2*)rp) {                                                                             \
-      PRAGMA(ivdep) \
+      PRAGMA_LOOP(ivdep) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE1 *TENSOR1##_data = rp+iter;\
@ -458,7 +461,7 @@
        CODE                                \
      } \
    } else {\
-      PRAGMA(simd) \
+      PRAGMA_LOOP(simd) \
      PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) )  \
      for (iter = 0; iter < SIZE; iter++) {\
        TYPE1 *TENSOR1##_data = rp+iter;\
--- a/aten/src/TH/generic/THTensorApply.hpp
+++ b/aten/src/TH/generic/THTensorApply.hpp
@ -13,10 +13,13 @@

 #ifdef _OPENMP

-#ifndef _WIN32
-#define PRAGMA(P) _Pragma(#P)
+#ifdef _WIN32  
+// MSVC doesing support loop pragmas, but does support others. Create a new macro to account for those differences.  
+#define PRAGMA_LOOP(P)    // Noop  
+#define PRAGMA(P)         __pragma(P)
 #else
-#define PRAGMA(P) __pragma(P)
+#define PRAGMA_LOOP(P)    _Pragma(#P)  
+#define PRAGMA(P)         _Pragma(#P)
 #endif

 #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@ -111,22 +111,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
  int free_b = 0;
  if (a == NULL) a = ra_;
  if (b == NULL) b = rb_;
-  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
-      a->dim());
-  THArgCheck(!a->is_empty(), 2, "A should not be empty");
-  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->dim());
-  THArgCheck(!b->is_empty(), 2, "B should not be empty");
-  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
-      a->size(0), a->size(1));
-  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size(0), b->size(0));
-
-  if (b->dim() == 1) {
-    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
-            b->stride(0), 1, 0);
-    free_b = 1;
-  }

  int n, nrhs, lda, ldb, info;
  THIntTensor *ipiv;
@ -157,7 +141,6 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
  THTensor_(freeCopyTo)(ra__, ra_);
  THTensor_(freeCopyTo)(rb__, rb_);
  THIntTensor_free(ipiv);
-  if (free_b) c10::raw::intrusive_ptr::decref(b);
 }

 void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@ -104,7 +104,6 @@ TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n

 TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
 TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted);
-TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k);
 TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k);
 TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
 TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@ -716,6 +716,11 @@ void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, int64_t n)
  REAL_SWAP(ARR(III), ARR(JJJ)); \
  LONG_SWAP(IDX(III), IDX(JJJ))

+/* Emulate NumPy behavior of putting NaNs
+ * at the end of an ascending list. */
+#define GT_OR_NAN(x, y) \
+  ((x != x && y == y) || (x > y))
+
 static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
 {
  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
@ -731,15 +736,15 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
      /* Use median of three for pivot choice */
    P=(L+R)>>1;
    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+    if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
+    if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
+    if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }

    i = L+1; j = R; piv = ARR(L); pid = IDX(L);

    do {
-      do { i = i+1; } while(ARR(i) < piv);
-      do { j = j-1; } while(ARR(j) > piv);
+      do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
+      do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
      if (j < i)
          break;
      BOTH_SWAP(i, j);
@ -790,7 +795,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
  } /* while not done */
  /* Now insertion sort on the concatenation of subfiles */
  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) > ARR(i+1)) {
+    if (GT_OR_NAN(ARR(i),ARR(i+1))) {
      piv = ARR(i);
      pid = IDX(i);
      j = i+1;
@ -798,7 +803,7 @@ static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elem
        ARR(j-1) = ARR(j);
        IDX(j-1) = IDX(j);
        j = j+1;
-      } while(j < elements && ARR(j) < piv);
+      } while(j < elements && GT_OR_NAN(piv, ARR(j)));
      ARR(j-1) = piv;
      IDX(j-1) = pid;
     }
@ -820,15 +825,15 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
      /* Use median of three for pivot choice */
    P=(L+R)>>1;
    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+    if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
+    if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
+    if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }

    i = L+1; j = R; piv = ARR(L); pid = IDX(L);

    do {
-      do { i = i+1; } while(ARR(i) > piv);
-      do { j = j-1; } while(ARR(j) < piv);
+      do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
+      do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
      if (j < i)
          break;
      BOTH_SWAP(i, j);
@ -879,7 +884,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
  } /* while not done */
  /* Now insertion sort on the concatenation of subfiles */
  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) < ARR(i+1)) {
+    if (GT_OR_NAN(ARR(i+1), ARR(i))) {
      piv = ARR(i);
      pid = IDX(i);
      j = i+1;
@ -887,7 +892,7 @@ static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t ele
        ARR(j-1) = ARR(j);
        IDX(j-1) = IDX(j);
        j = j+1;
-      } while(j < elements && ARR(j) > piv);
+      } while(j < elements && GT_OR_NAN(ARR(j), piv));
      ARR(j-1) = piv;
      IDX(j-1) = pid;
     }
@ -1244,37 +1249,6 @@ void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, i
  THLongTensor_free(tmpIndices);
 }

-void THTensor_(tril)(THTensor *r_, THTensor *t, int64_t k)
-{
-  int64_t t_size_0, t_size_1;
-  int64_t t_stride_0, t_stride_1;
-  int64_t r__stride_0, r__stride_1;
-  scalar_t *t_data, *r__data;
-  int64_t r, c;
-
-  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
-
-  THTensor_(resizeAs)(r_, t);
-
-  t_size_0 = THTensor_(size)(t, 0);
-  t_size_1 = THTensor_(size)(t, 1);
-  t_stride_0 = THTensor_(stride)(t, 0);
-  t_stride_1 = THTensor_(stride)(t, 1);
-  r__stride_0 = THTensor_(stride)(r_, 0);
-  r__stride_1 = THTensor_(stride)(r_, 1);
-  r__data = r_->data<scalar_t>();
-  t_data = t->data<scalar_t>();
-
-  for(r = 0; r < t_size_0; r++)
-  {
-    int64_t sz = THMin(r+k+1, t_size_1);
-    for(c = THMax(0, r+k+1); c < t_size_1; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = 0;
-    for(c = 0; c < sz; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
-  }
-}
-
 void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
 {
  int64_t t_size_0, t_size_1;
--- a/aten/src/THC/THCSortUtils.cuh
+++ b/aten/src/THC/THCSortUtils.cuh
@ -6,17 +6,17 @@
 #include "THCNumerics.cuh"

 // Collection of kernel sort routines
-template <typename T>
+template <typename T, bool handleNaN = false>
 struct LTComp {
  __device__ inline bool operator()(const T& a, const T& b) const {
-    return THCNumerics<T>::lt(a, b);
+    return (handleNaN && THCNumerics<T>::isnan(b) && !THCNumerics<T>::isnan(a)) || THCNumerics<T>::lt(a, b);
  }
 };

-template <typename T>
+template <typename T, bool handleNaN = false>
 struct GTComp {
  __device__ inline bool operator()(const T& a, const T& b) const {
-    return THCNumerics<T>::gt(a, b);
+    return (handleNaN && THCNumerics<T>::isnan(a) && !THCNumerics<T>::isnan(b)) || THCNumerics<T>::gt(a, b);
  }
 };

--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@ -121,18 +121,19 @@ __global__ void renormRowsL1(T* dist, long rows, long cols) {
 }

 template <typename T>
-__device__ int binarySearchForMultinomial(T* dist,
+__device__ int binarySearchForMultinomial(T* cumdist,
+                                          T* dist,
                                          int size,
                                          T val) {
  int start = 0;
  int end = size;
-  // dist[size - 1] = 0 => all zero prob dist
-  assert(THCNumerics<T>::gt(dist[size - 1], 0));
+  // cumdist[size - 1] = 0 => all zero prob dist
+  assert(THCNumerics<T>::gt(cumdist[size - 1], 0));

  while (end - start > 0) {
    int mid = start + (end - start) / 2;

-    T midVal = dist[mid];
+    T midVal = cumdist[mid];
    if (THCNumerics<T>::lt(midVal, val)) {
      start = mid + 1;
    } else {
@ -149,8 +150,8 @@ __device__ int binarySearchForMultinomial(T* dist,
    start = size - 1;
  }

-  T curVal = dist[start];
-  while(start >= 1 && THCNumerics<T>::eq(dist[start - 1], curVal)) start--;
+  T curVal = cumdist[start];
+  while(start >= 1 && THCNumerics<T>::eq(dist[start], 0)) start--;

  return start;
 }
@ -299,7 +300,8 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
                                 int64_t* dest,
                                 int64_t distributions,
                                 int categories,
-                                 T* normDistPrefixSum) {
+                                 T* normDistPrefixSum,
+                                 T* normDist) {
  // At the moment, each warp computes one sample value in the binary
  // search due to divergence. It seems possible to compute multiple
  // values and limit divergence though later on. However, no matter
@ -322,6 +324,7 @@ sampleMultinomialWithReplacement(curandStateMtgp32* state,
        // Find the bucket that a uniform sample lies in
        int choice = binarySearchForMultinomial<T>(
          normDistPrefixSum + curDist * categories,
+          normDist + curDist * categories,
          categories,
          r);

@ -363,6 +366,7 @@ sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
      // Find the bucket that a uniform sample lies in
      int choice = binarySearchForMultinomial<T>(
        normDistPrefixSum + curDist * categories,
+        origDist + curDist * categories,
        categories,
        r);

--- a/aten/src/THC/THCTensorSort.cuh
+++ b/aten/src/THC/THCTensorSort.cuh
@ -15,17 +15,17 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif

-template <typename T>
+template <typename T, bool handleNaN = false>
 struct ThrustGTOp {
  __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return THCNumerics<T>::gt(lhs, rhs);
+    return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
  }
 };

-template <typename T>
+template <typename T, bool handleNaN = false>
 struct ThrustLTOp {
  __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return THCNumerics<T>::lt(lhs, rhs);
+    return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
  }
 };

--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@ -63,11 +63,6 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
-  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
-  THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
-  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
-  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
-
  int64_t n = a_->size(0);
  int64_t nrhs = b_->size(1);

--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@ -187,7 +187,6 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
    }
  } else {
-    THCTensor_(resizeAs)(state, self_, src_);

    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, op)) {
      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@ -246,7 +246,8 @@ void THCTensor_(multinomial)(struct THCState *state,
          n_sample,
          THCudaLongTensor_data(state, self),
          numDist, numCategories,
-          THCTensor_(data)(state, prefixSum));
+          THCTensor_(data)(state, prefixSum),
+	  THCTensor_(data)(state, normDist));
    } else {
      // Sample without replacement

--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@ -53,7 +53,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
    dim3 block(blockSize);                                              \
                                                                        \
    if (dir) {                                                          \
-      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t>, TYPE, SIZE> \
+      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, GTComp<scalar_t, true>, TYPE, SIZE> \
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
          keyInfo,                                                      \
          keySlices,                                                    \
@ -61,9 +61,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
          valueInfo,                                                    \
          (TYPE) valueInfo.strides[collapseValueDim],                   \
-          GTComp<scalar_t>());                                              \
+          GTComp<scalar_t, true>());                                    \
    } else {                                                            \
-      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t>, TYPE, SIZE> \
+      bitonicSortKVInPlace<scalar_t, int64_t, A, -1, LTComp<scalar_t, true>, TYPE, SIZE> \
        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(         \
          keyInfo,                                                      \
          keySlices,                                                    \
@ -71,7 +71,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
          (TYPE) keyInfo.strides[collapseKeyDim],                       \
          valueInfo,                                                    \
          (TYPE) valueInfo.strides[collapseValueDim],                   \
-          LTComp<scalar_t>());                                              \
+          LTComp<scalar_t, true>());                                              \
    }                                                                   \
  } while (0)

@ -234,13 +234,13 @@ void THCTensor_(sortViaThrust)(THCState* state,
 #if CUDA_VERSION >= 7000
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t>());
+      keyIter, keyIter + totalElements, indexIter, ThrustGTOp<scalar_t, true>());
  } else {
    thrust::stable_sort_by_key(
 #if CUDA_VERSION >= 7000
      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t>());
+      keyIter, keyIter + totalElements, indexIter, ThrustLTOp<scalar_t, true>());
  }

  // Then, re-sort according to slice that each index is
--- a/c10/Half.h
+++ b/c10/Half.h
@ -383,6 +383,14 @@ struct Converter<
  }
 };

+// In some versions of MSVC, there will be a compiler error when building.
+// C4146: unary minus operator applied to unsigned type, result still unsigned
+// It can be addressed by disabling the following warning. 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4146 )
+#endif
+
 // skip isnan and isinf check for integral types
 template <typename To, typename From>
 typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
@ -399,6 +407,10 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
  }
 }

+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
 template <typename To, typename From>
 typename std::enable_if<std::is_floating_point<From>::value, bool>::type
 overflows(From f) {
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@ -11,9 +11,11 @@ using c10::intrusive_ptr_target;
 using c10::make_intrusive;
 using c10::weak_intrusive_ptr;

+#ifndef _MSC_VER
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wself-move"
+#endif

 namespace {
 class SomeClass0Parameters : public intrusive_ptr_target {};
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@ -25,7 +25,7 @@ Error::Error(
 // Caffe2-style error message
 Error::Error(
    const char* file,
-    const int line,
+    const uint32_t line,
    const char* condition,
    const std::string& msg,
    const std::string& backtrace,
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -49,7 +49,7 @@ class C10_API Error : public std::exception {
  Error(SourceLocation source_location, const std::string& msg);
  Error(
      const char* file,
-      const int line,
+      const uint32_t line,
      const char* condition,
      const std::string& msg,
      const std::string& backtrace,
@ -117,11 +117,17 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
 // not met.

+// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
+// which is different from the definition of `SourceLocation` that requires
+// unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
+// error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
+// Here the static cast is used to pass the build.
+
 #define AT_ERROR(...) \
-  throw ::c10::Error({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
+  throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))

 #define AT_WARN(...) \
-  ::c10::Warning::warn({__func__, __FILE__, __LINE__}, ::c10::str(__VA_ARGS__))
+  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))

 #define AT_ASSERT(cond)                       \
  if (!(cond)) {                              \
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@ -17,9 +17,10 @@
 #include <utility>
 #include <type_traits>

+#ifndef _MSC_VER
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-
+#endif
 #ifdef _MSC_VER
 #define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
 #else
@ -1457,4 +1458,6 @@ namespace ska

 } // end namespace ska

+#ifndef _MSC_VER
 #pragma GCC diagnostic pop
+#endif
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -72,18 +72,27 @@ class C10_API intrusive_ptr_target {
 // We also have to disable -Wunknown-warning-option and -Wpragmas, because
 // some other compilers don't know about -Wterminate or -Wexceptions and
 // will show a warning about unknown warning options otherwise.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpragmas"
-#pragma GCC diagnostic ignored "-Wunknown-warning-option"
-#pragma GCC diagnostic ignored "-Wterminate"
-#pragma GCC diagnostic ignored "-Wexceptions"
+#ifdef _MSC_VER
+#  pragma warning(push)  
+#  pragma warning(disable: 4297) // function assumed not to throw an exception but does  
+#else  
+#  pragma GCC diagnostic push  
+#  pragma GCC diagnostic ignored "-Wpragmas"  
+#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
+#  pragma GCC diagnostic ignored "-Wterminate"  
+#  pragma GCC diagnostic ignored "-Wexceptions"  
+#endif
    AT_ASSERTM(
        refcount_.load() == 0,
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it");
    AT_ASSERTM(
        weakcount_.load() == 0,
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
-#pragma GCC diagnostic pop
+#ifdef _MSC_VER
+#  pragma warning(pop)  
+#else  
+#  pragma GCC diagnostic pop  
+#endif
  }

  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@ -430,12 +430,16 @@ class C10_API TypeMeta {
    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
    // disabled for compilers that don't know '-Wundefined-var-template' and
    // would error at our attempt to disable it.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpragmas"
-#pragma GCC diagnostic ignored "-Wunknown-warning-option"
-#pragma GCC diagnostic ignored "-Wundefined-var-template"
+#ifndef _MSC_VER  
+#  pragma GCC diagnostic push  
+#  pragma GCC diagnostic ignored "-Wpragmas"  
+#  pragma GCC diagnostic ignored "-Wunknown-warning-option"  
+#  pragma GCC diagnostic ignored "-Wundefined-var-template"  
+#endif
    return TypeMeta(_typeMetaDataInstance<T>());
-#pragma GCC diagnostic pop
+#ifndef _MSC_VER  
+#  pragma GCC diagnostic pop  
+#endif
  }

 private:
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -219,16 +219,8 @@ if(NOT BUILD_ATEN_ONLY)
  else()
    target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
  endif()
-
-  #cmake only check for separate OpenMP library on AppleClang 7+
-  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
-  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
-        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
-      target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
-    endif()
-  endif()
 endif()
+
 target_link_libraries(caffe2 PUBLIC c10)
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
@ -239,10 +231,8 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 # Set standard properties on the target
 torch_set_target_props(caffe2)

-if (MSVC)
-target_compile_options(caffe2 INTERFACE "-std=c++11")
-else()
-target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
+if (NOT MSVC) 
+  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>") 
 endif()

 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -93,7 +93,7 @@ using std::vector;
 #define CAFFE2_NORETURN __attribute__((noreturn))
 #endif

-#if defined(_MSC_VER)
+#if (defined _MSC_VER && !defined NOMINMAX)
 #define NOMINMAX
 #endif

--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -1,5 +1,8 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 from caffe2.proto import caffe2_pb2
+import os
+import sys
+import platform
 # TODO: refactor & remove the following alias
 caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
@ -10,3 +13,40 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
 caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
+
+if platform.system() == 'Windows':
+    IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
+
+    if IS_CONDA:
+        from ctypes import windll, c_wchar_p
+        from ctypes.wintypes import DWORD, HMODULE
+
+        AddDllDirectory = windll.kernel32.AddDllDirectory
+        AddDllDirectory.restype = DWORD
+        AddDllDirectory.argtypes = [c_wchar_p]
+
+    def add_extra_dll_dir(extra_dll_dir):
+        if os.path.isdir(extra_dll_dir):
+            os.environ['PATH'] = extra_dll_dir + os.pathsep + os.environ['PATH']
+
+            if IS_CONDA:
+                AddDllDirectory(extra_dll_dir)
+
+    # first get nvToolsExt PATH
+    def get_nvToolsExt_path():
+        NVTOOLEXT_HOME = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
+
+        if os.path.exists(NVTOOLEXT_HOME):
+            return os.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
+        else:
+            return ''
+
+    py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
+    th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
+    th_dll_path = os.path.join(th_root, 'lib')
+
+    dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path()]
+
+    # then add the path to env
+    for p in dll_paths:
+        add_extra_dll_dir(p)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -628,37 +628,12 @@ endif()

 # ---[ OpenMP
 if(USE_OPENMP)
-  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
-    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
-    if(DARWIN_VERSION GREATER 9)
-      set(APPLE_OPENMP_SUCKS 1)
-    endif(DARWIN_VERSION GREATER 9)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
-      OUTPUT_VARIABLE GCC_VERSION)
-    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
-      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
-      add_compile_options(-Wno-unknown-pragmas)
-      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-    endif()
-  endif()
-
-  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
-    find_package(OpenMP)
-    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
-
-    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
-    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
-    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
-  endif()
-
+  find_package(OpenMP)
  if(OPENMP_FOUND)
    message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
  else()
    message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
    caffe2_update_option(USE_OPENMP OFF)
@ -690,7 +665,12 @@ if(USE_CUDA)
      caffe2_update_option(USE_NVRTC OFF)
    endif()
    if(CAFFE2_USE_CUDNN)
-      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+      IF(CUDNN_STATIC_LINKAGE)
+	LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
+	  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
+      ELSE()
+	list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+      ENDIF()
    else()
      caffe2_update_option(USE_CUDNN OFF)
    endif()
@ -1111,6 +1091,42 @@ if (NOT BUILD_ATEN_MOBILE)
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
    STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
  ENDIF()
+
+  # OpenMP support?
+  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+    IF (DARWIN_VERSION GREATER 9)
+      SET(APPLE_OPENMP_SUCKS 1)
+    ENDIF (DARWIN_VERSION GREATER 9)
+    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+      OUTPUT_VARIABLE GCC_VERSION)
+    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+      add_compile_options(-Wno-unknown-pragmas)
+      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+    ENDIF()
+  ENDIF()
+
+  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
+    FIND_PACKAGE(OpenMP)
+    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
+
+    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
+    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
+    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
+  ENDIF()
+
+  IF (OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  ENDIF()
+
+
  SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)

  FIND_PACKAGE(MAGMA)
@ -1282,7 +1298,6 @@ if (NOT BUILD_ATEN_MOBILE)
    SET(AT_CUDA_ENABLED 0)
  else()
    SET(AT_CUDA_ENABLED 1)
-    find_package(CUDA 5.5 REQUIRED)
  endif()

  IF (NOT AT_CUDA_ENABLED OR NOT CUDNN_FOUND)
@ -1305,11 +1320,10 @@ if (NOT BUILD_ATEN_MOBILE)
  SET(AT_MKLDNN_ENABLED 0)
  SET(CAFFE2_USE_MKLDNN OFF)
  IF (USE_MKLDNN)
-    FIND_PACKAGE(MKLDNN)
    INCLUDE(${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake)
    IF(MKLDNN_FOUND)
      SET(AT_MKLDNN_ENABLED 1)
-      INCLUDE_DIRECTORIES(SYSTEM ${MKLDNN_INCLUDE_DIR})
+      INCLUDE_DIRECTORIES(BEFORE SYSTEM ${MKLDNN_INCLUDE_DIR})
      IF(BUILD_CAFFE2_OPS)
        SET(CAFFE2_USE_MKLDNN ON)
        LIST(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::mkldnn)
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@ -2,7 +2,6 @@
 #
 # The following variables are optionally searched for defaults
 #  MKL_FOUND             : set to true if a library implementing the CBLAS interface is found
-#  USE_MKLDNN
 #
 # The following are set after configuration is done:
 #  MKLDNN_FOUND          : set to true if mkl-dnn is found.
@ -14,10 +13,6 @@ IF (NOT MKLDNN_FOUND)
 SET(MKLDNN_LIBRARIES)
 SET(MKLDNN_INCLUDE_DIR)

-IF (NOT USE_MKLDNN)
-  RETURN()
-ENDIF(NOT USE_MKLDNN)
-
 IF(MSVC)
  MESSAGE(STATUS "MKL-DNN needs omp 3+ which is not supported in MSVC so far")
  RETURN()
@ -41,28 +36,9 @@ ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
 LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})

 IF(MKL_FOUND)
+  # Append to mkldnn dependencies
  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
-  # The OMP-related variables of MKL-DNN have to be overwritten here,
-  # if MKL is used, and the OMP version is defined by MKL.
-  # MKL_LIBRARIES_xxxx_LIBRARY is defined by MKL.
-  # INTEL_MKL_DIR gives the MKL root path.
-  IF (INTEL_MKL_DIR)
-    SET(MKLROOT ${INTEL_MKL_DIR})
-    IF(WIN32)
-      SET(MKLIOMP5DLL ${MKL_LIBRARIES_libiomp5md_LIBRARY} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
-    ELSE(WIN32)
-      IF (MKL_LIBRARIES_gomp_LIBRARY)
-        SET(MKLOMPLIB ${MKL_LIBRARIES_gomp_LIBRARY})
-      ELSE(MKL_LIBRARIES_gomp_LIBRARY)
-        SET(MKLOMPLIB ${MKL_LIBRARIES_iomp5_LIBRARY})
-      ENDIF(MKL_LIBRARIES_gomp_LIBRARY)
-      SET(MKLIOMP5LIB ${MKLOMPLIB} CACHE STRING "Overwrite MKL-DNN omp dependency" FORCE)
-    ENDIF(WIN32)
-  ELSE(INTEL_MKL_DIR)
-    MESSAGE(STATUS "Warning: MKL is found, but INTEL_MKL_DIR is not set!")
-  ENDIF(INTEL_MKL_DIR)
-
 ELSE(MKL_FOUND)
  # If we cannot find MKL, we will use the Intel MKL Small library
  # comes with ${MKLDNN_ROOT}/external
@ -75,60 +51,65 @@ ELSE(MKL_FOUND)
  ENDIF(NOT IS_DIRECTORY ${MKLDNN_ROOT}/external)

  FILE(GLOB_RECURSE MKLML_INNER_INCLUDE_DIR ${MKLDNN_ROOT}/external/*/mkl.h)
-  IF(MKLML_INNER_INCLUDE_DIR)
-    # if user has multiple version under external/ then guess last
-    # one alphabetically is "latest" and warn
-    LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
-    IF(MKLINCLEN GREATER 1)
-      LIST(SORT MKLML_INNER_INCLUDE_DIR)
-      LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
-      LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
-      SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
-    ENDIF(MKLINCLEN GREATER 1)
-    GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
-    LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})
+  IF(NOT MKLML_INNER_INCLUDE_DIR)
+    MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
+    RETURN()
+  ENDIF(NOT MKLML_INNER_INCLUDE_DIR)
+  # if user has multiple version under external/ then guess last
+  # one alphabetically is "latest" and warn
+  LIST(LENGTH MKLML_INNER_INCLUDE_DIR MKLINCLEN)
+  IF(MKLINCLEN GREATER 1)
+    LIST(SORT MKLML_INNER_INCLUDE_DIR)
+    LIST(REVERSE MKLML_INNER_INCLUDE_DIR)
+    LIST(GET MKLML_INNER_INCLUDE_DIR 0 MKLINCLST)
+    SET(MKLML_INNER_INCLUDE_DIR "${MKLINCLST}")
+  ENDIF(MKLINCLEN GREATER 1)
+  GET_FILENAME_COMPONENT(MKLML_INNER_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR} DIRECTORY)
+  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKLML_INNER_INCLUDE_DIR})

-    IF(APPLE)
-      SET(__mklml_inner_libs mklml iomp5)
-    ELSE(APPLE)
-      SET(__mklml_inner_libs mklml_intel iomp5)
-    ENDIF(APPLE)
-
-    FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
-      STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
-      FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
-            NAMES ${__mklml_inner_lib}
-            PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
-            DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
-      MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
-      LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
-    ENDFOREACH(__mklml_inner_lib)
-  ENDIF(MKLML_INNER_INCLUDE_DIR)
+  IF(APPLE)
+    SET(__mklml_inner_libs mklml iomp5)
+  ELSE(APPLE)
+    SET(__mklml_inner_libs mklml_intel iomp5)
+  ENDIF(APPLE)
+  FOREACH(__mklml_inner_lib ${__mklml_inner_libs})
+    STRING(TOUPPER ${__mklml_inner_lib} __mklml_inner_lib_upper)
+    FIND_LIBRARY(${__mklml_inner_lib_upper}_LIBRARY
+          NAMES ${__mklml_inner_lib}
+          PATHS  "${MKLML_INNER_INCLUDE_DIR}/../lib"
+          DOC "The path to Intel(R) MKLML ${__mklml_inner_lib} library")
+    MARK_AS_ADVANCED(${__mklml_inner_lib_upper}_LIBRARY)
+    IF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
+      MESSAGE(STATUS "MKL-DNN not found. Compiling without MKL-DNN support")
+      RETURN()
+    ENDIF(NOT ${__mklml_inner_lib_upper}_LIBRARY)
+    LIST(APPEND MKLDNN_LIBRARIES ${${__mklml_inner_lib_upper}_LIBRARY})
+  ENDFOREACH(__mklml_inner_lib)
 ENDIF(MKL_FOUND)

-LIST(APPEND __mkldnn_looked_for MKLDNN_LIBRARIES)
-LIST(APPEND __mkldnn_looked_for MKLDNN_INCLUDE_DIR)
-INCLUDE(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(MKLDNN DEFAULT_MSG ${__mkldnn_looked_for})
+IF(MKL_FOUND)
+  SET(MKL_cmake_included TRUE)
+  SET(MKLDNN_THREADING "OMP:COMP" CACHE STRING "" FORCE)
+ENDIF(MKL_FOUND)
+SET(WITH_TEST FALSE CACHE BOOL "" FORCE)
+SET(WITH_EXAMPLE FALSE CACHE BOOL "" FORCE)
+SET(MKLDNN_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
+ADD_SUBDIRECTORY(${MKLDNN_ROOT})
+IF(NOT TARGET mkldnn)
+  MESSAGE("Failed to include MKL-DNN target")
+  RETURN()
+ENDIF(NOT TARGET mkldnn)
+IF(MKL_FOUND)
+  TARGET_COMPILE_DEFINITIONS(mkldnn PRIVATE -DUSE_MKL)
+ENDIF(MKL_FOUND)
+IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-maybe-uninitialized)
+  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-strict-overflow)
+  TARGET_COMPILE_OPTIONS(mkldnn PRIVATE -Wno-error=strict-overflow)
+ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
+LIST(APPEND MKLDNN_LIBRARIES mkldnn)

-IF(MKLDNN_FOUND)
-  IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
-    ADD_COMPILE_OPTIONS(-Wno-maybe-uninitialized)
-  ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
-  SET(WITH_TEST FALSE CACHE BOOL "build with mkl-dnn test" FORCE)
-  SET(WITH_EXAMPLE FALSE CACHE BOOL "build with mkl-dnn examples" FORCE)
-  ADD_SUBDIRECTORY(${MKLDNN_ROOT})
-  SET(MKLDNN_LIB "${CMAKE_SHARED_LIBRARY_PREFIX}mkldnn${CMAKE_SHARED_LIBRARY_SUFFIX}")
-  IF(WIN32)
-    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/bin/${MKLDNN_LIB}")
-  ELSE(WIN32)
-    LIST(APPEND MKLDNN_LIBRARIES "${PROJECT_BINARY_DIR}/lib/${MKLDNN_LIB}")
-  ENDIF(WIN32)
-ELSE(MKLDNN_FOUND)
-  MESSAGE(STATUS "MKLDNN source files not found!")
-ENDIF(MKLDNN_FOUND)
-
-UNSET(__mklml_inner_libs)
-UNSET(__mkldnn_looked_for)
+SET(MKLDNN_FOUND TRUE)
+MESSAGE(STATUS "Found MKL-DNN: TRUE")

 ENDIF(NOT MKLDNN_FOUND)
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -9,6 +9,12 @@ endif()
 # release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)

+ # we dont want to statically link cudart, because we rely on it's dynamic linkage in
+ # python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
+ # Technically, we can link cudart here statically, and link libtorch_python.so
+ # to a dynamic libcudart.so, but that's just wasteful
+SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
+
 # Find CUDA.
 find_package(CUDA 7.0)
 if(NOT CUDA_FOUND)
@ -89,6 +95,9 @@ endif()

 if(DEFINED ENV{CUDNN_LIBRARY})
  set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
+  if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a")
+    SET(CUDNN_STATIC_LINKAGE ON)
+  endif()
 else()
  find_library(CUDNN_LIBRARY ${CUDNN_LIBNAME}
    HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
@ -146,6 +155,9 @@ if(CAFFE2_USE_CUDNN)
        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
  endif()
  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})")
+  if(CUDNN_VERSION VERSION_LESS "7.0.0")
+    message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
+  endif()
 endif()

 # ---[ CUDA libraries wrapper
@ -183,7 +195,7 @@ add_library(caffe2::cudart INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA)
    set_property(
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt)
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a" rt dl)
 else()
    set_property(
        TARGET caffe2::cudart PROPERTY INTERFACE_LINK_LIBRARIES
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@ -0,0 +1,917 @@
+PyTorch Contribution Guide
+==========================
+
+PyTorch is a GPU-accelerated Python tensor computation package for
+building deep neural networks built on tape-based autograd systems.
+
+The PyTorch Contribution Process
+--------------------------------
+
+The PyTorch organization is governed by `PyTorch
+Governance </docs/community/governance.html>`__.
+
+The PyTorch development process involves a healthy amount of open
+discussions between the core development team and the community.
+
+PyTorch operates similar to most open source projects on GitHub.
+However, if you've never contributed to an open source project before,
+here is the basic process.
+
+-  **Figure out what you're going to work on.** The majority of open
+   source contributions come from people scratching their own itches.
+   However, if you don't know what you want to work on, or are just
+   looking to get more acquainted with the project, here are some tips
+   for how to find appropriate tasks:
+
+   -  Look through the `issue
+      tracker <https://github.com/pytorch/pytorch/issues/>`__ and see if
+      there are any issues you know how to fix. Issues that are
+      confirmed by other contributors tend to be better to investigate.
+      We also maintain some labels for issues which are likely to be
+      good for new people, e.g., **bootcamp** and **1hr**, although
+      these labels are less well maintained.
+   -  Join us on Slack and let us know you're interested in getting to
+      know PyTorch. We're very happy to help out researchers and
+      partners get up to speed with the codebase.
+
+-  **Figure out the scope of your change and reach out for design
+   comments on a GitHub issue if it's large.** The majority of pull
+   requests are small; in that case, no need to let us know about what
+   you want to do, just get cracking. But if the change is going to be
+   large, it's usually a good idea to get some design comments about it
+   first.
+
+   -  If you don't know how big a change is going to be, we can help you
+      figure it out! Just post about it on issues or Slack.
+   -  Some feature additions are very standardized; for example, lots of
+      people add new operators or optimizers to PyTorch. Design
+      discussion in these cases boils down mostly to, “Do we want this
+      operator/optimizer?” Giving evidence for its utility, e.g., usage
+      in peer reviewed papers, or existence in other frameworks, helps a
+      bit when making this case.
+   -  Core changes and refactors can be quite difficult to coordinate,
+      as the pace of development on PyTorch master is quite fast.
+      Definitely reach out about fundamental or cross-cutting changes;
+      we can often give guidance about how to stage such changes into
+      more easily reviewable pieces.
+
+-  **Code it out!**
+
+   -  See the technical guide for advice for working with PyTorch in a
+      technical form.
+
+-  **Open a pull request.**
+
+   -  If you are not ready for the pull request to be reviewed, tag it
+      with [WIP]. We will ignore it when doing review passes. If you are
+      working on a complex change, it's good to start things off as WIP,
+      because you will need to spend time looking at CI results to see
+      if things worked out or not.
+   -  Find an appropriate reviewer for your change. We have some folks
+      who regularly go through the PR queue and try to review
+      everything, but if you happen to know who the maintainer for a
+      given subsystem affected by your patch is, feel free to include
+      them directly on the pull request. You can learn more about this
+      structure at PyTorch Subsystem Ownership.
+
+-  **Iterate on the pull request until it's accepted!**
+
+   -  We'll try our best to minimize the number of review roundtrips and
+      block PRs only when there are major issues. For the most common
+      issues in pull requests, take a look at `Common Mistakes </docs/community/contribution_guide.html#common-mistakes-to-avoid>`__.
+   -  Once a pull request is accepted and CI is passing, there is
+      nothing else you need to do; we will merge the PR for you.
+
+Getting Started
+---------------
+
+Proposing new features
+~~~~~~~~~~~~~~~~~~~~~~
+
+New feature ideas are best discussed on a specific issue. Please include
+as much information as you can, any accompanying data, and your proposed
+solution. The PyTorch team and community frequently reviews new issues
+and comments where they think they can help. If you feel confident in
+your solution, go ahead and implement it.
+
+Reporting Issues
+~~~~~~~~~~~~~~~~
+
+If you've identified an issue, first search through the `list of
+existing issues <https://github.com/pytorch/pytorch/issues>`__ on the
+repo. If you are unable to find a similar issue, then create a new one.
+Supply as much information you can to reproduce the problematic
+behavior. Also, include any additional insights like the behavior you
+expect.
+
+Implementing Features or Fixing Bugs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to fix a specific issue, it's best to comment on the
+individual issue with your intent. However, we do not lock or assign
+issues except in cases where we have worked with the developer before.
+It's best to strike up a conversation on the issue and discuss your
+proposed solution. The PyTorch team can provide guidance that saves you
+time.
+
+Issues that are labeled first-new-issue, low, or medium priority provide
+the best entrance point are great places to start.
+
+Adding Tutorials
+~~~~~~~~~~~~~~~~
+
+A great deal of the tutorials on `pytorch.org <http://pytorch.org/>`__
+come from the community itself and we welcome additional contributions.
+To learn more about how to contribute a new tutorial you can learn more
+here: `PyTorch.org Tutorial Contribution Guide on
+Github <https://github.com/pytorch/tutorials/#contributing>`__
+
+Improving Documentation & Tutorials
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We aim to produce high quality documentation and tutorials. On rare
+occasions that content includes typos or bugs. If you find something you
+can fix, send us a pull request for consideration.
+
+Take a look at the `Documentation <#on-documentation>`__ section to learn how our system
+works.
+
+Participating in online discussions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can find active discussions happening on the PyTorch Discussion
+`forum <https://discuss.pytorch.org/>`__.
+
+Submitting pull requests to fix open issues
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can view a list of all open issues
+`here <https://github.com/pytorch/pytorch/issues>`__. Commenting on an
+issue is a great way to get the attention of the team. From here you can
+share your ideas and how you plan to resolve the issue.
+
+For more challenging issues, the team will provide feedback and
+direction for how to best solve the issue.
+
+If you're not able to fix the issue itself, commenting and sharing
+whether you can reproduce the issue can be useful for helping the team
+identify problem areas.
+
+Reviewing open pull requests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We appreciate your help reviewing and commenting on pull requests. Our
+team strives to keep the number of open pull requests at a manageable
+size, we respond quickly for more information if we need it, and we
+merge PRs that we think are useful. However, due to the high level of
+interest, additional eyes on pull requests is appreciated.
+
+Improving code readability
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Improve code readability helps everyone. It is often better to submit a
+small number of pull requests that touch few files versus a large pull
+request that touches many files. Starting a discussion in the PyTorch
+forum `here <https://discuss.pytorch.org/>`__ or on an issue related to
+your improvement is the best way to get started.
+
+Adding test cases to make the codebase more robust
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Additional test coverage is appreciated.
+
+Promoting PyTorch
+~~~~~~~~~~~~~~~~~
+
+Your use of PyTorch in your projects, research papers, write ups, blogs,
+or general discussions around the internet helps to raise awareness for
+PyTorch and our growing community. Please reach out to
+`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
+for marketing support.
+
+Triaging issues
+~~~~~~~~~~~~~~~
+
+If you feel that an issue could benefit from a particular tag or level
+of complexity comment on the issue and share your opinion. If an you
+feel an issue isn't categorized properly comment and let the team know.
+
+About open source development
+-----------------------------
+
+If this is your first time contributing to an open source project, some
+aspects of the development process may seem unusual to you.
+
+-  **There is no way to “claim” issues.** People often want to “claim”
+   an issue when they decide to work on it, to ensure that there isn't
+   wasted work when someone else ends up working on it. This doesn't
+   really work too well in open source, since someone may decide to work
+   on something, and end up not having time to do it. Feel free to give
+   information in an advisory fashion, but at the end of the day, we
+   will take running code and rough consensus.
+-  **There is a high bar for new functionality that is added.** Unlike
+   in a corporate environment, where the person who wrote code
+   implicitly “owns” it and can be expected to take care of it in the
+   beginning of its lifetime, once a pull request is merged into an open
+   source project, it immediately becomes the collective responsibility
+   of all maintainers on the project. When we merge code, we are saying
+   that we, the maintainers, are able to review subsequent changes and
+   make a bugfix to the code. This naturally leads to a higher standard
+   of contribution.
+
+Common Mistakes To Avoid
+------------------------
+
+-  **Did you add tests?** (Or if the change is hard to test, did you
+   describe how you tested your change?)
+
+   -  We have a few motivations for why we ask for tests:
+
+      1. to help us tell if we break it later
+      2. to help us tell if the patch is correct in the first place
+         (yes, we did review it, but as Knuth says, “beware of the
+         following code, for I have not run it, merely proven it
+         correct”)
+
+   -  When is it OK not to add a test? Sometimes a change can't be
+      conveniently tested, or the change is so obviously correct (and
+      unlikely to be broken) that it's OK not to test it. On the
+      contrary, if a change is seems likely (or is known to be likely)
+      to be accidentally broken, it's important to put in the time to
+      work out a testing strategy.
+
+-  **Is your PR too long?**
+
+   -  It's easier for us to review and merge small PRs. Difficulty of
+      reviewing a PR scales nonlinearly with its size.
+   -  When is it OK to submit a large PR? It helps a lot if there was a
+      corresponding design discussion in an issue, with sign off from
+      the people who are going to review your diff. We can also help
+      give advice about how to split up a large change into individually
+      shippable parts. Similarly, it helps if there is a complete
+      description of the contents of the PR: it's easier to review code
+      if we know what's inside!
+
+-  **Comments for subtle things?** In cases where behavior of your code
+   is nuanced, please include extra comments and documentation to allow
+   us to better understand the intention of your code.
+-  **Did you add a hack?** Sometimes a hack is the right answer. But
+   usually we will have to discuss it.
+-  **Do you want to touch a very core component?** In order to prevent
+   major regressions, pull requests that touch core components receive
+   extra scrutiny. Make sure you've discussed your changes with the team
+   before undertaking major changes.
+-  **Want to add a new feature?** If you want to add new features,
+   comment your intention on the related issue. Our team tries to
+   comment on and provide feedback to the community. It's better to have
+   an open discussion with the team and the rest of the community prior
+   to building new features. This helps us stay aware of what you're
+   working on and increases the chance that it'll be merged.
+-  **Did you touch unrelated code to the PR?** To aid in code review,
+   please only include files in your pull request that are directly
+   related to your changes.
+
+Frequently asked questions
+
+-  **How can I contribute as a reviewer?** There is lots of value if
+   community developer reproduce issues, try out new functionality, or
+   otherwise help us identify or troubleshoot issues. Commenting on
+   tasks or pull requests with your enviroment details is helpful and
+   appreciated.
+-  **CI tests failed, what does it mean?** Maybe you need to merge with
+   master or rebase with latest changes. Pushing your changes should
+   re-trigger CI tests. If the tests persist, you'll want to trace
+   through the error messages and resolve the related issues.
+-  **What are the most high risk changes?** Anything that tourhces build
+   configuration is an risky area. Please avoid changing these unless
+   you've had a discussion with the team beforehand.
+-  **Hey, a commit showed up on my branch, what's up with that?**
+   Sometimes another community member will provide a patch or fix to
+   your pull request or branch. This is often needed for getting CI tests
+   to pass.
+
+On Documentation
+----------------
+
+Python Docs
+~~~~~~~~~~~
+
+PyTorch documentation is generated from python source using
+`Sphinx <http://www.sphinx-doc.org/en/master/>`__. Generated HTML is
+copied to the docs folder in the master branch of
+`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
+and is served via GitHub pages.
+
+-  Site: http://pytorch.org/docs
+-  GitHub: http://github.com/pytorch/pytorch/docs
+-  Served from:
+   `https://github.com/pytorch/pytorch.github.io/tree/master/doc <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
+
+C++ Docs
+~~~~~~~~
+
+For C++ code we use Doxygen to generate the content files. The C++ docs
+are built on a special server and the resulting files are copied to the
+https://github.com/pytorch/cppdocs repo, and are served from GitHub
+pages.
+
+-  Site: http://pytorch.org/cppdocs
+-  GitHub: https://github.com/pytorch/pytorch/tree/master/docs/cpp
+-  Served from: https://github.com/pytorch/cppdocs
+
+Tutorials
+---------
+
+PyTorch tutorials are documents used to help understand using PyTorch to
+accomplish specific tasks or to understand more holistic concepts.
+Tutorials are built using
+`Sphinx-Gallery <https://sphinx-gallery.readthedocs.io/en/latest/index.html>`__
+from executable python sources files, or from restructured-text (rst)
+files.
+
+-  Site: http://pytorch.org/tutorials
+-  GitHub: http://github.com/pytorch/tutorials
+
+Tutorials Build Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For tutorials, `pull
+requests <https://github.com/pytorch/tutorials/pulls>`__ trigger a
+rebuild the entire site using CircleCI to test the effects of the
+change. This build is sharded into 9 worker builds and takes around 40
+minutes total. At the same time, we do a Netlify build using *make
+html-noplot*, which builds the site without rendering the notebook
+output into pages for quick review.
+
+After a PR is accepted, the site is rebuilt and deployed from CircleCI.
+
+Contributing a new Tutorial
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`PyTorch.org Tutorial Contribution
+Guide <https://github.com/pytorch/tutorials/#contributing>`__
+
+Code Style
+~~~~~~~~~~
+
+**Python style**
+
+**C++ style**
+
+Submitting a Pull Request
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PyTorch development happens publicly on our Github repo.
+
+To have your feature or fix added to PyTorch, please submit a Pull
+Request.
+
+Running Tests
+~~~~~~~~~~~~~
+
+Show examples for running all tests, just one individual...
+
+Technical Process
+-----------------
+
+Developing PyTorch
+~~~~~~~~~~~~~~~~~~
+
+To develop PyTorch on your machine, here are some tips:
+
+1. Uninstall all existing PyTorch installs:
+
+::
+
+    conda uninstall pytorch
+    pip uninstall torch
+    pip uninstall torch # run this command twice
+
+2. Clone a copy of PyTorch from source:
+
+::
+
+    git clone https://github.com/pytorch/pytorch
+    cd pytorch
+
+3. Install PyTorch in ``build develop`` mode:
+
+A full set of instructions on installing PyTorch from source is here:
+https://github.com/pytorch/pytorch#from-source
+
+The change you have to make is to replace
+
+::
+
+    python setup.py install
+
+with
+
+::
+
+    python setup.py build develop
+
+This is especially useful if you are only changing Python files.
+
+This mode will symlink the Python files from the current local source
+tree into the Python install.
+
+Hence, if you modify a Python file, you do not need to reinstall PyTorch
+again and again.
+
+For example:
+
+-  Install local PyTorch in ``build develop`` mode
+-  modify your Python file ``torch/__init__.py`` (for example)
+-  test functionality
+-  modify your Python file ``torch/__init__.py``
+-  test functionality
+-  modify your Python file ``torch/__init__.py``
+-  test functionality
+
+You do not need to repeatedly install after modifying Python files.
+
+In case you want to reinstall, make sure that you uninstall PyTorch
+first by running ``pip uninstall torch`` and ``python setup.py clean``.
+Then you can install in ``build develop`` mode again.
+
+Codebase structure
+------------------
+
+-  `c10 <https://github.com/pytorch/pytorch/blob/master/c10>`__ - Core
+   library files that work everywhere, both server and mobile. We are
+   slowly moving pieces from
+   `ATen/core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
+   here. This library is intended only to contain essential
+   functionality, and appropriate to use in settings where binary size
+   matters. (But you'll have a lot of missing functionality if you try
+   to use it directly.)
+-  `aten <https://github.com/pytorch/pytorch/blob/master/aten>`__ - C++
+   tensor library for PyTorch (no autograd support)
+
+   -  `src <https://github.com/pytorch/pytorch/blob/master/aten/src>`__
+
+      -  `TH <https://github.com/pytorch/pytorch/blob/master/aten/src/TH>`__
+         `THC <https://github.com/pytorch/pytorch/blob/master/aten/src/THC>`__
+         `THNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THNN>`__
+         `THCUNN <https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN>`__
+         - Legacy library code from the original Torch. Try not to add
+         things here; we're slowly porting these to
+         `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__.
+
+         -  generic - Contains actual implementations of operators,
+            parametrized over ``scalar_t``. Files here get compiled N
+            times per supported scalar type in PyTorch.
+
+      -  `ATen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen>`__
+
+         -  `core <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core>`__
+            - Core functionality of ATen. This is migrating to top-level
+            c10 folder.
+         -  `native <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native>`__
+            - Modern implementations of operators. If you want to write
+            a new operator, here is where it should go. Most CPU
+            operators go in the top level directory, except for
+            operators which need to be compiled specially; see cpu
+            below.
+
+            -  `cpu <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu>`__
+               - Not actually CPU implementations of operators, but
+               specifically implementations which are compiled with
+               processor-specific instructions, like AVX. See the
+               `README <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/README.md>`__
+               for more details.
+            -  `cuda <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda>`__
+               - CUDA implementations of operators.
+            -  `sparse <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/sparse>`__
+               - CPU and CUDA implementations of COO sparse tensor
+               operations
+            -  `mkl <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkl>`__
+               `mkldnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/mkldnn>`__
+               `miopen <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/miopen>`__
+               `cudnn <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn>`__
+
+               -  implementations of operators which simply bind to some
+                  backend library.
+
+-  `torch <https://github.com/pytorch/pytorch/blob/master/torch>`__ -
+   The actual PyTorch library. Everything that is not in
+   `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
+   is a Python module, following the PyTorch Python frontend module
+   structure.
+
+   -  `csrc <https://github.com/pytorch/pytorch/blob/master/torch/csrc>`__
+      - C++ files composing the PyTorch library. Files in this directory
+      tree are a mix of Python binding code, and C++ heavy lifting.
+      Consult ``setup.py`` for the canonical list of Python binding
+      files; conventionally, they are often prefixed with ``python_``.
+
+      -  `jit <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit>`__
+         - Compiler and frontend for TorchScript JIT frontend.
+      -  `autograd <https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd>`__
+         - Implementation of reverse-mode automatic differentiation.
+      -  `api <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api>`__
+         - The PyTorch C++ frontend.
+      -  `distributed <https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed>`__
+         - Distributed training support for PyTorch.
+
+-  `tools <https://github.com/pytorch/pytorch/blob/master/tools>`__ -
+   Code generation scripts for the PyTorch library. See
+   `README <https://github.com/pytorch/pytorch/blob/master/tools/README.md>`__
+   of this directory for more details.
+-  `test <https://github.com/pytorch/pytorch/blob/master/tests>`__ -
+   Python unit tests for PyTorch Python frontend.
+
+   -  `test\_torch.py <https://github.com/pytorch/pytorch/blob/master/test/test_torch.py>`__
+      - Basic tests for PyTorch functionality.
+   -  `test\_autograd.py <https://github.com/pytorch/pytorch/blob/master/test/test_autograd.py>`__
+      - Tests for non-NN automatic differentiation support.
+   -  `test\_nn.py <https://github.com/pytorch/pytorch/blob/master/test/test_nn.py>`__
+      - Tests for NN operators and their automatic differentiation.
+   -  `test\_jit.py <https://github.com/pytorch/pytorch/blob/master/test/test_jit.py>`__
+      - Tests for the JIT compiler and TorchScript.
+   -  ...
+   -  `cpp <https://github.com/pytorch/pytorch/blob/master/test/cpp>`__
+      - C++ unit tests for PyTorch C++ frontend.
+   -  `expect <https://github.com/pytorch/pytorch/blob/master/test/expect>`__
+      - Automatically generated "expect" files which are used to compare
+      against expected output.
+   -  `onnx <https://github.com/pytorch/pytorch/blob/master/test/onnx>`__
+      - Tests for ONNX export functionality, using both PyTorch and
+      Caffe2.
+
+-  `caffe2 <https://github.com/pytorch/pytorch/blob/master/caffe2>`__ -
+   The Caffe2 library.
+
+   -  `core <https://github.com/pytorch/pytorch/blob/master/caffe2/core>`__
+      - Core files of Caffe2, e.g., tensor, workspace, blobs, etc.
+   -  `operators <https://github.com/pytorch/pytorch/blob/master/caffe2/operators>`__
+      - Operators of Caffe2.
+   -  `python <https://github.com/pytorch/pytorch/blob/master/caffe2/python>`__
+      - Python bindings to Caffe2.
+   -  ...
+
+Unit Testing
+------------
+
+PyTorch's testing is located under ``test/``. Run the entire test suite
+with
+
+::
+
+    python test/run_test.py
+
+or run individual test files, like ``python test/test_nn.py``, for
+individual test suites.
+
+Better local unit tests with pytest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We don't officially support ``pytest``, but it works well with our
+``unittest`` tests and offers a number of useful features for local
+developing. Install it via ``pip install pytest``.
+
+If you want to just run tests that contain a specific substring, you can
+use the ``-k`` flag:
+
+::
+
+    pytest test/test_nn.py -k Loss -v
+
+The above is an example of testing a change to Loss functions: this
+command runs tests such as ``TestNN.test_BCELoss``\ and
+``TestNN.test_MSELoss`` and can be useful to save keystrokes.
+
+Writing documentation
+---------------------
+
+PyTorch uses `Google
+style <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__
+for formatting docstrings. Length of line inside docstrings block must
+be limited to 80 characters to fit into Jupyter documentation popups.
+
+For C++ documentation (https://pytorch.org/cppdocs), we use
+`Doxygen <http://www.doxygen.nl/>`__ and then convert it to
+`Sphinx <http://www.sphinx-doc.org/>`__ via
+`Breathe <https://github.com/michaeljones/breathe>`__
+and\ `Exhale <https://github.com/svenevs/exhale>`__. Check the `Doxygen
+reference <http://www.stack.nl/~dimitri/doxygen/manual/index.html>`__
+for more information on the documentation syntax. To build the
+documentation locally, ``cd`` into ``docs/cpp`` and then ``make html``.
+
+We run Doxygen in CI (Travis) to verify that you do not use invalid
+Doxygen commands. To run this check locally, run ``./check-doxygen.sh``
+from inside ``docs/cpp``.
+
+Managing multiple build trees
+-----------------------------
+
+One downside to using ``python setup.py develop`` is that your
+development version of PyTorch will be installed globally on your
+account (e.g., if you run ``import torch`` anywhere else, the
+development version will be used.
+
+If you want to manage multiple builds of PyTorch, you can make use of
+`conda environments <https://conda.io/docs/using/envs.html>`__ to
+maintain separate Python package environments, each of which can be tied
+to a specific build of PyTorch. To set one up:
+
+::
+
+    conda create -n pytorch-myfeaturesource activate pytorch-myfeature# if you run python now, torch will NOT be installed
+    python setup.py build develop
+
+C++ Development tips
+--------------------
+
+If you are working on the C++ code, there are a few important things
+that you will want to keep in mind:
+
+1. How to rebuild only the code you are working on.
+2. How to make rebuilds in the absence of changes go faster.
+
+Build only what you need.
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``python setup.py build`` will build everything, but since our build
+system is not very optimized for incremental rebuilds, this will
+actually be very slow. Far better is to only request rebuilds of the
+parts of the project you are working on:
+
+-  Working on the Python bindings? Run ``python setup.py develop`` to
+   rebuild (NB: no ``build`` here!)
+-  Working on ``torch/csrc`` or ``aten``? Run
+   ``python setup.py rebuild_libtorch`` to rebuild and avoid having to
+   rebuild other dependent libraries we depend on.
+-  Working on one of the other dependent libraries? The other valid
+   targets are listed in ``dep_libs`` in ``setup.py``. prepend
+   ``build_`` to get a target, and run as e.g.
+   ``python setup.py build_gloo``.
+-  Working on a test binary? Run
+   ``(cd build && ninja bin/test_binary_name)`` to rebuild only that
+   test binary (without rerunning cmake). (Replace ``ninja`` with
+   ``make`` if you don't have ninja installed).
+
+On the initial build, you can also speed things up with the environment
+variables ``DEBUG`` and ``NO_CUDA``.
+
+-  ``DEBUG=1`` will enable debug builds (-g -O0)
+-  ``REL_WITH_DEB_INFO=1`` will enable debug symbols with optimizations
+   (-g -O3)
+-  ``NO_CUDA=1`` will disable compiling CUDA (in case you are developing
+   on something not CUDA related), to save compile time.
+
+For example:
+
+::
+
+    NO_CUDA=1 DEBUG=1 python setup.py build develop
+
+Make sure you continue to pass these flags on subsequent builds.
+
+Code completion and IDE support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using ``python setup.py develop``, PyTorch will generate a
+``compile_commands.json`` file that can be used by many editors to
+provide command completion and error highlighting for PyTorch's C++
+code. You need to ``pip install ninja`` to generate accurate information
+for the code in ``torch/csrc``. More information at:
+
+-  https://sarcasm.github.io/notes/dev/compilation-database.html
+
+Make no-op build fast.
+~~~~~~~~~~~~~~~~~~~~~~
+
+Use Ninja
+~~~~~~~~~
+
+Python ``setuptools`` is pretty dumb, and always rebuilds every C file
+in a project. If you install the ninja build system with
+``pip install ninja``, then PyTorch will use it to track dependencies
+correctly. If PyTorch was already built, you will need to run
+``python setup.py clean`` once after installing ninja for builds to
+succeed.
+
+Use CCache
+~~~~~~~~~~
+
+Even when dependencies are tracked with file modification, there are
+many situations where files get rebuilt when a previous compilation was
+exactly the same.
+
+Using ccache in a situation like this is a real time-saver. However, by
+default, ccache does not properly support CUDA stuff, so here are the
+instructions for installing a custom ccache fork that has CUDA support:
+
+::
+
+    # install and export ccacheif ! ls ~/ccache/bin/ccachethen
+        sudo apt-get update
+        sudo apt-get install -y automake autoconf
+        sudo apt-get install -y asciidoc
+        mkdir -p ~/ccache
+        pushd /tmp
+        rm -rf ccache
+        git clone https://github.com/colesbury/ccache -b ccbin
+        pushd ccache
+        ./autogen.sh
+        ./configure
+        make install prefix=~/ccache
+        popdpopd
+
+        mkdir -p ~/ccache/lib
+        mkdir -p ~/ccache/cuda
+        ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
+        ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
+        ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
+        ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
+        ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
+
+        ~/ccache/bin/ccache -M 25Gifiexport PATH=~/ccache/lib:$PATHexport CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
+
+CUDA Development tips
+---------------------
+
+If you are working on the CUDA code, here are some useful CUDA debugging
+tips:
+
+1. ``CUDA_DEVICE_DEBUG=1`` will enable CUDA device function debug
+   symbols (``-g -G``). This will be particularly helpful in debugging
+   device code. However, it will slow down the build process for about
+   50% (compared to only ``DEBUG=1``), so use wisely.
+2. ``cuda-gdb`` and ``cuda-memcheck`` are your best CUDA debugging
+   friends. Unlike\ ``gdb``, ``cuda-gdb`` can display actual values in a
+   CUDA tensor (rather than all zeros).
+
+Hope this helps, and thanks for considering to contribute.
+
+Windows development tips
+------------------------
+
+Occasionally, you will write a patch which works on Linux, but fails CI
+on Windows. There are a few aspects in which MSVC (the Windows compiler
+toolchain we use) is stricter than Linux, which are worth keeping in
+mind when fixing these problems.
+
+1. Symbols are NOT exported by default on Windows; instead, you have to
+   explicitly mark a symbol as exported/imported in a header file with
+   ``__declspec(dllexport)`` / ``__declspec(dllimport)``. We have
+   codified this pattern into a set of macros which follow the
+   convention ``*_API``, e.g., ``CAFFE2_API`` inside Caffe2 and ATen.
+   (Every separate shared library needs a unique macro name, because
+   symbol visibility is on a per shared library basis. See
+   c10/macros/Macros.h for more details.) The upshot is if you see an
+   "unresolved external" error in your Windows build, this is probably
+   because you forgot to mark a function with ``*_API``. However, there
+   is one important counterexample to this principle: if you want a
+   *templated* function to be instantiated at the call site, do NOT mark
+   it with ``*_API`` (if you do mark it, you'll have to explicitly
+   instantiate all of the specializations used by the call sites.)
+2. If you link against a library, this does not make its dependencies
+   transitively visible. You must explicitly specify a link dependency
+   against every library whose symbols you use. (This is different from
+   Linux where in most environments, transitive dependencies can be used
+   to fulfill unresolved symbols.)
+3. If you have a Windows box (we have a few on EC2 which you can request
+   access to) and you want to run the build, the easiest way is to just
+   run ``.jenkins/pytorch/win-build.sh``. If you need to rebuild, run
+   ``REBUILD=1 .jenkins/pytorch/win-build.sh`` (this will avoid blowing
+   away your Conda environment.)
+
+Even if you don't know anything about MSVC, you can use cmake to build
+simple programs on Windows; this can be helpful if you want to learn
+more about some peculiar linking behavior by reproducing it on a small
+example. Here's a simple example cmake file that defines two dynamic
+libraries, one linking with the other:
+
+::
+
+    project(myproject CXX)set(CMAKE_CXX_STANDARD 11)add_library(foo SHARED foo.cpp)add_library(bar SHARED bar.cpp)# NB: don't forget to __declspec(dllexport) at least one symbol from foo,# otherwise foo.lib will not be created.target_link_libraries(bar PUBLIC foo)
+
+You can build it with:
+
+::
+
+    mkdir buildcd build
+    cmake ..
+    cmake --build .
+
+Known MSVC (and MSVC with NVCC) bugs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PyTorch codebase sometimes likes to use exciting C++ features, and
+these exciting features lead to exciting bugs in Windows compilers. To
+add insult to injury, the error messages will often not tell you which
+line of code actually induced the erroring template instantiation. We've
+found the most effective way to debug these problems is to carefully
+read over diffs, keeping in mind known bugs in MSVC/NVCC. Here are a few
+well known pitfalls and workarounds:
+
+-  This is not actually a bug per se, but in general, code generated by
+   MSVC is more sensitive to memory errors; you may have written some
+   code that does a use-after-free or stack overflows; on Linux the code
+   might work, but on Windows your program will crash. ASAN may not
+   catch all of these problems: stay vigilant to the possibility that
+   your crash is due to a real memory problem.
+-  (NVCC) ``c10::optional`` does not work when used from device code.
+   Don't use it from kernels. Upstream issue:
+   https://github.com/akrzemi1/Optional/issues/58 and our local issue
+   #10329.
+-  ``constexpr`` generally works less well on MSVC.
+
+   -  The idiom ``static_assert(f() == f())`` to test if ``f`` is
+      constexpr does not work; you'll get "error C2131: expression did
+      not evaluate to a constant". Don't use these asserts on Windows.
+      (Example: ``c10/util/intrusive_ptr.h``)
+
+-  (NVCC) Code you access inside a ``static_assert`` will eagerly be
+   evaluated as if it were device code, and so you might get an error
+   that the code is "not accessible".
+
+::
+
+    class A {
+      static A singleton_;
+      static constexpr inline A* singleton() {
+        return &singleton_;
+      }
+    };static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
+
+-  The compiler will run out of heap space if you attempt to compile
+   files that are too large. Splitting such files into separate files
+   helps. (Example: ``THTensorMath``, ``THTensorMoreMath``,
+   ``THTensorEvenMoreMath``.)
+-  MSVC's preprocessor (but not the standard compiler) has a bug where
+   it incorrectly tokenizes raw string literals, ending when it sees a
+   ``"``. This causes preprocessor tokens inside the literal like
+   an\ ``#endif`` to be incorrectly treated as preprocessor directives.
+   See https://godbolt.org/z/eVTIJq as an example.
+
+Running Clang-Tidy
+~~~~~~~~~~~~~~~~~~
+
+`Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/index.html>`__ is a
+C++ linter and static analysis tool based on the clang compiler. We run
+clang-tidy in our CI to make sure that new C++ code is safe, sane and
+efficient. See our
+`.travis.yml <https://github.com/pytorch/pytorch/blob/master/.travis.yml>`__
+file for the simple commands we use for this. To run clang-tidy locally,
+follow these steps:
+
+1. Install clang-tidy. First, check if you already have clang-tidy by
+   simply writing ``clang-tidy`` in your terminal. If you don't yet have
+   clang-tidy, you should be able to install it easily with your package
+   manager, e.g. by writing ``apt-get install clang-tidy`` on Ubuntu.
+   See `https://apt.llvm.org <https://apt.llvm.org/>`__ for details on
+   how to install the latest version. Note that newer versions of
+   clang-tidy will have more checks than older versions. In our CI, we
+   run clang-tidy-6.0.
+2. Use our driver script to run clang-tidy over any changes relative to
+   some git revision (you may want to replace ``HEAD~1`` with ``HEAD``
+   to pick up uncommitted changes). Changes are picked up based on a
+   ``git diff`` with the given revision:
+
+::
+
+    python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
+
+Above, it is assumed you are in the PyTorch root folder.
+``path/to/build`` should be the path to where you built PyTorch from
+source, e.g. ``build`` in the PyTorch root folder if you used
+``setup.py build``. You can use ``-c <clang-tidy-binary>``\ to change
+the clang-tidy this script uses. Make sure you have PyYaml installed,
+which is in PyTorch's ``requirements.txt``.
+
+Pre-commit Tidy/Linting Hook
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use clang-tidy and flake8 to perform additional formatting and
+semantic checking of code. We provide a pre-commit git hook for
+performing these checks, before a commit is created:
+
+::
+
+    ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
+
+Caffe2 notes
+------------
+
+In 2018, we merged Caffe2 into the PyTorch source repository. While the
+steady state aspiration is that Caffe2 and PyTorch share code freely, in
+the meantime there will be some separation. If you submit a PR to only
+PyTorch or only Caffe2 code, CI will only run for the project you
+edited. The logic for this is implemented in
+``.jenkins/pytorch/dirty.sh`` and ``.jenkins/caffe2/dirty.sh``; you can
+look at this to see what path prefixes constitute changes. This also
+means if you ADD a new top-level path, or you start sharing code between
+projects, you need to modify these files. There are a few "unusual"
+directories which, for historical reasons, are Caffe2/PyTorch specific.
+Here they are:
+
+-  ``CMakeLists.txt``, ``Makefile``, ``binaries``, ``cmake``, ``conda``,
+   ``modules``, ``scripts`` are Caffe2-specific. Don't put PyTorch code
+   in them without extra coordination.
+-  ``mypy*``, ``requirements.txt``, ``setup.py``, ``test``, ``tools``
+   are PyTorch-specific. Don't put Caffe2 code in them without extra
+   coordination.
--- a/docs/source/community/governance.rst
+++ b/docs/source/community/governance.rst
@ -0,0 +1,154 @@
+PyTorch Governance
+==========================
+
+Governance Philosophy and Guiding Tenets
+-----------------------------------------
+
+PyTorch adopts a governance structure with a small set of maintainers
+driving the overall project direction with a strong bias towards
+PyTorch's design philosophy where design and code contributions are
+valued. Beyond the core maintainers, there is also a slightly broader
+set of core developers that have the ability to directly merge pull
+requests and own various parts of the core code base.
+
+Beyond the maintainers and core devs, the community is encouraged to
+contribute, file issues, make proposals, review pull requests and be
+present in the community. Given contributions and willingness to
+invest, anyone can be provided write access or ownership of parts of
+the codebase.
+
+Based on this governance structure, the project has the following core
+operating tenets by which decisions are made and overall culture is
+derived:
+
+1. **Code contributions** matter much more than corporate sponsorship
+   and independent developers are highly valued.
+2. **Project influence** is gained through contributions (whether PRs,
+   forum answers, code reviews or otherwise)
+
+Key people and their functions
+------------------------------
+
+Project Maintainers
+~~~~~~~~~~~~~~~~~~~
+
+Project maintainers provide leadership and direction for the PyTorch
+project. Specifics include:
+
+-  Articulate a cohesive long-term vision for the project
+-  Possess a deep understanding of the PyTorch code base
+-  Negotiate and resolve contentious issues in ways acceptable to all
+   parties involved
+
+PyTorch Maintainers:
+
+-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
+-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
+-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
+-  (sunsetting) Sam Gross (`colesbury <https://github.com/colesbury>`__)
+
+Core Developers
+~~~~~~~~~~~~~~~
+
+The PyTorch project is developed by a team of core developers. You can
+find the list of core developers at `PyTorch Governance \| Persons of
+Interest </docs/community/persons_of_interest.html>`__.
+
+While membership is determined by presence in the "PyTorch core" team in
+the "PyTorch"
+`organization <https://github.com/orgs/pytorch/teams/facebook>`__ on
+GitHub, contribution takes many forms:
+
+-  committing changes to the repository;
+-  reviewing pull requests by others;
+-  triaging bug reports on the issue tracker;
+-  discussing topics on official PyTorch communication channels.
+
+Moderators
+~~~~~~~~~~
+
+There is a group of people, some of which are not core developers,
+responsible for ensuring that discussions on official communication
+channels adhere to the Code of Conduct. They take action in view of
+violations and help to support a healthy community. You can find the
+list of moderators `here <https://discuss.pytorch.org/about>`__.
+
+Decision Making
+---------------
+
+Uncontroversial Changes
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Primary work happens through bug tracker issues and pull requests on
+GitHub. Core developers should avoid pushing their changes directly to
+the PyTorch repository, instead relying on pull requests. Approving a
+pull request by a core developer allows it to be merged without further
+process. Core Developers and Project Maintainers ultimately approve
+these changes.
+
+Notifying relevant experts about a bug tracker issue or a pull request
+is important. Reviews from experts in the given interest area are
+strongly preferred, especially on pull request approvals. Failure to do
+so might end up with the change being reverted by the relevant expert.
+
+Controversial decision process
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Substantial changes in a given interest area require a GitHub issue to
+be opened for discussion. This includes:
+
+-  Any semantic or syntactic change to the framework.
+-  Backwards-incompatible changes to the Python or Cpp API.
+-  Additions to the core framework, including substantial new
+   functionality within an existing library.
+-  Removing core features
+
+Project Maintainers ultimately approve these changes.
+
+FAQ
+---
+
+**Q: What if I would like to own (or partly own) a part of the project
+such as a domain api (i.e. Torch Vision)?** This is absolutely possible.
+The first step is to start contributing to the existing project area and
+contributing to its health and success. In addition to this, you can
+make a proposal through a GitHub issue for new functionality or changes
+to improve the project area.
+
+**Q: What if I am a company looking to use PyTorch internally for
+development, can I be granted or purchase a board seat to drive the
+project direction?** No, the PyTorch project is strictly driven by the
+maintainer-driven project philosophy and does not have a board or
+vehicle to take financial contributions relating to gaining influence
+over technical direction.
+
+**Q: Does the PyTorch project support grants or ways to support
+independent developers using or contributing to the project?** No, not
+at this point. We are however looking at ways to better support the
+community of independent developers around PyTorch. If you have
+suggestions or inputs, please reach out on the PyTorch forums to
+discuss.
+
+**Q: How do I contribute code to the project?** If the change is
+relatively minor, a pull request on GitHub can be opened up immediately
+for review and merge by the project committers. For larger changes,
+please open an issue to make a proposal to discuss prior. Please also
+see the **`PyTorch Contributor
+Guide </docs/community/contribution_guide.html>`__** for contribution
+guidelines.
+
+**Q: Can I become a committer on the project?** Unfortunately, the
+current commit process to PyTorch involves an interaction with Facebook
+infrastructure that can only be triggered by Facebook employees. We are
+however looking at ways to expand the committer base to individuals
+outside of Facebook and will provide an update when the tooling exists
+to allow this.
+
+**Q: What if i would like to deliver a PyTorch tutorial at a conference
+or otherwise? Do I need to be 'officially' a committer to do this?** No,
+we encourage community members to showcase their work wherever and
+whenever they can. Please reach out to
+`pytorch-marketing@fb.com <http://mailto:pytorch-marketing@fb.com/>`__
+for marketing support.
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@ -0,0 +1,130 @@
+PyTorch Governance | Persons of Interest
+=========================================
+
+General Maintainers
+-------------------
+
+-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
+-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
+-  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
+-  (sunsetting) Sam Gross
+   (`colesbury <https://github.com/colesbury>`__)
+
+Module-level maintainers
+------------------------
+
+JIT
+~~~
+
+-  Zach Devito (`zdevito <https://github.com/zdevito>`__)
+-  Michael Suo (`suo <https://github.com/suo>`__)
+
+Distributed
+~~~~~~~~~~~
+
+-  Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
+-  Shen Li (`mrshenli <https://github.com/mrshenli>`__)
+-  (sunsetting) Teng Li (`teng-li <https://github.com/teng-li>`__)
+
+Autograd Engine
+~~~~~~~~~~~~~~~
+
+-  Alban Desmaison (`alband <https://github.com/alband>`__)
+-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+
+Multiprocessing and DataLoaders
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Simon Wang (`SsnL <https://github.com/SsnL>`__)
+-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  (proposed) Vitaly Fedyunin
+   (`VitalyFedyunin <https://github.com/proposed>`__)
+
+CUDA
+~~~~
+
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
+-  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
+
+C++
+~~~
+
+-  Will Feng (`yf225 <https://github.com/yf225>`__)
+-  (sunsetting) Peter Goldsborough
+   (`goldsborough <https://github.com/goldsborough>`__)
+
+Build + CI
+~~~~~~~~~~
+
+-  Will Feng (`yf225 <https://github.com/yf225>`__)
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
+-  Jesse Hellemn (`pjh5 <https://github.com/pjh5>`__)
+-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  (sunsetting) Orion Reblitz-Richardson
+(`orionr <https://github.com/orionr>`__)
+
+Distributions & RNG
+~~~~~~~~~~~~~~~~~~~
+
+-  Fritz Obermeyer (`fritzo <https://github.com/fritzo>`__)
+-  Neeraj Pradhan (`neerajprad <https://github.com/neerajprad>`__)
+-  Alican Bozkurt (`alicanb <https://github.com/alicanb>`__)
+-  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
+
+C10
+~~~
+
+-  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
+
+ONNX <-> PyTorch
+~~~~~~~~~~~~~~~~
+
+-  Lu Fang (`houseroad <https://github.com/houseroad>`__)
+
+torch.nn
+~~~~~~~~
+
+-  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
+-  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
+-  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
+
+CPU Performance / SIMD
+~~~~~~~~~~~~~~~~~~~~~~
+
+-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+-  Sam Gross (`colesbury <https://github.com/colesbury>`__)
+-  Richard Zou (`zou3519 <https://github.com/zou3519>`__)
+
+AMD/ROCm/HIP
+~~~~~~~~~~~~
+
+-  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
+-  Johannes M. Dietrich (`iotamudelta <https://github.com/iotamudelta>`__)
+
+Windows
+~~~~~~~
+
+-  Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
+
+MKLDNN
+~~~~~~
+
+-  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
+
+XLA
+~~~
+
+-  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
+-  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
+-  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
+-  Alex Suhan (`asuhan <https://github.com/asuhan>`__)
+
+PPC
+~~~
+
+-  Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
--- a/docs/source/hub.rst
+++ b/docs/source/hub.rst
@ -1,6 +1,101 @@
 torch.hub
 ===================================
+Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
+
+Publishing models
+-----------------
+
+Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
+to a github repository by adding a simple ``hubconf.py`` file;
+
+``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function with
+the following signature.
+
+::
+
+    def entrypoint_name(pretrained=False, *args, **kwargs):
+        ...
+
+How to implement an entrypoint?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Here is a code snipet from pytorch/vision repository, which specifies an entrypoint
+for ``resnet18`` model. You can see a full script in
+`pytorch/vision repo <https://github.com/pytorch/vision/blob/master/hubconf.py>`_
+
+::
+
+    dependencies = ['torch', 'math']
+
+    def resnet18(pretrained=False, *args, **kwargs):
+        """
+        Resnet18 model
+        pretrained (bool): a recommended kwargs for all entrypoints
+        args & kwargs are arguments for the function
+        """
+        ######## Call the model in the repo ###############
+        from torchvision.models.resnet import resnet18 as _resnet18
+        model = _resnet18(*args, **kwargs)
+        ######## End of call ##############################
+        # The following logic is REQUIRED
+        if pretrained:
+            # For weights saved in local repo
+			# model.load_state_dict(<path_to_saved_file>)
+
+			# For weights saved elsewhere
+			checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+            model.load_state_dict(model_zoo.load_url(checkpoint, progress=False))
+        return model
+
+- ``dependencies`` variable is a **list** of package names required to to run the model.
+- Pretrained weights can either be stored local in the github repo, or loadable by
+  ``model_zoo.load()``.
+- ``pretrained`` controls whether to load the pre-trained weights provided by repo owners.
+- ``args`` and ``kwargs`` are passed along to the real callable function.
+- Docstring of the function works as a help message, explaining what does the model do and what
+  are the allowed arguments.
+- Entrypoint function should **ALWAYS** return a model(nn.module).
+
+Important Notice
+^^^^^^^^^^^^^^^^
+
+- The published models should be at least in a branch/tag. It can't be a random commit.
+
+Loading models from Hub
+-----------------------
+
+Users can load the pre-trained models using ``torch.hub.load()`` API.
+

 .. automodule:: torch.hub
 .. autofunction:: load
+
+Here's an example loading ``resnet18`` entrypoint from ``pytorch/vision`` repo.
+
+::
+
+    hub_model = hub.load(
+        'pytorch/vision:master', # repo_owner/repo_name:branch
+        'resnet18', # entrypoint
+        1234, # args for callable [not applicable to resnet]
+        pretrained=True) # kwargs for callable
+
+Where are my downloaded model & weights saved?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The locations are used in the order of
+
+- hub_dir: user specified path. It can be set in the following ways:
+  - Setting the environment variable ``TORCH_HUB_DIR``
+  - Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
+- ``~/.torch/hub``
+
 .. autofunction:: set_dir
+
+Caching logic
+^^^^^^^^^^^^^
+
+By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in ``hub_dir``.
+
+Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
+the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
+when updates are published to the same branch, users can keep up with the latest release.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -17,6 +17,12 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.

   notes/*

+.. toctree::
+  :glob:
+  :maxdepth: 1
+  :caption: Community
+
+  community/*

 .. toctree::
   :maxdepth: 1
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -1,4 +1,4 @@
-Torch Script
+TorchScript
 ============

 .. contents:: :local:
@ -6,17 +6,17 @@ Torch Script
 .. automodule:: torch.jit
 .. currentmodule:: torch.jit

-Torch Script is a way to create serializable and optimizable models from PyTorch code.
-Any code written in Torch Script can be saved from your Python
+TorchScript is a way to create serializable and optimizable models from PyTorch code.
+Any code written in TorchScript can be saved from your Python
 process and loaded in a process where there is no Python dependency.

 We provide tools to incrementally transition a model from being a pure Python program
-to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
+to a TorchScript program that can be run independently from Python, for instance, in a standalone C++ program.
 This makes it possible to train models in PyTorch using familiar tools and then export
 the model to a production environment where it is not a good idea to run models as Python programs
 for performance and multi-threading reasons.

-Creating Torch Script Code
+Creating TorchScript Code
 --------------------------


@ -117,26 +117,26 @@ Example:
            return self.resnet(input - self.means)


-Torch Script Language Reference
+TorchScript Language Reference
 -------------------------------

-Torch Script is a subset of Python that can either be written directly (using
+TorchScript is a subset of Python that can either be written directly (using
 the @script annotations) or generated automatically from Python code via
 tracing. When using tracing, code is automatically converted into this subset of
 Python by recording only the actual operators on tensors and simply executing and
 discarding the other surrounding Python code.

-When writing Torch Script directly using @script annotations, the programmer must
-only use the subset of Python supported in Torch Script. This section documents
-what is supported in Torch Script as if it were a language reference for a stand
+When writing TorchScript directly using @script annotations, the programmer must
+only use the subset of Python supported in TorchScript. This section documents
+what is supported in TorchScript as if it were a language reference for a stand
 alone language. Any features of Python not mentioned in this reference are not
-part of Torch Script.
+part of TorchScript.

-As a subset of Python any valid Torch Script function is also a valid Python
+As a subset of Python any valid TorchScript function is also a valid Python
 function. This makes it possible to remove the @script annotations and debug the
 function using standard Python tools like pdb. The reverse is not true: there
-are many valid python programs that are not valid Torch Script programs.
-Instead, Torch Script focuses specifically on the features of Python that are
+are many valid python programs that are not valid TorchScript programs.
+Instead, TorchScript focuses specifically on the features of Python that are
 needed to represent neural network models in Torch.

 .. envvar:: PYTORCH_JIT=1
@ -150,9 +150,9 @@ needed to represent neural network models in Torch.
 Types
 ~~~~~

-The largest difference between Torch Script and the full Python language is that
-Torch Script only support a small set of types that are needed to express neural
-net models. In particular Torch Script supports:
+The largest difference between TorchScript and the full Python language is that
+TorchScript only support a small set of types that are needed to express neural
+net models. In particular TorchScript supports:

 ``Tensor``
    A PyTorch tensor of any dtype, dimension, or backend.
@ -169,8 +169,8 @@ net models. In particular Torch Script supports:
 ``List[T]``
    A list of which all members are type ``T``

-Unlike Python, each variable in Torch Script function must have a single static type.
-This makes it easier to optimize Torch Script functions.
+Unlike Python, each variable in TorchScript function must have a single static type.
+This makes it easier to optimize TorchScript functions.

 Example::

@ -183,9 +183,9 @@ Example::
        return r # Type mismatch: r is set to type Tensor in the true branch
                 # and type int in the false branch

-By default, all parameters to a Torch Script function are assumed to be Tensor
+By default, all parameters to a TorchScript function are assumed to be Tensor
 because this is the most common type used in modules. To specify that an
-argument to a Torch Script function is another type, it is possible to use
+argument to a TorchScript function is another type, it is possible to use
 MyPy-style type annotations using the types listed above:

 Example::
@ -264,7 +264,7 @@ Subscripts
  ``t[i:j, i]``

  .. note::
-    Torch Script currently does not support mutating tensors in place, so any
+    TorchScript currently does not support mutating tensors in place, so any
    tensor indexing can only appear on the right-hand size of an expression.

 Function calls
@ -328,7 +328,7 @@ Accessing Module Parameters
 Statements
 ~~~~~~~~~~

-Torch Script supports the following types of statements:
+TorchScript supports the following types of statements:

 Simple Assignments

@ -438,7 +438,7 @@ Return
 Variable Resolution
 ~~~~~~~~~~~~~~~~~~~

-Torch Script supports a subset of Python's variable resolution (i.e. scoping)
+TorchScript supports a subset of Python's variable resolution (i.e. scoping)
 rules. Local variables behave the same as in Python, except for the restriction
 that a variable must have the same type along all paths through a function.
 If a variable has a different type on different sides of an if statement, it
@ -456,23 +456,23 @@ Example::
        print(y) # Error: undefined value y

 Non-local variables are resolved to Python values at compile time when the
-function is defined. These values are then converted into Torch Script values using
+function is defined. These values are then converted into TorchScript values using
 the rules described in `Use of Python Values`_.

 Use of Python Values
 ~~~~~~~~~~~~~~~~~~~~

-To make writing Torch Script more convenient, we allow script code to refer
+To make writing TorchScript more convenient, we allow script code to refer
 to Python values in the surrounding scope. For instance, any time there is a
-reference to ``torch``, the Torch Script compiler is actually resolving it to the
+reference to ``torch``, the TorchScript compiler is actually resolving it to the
 ``torch`` Python module when the function is declared.  These Python values are
-not a first class part of Torch Script. Instead they are desugared at compile-time
-into the primitive types that Torch Script supports. This section describes the
-rules that are used when accessing Python values in Torch Script. They depend
+not a first class part of TorchScript. Instead they are desugared at compile-time
+into the primitive types that TorchScript supports. This section describes the
+rules that are used when accessing Python values in TorchScript. They depend
 on the dynamic type of the python valued referenced.

 Functions
-  Torch Script can call python functions. This functionality is very useful when
+  TorchScript can call python functions. This functionality is very useful when
  incrementally converting a model into script. The model can be moved function-by-function
  to script, leaving calls to Python functions in place. This way you can incrementally
  check the correctness of the model as you go.
@ -495,12 +495,12 @@ Functions


 Attribute Lookup On Python Modules
-    Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
-    are accessed this way. This allows Torch Script to call functions defined in
+    TorchScript can lookup attributes on modules. Builtin functions like ``torch.add``
+    are accessed this way. This allows TorchScript to call functions defined in
    other modules.

 Python-defined Constants
-    Torch Script also provides a way to use constants that are defined in Python.
+    TorchScript also provides a way to use constants that are defined in Python.
    These can be used to hard-code hyper-parameters into the function, or to
    define universal constants. There are two ways of specifying that a Python
    value should be treated as a constant.
@ -597,36 +597,35 @@ Interpreting Graphs

    The example script above produces the graph::

-        graph(%len : int) {
-          %13 : float = prim::Constant[value=1]()
-          %10 : int = prim::Constant[value=10]()
-          %2 : int = prim::Constant[value=4]()
-          %1 : int = prim::Constant[value=3]()
-          %3 : int[] = prim::ListConstruct(%1, %2)
-          %4 : int = prim::Constant[value=6]()
-          %5 : int = prim::Constant[value=0]()
-          %6 : int[] = prim::Constant[value=[0, -1]]()
-          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
-          %8 : int = prim::Constant[value=1]()
-          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
-            block0(%i : int, %12 : Dynamic) {
-              %11 : int = aten::lt(%i, %10)
-              %rv.4 : Dynamic = prim::If(%11)
-                block0() {
-                  %14 : int = prim::Constant[value=1]()
-                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
-                  -> (%rv.2)
-                }
-                block1() {
-                  %16 : int = prim::Constant[value=1]()
-                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
-                  -> (%rv.3)
-                }
-              %19 : int = prim::Constant[value=1]()
-              -> (%19, %rv.4)
-            }
-          return (%rv);
-        }
+	graph(%len : int) {
+	  %15 : int = prim::Constant[value=1]()
+	  %9 : bool = prim::Constant[value=1]()
+	  %7 : Device = prim::Constant[value="cpu"]()
+	  %6 : int = prim::Constant[value=0]()
+	  %5 : int = prim::Constant[value=6]()
+	  %1 : int = prim::Constant[value=3]()
+	  %2 : int = prim::Constant[value=4]()
+	  %11 : int = prim::Constant[value=10]()
+	  %14 : float = prim::Constant[value=1]()
+	  %4 : int[] = prim::ListConstruct(%1, %2)
+	  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
+	  %rv : Tensor = prim::Loop(%len, %9, %rv.1)
+	    block0(%i : int, %13 : Tensor) {
+	      %12 : bool = aten::lt(%i, %11)
+	      %rv.4 : Tensor = prim::If(%12)
+		block0() {
+		  %rv.2 : Tensor = aten::sub(%13, %14, %15)
+		  -> (%rv.2)
+		}
+		block1() {
+		  %rv.3 : Tensor = aten::add(%13, %14, %15)
+		  -> (%rv.3)
+		}
+	      -> (%9, %rv.4)
+	    }
+	  return (%rv);
+	}
+

    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
@ -676,34 +675,39 @@ Automatic Trace Checking
        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)

    Gives us the following diagnostic information::
-
-        ERROR: Graphs differed across invocations!
-        Graph diff:
-            graph(%0 : Dynamic) {
-                  %1 : int = prim::Constant[value=0]()
-                  %2 : int = prim::Constant[value=0]()
-                  %3 : Dynamic = aten::select(%0, %1, %2)
-                  %4 : int = prim::Constant[value=0]()
-                  %5 : int = prim::Constant[value=0]()
-                  %6 : Dynamic = aten::select(%0, %4, %5)
-                  %7 : Dynamic = aten::mul(%3, %6)
-                  %8 : int = prim::Constant[value=0]()
-                  %9 : int = prim::Constant[value=1]()
-                  %10 : Dynamic = aten::select(%0, %8, %9)
-                  %11 : Dynamic = aten::mul(%7, %10)
-                  %12 : int = prim::Constant[value=0]()
-                  %13 : int = prim::Constant[value=2]()
-                  %14 : Dynamic = aten::select(%0, %12, %13)
-                  %15 : Dynamic = aten::mul(%11, %14)
-              +   %16 : int = prim::Constant[value=0]()
-              +   %17 : int = prim::Constant[value=3]()
-              +   %18 : Dynamic = aten::select(%0, %16, %17)
-              +   %19 : Dynamic = aten::mul(%15, %18)
-              -   return (%15);
-              ?             ^
-              +   return (%19);
-              ?             ^
-            }
+	ERROR: Graphs differed across invocations!
+	Graph diff::
+  
+		  graph(%x : Tensor) {
+		    %1 : int = prim::Constant[value=0]()
+		    %2 : int = prim::Constant[value=0]()
+		    %result.1 : Tensor = aten::select(%x, %1, %2)
+		    %4 : int = prim::Constant[value=0]()
+		    %5 : int = prim::Constant[value=0]()
+		    %6 : Tensor = aten::select(%x, %4, %5)
+		    %result.2 : Tensor = aten::mul(%result.1, %6)
+		    %8 : int = prim::Constant[value=0]()
+		    %9 : int = prim::Constant[value=1]()
+		    %10 : Tensor = aten::select(%x, %8, %9)
+		-   %result : Tensor = aten::mul(%result.2, %10)
+		+   %result.3 : Tensor = aten::mul(%result.2, %10)
+		?          ++
+		    %12 : int = prim::Constant[value=0]()
+		    %13 : int = prim::Constant[value=2]()
+		    %14 : Tensor = aten::select(%x, %12, %13)
+		+   %result : Tensor = aten::mul(%result.3, %14)
+		+   %16 : int = prim::Constant[value=0]()
+		+   %17 : int = prim::Constant[value=3]()
+		+   %18 : Tensor = aten::select(%x, %16, %17)
+		-   %15 : Tensor = aten::mul(%result, %14)
+		?     ^                                 ^
+		+   %19 : Tensor = aten::mul(%result, %18)
+		?     ^                                 ^
+		-   return (%15);
+		?             ^
+		+   return (%19);
+		?             ^
+		  }


    This message indicates to us that the computation differed between when
@ -733,23 +737,19 @@ Automatic Trace Checking

    Which produces::

-        graph(%x : Dynamic) {
-          %1 : int = prim::Constant[value=0]()
-          %2 : int = prim::Constant[value=0]()
-          %result.1 : Dynamic = aten::select(%x, %2, %1)
-          %4 : int = aten::size(%x, %1)
-          %5 : int = prim::Constant[value=1]()
-          %result : Dynamic = prim::Loop(%4, %5, %result.1)
-            block0(%i : int, %7 : Dynamic) {
-              %9 : int = prim::Constant[value=0]()
-              %10 : Dynamic = aten::select(%x, %9, %i)
-              %result.2 : Dynamic = aten::mul(%7, %10)
-              %12 : int = prim::Constant[value=1]()
-              -> (%12, %result.2)
-            }
-          return (%result);
-        }
-
+	graph(%x : Tensor) {
+	  %5 : bool = prim::Constant[value=1]()
+	  %1 : int = prim::Constant[value=0]()
+	  %result.1 : Tensor = aten::select(%x, %1, %1)
+	  %4 : int = aten::size(%x, %1)
+	  %result : Tensor = prim::Loop(%4, %5, %result.1)
+	    block0(%i : int, %7 : Tensor) {
+	      %10 : Tensor = aten::select(%x, %1, %i)
+	      %result.2 : Tensor = aten::mul(%7, %10)
+	      -> (%5, %result.2)
+	    }
+	  return (%result);
+	}

 Tracer Warnings
    The tracer produces warnings for several problematic patterns in traced
@ -789,14 +789,24 @@ Tracer Warnings
 Builtin Functions
 ~~~~~~~~~~~~~~~~~

-Torch Script supports a subset of the builtin tensor and neural network functions that
-PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
-namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
+Torch Script supports a subset of the builtin tensor and neural network
+functions that PyTorch provides. Most methods on Tensor as well as functions in
+the ``torch`` namespace, all functions in ``torch.nn.functional`` and all
+modules from ``torch.nn`` are supported in Torch Script, excluding those in the
+table below. For unsupported modules, we suggest using :meth:`torch.jit.trace`.

+Unsupported ``torch.nn`` Modules  ::
+
+    torch.nn.modules.adaptive.AdaptiveLogSoftmaxWithLoss
+    torch.nn.modules.normalization.CrossMapLRN2d
+    torch.nn.modules.fold.Fold
+    torch.nn.modules.fold.Unfold
+    torch.nn.modules.rnn.GRU
+    torch.nn.modules.rnn.LSTM
+    torch.nn.modules.rnn.RNN
+    torch.nn.modules.rnn.GRUCell
+    torch.nn.modules.rnn.LSTMCell
+    torch.nn.modules.rnn.RNNCell

-We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
-``Conv`` module. This functionality is something that will be developed in the future.
-For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
-modules into ScriptModules on construction.

 .. automodule:: torch.jit.supported_ops
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -338,6 +338,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: reshape_as
   .. automethod:: resize_
   .. automethod:: resize_as_
+   .. automethod:: roll
   .. automethod:: round
   .. automethod:: round_
   .. automethod:: rsqrt
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -269,6 +269,7 @@ Other Operations
 .. autofunction:: histc
 .. autofunction:: meshgrid
 .. autofunction:: renorm
+.. autofunction:: roll
 .. autofunction:: tensordot
 .. autofunction:: trace
 .. autofunction:: tril
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@ -2,15 +2,6 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)

 if (BUILD_CAFFE2_OPS)
-  #cmake only check for separate OpenMP library on AppleClang 7+
-  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
-  if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
-        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
-      Set(OpenMP_link ${OpenMP_libomp_LIBRARY})
-    endif()
-  endif()
-
  # Note(ilijar): Since Detectron ops currently have no
  # CPU implementation, we only build GPU ops for now.
  if (USE_CUDA)
@ -19,11 +10,11 @@ if (BUILD_CAFFE2_OPS)
        ${Detectron_CPU_SRCS}
        ${Detectron_GPU_SRCS})

-    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
  elseif(NOT IOS_PLATFORM)
    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-    target_link_libraries(caffe2_detectron_ops caffe2 ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops caffe2)
    install(TARGETS caffe2_detectron_ops DESTINATION lib)
  endif()
 endif()
--- a/setup.py
+++ b/setup.py
@ -124,6 +124,7 @@
 #   LD_LIBRARY_PATH
 #     we will search for libraries in these paths

+from __future__ import print_function
 from setuptools import setup, Extension, distutils, Command, find_packages
 import setuptools.command.build_ext
 import setuptools.command.install
@ -144,86 +145,32 @@ import json
 import glob
 import importlib

-from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
-
-
-def hotpatch_var(var, prefix='USE_'):
-    if check_env_flag('NO_' + var):
-        os.environ[prefix + var] = '0'
-    elif check_negative_env_flag('NO_' + var):
-        os.environ[prefix + var] = '1'
-    elif check_env_flag('WITH_' + var):
-        os.environ[prefix + var] = '1'
-    elif check_negative_env_flag('WITH_' + var):
-        os.environ[prefix + var] = '0'
-
-# Before we run the setup_helpers, let's look for NO_* and WITH_*
-# variables and hotpatch environment with the USE_* equivalent
-use_env_vars = ['CUDA', 'CUDNN', 'FBGEMM', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
-                'OPENCV', 'QNNPACK', 'FFMPEG', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
-list(map(hotpatch_var, use_env_vars))
-
-# Also hotpatch a few with BUILD_* equivalent
-build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
-[hotpatch_var(v, 'BUILD_') for v in build_env_vars]
-
-from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
-from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
-                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
-                                       USE_LMDB, USE_OPENCV, USE_FFMPEG)
-from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
-from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
-                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
-from tools.setup_helpers.fbgemm import USE_FBGEMM
-from tools.setup_helpers.miopen import (USE_MIOPEN, MIOPEN_LIBRARY,
-                                        MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
-from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
-    NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
-from tools.setup_helpers.nnpack import USE_NNPACK
-from tools.setup_helpers.qnnpack import USE_QNNPACK
-from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
+# If you want to modify flags or environmental variables that is set when
+# building torch, you should do it in tools/setup_helpers/configure.py.
+# Please don't add it here unless it's only used in PyTorch.
+from tools.setup_helpers.configure import *
 from tools.setup_helpers.generate_code import generate_code
 from tools.setup_helpers.ninja_builder import NinjaBuilder, ninja_build_ext
-from tools.setup_helpers.dist_check import USE_DISTRIBUTED, \
-    USE_GLOO_IBVERBS

 ################################################################################
 # Parameters parsed from environment
 ################################################################################

-DEBUG = check_env_flag('DEBUG')
-REL_WITH_DEB_INFO = check_env_flag('REL_WITH_DEB_INFO')
-IS_WINDOWS = (platform.system() == 'Windows')
-IS_DARWIN = (platform.system() == 'Darwin')
-IS_LINUX = (platform.system() == 'Linux')
-IS_PPC = (platform.machine() == 'ppc64le')
-IS_ARM = (platform.machine() == 'aarch64')
+VERBOSE_SCRIPT = True
+# see if the user passed a quiet flag to setup.py arguments and respect
+# that in our parts of the build
+for arg in sys.argv:
+    if arg == "--":
+        break
+    if arg == '-q' or arg == '--quiet':
+        VERBOSE_SCRIPT = False

-BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
-# ppc64le and aarch64 do not support MKLDNN
-if IS_PPC or IS_ARM:
-    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'OFF')
+if VERBOSE_SCRIPT:
+    def report(*args):
+        print(*args)
 else:
-    USE_MKLDNN = check_env_flag('USE_MKLDNN', 'ON')
-
-USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
-RERUN_CMAKE = True
-
-NUM_JOBS = multiprocessing.cpu_count()
-max_jobs = os.getenv("MAX_JOBS")
-if max_jobs is not None:
-    NUM_JOBS = min(NUM_JOBS, int(max_jobs))
-
-ONNX_NAMESPACE = os.getenv("ONNX_NAMESPACE")
-if not ONNX_NAMESPACE:
-    ONNX_NAMESPACE = "onnx_torch"
-
-# Ninja
-try:
-    import ninja
-    USE_NINJA = True
-except ImportError:
-    USE_NINJA = False
+    def report(*args):
+        pass

 # Constant known variables used throughout this file
 cwd = os.path.dirname(os.path.abspath(__file__))
@ -323,8 +270,9 @@ def build_libs(libs):
        build_libs_cmd = ['tools\\build_pytorch_libs.bat']
    else:
        build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
-    my_env = os.environ.copy()
-    my_env["PYTORCH_PYTHON"] = sys.executable
+
+    my_env, extra_flags = get_pytorch_env_with_flags()
+    build_libs_cmd.extend(extra_flags)
    my_env["PYTORCH_PYTHON_LIBRARY"] = cmake_python_library
    my_env["PYTORCH_PYTHON_INCLUDE_DIR"] = cmake_python_include_dir
    my_env["PYTORCH_BUILD_VERSION"] = version
@ -334,64 +282,8 @@ def build_libs(libs):
        cmake_prefix_path = my_env["CMAKE_PREFIX_PATH"] + ";" + cmake_prefix_path
    my_env["CMAKE_PREFIX_PATH"] = cmake_prefix_path

-    my_env["NUM_JOBS"] = str(NUM_JOBS)
-    my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
-    if not IS_WINDOWS:
-        if USE_NINJA:
-            my_env["CMAKE_GENERATOR"] = '-GNinja'
-            my_env["CMAKE_INSTALL"] = 'ninja install'
-        else:
-            my_env['CMAKE_GENERATOR'] = ''
-            my_env['CMAKE_INSTALL'] = 'make install'
-    if USE_SYSTEM_NCCL:
-        my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
-        my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
-        my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
-    if USE_CUDA:
-        my_env["CUDA_BIN_PATH"] = CUDA_HOME
-        build_libs_cmd += ['--use-cuda']
-        if IS_WINDOWS:
-            my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
-    if USE_CUDA_STATIC_LINK:
-        build_libs_cmd += ['--cuda-static-link']
-    if USE_FBGEMM:
-        build_libs_cmd += ['--use-fbgemm']
-    if USE_ROCM:
-        build_libs_cmd += ['--use-rocm']
-    if USE_NNPACK:
-        build_libs_cmd += ['--use-nnpack']
-    if USE_NUMPY:
-        my_env["NUMPY_INCLUDE_DIR"] = NUMPY_INCLUDE_DIR
-    if USE_CUDNN:
-        my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
-        my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
-        my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
-    if USE_MIOPEN:
-        my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
-        my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
-        my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
-    if USE_MKLDNN:
-        build_libs_cmd += ['--use-mkldnn']
-    if USE_QNNPACK:
-        build_libs_cmd += ['--use-qnnpack']
-    if USE_GLOO_IBVERBS:
-        build_libs_cmd += ['--use-gloo-ibverbs']
-    if not RERUN_CMAKE:
-        build_libs_cmd += ['--dont-rerun-cmake']
-
-    my_env["BUILD_TORCH"] = "ON"
-    my_env["BUILD_PYTHON"] = "ON"
-    my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
-    my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
-    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
-    my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
-    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
-    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
-    my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
-    my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
-    my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
-    my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
-
+    if VERBOSE_SCRIPT:
+        my_env['VERBOSE_SCRIPT'] = '1'
    try:
        os.mkdir('build')
    except OSError:
@ -660,6 +552,16 @@ class build_ext(build_ext_parent):
        return outputs


+# this is a subclass of build just to get access to self.build_lib
+# as there does not seem to be an utility function getting this
+class create_pyi(distutils.command.build.build):
+    def run(self):
+        print("-- Building .pyi --")
+        if sys.version_info[0] == 3:
+            from tools.pyi.gen_pyi import gen_pyi
+            gen_pyi(self.build_lib)
+
+
 class build(distutils.command.build.build):
    sub_commands = [
        ('build_deps', lambda self: True),
@ -914,6 +816,7 @@ if USE_CUDA:

 cmdclass = {
    'create_version_file': create_version_file,
+    'create_pyi': create_pyi,
    'build': build,
    'build_py': build_py,
    'build_ext': build_ext,
@ -946,6 +849,7 @@ if __name__ == '__main__':
        entry_points=entry_points,
        package_data={
            'torch': [
+                '__init__.pyi',
                'lib/*.so*',
                'lib/*.dylib*',
                'lib/*.dll',
--- a/test/common_methods_invocations.py
+++ b/test/common_methods_invocations.py
@ -458,6 +458,10 @@ method_tests = [
     NO_ARGS, [skipIfNoLapack]),
    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S, S), [-2], "n=-2",
     NO_ARGS, [skipIfNoLapack]),
+    ('mvlgamma', torch.empty(S,).uniform_(0.5, 1), [1], "p=1"),
+    ('mvlgamma', torch.empty(S,).uniform_(1, 2), [2], "p=2"),
+    ('mvlgamma', torch.empty(S, S).uniform_(1.5, 3), [3], "p=3"),
+    ('mvlgamma', torch.empty(S, S).uniform_(2.5, 5), [5], "p=5"),
    ('addcmul', (S, S), ((S, S), (S, S))),
    ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
    ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
@ -560,8 +564,14 @@ method_tests = [
    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
    ('tril', (M, M), NO_ARGS),
    ('tril', (M, M), (2,), 'idx'),
+    ('tril', (S, M, M), NO_ARGS, 'batched'),
+    ('tril', (S, M, M), (2,), 'batched_idx'),
+    ('tril', (3, 3, S, S), NO_ARGS, 'more_batched'),
    ('triu', (M, M), NO_ARGS),
    ('triu', (M, M), (2,), 'idx'),
+    ('triu', (S, M, M), NO_ARGS, 'batched'),
+    ('triu', (S, M, M), (2,), 'batched_idx'),
+    ('triu', (3, 3, S, S), NO_ARGS, 'more_batched'),
    ('trace', (M, M), NO_ARGS),
    ('cross', (S, 3), ((S, 3),)),
    ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
--- a/test/common_utils.py
+++ b/test/common_utils.py
@ -725,6 +725,20 @@ def random_fullrank_matrix_distinct_singular_value(l, *batches, **kwargs):
        return torch.stack(all_matrices).reshape(*(batches + (l, l)))


+def brute_pdist(inp, p=2):
+    """Computes the same as torch.pdist using primitives"""
+    n = inp.shape[-2]
+    k = n * (n - 1) // 2
+    if k == 0:
+        # torch complains about empty indices
+        return torch.empty(inp.shape[:-2] + (0,), dtype=inp.dtype, device=inp.device)
+    square = torch.norm(inp[..., None, :] - inp[..., None, :, :], p=p, dim=-1)
+    unroll = square.view(square.shape[:-2] + (n * n,))
+    inds = torch.ones(k, dtype=torch.int)
+    inds[torch.arange(n - 1, 1, -1, dtype=torch.int).cumsum(0)] += torch.arange(2, n, dtype=torch.int)
+    return unroll[..., inds.cumsum(0)]
+
+
 def do_test_dtypes(self, dtypes, layout, device):
    for dtype in dtypes:
        if dtype != torch.float16:
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@ -450,6 +450,80 @@ TEST(DataTest, TensorLambdaWorksforAnyTargetType) {
  ASSERT_EQ(batch[1].target, "2");
 }

+struct DummyTensorDataset
+    : datasets::Dataset<DummyTensorDataset, Example<torch::Tensor, int>> {
+  Example<torch::Tensor, int> get(size_t index) override {
+    const auto channels = static_cast<int64_t>(index);
+    torch::Tensor tensor =
+        (channels > 0) ? torch::ones({channels, 4, 4}) : torch::ones({4, 4});
+    return {tensor, static_cast<int>(channels)};
+  }
+
+  torch::optional<size_t> size() const override {
+    return 100;
+  }
+};
+
+TEST(DataTest, NormalizeTransform) {
+  auto dataset = DummyTensorDataset().map(transforms::Normalize<int>(0.5, 0.1));
+
+  // Works for zero (one implicit) channels
+  std::vector<Example<torch::Tensor, int>> output = dataset.get_batch(0);
+  ASSERT_EQ(output.size(), 1);
+  // (1 - 0.5) / 0.1 = 5
+  ASSERT_TRUE(output[0].data.allclose(torch::ones({4, 4}) * 5))
+      << output[0].data;
+
+  // Works for one explicit channel
+  output = dataset.get_batch(1);
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_EQ(output[0].data.size(0), 1);
+  ASSERT_TRUE(output[0].data.allclose(torch::ones({1, 4, 4}) * 5))
+      << output[0].data;
+
+  // Works for two channels with different moments
+  dataset = DummyTensorDataset().map(
+      transforms::Normalize<int>({0.5, 1.5}, {0.1, 0.2}));
+  output = dataset.get_batch(2);
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_EQ(output[0].data.size(0), 2);
+  ASSERT_TRUE(output[0]
+                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
+                  .allclose(torch::ones({1, 4, 4}) * 5))
+      << output[0].data;
+  ASSERT_TRUE(output[0]
+                  .data.slice(/*dim=*/0, /*start=*/1)
+                  .allclose(torch::ones({1, 4, 4}) * -2.5))
+      << output[0].data;
+
+  // Works for three channels with one moment value
+  dataset = DummyTensorDataset().map(transforms::Normalize<int>(1.5, 0.2));
+  output = dataset.get_batch(3);
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_EQ(output[0].data.size(0), 3);
+  ASSERT_TRUE(output[0].data.allclose(torch::ones({3, 4, 4}) * -2.5))
+      << output[0].data;
+
+  // Works for three channels with different moments
+  dataset = DummyTensorDataset().map(
+      transforms::Normalize<int>({0.5, 1.5, -1.5}, {0.1, 0.2, 0.2}));
+  output = dataset.get_batch(3);
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_EQ(output[0].data.size(0), 3);
+  ASSERT_TRUE(output[0]
+                  .data.slice(/*dim=*/0, /*start=*/0, /*end=*/1)
+                  .allclose(torch::ones({1, 4, 4}) * 5))
+      << output[0].data;
+  ASSERT_TRUE(output[0]
+                  .data.slice(/*dim=*/0, /*start=*/1, /*end=*/2)
+                  .allclose(torch::ones({1, 4, 4}) * -2.5))
+      << output[0].data;
+  ASSERT_TRUE(output[0]
+                  .data.slice(/*dim=*/0, /*start=*/2)
+                  .allclose(torch::ones({1, 4, 4}) * 12.5))
+      << output[0].data;
+}
+
 struct UnCopyableDataset : public datasets::Dataset<UnCopyableDataset> {
  UnCopyableDataset() = default;

--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@ -37,7 +37,7 @@ TEST_F(ModuleTest, CanEnableAndDisableTrainingMode) {
 TEST_F(ModuleTest, ZeroGrad) {
  Linear module(3, 4);
  auto weight = torch::ones({8, 3}, torch::requires_grad());
-  auto loss = module->forward(weight).sum();
+  auto loss = module(weight).sum();
  loss.backward();
  for (auto& parameter : module->parameters()) {
    auto grad = parameter.grad();
@ -831,3 +831,15 @@ TEST_F(ModuleTest, ThrowsWhenAttemptingtoGetTopLevelModuleAsSharedPtr) {
    ASSERT_NO_THROW(module->modules());
  }
 }
+
+struct ModuleWithNonTensorForwardImpl : torch::nn::Module {
+  int64_t forward(torch::Tensor x) {
+    return x.numel();
+  }
+};
+TORCH_MODULE(ModuleWithNonTensorForward);
+
+TEST_F(ModuleTest, CanCallForwardOnNonTensorForwardThroughPimpl) {
+  ModuleWithNonTensorForward m;
+  ASSERT_EQ(m(torch::ones(123)), 123);
+}
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -42,7 +42,7 @@ struct ModulesTest : torch::test::SeedingFixture {};
 TEST_F(ModulesTest, Conv1d) {
  Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -58,7 +58,7 @@ TEST_F(ModulesTest, Conv1d) {
 TEST_F(ModulesTest, Conv2dEven) {
  Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -74,7 +74,7 @@ TEST_F(ModulesTest, Conv2dEven) {
 TEST_F(ModulesTest, Conv2dUneven) {
  Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
  auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -90,7 +90,7 @@ TEST_F(ModulesTest, Conv2dUneven) {
 TEST_F(ModulesTest, Conv3d) {
  Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
  auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -106,7 +106,7 @@ TEST_F(ModulesTest, Conv3d) {
 TEST_F(ModulesTest, Linear) {
  Linear model(5, 2);
  auto x = torch::randn({10, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -125,9 +125,9 @@ TEST_F(ModulesTest, SimpleContainer) {
  auto l3 = model->add(Linear(5, 100), "l3");

  auto x = torch::randn({1000, 10}, torch::requires_grad());
-  x = l1->forward(x).clamp_min(0);
-  x = l2->forward(x).clamp_min(0);
-  x = l3->forward(x).clamp_min(0);
+  x = l1(x).clamp_min(0);
+  x = l2(x).clamp_min(0);
+  x = l3(x).clamp_min(0);

  x.backward();
  ASSERT_EQ(x.ndimension(), 2);
@ -147,7 +147,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
  // Cannot get gradients to change indices (input) - only for embedding
  // params
  auto x = torch::full({10}, dict_size - 1, torch::kInt64);
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -162,7 +162,7 @@ TEST_F(ModulesTest, EmbeddingBasic) {
 TEST_F(ModulesTest, EmbeddingList) {
  Embedding model(6, 4);
  auto x = torch::full({2, 3}, 5, torch::kInt64);
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -175,7 +175,7 @@ TEST_F(ModulesTest, EmbeddingList) {
 TEST_F(ModulesTest, Dropout) {
  Dropout dropout(0.5);
  torch::Tensor x = torch::ones(100, torch::requires_grad());
-  torch::Tensor y = dropout->forward(x);
+  torch::Tensor y = dropout(x);

  y.backward();
  ASSERT_EQ(y.ndimension(), 1);
@ -184,7 +184,7 @@ TEST_F(ModulesTest, Dropout) {
  ASSERT_GT(y.sum().item<float>(), 70); // Probably

  dropout->eval();
-  y = dropout->forward(x);
+  y = dropout(x);
  ASSERT_EQ(y.sum().item<float>(), 100);
 }

@ -214,7 +214,7 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
    was_called = true;
    return input;
  });
-  auto output = functional->forward(torch::ones(5, torch::requires_grad()));
+  auto output = functional(torch::ones(5, torch::requires_grad()));
  ASSERT_TRUE(was_called);
  ASSERT_TRUE(output.equal(torch::ones(5, torch::requires_grad())));

@ -272,7 +272,7 @@ TEST_F(ModulesTest, BatchNormStateless) {
  ASSERT_FALSE(bn->bias.defined());

  ASSERT_THROWS_WITH(
-      bn->forward(torch::ones({2, 5})),
+      bn(torch::ones({2, 5})),
      "Calling BatchNorm::forward is only permitted "
      "when the 'stateful' option is true (was false). "
      "Use BatchNorm::pure_forward instead.");
@ -297,7 +297,7 @@ TEST_F(ModulesTest, Linear_CUDA) {
  model->to(torch::kCUDA);
  auto x =
      torch::randn({10, 5}, torch::device(torch::kCUDA).requires_grad(true));
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
@ -314,7 +314,7 @@ TEST_F(ModulesTest, Linear2_CUDA) {
  model->to(torch::kCUDA);
  model->to(torch::kCPU);
  auto x = torch::randn({10, 5}, torch::requires_grad());
-  auto y = model->forward(x);
+  auto y = model(x);
  torch::Tensor s = y.sum();

  s.backward();
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@ -215,7 +215,9 @@ TEST(SerializeTest, Optim) {
 TEST(SerializeTest, XOR_CUDA) {
  torch::manual_seed(0);
  // We better be able to save and load a XOR model!
-  auto getLoss = [](Sequential model, uint32_t batch_size, bool is_cuda=false) {
+  auto getLoss = [](Sequential model,
+                    uint32_t batch_size,
+                    bool is_cuda = false) {
    auto inputs = torch::empty({batch_size, 2});
    auto labels = torch::empty({batch_size});
    if (is_cuda) {
@ -269,3 +271,34 @@ TEST(SerializeTest, XOR_CUDA) {
  loss = getLoss(model3, 100, true);
  ASSERT_LT(loss.item<float>(), 0.1);
 }
+
+TEST(
+    SerializeTest,
+    CanSerializeModulesWithIntermediateModulesWithoutParametersOrBuffers) {
+  struct C : torch::nn::Module {
+    C() {
+      register_buffer("foo", torch::ones(5, torch::kInt32));
+    }
+  };
+  struct B : torch::nn::Module {};
+  struct A : torch::nn::Module {
+    A() {
+      register_module("b", std::make_shared<B>());
+      register_module("c", std::make_shared<C>());
+    }
+  };
+  struct M : torch::nn::Module {
+    M() {
+      register_module("a", std::make_shared<A>());
+    }
+  };
+
+  auto out = std::make_shared<M>();
+  std::stringstream ss;
+  torch::save(out, ss);
+  auto in = std::make_shared<M>();
+  torch::load(in, ss);
+
+  const int output = in->named_buffers()["a.c.foo"].sum().item<int>();
+  ASSERT_EQ(output, 5);
+}
--- a/test/cpp/api/static.cpp
+++ b/test/cpp/api/static.cpp
@ -49,6 +49,51 @@ TEST(TestStatic, EnableIfModule) {
  ASSERT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
 }

+struct A : torch::nn::Module {
+  int forward() {
+    return 5;
+  }
+};
+
+struct B : torch::nn::Module {
+  std::string forward(torch::Tensor tensor) {
+    return "";
+  }
+};
+
+struct C : torch::nn::Module {
+  float forward(torch::Tensor& tensor) {
+    return 5.0;
+  }
+};
+
+struct D : torch::nn::Module {
+  char forward(torch::Tensor&& tensor) {
+    return 'x';
+  }
+};
+
+struct E : torch::nn::Module {};
+
+// Put in a function because macros don't handle the comma between arguments to
+// is_same well ...
+template <typename Module, typename ExpectedType, typename... Args>
+void assert_has_expected_type() {
+  using ReturnType =
+      typename torch::detail::return_type_of_forward<Module, Args...>::type;
+  constexpr bool is_expected_type =
+      std::is_same<ReturnType, ExpectedType>::value;
+  ASSERT_TRUE(is_expected_type) << Module().name();
+}
+
+TEST(TestStatic, ReturnTypeOfForward) {
+  assert_has_expected_type<A, int>();
+  assert_has_expected_type<B, std::string, torch::Tensor>();
+  assert_has_expected_type<C, float, torch::Tensor&>();
+  assert_has_expected_type<D, char, torch::Tensor&&>();
+  assert_has_expected_type<E, void>();
+}
+
 TEST(TestStatic, Apply) {
  std::vector<int> v;
  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
--- a/test/expect/TestBatched.test_for.expect
+++ b/test/expect/TestBatched.test_for.expect
@ -10,12 +10,13 @@ graph(%x.1_data : Tensor
  %x : Tensor, %10 : Tensor, %11 : Tensor = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
    block0(%loop_num : int, %5_data : Tensor, %5_mask : Tensor, %5_dims : Tensor) {
      %16 : Long() = prim::NumToTensor(%6)
-      %alpha : float = prim::TensorToNum(%16)
+      %alpha : float = prim::Float(%16)
      %data.1 : Tensor = aten::add(%5_data, %y_data, %alpha)
      %mask : Tensor = aten::mul(%5_mask, %y_mask)
      %dims : Tensor = aten::__or__(%5_dims, %y_dims)
      %data : Tensor = aten::where(%mask, %data.1, %5_data)
      -> (%7, %data, %mask, %dims)
    }
-  return (%x, %10, %11);
+  %22 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%x, %10, %11)
+  return (%22);
 }
--- a/test/expect/TestBatched.test_if_else.expect
+++ b/test/expect/TestBatched.test_if_else.expect
@ -7,33 +7,31 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
-  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %10 : bool = prim::TensorToBool(%7)
-  %11 : Long() = prim::NumToTensor(%6)
-  %alpha.1 : float = prim::TensorToNum(%11)
+  %9 : Long() = prim::NumToTensor(%6)
+  %alpha.1 : float = prim::Float(%9)
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %16 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%16)
+  %14 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::Float(%14)
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %21 : bool = prim::Constant[value=1]()
-  %22 : int = prim::Constant[value=1]()
-  %23 : Tensor = aten::type_as(%8, %7)
-  %data.2 : Tensor = aten::mul(%7, %23)
-  %25 : int = aten::dim(%data.2)
-  %26 : bool = aten::eq(%25, %22)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
+  %19 : bool = prim::Constant[value=1]()
+  %20 : int = prim::Constant[value=1]()
+  %21 : Tensor = aten::type_as(%8, %7)
+  %data.2 : Tensor = aten::mul(%7, %21)
+  %23 : int = aten::dim(%data.2)
+  %24 : bool = aten::eq(%23, %20)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%24)
    block0() {
-      %29 : int = aten::dim(%data.1)
-      %30 : int = aten::sub(%29, %22)
-      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
-        block0(%32 : int, %33 : Tensor) {
-          %34 : int = aten::dim(%33)
-          %data.3 : Tensor = aten::unsqueeze(%33, %34)
-          -> (%21, %data.3)
+      %27 : int = aten::dim(%data.1)
+      %28 : int = aten::sub(%27, %20)
+      %data.4 : Tensor = prim::Loop(%28, %19, %data.2)
+        block0(%30 : int, %31 : Tensor) {
+          %32 : int = aten::dim(%31)
+          %data.3 : Tensor = aten::unsqueeze(%31, %32)
+          -> (%19, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
@ -45,5 +43,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
+  %39 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
+  return (%39);
 }
--- a/test/expect/TestBatched.test_if_else_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_else_with_scalar.expect
@ -7,34 +7,33 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : float = prim::Constant[value=0.1]()
  %8 : Float() = prim::NumToTensor(%7)
-  %other : float = prim::TensorToNum(%8)
+  %other : float = prim::Float(%8)
  %10 : Tensor = aten::gt(%a.1_data, %other)
-  %11 : bool = prim::TensorToBool(%10)
-  %12 : Long() = prim::NumToTensor(%6)
-  %alpha.1 : float = prim::TensorToNum(%12)
+  %11 : Long() = prim::NumToTensor(%6)
+  %alpha.1 : float = prim::Float(%11)
  %data.1 : Tensor = aten::add(%a.1_data, %b_data, %alpha.1)
  %mask.1 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims.1 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %17 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%17)
+  %16 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::Float(%16)
  %data : Tensor = aten::sub(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %22 : bool = prim::Constant[value=1]()
-  %23 : int = prim::Constant[value=1]()
-  %24 : Tensor = aten::type_as(%a.1_mask, %10)
-  %data.2 : Tensor = aten::mul(%10, %24)
-  %26 : int = aten::dim(%data.2)
-  %27 : bool = aten::eq(%26, %23)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%27)
+  %21 : bool = prim::Constant[value=1]()
+  %22 : int = prim::Constant[value=1]()
+  %23 : Tensor = aten::type_as(%a.1_mask, %10)
+  %data.2 : Tensor = aten::mul(%10, %23)
+  %25 : int = aten::dim(%data.2)
+  %26 : bool = aten::eq(%25, %22)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%26)
    block0() {
-      %30 : int = aten::dim(%data.1)
-      %31 : int = aten::sub(%30, %23)
-      %data.4 : Tensor = prim::Loop(%31, %22, %data.2)
-        block0(%33 : int, %34 : Tensor) {
-          %35 : int = aten::dim(%34)
-          %data.3 : Tensor = aten::unsqueeze(%34, %35)
-          -> (%22, %data.3)
+      %29 : int = aten::dim(%data.1)
+      %30 : int = aten::sub(%29, %22)
+      %data.4 : Tensor = prim::Loop(%30, %21, %data.2)
+        block0(%32 : int, %33 : Tensor) {
+          %34 : int = aten::dim(%33)
+          %data.3 : Tensor = aten::unsqueeze(%33, %34)
+          -> (%21, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data.1)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask.1)
@ -46,5 +45,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data.1, %data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask.1, %mask)
  %res_dims : Tensor = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
+  %41 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
+  return (%41);
 }
--- a/test/expect/TestBatched.test_if_noelse.expect
+++ b/test/expect/TestBatched.test_if_noelse.expect
@ -7,28 +7,26 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : Tensor = aten::gt(%a.1_data, %b_data)
  %8 : Tensor = aten::mul(%a.1_mask, %b_mask)
-  %9 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %10 : bool = prim::TensorToBool(%7)
-  %11 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%11)
+  %9 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::Float(%9)
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %16 : bool = prim::Constant[value=1]()
-  %17 : int = prim::Constant[value=1]()
-  %18 : Tensor = aten::type_as(%8, %7)
-  %data.2 : Tensor = aten::mul(%7, %18)
-  %20 : int = aten::dim(%data.2)
-  %21 : bool = aten::eq(%20, %17)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
+  %14 : bool = prim::Constant[value=1]()
+  %15 : int = prim::Constant[value=1]()
+  %16 : Tensor = aten::type_as(%8, %7)
+  %data.2 : Tensor = aten::mul(%7, %16)
+  %18 : int = aten::dim(%data.2)
+  %19 : bool = aten::eq(%18, %15)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%19)
    block0() {
-      %24 : int = aten::dim(%data)
-      %25 : int = aten::sub(%24, %17)
-      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
-        block0(%27 : int, %28 : Tensor) {
-          %29 : int = aten::dim(%28)
-          %data.3 : Tensor = aten::unsqueeze(%28, %29)
-          -> (%16, %data.3)
+      %22 : int = aten::dim(%data)
+      %23 : int = aten::sub(%22, %15)
+      %data.4 : Tensor = prim::Loop(%23, %14, %data.2)
+        block0(%25 : int, %26 : Tensor) {
+          %27 : int = aten::dim(%26)
+          %data.3 : Tensor = aten::unsqueeze(%26, %27)
+          -> (%14, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -40,5 +38,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
+  %34 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
+  return (%34);
 }
--- a/test/expect/TestBatched.test_if_noelse_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
@ -7,29 +7,28 @@ graph(%a.1_data : Tensor
  %6 : int = prim::Constant[value=1]()
  %7 : float = prim::Constant[value=0.1]()
  %8 : Float() = prim::NumToTensor(%7)
-  %other : float = prim::TensorToNum(%8)
+  %other : float = prim::Float(%8)
  %10 : Tensor = aten::gt(%a.1_data, %other)
-  %11 : bool = prim::TensorToBool(%10)
-  %12 : Long() = prim::NumToTensor(%6)
-  %alpha : float = prim::TensorToNum(%12)
+  %11 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::Float(%11)
  %data : Tensor = aten::add(%a.1_data, %b_data, %alpha)
  %mask : Tensor = aten::mul(%a.1_mask, %b_mask)
  %dims : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %17 : bool = prim::Constant[value=1]()
-  %18 : int = prim::Constant[value=1]()
-  %19 : Tensor = aten::type_as(%a.1_mask, %10)
-  %data.2 : Tensor = aten::mul(%10, %19)
-  %21 : int = aten::dim(%data.2)
-  %22 : bool = aten::eq(%21, %18)
-  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%22)
+  %16 : bool = prim::Constant[value=1]()
+  %17 : int = prim::Constant[value=1]()
+  %18 : Tensor = aten::type_as(%a.1_mask, %10)
+  %data.2 : Tensor = aten::mul(%10, %18)
+  %20 : int = aten::dim(%data.2)
+  %21 : bool = aten::eq(%20, %17)
+  %cond_data : Tensor, %cond_mask : Tensor = prim::If(%21)
    block0() {
-      %25 : int = aten::dim(%data)
-      %26 : int = aten::sub(%25, %18)
-      %data.4 : Tensor = prim::Loop(%26, %17, %data.2)
-        block0(%28 : int, %29 : Tensor) {
-          %30 : int = aten::dim(%29)
-          %data.3 : Tensor = aten::unsqueeze(%29, %30)
-          -> (%17, %data.3)
+      %24 : int = aten::dim(%data)
+      %25 : int = aten::sub(%24, %17)
+      %data.4 : Tensor = prim::Loop(%25, %16, %data.2)
+        block0(%27 : int, %28 : Tensor) {
+          %29 : int = aten::dim(%28)
+          %data.3 : Tensor = aten::unsqueeze(%28, %29)
+          -> (%16, %data.3)
        }
      %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
      %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -41,5 +40,6 @@ graph(%a.1_data : Tensor
  %res_data : Tensor = aten::where(%cond_data, %data, %a.1_data)
  %res_mask : Tensor = aten::where(%cond_mask, %mask, %a.1_mask)
  %res_dims : Tensor = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
+  %36 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%res_data, %res_mask, %res_dims)
+  return (%36);
 }
--- a/test/expect/TestBatched.test_while.expect
+++ b/test/expect/TestBatched.test_while.expect
@ -9,38 +9,35 @@ graph(%a.1_data : Tensor
  %8 : Tensor = aten::gt(%a.1_data, %b_data)
  %9 : Tensor = aten::mul(%a.1_mask, %b_mask)
  %10 : Tensor = aten::__or__(%a.1_dims, %b_dims)
-  %11 : bool = prim::TensorToBool(%8)
-  %12 : int = prim::Constant[value=0]()
-  %13 : Tensor = aten::mul(%8, %9)
-  %14 : Tensor = aten::sum(%13)
-  %15 : Tensor = aten::gt(%14, %12)
-  %16 : bool = prim::TensorToBool(%15)
-  %17 : Tensor, %18 : Tensor, %19 : Tensor, %a : Tensor, %21 : Tensor, %22 : Tensor = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
-    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %cond_dims : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
-      %30 : Long() = prim::NumToTensor(%6)
-      %alpha : float = prim::TensorToNum(%30)
+  %11 : int = prim::Constant[value=0]()
+  %12 : Tensor = aten::mul(%8, %9)
+  %13 : Tensor = aten::sum(%12)
+  %14 : Tensor = aten::gt(%13, %11)
+  %15 : bool = prim::Bool(%14)
+  %16 : Tensor, %17 : Tensor, %a : Tensor, %19 : Tensor, %20 : Tensor = prim::Loop(%7, %15, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+    block0(%loop_num : int, %cond_data.2 : Tensor, %cond_mask.2 : Tensor, %6_data : Tensor, %6_mask : Tensor, %6_dims : Tensor) {
+      %27 : Long() = prim::NumToTensor(%6)
+      %alpha : float = prim::Float(%27)
      %data : Tensor = aten::sub(%6_data, %b_data, %alpha)
      %mask : Tensor = aten::mul(%6_mask, %b_mask)
      %dims : Tensor = aten::__or__(%6_dims, %b_dims)
-      %35 : Tensor = aten::gt(%data, %b_data)
-      %36 : Tensor = aten::mul(%mask, %b_mask)
-      %37 : Tensor = aten::__or__(%dims, %b_dims)
-      %38 : bool = prim::TensorToBool(%35)
-      %39 : bool = prim::Constant[value=1]()
-      %40 : int = prim::Constant[value=1]()
-      %41 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
-      %data.2 : Tensor = aten::mul(%cond_data.2, %41)
-      %43 : int = aten::dim(%data.2)
-      %44 : bool = aten::eq(%43, %40)
-      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%44)
+      %32 : Tensor = aten::gt(%data, %b_data)
+      %33 : Tensor = aten::mul(%mask, %b_mask)
+      %34 : bool = prim::Constant[value=1]()
+      %35 : int = prim::Constant[value=1]()
+      %36 : Tensor = aten::type_as(%cond_mask.2, %cond_data.2)
+      %data.2 : Tensor = aten::mul(%cond_data.2, %36)
+      %38 : int = aten::dim(%data.2)
+      %39 : bool = aten::eq(%38, %35)
+      %cond_data : Tensor, %cond_mask : Tensor = prim::If(%39)
        block0() {
-          %47 : int = aten::dim(%data)
-          %48 : int = aten::sub(%47, %40)
-          %data.4 : Tensor = prim::Loop(%48, %39, %data.2)
-            block0(%50 : int, %51 : Tensor) {
-              %52 : int = aten::dim(%51)
-              %data.3 : Tensor = aten::unsqueeze(%51, %52)
-              -> (%39, %data.3)
+          %42 : int = aten::dim(%data)
+          %43 : int = aten::sub(%42, %35)
+          %data.4 : Tensor = prim::Loop(%43, %34, %data.2)
+            block0(%45 : int, %46 : Tensor) {
+              %47 : int = aten::dim(%46)
+              %data.3 : Tensor = aten::unsqueeze(%46, %47)
+              -> (%34, %data.3)
            }
          %cond_data.1 : Tensor = aten::expand_as(%data.4, %data)
          %cond_mask.1 : Tensor = aten::expand_as(%data.4, %mask)
@ -52,12 +49,13 @@ graph(%a.1_data : Tensor
      %res_data : Tensor = aten::where(%cond_data, %data, %6_data)
      %res_mask : Tensor = aten::where(%cond_mask, %mask, %6_mask)
      %res_dims : Tensor = aten::__or__(%dims, %6_dims)
-      %59 : int = prim::Constant[value=0]()
-      %60 : Tensor = aten::mul(%35, %36)
-      %61 : Tensor = aten::sum(%60)
-      %62 : Tensor = aten::gt(%61, %59)
-      %63 : bool = prim::TensorToBool(%62)
-      -> (%63, %35, %36, %37, %res_data, %res_mask, %res_dims)
+      %54 : int = prim::Constant[value=0]()
+      %55 : Tensor = aten::mul(%32, %33)
+      %56 : Tensor = aten::sum(%55)
+      %57 : Tensor = aten::gt(%56, %54)
+      %58 : bool = prim::Bool(%57)
+      -> (%58, %32, %33, %res_data, %res_mask, %res_dims)
    }
-  return (%a, %21, %22);
+  %59 : (Tensor, Tensor, Tensor) = prim::TupleConstruct(%a, %19, %20)
+  return (%59);
 }
--- a/Show More
+++ b/Show More