version bump to 1.1 (#15554 )

Summary: version bump to 1.1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/15554 Differential Revision: D13550818 Pulled By: soumith fbshipit-source-id: 8a28582c98b42c081e103581551a01fd96c9f42d
In README.md CMAKE_PREFIX_PATH should be CONDA_PREFIX when using an conda virtual environment (#15548 )
2025-10-24 23:54:56 +08:00 · 2018-12-26 15:44:25 -08:00 · 2018-12-26 12:57:07 -08:00 · 2018-12-26 08:35:39 -08:00 · 2018-12-26 08:32:28 -08:00 · 2018-12-26 06:55:01 -08:00
1921 changed files with 41644 additions and 40585 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,7 +1,7 @@
 # IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
 # in this file to the new version number, and **ALSO** update the version number below:
 # PyTorchDockerVersion:262
-# Caffe2DockerVersion:230
+# Caffe2DockerVersion:238

 docker_config_defaults: &docker_config_defaults
  user: jenkins
@ -117,7 +117,7 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
      <<: *setup_ci_environment
  - run:
      name: Test
-      no_output_timeout: "90m"
+      no_output_timeout: "1h"
      command: |
        set -e
        export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
@ -800,7 +800,7 @@ jobs:
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:238"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
    <<: *caffe2_linux_build_defaults
@ -808,7 +808,7 @@ jobs:
  caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn6-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:238"
      CUDA_VERSION: "8"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
    resource_class: gpu.medium
@ -817,7 +817,7 @@ jobs:
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "9"
      BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-ubuntu16.04"
    <<: *caffe2_linux_build_defaults
@ -825,7 +825,7 @@ jobs:
  caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "9"
      BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-ubuntu16.04"
    resource_class: gpu.medium
@ -834,7 +834,7 @@ jobs:
  caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "9.1"
      BUILD_ENVIRONMENT: "py2-cuda9.1-cudnn7-ubuntu16.04"
    <<: *caffe2_linux_build_defaults
@ -842,7 +842,7 @@ jobs:
  caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:238"
      CUDA_VERSION: "9.1"
      BUILD_ENVIRONMENT: "py2-cuda9.1-cudnn7-ubuntu16.04"
    resource_class: gpu.medium
@ -851,14 +851,14 @@ jobs:
  caffe2_py2_mkl_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-mkl-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-mkl-ubuntu16.04"
    <<: *caffe2_linux_build_defaults

  caffe2_py2_mkl_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-mkl-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-mkl-ubuntu16.04"
    resource_class: large
    <<: *caffe2_linux_test_defaults
@ -866,14 +866,14 @@ jobs:
  caffe2_py2_gcc4_8_ubuntu14_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-gcc4.8-ubuntu14.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:238"
      BUILD_ENVIRONMENT: "py2-gcc4.8-ubuntu14.04"
    <<: *caffe2_linux_build_defaults

  caffe2_py2_gcc4_8_ubuntu14_04_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-gcc4.8-ubuntu14.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:238"
      BUILD_ENVIRONMENT: "py2-gcc4.8-ubuntu14.04"
    resource_class: large
    <<: *caffe2_linux_test_defaults
@ -881,14 +881,14 @@ jobs:
  caffe2_onnx_py2_gcc5_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-onnx-py2-gcc5-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "onnx-py2-gcc5-ubuntu16.04"
    <<: *caffe2_linux_build_defaults

  caffe2_onnx_py2_gcc5_ubuntu16_04_test:
    environment:
      JOB_BASE_NAME: caffe2-onnx-py2-gcc5-ubuntu16.04-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "onnx-py2-gcc5-ubuntu16.04"
    resource_class: large
    <<: *caffe2_linux_test_defaults
@ -896,7 +896,7 @@ jobs:
  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -904,7 +904,7 @@ jobs:
  caffe2_py2_gcc4_9_ubuntu14_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-gcc4.9-ubuntu14.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:238"
      BUILD_ENVIRONMENT: "py2-gcc4.9-ubuntu14.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -912,7 +912,7 @@ jobs:
  caffe2_py2_clang3_8_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-clang3.8-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-clang3.8-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -920,7 +920,7 @@ jobs:
  caffe2_py2_clang3_9_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-clang3.9-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-clang3.9-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -928,7 +928,7 @@ jobs:
  caffe2_py2_clang7_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-clang7-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-clang7-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -936,7 +936,7 @@ jobs:
  caffe2_py2_android_ubuntu16_04_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-android-ubuntu16.04-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:238"
      BUILD_ENVIRONMENT: "py2-android-ubuntu16.04"
      BUILD_ONLY: "1"
    <<: *caffe2_linux_build_defaults
@ -944,14 +944,14 @@ jobs:
  caffe2_py2_cuda9_0_cudnn7_centos7_build:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-centos7-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:238"
      BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-centos7"
    <<: *caffe2_linux_build_defaults

  caffe2_py2_cuda9_0_cudnn7_centos7_test:
    environment:
      JOB_BASE_NAME: caffe2-py2-cuda9.0-cudnn7-centos7-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:230"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:238"
      CUDA_VERSION: "9.0"
      BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-centos7"
    resource_class: gpu.medium
--- a/.clang-tidy
+++ b/.clang-tidy
@ -3,26 +3,29 @@
 Checks: '
  -*
  ,bugprone-*
-  ,-bugprone-macro-parentheses
  ,-bugprone-forward-declaration-namespace
+  ,-bugprone-macro-parentheses
  ,cppcoreguidelines-*
-  ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
-  ,-cppcoreguidelines-pro-type-static-cast-downcast
-  ,-cppcoreguidelines-pro-bounds-pointer-arithmetic
-  ,-cppcoreguidelines-pro-bounds-constant-array-index
-  ,-cppcoreguidelines-pro-type-cstyle-cast
-  ,-cppcoreguidelines-pro-type-reinterpret-cast
-  ,-cppcoreguidelines-pro-type-vararg
-  ,-cppcoreguidelines-special-member-functions
  ,-cppcoreguidelines-interfaces-global-init
  ,-cppcoreguidelines-owning-memory
-  ,hicpp-signed-bitwise
+  ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
+  ,-cppcoreguidelines-pro-bounds-constant-array-index
+  ,-cppcoreguidelines-pro-bounds-pointer-arithmetic
+  ,-cppcoreguidelines-pro-type-cstyle-cast
+  ,-cppcoreguidelines-pro-type-reinterpret-cast
+  ,-cppcoreguidelines-pro-type-static-cast-downcast
+  ,-cppcoreguidelines-pro-type-union-access
+  ,-cppcoreguidelines-pro-type-vararg
+  ,-cppcoreguidelines-special-member-functions
  ,hicpp-exception-baseclass
  ,hicpp-avoid-goto
  ,modernize-*
-  ,-modernize-use-default-member-init
  ,-modernize-return-braced-init-list
  ,-modernize-use-auto
+  ,-modernize-use-default-member-init
+  ,-modernize-use-using
+  ,performance-*
+  ,-performance-noexcept-move-constructor
  '
 WarningsAsErrors: '*'
 HeaderFilterRegex: 'torch/csrc/.*'
--- a/.flake8
+++ b/.flake8
--- a/.gitignore
+++ b/.gitignore
@ -23,6 +23,7 @@ aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
 build/
+caffe2/cpp_test/
 dist/
 docs/src/**/*
 docs/cpp/build
--- a/.gitmodules
+++ b/.gitmodules
@ -60,10 +60,10 @@
 	url = https://github.com/onnx/onnx.git
 [submodule "third_party/onnx-tensorrt"]
 	path = third_party/onnx-tensorrt
-	url = https://github.com/onnx/onnx-tensorrt
+	url = https://github.com/bddppq/onnx-tensorrt
 [submodule "third_party/sleef"]
 	path = third_party/sleef
-	url = https://github.com/shibatch/sleef
+	url = https://github.com/zdevito/sleef
 [submodule "third_party/ideep"]
 	path = third_party/ideep
 	url = https://github.com/intel/ideep
--- a/.jenkins/caffe2/bench.sh
+++ b/.jenkins/caffe2/bench.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+# Anywhere except $ROOT_DIR should work
+cd "$INSTALL_PREFIX"
+
+if [[ $BUILD_ENVIRONMENT == *-cuda* ]]; then
+    num_gpus=$(nvidia-smi -L | wc -l)
+elif [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
+    num_gpus=$(rocminfo | grep 'Device Type.*GPU' | wc -l)
+else
+    num_gpus=0
+fi
+
+cmd="$PYTHON $CAFFE2_PYPATH/python/examples/resnet50_trainer.py --train_data null --batch_size 64 --epoch_size 6400 --num_epochs 2"
+if (( $num_gpus == 0 )); then
+    cmd="$cmd --use_cpu"
+else
+    cmd="$cmd --num_gpus 1"
+fi
+
+eval "$cmd"
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@ -2,6 +2,14 @@

 set -ex

+# TODO: Migrate all centos jobs to use proper devtoolset
+if [[ "$BUILD_ENVIRONMENT" == "py2-cuda9.0-cudnn7-centos7" ]]; then
+  # There is a bug in pango packge on Centos7 that causes undefined
+  # symbols, upgrading glib2 to >=2.56.1 solves the issue. See
+  # https://bugs.centos.org/view.php?id=15495
+  sudo yum install -y -q glib2-2.56.1
+fi
+
 pip install --user --no-cache-dir hypothesis==3.59.0

 # The INSTALL_PREFIX here must match up with test.sh
@ -124,7 +132,24 @@ CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")

 if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
  CMAKE_ARGS+=("-DBLAS=MKL")
+  CMAKE_ARGS+=("-DUSE_MKLDNN=ON")
 fi
+
+if [[ $BUILD_ENVIRONMENT == py2-cuda9.0-cudnn7-ubuntu16.04 ]]; then
+
+  # removing http:// duplicate in favor of nvidia-ml.list
+  # which is https:// version of the same repo
+  sudo rm -f /etc/apt/sources.list.d/nvidia-machine-learning.list
+  curl -o ./nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0_1-1_amd64.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0_1-1_amd64.deb
+  sudo dpkg -i ./nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0_1-1_amd64.deb
+  sudo apt-key add /var/nvinfer-runtime-trt-repo-5.0.2-ga-cuda9.0/7fa2af80.pub
+  sudo apt-get -qq update
+  sudo apt-get install libnvinfer5 libnvinfer-dev
+  rm ./nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0_1-1_amd64.deb
+
+  CMAKE_ARGS+=("-DUSE_TENSORRT=ON")
+fi
+
 if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
  CMAKE_ARGS+=("-DUSE_CUDA=ON")
  CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
@ -204,6 +229,11 @@ if [[ -z "$INTEGRATED" ]]; then
    exit 1
  fi

+  # This is to save test binaries for testing
+  mv "$INSTALL_PREFIX/test/" "$INSTALL_PREFIX/cpp_test/"
+
+  ls $INSTALL_PREFIX
+
 else

  # sccache will be stuck if  all cores are used for compiling
@ -212,10 +242,12 @@ else
    export MAX_JOBS=`expr $(nproc) - 1`
  fi

-  USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
+  USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_TEST=1 BUILD_BINARY=1 python setup.py install --user

  # This is to save test binaries for testing
  cp -r torch/lib/tmp_install $INSTALL_PREFIX
+  mkdir -p "$INSTALL_PREFIX/cpp_test/"
+  cp -r caffe2/test/* "$INSTALL_PREFIX/cpp_test/"

  ls $INSTALL_PREFIX

--- a/.jenkins/caffe2/common.sh
+++ b/.jenkins/caffe2/common.sh
@ -0,0 +1,22 @@
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+
+# Figure out which Python to use
+PYTHON="python"
+if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  PYTHON="python${BASH_REMATCH[1]}"
+fi
+
+# Find where Caffe2 is installed. This will be the absolute path to the
+# site-packages of the active Python installation
+INSTALL_PREFIX="/usr/local/caffe2"
+SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
+INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
+CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
+
+# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
+# Caffe2.
+export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@ -1,23 +1,6 @@
 #!/bin/bash

-set -ex
-
-LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
-TEST_DIR=$ROOT_DIR/caffe2_tests
-
-# Figure out which Python to use
-PYTHON="python"
-if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
-  PYTHON="python${BASH_REMATCH[1]}"
-fi
-
-# The prefix must mirror the setting from build.sh
-INSTALL_PREFIX="/usr/local/caffe2"
-
-# Add the site-packages in the caffe2 install prefix to the PYTHONPATH
-SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
-INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

 # Skip tests in environments where they are not built/applicable
 if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
@ -25,41 +8,34 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
  exit 0
 fi

-# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
-# Caffe2.
-export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
-export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
-
 cd "$ROOT_DIR"

-if [ -d $TEST_DIR ]; then
-  echo "Directory $TEST_DIR already exists; please remove it..."
-  exit 1
-fi
-
-mkdir -p $TEST_DIR/{cpp,python}
+TEST_DIR="$ROOT_DIR/caffe2_tests"
+rm -rf "$TEST_DIR" && mkdir -p "$TEST_DIR"

 cd "${WORKSPACE}"

-# C++ tests
+#############
+# C++ tests #
+#############
+
 echo "Running C++ tests.."
 gtest_reports_dir="${TEST_DIR}/cpp"
-junit_reports_dir="${TEST_DIR}/junit_reports"
-mkdir -p "$gtest_reports_dir" "$junit_reports_dir"
-for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
+mkdir -p "$gtest_reports_dir"
+for test in $(find "${INSTALL_PREFIX}/cpp_test" -executable -type f); do
  case "$test" in
    # skip tests we know are hanging or bad
    */mkl_utils_test|*/aten/integer_divider_test)
      continue
      ;;
    */scalar_tensor_test|*/basic|*/native_test)
-	  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-		continue
-	  else
-	    "$test"
-	  fi
-	  ;;
-	*)
+      if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+        continue
+      else
+        "$test"
+      fi
+      ;;
+    *)
      # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While
      # planning to migrate to gtest as the common PyTorch c++ test suite, we
      # currently do NOT use the xml test reporter, because Catch doesn't
@ -70,14 +46,17 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
      # output than it is to have XML output for Jenkins.
      # Note: in the future, if we want to use xml test reporter once we switch
      # to all gtest, one can simply do:
-      # "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
-      "$test"
+      "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
      ;;
  esac
 done

-# Get the relative path to where the caffe2 python module was installed
-CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
+################
+# Python tests #
+################
+
+pytest_reports_dir="${TEST_DIR}/python"
+mkdir -p "$pytest_reports_dir"

 # Collect additional tests to run (outside caffe2/python)
 EXTRA_TESTS=()
@ -98,7 +77,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py")
 fi

-# Python tests
 # NB: Warnings are disabled because they make it harder to see what
 # the actual erroring test is
 echo "Running Python tests.."
@ -108,7 +86,7 @@ pip install --user pytest-sugar
  -x \
  -v \
  --disable-warnings \
-  --junit-xml="$TEST_DIR/python/result.xml" \
+  --junit-xml="$pytest_reports_dir/result.xml" \
  --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
  --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
  --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@ -14,18 +14,8 @@ clang --version
 # symbolize=1: Gives us much better errors when things go wrong
 export ASAN_OPTIONS=detect_leaks=0:symbolize=1

-# FIXME: Remove the hardcoded "-pthread" option.
-# With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
-# succeed because "pthread_create" is in libasan.so. However, libasan doesn't
-# have the full pthread implementation. Other advanced pthread functions doesn't
-# exist in libasan.so[2]. If we need some pthread advanced functions, we still
-# need to link the pthread library.
-# [1] https://github.com/Kitware/CMake/blob/8cabaaf054a16ea9c8332ce8e9291bd026b38c62/Modules/FindThreads.cmake#L135
-# [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
-#
 # TODO: Make the ASAN flags a more unified env var
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
-  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
-  CXX_FLAGS="-pthread" \
+  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan" \
  NO_CUDA=1 USE_MKLDNN=0 \
  python setup.py install
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -65,7 +65,7 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    fi

    # Setup wrapper scripts
-    for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
+    for compiler in cc c++ gcc g++; do
      (
        echo "#!/bin/sh"
        echo "exec $SCCACHE $(which $compiler) \"\$@\""
--- a/.jenkins/pytorch/win-build.sh
+++ b/.jenkins/pytorch/win-build.sh
@ -141,6 +141,11 @@ if not "%USE_CUDA%"=="0" (
    sccache --show-stats
    sccache --zero-stats
    rd /s /q %CONDA_PARENT_DIR%\\Miniconda3\\Lib\\site-packages\\torch
+    for /f "delims=" %%i in ('where /R caffe2\proto *.py') do (
+      IF NOT "%%i" == "%CD%\caffe2\proto\__init__.py" (
+        del /S /Q %%i
+      )
+    )
    copy %CD%\\tmp_bin\\sccache.exe tmp_bin\\nvcc.exe
  )

--- a/.travis.yml
+++ b/.travis.yml
@ -28,7 +28,7 @@ matrix:
        script: mypy @mypy-files.txt
      - env: CPP_DOC_CHECK
        python: "3.6"
-        install: 
+        install:
          - sudo apt-get install -y doxygen
          - pip install -r requirements.txt
        script: cd docs/cpp/source && ./check-doxygen.sh
@ -41,3 +41,4 @@ matrix:
              - llvm-toolchain-trusty
            packages: clang-tidy
        script: tools/run-clang-tidy-in-ci.sh
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,7 +65,6 @@ option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
 option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON)
-option(BUILD_C10_EXPERIMENTAL_OPS "Build c10 experimental operators" ON)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
    CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@ -75,7 +74,7 @@ cmake_dependent_option(
    "NOT BUILD_SHARED_LIBS" OFF)
 option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
 cmake_dependent_option(
-    INSTALL_TEST "Install test binaries if BUILD_TEST is on" OFF
+    INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
    "BUILD_TEST" OFF)
 option(USE_ACL "Use ARM Compute Library" OFF)
 option(USE_ASAN "Use Address Sanitizer" OFF)
@ -93,7 +92,6 @@ option(USE_LEVELDB "Use LEVELDB" ON)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
 option(USE_LMDB "Use LMDB" ON)
 option(USE_METAL "Use Metal for iOS build" ON)
-option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 option(USE_NCCL "Use NCCL" ON)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
@ -200,6 +198,10 @@ include(ExternalProject)
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)

+if(USE_FBGEMM)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM")
+endif()
+
 # ---[ Whitelist file if whitelist is specified
 include(cmake/Whitelist.cmake)

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -3,15 +3,15 @@
 If you are interested in contributing to PyTorch, your contributions will fall
 into two categories:
 1. You want to propose a new Feature and implement it
-    - post about your intended feature, and we shall discuss the design and
+    - Post about your intended feature, and we shall discuss the design and
    implementation. Once we agree that the plan looks good, go ahead and implement it.
 2. You want to implement a feature or bug-fix for an outstanding issue
    - Look at the outstanding issues here: https://github.com/pytorch/pytorch/issues
-    - Especially look at the Low Priority and Medium Priority issues
-    - Pick an issue and comment on the task that you want to work on this feature
+    - Especially look at the Low Priority and Medium Priority issues.
+    - Pick an issue and comment on the task that you want to work on this feature.
    - If you need more context on a particular issue, please ask and we shall provide.

-Once you finish implementing a feature or bugfix, please send a Pull Request to
+Once you finish implementing a feature or bug-fix, please send a Pull Request to
 https://github.com/pytorch/pytorch

 If you are not familiar with creating a Pull Request, here are some guides:
@ -24,7 +24,7 @@ If you are not familiar with creating a Pull Request, here are some guides:
 To develop PyTorch on your machine, here are some tips:

 1. Uninstall all existing PyTorch installs:
-```
+```bash
 conda uninstall pytorch
 pip uninstall torch
 pip uninstall torch # run this command twice
@ -32,80 +32,81 @@ pip uninstall torch # run this command twice

 2. Clone a copy of PyTorch from source:

-```
+```bash
 git clone https://github.com/pytorch/pytorch
 cd pytorch
 ```

 3. Install PyTorch in `build develop` mode:

-A full set of instructions on installing PyTorch from Source are here:
+A full set of instructions on installing PyTorch from source is here:
 https://github.com/pytorch/pytorch#from-source

 The change you have to make is to replace

-```
+```bash
 python setup.py install
 ```

 with

-```
+```bash
 python setup.py build develop
 ```

 This is especially useful if you are only changing Python files.

-This mode will symlink the python files from the current local source tree into the
-python install.
+This mode will symlink the Python files from the current local source tree into the
+Python install.

-Hence, if you modify a python file, you do not need to reinstall pytorch again and again.
+Hence, if you modify a Python file, you do not need to reinstall PyTorch again and again.

 For example:
- Install local pytorch in `build develop` mode
- modify your python file `torch/__init__.py` (for example)
+- Install local PyTorch in `build develop` mode
+- modify your Python file `torch/__init__.py` (for example)
 - test functionality
- modify your python file `torch/__init__.py`
+- modify your Python file `torch/__init__.py`
 - test functionality
- modify your python file `torch/__init__.py`
+- modify your Python file `torch/__init__.py`
 - test functionality

-You do not need to repeatedly install after modifying python files.
+You do not need to repeatedly install after modifying Python files.

-In case you want to reinstall, make sure that you uninstall pytorch first by running `pip uninstall torch`
+In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch`
 and `python setup.py clean`. Then you can install in `build develop` mode again.

 ## Codebase structure

 * [c10](c10) - Core library files that work everywhere, both server
-  and mobile.  We are slowly moving pieces from ATen/core here.
-  This library is intended only to contain essential functionality,
-  and appropriate to use in settings where binary size matters.  (But
+  and mobile. We are slowly moving pieces from [ATen/core](aten/src/ATen/core)
+  here. This library is intended only to contain essential functionality,
+  and appropriate to use in settings where binary size matters. (But
  you'll have a lot of missing functionality if you try to use it
  directly.)
 * [aten](aten) - C++ tensor library for PyTorch (no autograd support)
-  * src
+  * [src](aten/src)
    * [TH](aten/src/TH)
      [THC](aten/src/THC)
      [THNN](aten/src/THNN)
      [THCUNN](aten/src/THCUNN) - Legacy library code from the original
-      Torch.  Try not to add things here; we're slowly porting these to
-      native.
+      Torch. Try not to add things here; we're slowly porting these to
+      [native](aten/src/ATen/native).
      * generic - Contains actual implementations of operators,
-        parametrized over `scalar_t`.  Files here get compiled N times
+        parametrized over `scalar_t`. Files here get compiled N times
        per supported scalar type in PyTorch.
-    * ATen
-      * [core](aten/src/ATen/core) - Core functionality of ATen.  This
+    * [ATen](aten/src/ATen)
+      * [core](aten/src/ATen/core) - Core functionality of ATen. This
        is migrating to top-level c10 folder.
      * [native](aten/src/ATen/native) - Modern implementations of
-        operators.  If you want to write a new operator, here is where
-        it should go.  Most CPU operators go in the top level directory,
+        operators. If you want to write a new operator, here is where
+        it should go. Most CPU operators go in the top level directory,
        except for operators which need to be compiled specially; see
        cpu below.
        * [cpu](aten/src/ATen/native/cpu) - Not actually CPU
          implementations of operators, but specifically implementations
          which are compiled with processor-specific instructions, like
-          AVX.  See the README for more details.
+          AVX. See the [README](aten/src/ATen/native/cpu/README.md) for more
+          details.
        * [cuda](aten/src/ATen/native/cuda) - CUDA implementations of
          operators.
        * [sparse](aten/src/ATen/native/sparse) - CPU and CUDA
@ -114,34 +115,34 @@ and `python setup.py clean`. Then you can install in `build develop` mode again.
          [miopen](aten/src/ATen/native/miopen) [cudnn](aten/src/ATen/native/cudnn)
          - implementations of operators which simply bind to some
            backend library.
-* [torch](torch) - The actual PyTorch library.  Everything that is not
-  in csrc is Python modules, following the PyTorch Python frontend
-  module structure.
-  * [csrc](torch/csrc) - C++ files composing the PyTorch library.  Files
+* [torch](torch) - The actual PyTorch library. Everything that is not
+  in [csrc](torch/csrc) is a Python module, following the PyTorch Python
+  frontend module structure.
+  * [csrc](torch/csrc) - C++ files composing the PyTorch library. Files
    in this directory tree are a mix of Python binding code, and C++
-    heavy lifting.  Consult `setup.py` for the canonical list of Python
+    heavy lifting. Consult `setup.py` for the canonical list of Python
    binding files; conventionally, they are often prefixed with
    `python_`.
    * [jit](torch/csrc/jit) - Compiler and frontend for TorchScript JIT
      frontend.
    * [autograd](torch/csrc/autograd) - Implementation of reverse-mode automatic
-      differentation
+      differentiation.
    * [api](torch/csrc/api) - The PyTorch C++ frontend.
    * [distributed](torch/csrc/distributed) - Distributed training
      support for PyTorch.
 * [tools](tools) - Code generation scripts for the PyTorch library.
-  See README of this directory for more details.
-* [test](tests) - Python unit tests for PyTorch Python frontend
+  See [README](tools/README.md) of this directory for more details.
+* [test](tests) - Python unit tests for PyTorch Python frontend.
  * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
-    functionality
+    functionality.
  * [test_autograd.py](test/test_autograd.py) - Tests for non-NN
-    automatic differentiation support
+    automatic differentiation support.
  * [test_nn.py](test/test_nn.py) - Tests for NN operators and
-    their automatic differentiation
+    their automatic differentiation.
  * [test_jit.py](test/test_jit.py) - Tests for the JIT compiler
-    and TorchScript
+    and TorchScript.
  * ...
-  * [cpp](test/cpp) - C++ unit tests for PyTorch C++ frontend
+  * [cpp](test/cpp) - C++ unit tests for PyTorch C++ frontend.
  * [expect](test/expect) - Automatically generated "expect" files
    which are used to compare against expected output.
  * [onnx](test/onnx) - Tests for ONNX export functionality,
@ -149,15 +150,15 @@ and `python setup.py clean`. Then you can install in `build develop` mode again.
 * [caffe2](caffe2) - The Caffe2 library.
  * [core](caffe2/core) - Core files of Caffe2, e.g., tensor, workspace,
    blobs, etc.
-  * [operators](caffe2/operators) - Operators of Caffe2
-  * [python](caffe2/python) - Python bindings to Caffe2
+  * [operators](caffe2/operators) - Operators of Caffe2.
+  * [python](caffe2/python) - Python bindings to Caffe2.
  * ...

 ## Unit testing

 PyTorch's testing is located under `test/`. Run the entire test suite with

-```
+```bash
 python test/run_test.py
 ```

@ -169,7 +170,7 @@ a number of useful features for local developing. Install it via `pip install py

 If you want to just run tests that contain a specific substring, you can use the `-k` flag:

-```
+```bash
 pytest test/test_nn.py -k Loss -v
 ```

@ -198,16 +199,16 @@ commands. To run this check locally, run `./check-doxygen.sh` from inside
 ## Managing multiple build trees

 One downside to using `python setup.py develop` is that your development
-version of pytorch will be installed globally on your account (e.g., if
+version of PyTorch will be installed globally on your account (e.g., if
 you run `import torch` anywhere else, the development version will be
 used.

 If you want to manage multiple builds of PyTorch, you can make use of
 [conda environments](https://conda.io/docs/using/envs.html) to maintain
 separate Python package environments, each of which can be tied to a
-specific build of PyTorch.  To set one up:
+specific build of PyTorch. To set one up:

-```
+```bash
 conda create -n pytorch-myfeature
 source activate pytorch-myfeature
 # if you run python now, torch will NOT be installed
@ -219,7 +220,7 @@ python setup.py build develop
 If you are working on the C++ code, there are a few important things that you
 will want to keep in mind:

-1. How to rebuild only the code you are working on, and
+1. How to rebuild only the code you are working on.
 2. How to make rebuilds in the absence of changes go faster.

 ### Build only what you need.
@ -229,10 +230,10 @@ not very optimized for incremental rebuilds, this will actually be very slow.
 Far better is to only request rebuilds of the parts of the project you are
 working on:

- Working on the Python bindings?  Run `python setup.py develop` to rebuild
+- Working on the Python bindings? Run `python setup.py develop` to rebuild
  (NB: no `build` here!)

- Working on `torch/csrc` or `aten`?  Run `python setup.py rebuild_libtorch` to
+- Working on `torch/csrc` or `aten`? Run `python setup.py rebuild_libtorch` to
  rebuild and avoid having to rebuild other dependent libraries we
  depend on.

@ -240,18 +241,19 @@ working on:
  targets are listed in `dep_libs` in `setup.py`. prepend `build_` to
  get a target, and run as e.g. `python setup.py build_gloo`.

- Working on a test binary?  Run `(cd build && ninja bin/test_binary_name)` to
-  rebuild only that test binary (without rerunning cmake).  (Replace `ninja` with
+- Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
+  rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
  `make` if you don't have ninja installed).

 On the initial build, you can also speed things up with the environment
 variables `DEBUG` and `NO_CUDA`.

 - `DEBUG=1` will enable debug builds (-g -O0)
+- `REL_WITH_DEB_INFO=1` will enable debug symbols with optimizations (-g -O3)
 - `NO_CUDA=1` will disable compiling CUDA (in case you are developing on something not CUDA related), to save compile time.

 For example:
-```
+```bash
 NO_CUDA=1 DEBUG=1 python setup.py build develop
 ```

@ -270,9 +272,9 @@ information for the code in `torch/csrc`. More information at:

 #### Use Ninja
 Python `setuptools` is pretty dumb, and always rebuilds every C file in a
-project.  If you install the ninja build system with `pip install ninja`,
+project. If you install the ninja build system with `pip install ninja`,
 then PyTorch will use it to track dependencies correctly.
-If pytorch was already built, you will need to run `python setup.py clean` once
+If PyTorch was already built, you will need to run `python setup.py clean` once
 after installing ninja for builds to succeed.

 #### Use CCache
@ -283,9 +285,9 @@ compilation was exactly the same.

 Using ccache in a situation like this is a real time-saver. However, by
 default, ccache does not properly support CUDA stuff, so here are the
-instructions for installing a custom `ccache` fork that has CUDA support:
+instructions for installing a custom ccache fork that has CUDA support:

-```
+```bash
 # install and export ccache
 if ! ls ~/ccache/bin/ccache
 then
@ -339,13 +341,13 @@ than Linux, which are worth keeping in mind when fixing these problems.

 1. Symbols are NOT exported by default on Windows; instead, you have to explicitly
   mark a symbol as exported/imported in a header file with `__declspec(dllexport)` /
-   `__declspec(dllimport)`.  We have codified this pattern into a set of macros
+   `__declspec(dllimport)`. We have codified this pattern into a set of macros
   which follow the convention `*_API`, e.g., `CAFFE2_API` inside Caffe2 and ATen.
   (Every separate shared library needs a unique macro name, because symbol visibility
   is on a per shared library basis. See c10/macros/Macros.h for more details.)

   The upshot is if you see an "unresolved external" error in your Windows build, this
-   is probably because you forgot to mark a function with `*_API`.  However, there is
+   is probably because you forgot to mark a function with `*_API`. However, there is
   one important counterexample to this principle: if you want a *templated* function
   to be instantiated at the call site, do NOT mark it with `*_API` (if you do mark it,
   you'll have to explicitly instantiate all of the specializations used by the call
@ -353,7 +355,7 @@ than Linux, which are worth keeping in mind when fixing these problems.

 2. If you link against a library, this does not make its dependencies transitively
   visible. You must explicitly specify a link dependency against every library whose
-   symbols you use.  (This is different from Linux where in most environments,
+   symbols you use. (This is different from Linux where in most environments,
   transitive dependencies can be used to fulfill unresolved symbols.)

 3. If you have a Windows box (we have a few on EC2 which you can request access to) and
@ -363,10 +365,10 @@ than Linux, which are worth keeping in mind when fixing these problems.

 Even if you don't know anything about MSVC, you can use cmake to build simple programs on
 Windows; this can be helpful if you want to learn more about some peculiar linking behavior
-by reproducing it on a small example.  Here's a simple example cmake file that defines
+by reproducing it on a small example. Here's a simple example cmake file that defines
 two dynamic libraries, one linking with the other:

-```
+```CMake
 project(myproject CXX)
 set(CMAKE_CXX_STANDARD 11)
 add_library(foo SHARED foo.cpp)
@ -378,7 +380,7 @@ target_link_libraries(bar PUBLIC foo)

 You can build it with:

-```
+```bash
 mkdir build
 cd build
 cmake ..
@ -392,44 +394,44 @@ these exciting features lead to exciting bugs in Windows compilers.
 To add insult to injury, the error messages will often not tell you
 which line of code actually induced the erroring template instantiation.

-I've found the most effective way to debug these problems is to
+We've found the most effective way to debug these problems is to
 carefully read over diffs, keeping in mind known bugs in MSVC/NVCC.
 Here are a few well known pitfalls and workarounds:

 * This is not actually a bug per se, but in general, code generated by MSVC
  is more sensitive to memory errors; you may have written some code
  that does a use-after-free or stack overflows; on Linux the code
-  might work, but on Windows your program will crash.  ASAN may not
+  might work, but on Windows your program will crash. ASAN may not
  catch all of these problems: stay vigilant to the possibility that
  your crash is due to a real memory problem.

-* (NVCC) `c10::optional` does not work when used from device code.  Don't use
-  it from kernels.  Upstream issue: https://github.com/akrzemi1/Optional/issues/58
+* (NVCC) `c10::optional` does not work when used from device code. Don't use
+  it from kernels. Upstream issue: https://github.com/akrzemi1/Optional/issues/58
  and our local issue #10329.

 * `constexpr` generally works less well on MSVC.

  * The idiom `static_assert(f() == f())` to test if `f` is constexpr
    does not work; you'll get "error C2131: expression did not evaluate
-    to a constant".  Don't use these asserts on Windows.
+    to a constant". Don't use these asserts on Windows.
    (Example: `c10/util/intrusive_ptr.h`)

 * (NVCC) Code you access inside a `static_assert` will eagerly be
  evaluated as if it were device code, and so you might get an error
  that the code is "not accessible".

-```
+```cpp
 class A {
  static A singleton_;
  static constexpr inline A* singleton() {
    return &singleton_;
  }
 };
-static_assert(std::is_same(A*, decltype(A::singelton()))::value, "hmm");
+static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
 ```

-* The compiler will run out of heap if you attempt to compile files that
-  are too large.  Splitting such files into separate files helps.
+* The compiler will run out of heap space if you attempt to compile files that
+  are too large. Splitting such files into separate files helps.
  (Example: `THTensorMath`, `THTensorMoreMath`, `THTensorEvenMoreMath`.)

 ### Running Clang-Tidy
@ -453,8 +455,8 @@ have more checks than older versions. In our CI, we run clang-tidy-6.0.
   git revision (you may want to replace `HEAD~1` with `HEAD` to pick up
   uncommitted changes). Changes are picked up based on a `git diff` with the
   given revision:
-  ```sh
-  $ python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
+  ```bash
+  python tools/clang_tidy.py -d build -p torch/csrc --diff 'HEAD~1'
  ```

 Above, it is assumed you are in the PyTorch root folder. `path/to/build` should
@ -463,26 +465,36 @@ root folder if you used `setup.py build`. You can use `-c <clang-tidy-binary>`
 to change the clang-tidy this script uses. Make sure you have PyYaml installed,
 which is in PyTorch's `requirements.txt`.

+### Pre-commit Tidy/Linting Hook
+
+We use clang-tidy and flake8 to perform additional formatting and semantic checking
+of code. We provide a pre-commit git hook for performing these checks, before
+a commit is created:
+
+  ```bash
+  ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
+  ```
+
 ## Caffe2 notes

-In 2018, we merged Caffe2 into the PyTorch source repository.  While the
+In 2018, we merged Caffe2 into the PyTorch source repository. While the
 steady state aspiration is that Caffe2 and PyTorch share code freely,
 in the meantime there will be some separation.

 If you submit a PR to only PyTorch or only Caffe2 code, CI will only
-run for the project you edited.  The logic for this is implemented
+run for the project you edited. The logic for this is implemented
 in `.jenkins/pytorch/dirty.sh` and `.jenkins/caffe2/dirty.sh`; you
 can look at this to see what path prefixes constitute changes.
 This also means if you ADD a new top-level path, or you start
 sharing code between projects, you need to modify these files.

 There are a few "unusual" directories which, for historical reasons,
-are Caffe2/PyTorch specific.  Here they are:
+are Caffe2/PyTorch specific. Here they are:

 - `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `conda`, `modules`,
-  `scripts` are Caffe2-specific.  Don't put PyTorch code in them without
+  `scripts` are Caffe2-specific. Don't put PyTorch code in them without
  extra coordination.

 - `mypy*`, `requirements.txt`, `setup.py`, `test`, `tools` are
-  PyTorch-specific.  Don't put Caffe2 code in them without extra
+  PyTorch-specific. Don't put Caffe2 code in them without extra
  coordination.
--- a/README.md
+++ b/README.md
@ -8,8 +8,6 @@ PyTorch is a Python package that provides two high-level features:

 You can reuse your favorite Python packages such as NumPy, SciPy and Cython to extend PyTorch when needed.

-We are in an early-release beta. Expect some adventures and rough edges.
-
 - [More about PyTorch](#more-about-pytorch)
 - [Installation](#installation)
  - [Binaries](#binaries)
@ -33,7 +31,7 @@ We are in an early-release beta. Expect some adventures and rough edges.
 See also the [ci.pytorch.org HUD](https://ezyang.github.io/pytorch-ci-hud/build/pytorch-master).


-## More about PyTorch
+## More About PyTorch

 At a granular level, PyTorch is a library that consists of the following components:

@ -44,12 +42,11 @@ At a granular level, PyTorch is a library that consists of the following compone
 | **torch.nn** | a neural networks library deeply integrated with autograd designed for maximum flexibility |
 | **torch.multiprocessing** | Python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and Hogwild training |
 | **torch.utils** | DataLoader, Trainer and other utility functions for convenience |
-| **torch.legacy(.nn/.optim)** | legacy code that has been ported over from torch for backward compatibility reasons |

 Usually one uses PyTorch either as:

 - a replacement for NumPy to use the power of GPUs.
- a deep learning research platform that provides maximum flexibility and speed
+- a deep learning research platform that provides maximum flexibility and speed.

 Elaborating further:

@ -117,7 +114,7 @@ We've written custom memory allocators for the GPU to make sure that
 your deep learning models are maximally memory efficient.
 This enables you to train bigger deep learning models than before.

-### Extensions without Pain
+### Extensions Without Pain

 Writing new neural network modules, or interfacing with PyTorch's Tensor API was designed to be straightforward
 and with minimal abstractions.
@ -133,7 +130,6 @@ There is no wrapper code that needs to be written. You can see [a tutorial here]

 ### Binaries
 Commands to install from binaries via Conda or pip wheels are on our website:
-
 [https://pytorch.org](https://pytorch.org)

 ### From Source
@ -154,31 +150,20 @@ If you want to build on Windows, Visual Studio 2017 14.11 toolset and NVTX are a
 Especially, for CUDA 8 build on Windows, there will be an additional requirement for VS 2015 Update 3 and a patch for it.
 The details of the patch can be found out [here](https://support.microsoft.com/en-gb/help/4020481/fix-link-exe-crashes-with-a-fatal-lnk1000-error-when-you-use-wholearch).

-#### Install optional dependencies
+#### Install Dependencies
+
+Common
+```
+conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
+```

 On Linux
 ```bash
-export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" # [anaconda root directory]
-
-# Install basic dependencies
-conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
-conda install -c mingfeima mkldnn
-
-# Add LAPACK support for the GPU
+# Add LAPACK support for the GPU if needed
 conda install -c pytorch magma-cuda92 # or [magma-cuda80 | magma-cuda91] depending on your cuda version
 ```

-On macOS
-```bash
-export CMAKE_PREFIX_PATH=[anaconda root directory]
-conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
-```
-
-On Windows
-```cmd
-conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
-```
-#### Get the PyTorch source
+#### Get the PyTorch Source
 ```bash
 git clone --recursive https://github.com/pytorch/pytorch
 cd pytorch
@ -187,11 +172,13 @@ cd pytorch
 #### Install PyTorch
 On Linux
 ```bash
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 python setup.py install
 ```

 On macOS
 ```bash
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
 ```

@ -210,9 +197,9 @@ call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11
 python setup.py install
 ```

-### Docker image
+### Docker Image

-Dockerfile is supplied to build images with cuda support and cudnn v7. You can pass `-e PYTHON_VERSION=x.y` flag to specify which python version is to be used by Miniconda, or leave it unset to use the default. Build as usual
+Dockerfile is supplied to build images with cuda support and cudnn v7. You can pass `-e PYTHON_VERSION=x.y` flag to specify which Python version is to be used by Miniconda, or leave it unset to use the default. Build as usual
 ```
 docker build -t pytorch -f docker/pytorch/Dockerfile .
 ```
@ -259,8 +246,7 @@ Three pointers to get you started:

 ## Releases and Contributing

-PyTorch has a 90 day release cycle (major releases).
-Its current state is Beta, we expect no obvious bugs. Please let us know if you encounter a bug by [filing an issue](https://github.com/pytorch/pytorch/issues).
+PyTorch has a 90 day release cycle (major releases). Please let us know if you encounter a bug by [filing an issue](https://github.com/pytorch/pytorch/issues).

 We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion.

--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@ -21,9 +21,14 @@ set(ATen_THIRD_PARTY_INCLUDE)
 set(ATen_CUDA_SRCS)
 set(ATen_CUDA_TEST_SRCS)
 set(ATen_CUDA_INCLUDE)
+set(ATen_HIP_SRCS)
+set(ATen_HIP_TEST_SRCS)
+set(ATen_HIP_INCLUDE)
 set(ATen_CPU_DEPENDENCY_LIBS)
 set(ATen_CUDA_DEPENDENCY_LIBS)
+set(ATen_HIP_DEPENDENCY_LIBS)
 set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS)
+set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS)
 SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
 SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
 SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
@ -35,22 +40,11 @@ endif()
 set(TH_LINK_STYLE STATIC)
 add_subdirectory(src/TH)
 set(TH_CPU_INCLUDE
-  # dense
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/TH
-  ${CMAKE_CURRENT_BINARY_DIR}/src/TH
  ${CMAKE_CURRENT_SOURCE_DIR}/src
  ${CMAKE_CURRENT_BINARY_DIR}/src
  ${CMAKE_BINARY_DIR}/aten/src)
 list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE})

-if(USE_CUDA OR USE_ROCM)
-  set(TH_CUDA_INCLUDE
-    # dense
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
-    ${CMAKE_CURRENT_BINARY_DIR}/src/THC)
-  list(APPEND ATen_CUDA_INCLUDE ${TH_CUDA_INCLUDE})
-endif()
-
 add_subdirectory(src/THNN)

 # Find the HIP package, set the HIP paths, load the HIP CMake.
@ -69,9 +63,11 @@ IF(MSVC)
 ENDIF(MSVC)

 if(USE_ROCM)
+  # TODO: AT_HIP_ENABLED (change this once we represent HIP as HIP in
+  # ATen proper)
  SET(AT_CUDA_ENABLED 1)
-  add_subdirectory(src/THC)
-  add_subdirectory(src/THCUNN)
+  add_subdirectory(src/THH)
+  add_subdirectory(src/THHUNN)
  message("ROCm is enabled.")
 elseif(USE_CUDA)
  SET(AT_CUDA_ENABLED 1)
@ -82,24 +78,23 @@ else()
  SET(AT_CUDA_ENABLED 0)
 endif()

-list(APPEND ATen_CPU_INCLUDE
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/THNN
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/THCUNN)
-
 list(APPEND ATen_CPU_INCLUDE
  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/catch/single_include
-  ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
+  ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/catch/single_include)
 add_subdirectory(src/ATen)

 # Pass source, includes, and libs to parent
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@ -1,24 +1,24 @@
 #pragma once

-#include "ATen/Allocator.h"
-#include "ATen/CPUGeneral.h"
-#include "ATen/Context.h"
-#include "ATen/Device.h"
-#include "ATen/DeviceGuard.h"
-#include "ATen/DimVector.h"
-#include "ATen/Dispatch.h"
-#include "ATen/Formatting.h"
-#include "ATen/Functions.h"
-#include "ATen/ScalarOps.h"
-#include "ATen/Tensor.h"
-#include "ATen/TensorGeometry.h"
-#include "ATen/TensorOperators.h"
-#include "ATen/Type.h"
-#include "ATen/core/ATenGeneral.h"
-#include "ATen/core/Generator.h"
+#include <ATen/Allocator.h>
+#include <ATen/CPUGeneral.h>
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DimVector.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Formatting.h>
+#include <ATen/Functions.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/Type.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
 #include <c10/core/Layout.h>
-#include "ATen/core/Scalar.h"
+#include <ATen/core/Scalar.h>
 #include <c10/core/Storage.h>
-#include "ATen/core/TensorMethods.h"
-#include "ATen/core/TensorOptions.h"
+#include <ATen/core/TensorMethods.h>
+#include <c10/core/TensorOptions.h>
 #include <c10/util/Exception.h>
--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@ -1,6 +1,6 @@
 #pragma once
-#include "ATen/Config.h"
-#include "ATen/core/Half.h"
+#include <ATen/Config.h>
+#include <ATen/core/Half.h>

 // Defines the accumulation type for a scalar type.
 // Example:
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -17,7 +17,14 @@ IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DI
 ENDIF()

 CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
-CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
+# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortuantely,
+# this file generates AT_ROCM_ENABLED() which is required by the miopen
+# files, which are compiled even if we are doing a vanilla CUDA build.
+# Once we properly split CUDA and HIP in ATen, we can remove this code.
+configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
+if(USE_ROCM)
+  configure_file(hip/HIPConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/hip/HIPConfig.h")
+endif()

 # NB: If you edit these globs, you'll have to update setup.py package_data as well
 FILE(GLOB base_h "*.h" "detail/*.h" "cpu/*.h")
@ -28,21 +35,33 @@ FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
 FILE(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 FILE(GLOB cudnn_cpp "cudnn/*.cpp")
+
+FILE(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh")
+FILE(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+FILE(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
 FILE(GLOB miopen_h "miopen/*.h")
 FILE(GLOB miopen_cpp "miopen/*.cpp")
+
 FILE(GLOB mkl_cpp "mkl/*.cpp")
 FILE(GLOB mkldnn_cpp "mkldnn/*.cpp")

 FILE(GLOB native_cpp "native/*.cpp")
-FILE(GLOB native_sparse_cpp "native/sparse/*.cpp")
-FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
-FILE(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
-FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
-FILE(GLOB native_miopen_cpp "native/miopen/*.cpp")
-FILE(GLOB native_cuda_cu "native/cuda/*.cu")
-FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
 FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
 FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
+FILE(GLOB native_sparse_cpp "native/sparse/*.cpp")
+
+FILE(GLOB native_cuda_cu "native/cuda/*.cu")
+FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
+FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
+FILE(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
+
+FILE(GLOB native_hip_hip "native/hip/*.hip")
+FILE(GLOB native_hip_cpp "native/hip/*.cpp")
+FILE(GLOB native_miopen_cpp "native/miopen/*.cpp")
+FILE(GLOB native_cudnn_hip_cpp "native/cudnn/hip/*.cpp")
+FILE(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
+FILE(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp")

 set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
@ -52,22 +71,32 @@ if(AT_MKLDNN_ENABLED)
  set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp})
 endif()

-IF(USE_CUDA OR USE_ROCM)
+if(USE_CUDA AND USE_ROCM)
+  message(FATAL_ERROR "ATen doesn't not currently support simultaneously building with CUDA and ROCM")
+endif()
+
+IF(USE_CUDA)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
  set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu})
  set(all_cuda_cpp ${native_sparse_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS})
-  IF(USE_CUDA)
-    SET(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp})
-    IF(CUDNN_FOUND)
-      SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp})
-    ENDIF()
-  ELSEIF(USE_ROCM)
-    SET(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${miopen_cpp} ${all_cuda_cpp})
+  SET(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp})
+  IF(CUDNN_FOUND)
+    SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp})
  ENDIF()
 endif()

+IF(USE_ROCM)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_sparse_hip_hip})
+  # TODO: Codegen separate files for HIP and use those (s/cuda_generated_cpp/hip_generated_cpp)
+  set(all_hip_cpp ${native_sparse_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${cuda_generated_cpp} ${ATen_HIP_SRCS})
+  set(all_hip_cpp ${native_miopen_cpp} ${native_cudnn_hip_cpp} ${miopen_cpp} ${all_hip_cpp})
+endif()
+
 filter_list(generated_h generated_cpp "\\.h$")
 filter_list(cuda_generated_h cuda_generated_cpp "\\.h$")
+# TODO: When we have hip_generated_cpp
+#filter_list(hip_generated_h hip_generated_cpp "\\.h$")

 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
 # so the build can find the generated header files
@ -81,21 +110,28 @@ IF(BLAS_FOUND)
    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
    list(APPEND ATen_CPU_DEPENDENCY_LIBS
      "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
-    if(USE_CUDA OR USE_ROCM)
+    if(USE_CUDA)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
        "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
    endif()
+    if(USE_ROCM)
+      list(APPEND ATen_HIP_DEPENDENCY_LIBS
+        "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+    endif()
  ELSE ($ENV{TH_BINARY_BUILD})
    list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES})
-    if(USE_CUDA OR USE_ROCM)
+    if(USE_CUDA)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${BLAS_LIBRARIES}")
    endif()
+    if(USE_ROCM)
+      list(APPEND ATen_HIP_DEPENDENCY_LIBS "${BLAS_LIBRARIES}")
+    endif()
  ENDIF ($ENV{TH_BINARY_BUILD})
 ENDIF(BLAS_FOUND)

 IF(LAPACK_FOUND)
  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
-  if(USE_CUDA OR USE_ROCM)
+  if(USE_CUDA)
    # Although Lapack provides CPU (and thus, one might expect that ATen_cuda
    # would not need this at all), some of our libraries (magma in particular)
    # backend to CPU BLAS/LAPACK implementations, and so it is very important
@ -104,6 +140,11 @@ IF(LAPACK_FOUND)
    # This caused https://github.com/pytorch/pytorch/issues/7353
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
  endif()
+  if(USE_ROCM)
+    # It's not altogether clear that HIP behaves the same way, but it
+    # seems safer to assume that it needs it too
+    list(APPEND ATen_HIP_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
+  endif()
 ENDIF(LAPACK_FOUND)

 IF (UNIX AND NOT APPLE)
@ -206,6 +247,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60
 	--generate-code arch=compute_70,code=sm_70)
+    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60
+	--generate-code arch=compute_70,code=sm_70)
    else()
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
    endif()
@ -252,22 +299,21 @@ IF(USE_CUDA AND NOT USE_ROCM)
  ENDIF($ENV{ATEN_STATIC_CUDA})
 ENDIF()

-IF(USE_ROCM)
- ### Link in the ROCm libraries BLAS / RNG.
- FIND_LIBRARY(ROCBLAS_LIBRARY rocblas HINTS ${ROCBLAS_PATH}/lib)
- FIND_LIBRARY(HIPRAND_LIBRARY hiprand HINTS ${HIPRAND_PATH}/lib)
+# NB: We're relying on cmake/Dependencies.cmake to appropriately setup HIP dependencies.
+# In principle we could duplicate them, but handling the rocblas
+# dependency is nontrivial.  So better not to copy-paste.
+# Look for Note [rocblas cmake bug]

- list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${ROCBLAS_LIBRARY} ${HIPRAND_LIBRARY})
-ENDIF()
-
-# Include CPU paths for CUDA as well
+# Include CPU paths for CUDA/HIP as well
 list(APPEND ATen_CUDA_INCLUDE ${ATen_CPU_INCLUDE})
+list(APPEND ATen_HIP_INCLUDE ${ATen_CPU_INCLUDE})

 # We have two libraries: libATen_cpu.so and libATen_cuda.so,
 # with libATen_cuda.so depending on libATen_cpu.so.  The CPU library
 # contains CPU code only.  libATen_cpu.so is invariant to the setting
 # of USE_CUDA (it always builds the same way); libATen_cuda.so is only
-# built when USE_CUDA=1 and CUDA is available.
+# built when USE_CUDA=1 and CUDA is available.  (libATen_hip.so works
+# the same way as libATen_cuda.so)
 set(ATen_CPU_SRCS ${all_cpu_cpp})
 if(AT_LINK_STYLE STREQUAL "INTERFACE")
  # Source code can't be added to an interface library, so it is
@ -291,7 +337,7 @@ else()
  set(ATen_CPU_SRCS)
 endif()

-if(USE_CUDA OR USE_ROCM)
+if(USE_CUDA)
  set(ATen_CUDA_SRCS ${all_cuda_cpp})
  if(AT_LINK_STYLE STREQUAL "INTERFACE")
    # Source code can't be added to an interface library, so it is
@ -299,42 +345,25 @@ if(USE_CUDA OR USE_ROCM)
    add_library(ATen_cuda INTERFACE)
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
  else()
-    # A hack to deal with cuda library dependencies and modern CMake: the
-    # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
-    # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
-    # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
-    # it. We will then manually add the cudart library as interface libs.
-    set(__tmp ${CUDA_LIBRARIES})
-    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
-    torch_cuda_based_add_library(ATen_cuda ${AT_LINK_STYLE} ${ATen_CUDA_SRCS})
-    set(CUDA_LIBRARIES ${__tmp})
-    target_link_libraries(ATen_cuda INTERFACE caffe2::cudart)
-
-    target_include_directories(
-        ATen_cuda INTERFACE $<INSTALL_INTERFACE:include>)
-    target_include_directories(
-        ATen_cuda PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-    target_include_directories(
-        ATen_cuda PRIVATE ${ATen_CUDA_INCLUDE})
-    target_link_libraries(
-        ATen_cuda PRIVATE ${ATen_CUDA_DEPENDENCY_LIBS} ATEN_CUDA_FILES_GEN_LIB)
-
-    # These public dependencies must go after the previous dependencies, as the
-    # order of the libraries in the linker call matters here when statically
-    # linking; libculibos and cublas must be last.
-    target_link_libraries(
-        ATen_cuda PUBLIC ATen_cpu ${ATen_PUBLIC_CUDA_DEPENDENCY_LIBS})
-
-    # Set standard properties on the target
-    torch_set_target_props(ATen_cuda)
-
-    caffe2_interface_library(ATen_cuda ATen_cuda_library)
-
-    # Make sure these don't get built by parent
-    set(ATen_CUDA_SRCS)
+    message(FATAL_ERROR "Non-INTERFACE AT_LINK_STYLE no longer supported")
  endif()
 endif()

+if(USE_ROCM)
+  set(ATen_HIP_SRCS ${all_hip_cpp})
+  if(AT_LINK_STYLE STREQUAL "INTERFACE")
+    # Source code can't be added to an interface library, so it is
+    # passed back to be compiled into the containing library
+    add_library(ATen_hip INTERFACE)
+    # NB: Instead of adding it to this list, we add it by hand
+    # to caffe2_hip, because it needs to be a PRIVATE dependency
+    # list(APPEND ATen_HIP_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
+  else()
+    message(FATAL_ERROR "Non-INTERFACE AT_LINK_STYLE not (yet) supported for ROCm build")
+  endif()
+endif()
+
+
 if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")
  if(USE_CUDA)
    if (NOT $ENV{ATEN_STATIC_CUDA})
@ -345,16 +374,22 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")

  if(NOT MSVC)
    torch_compile_options(ATen_cpu)
-    if(USE_CUDA OR USE_ROCM)
+    if(USE_CUDA)
      torch_compile_options(ATen_cuda)
    endif()
+    if(USE_ROCM)
+      torch_compile_options(ATen_hip)
+    endif()
  endif()

  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
    set_property(TARGET ATen_cpu PROPERTY CXX_STANDARD 11)
-    if(USE_CUDA OR USE_ROCM)
+    if(USE_CUDA)
      set_property(TARGET ATen_cuda PROPERTY CXX_STANDARD 11)
    endif()
+    if(USE_ROCM)
+      set_property(TARGET ATen_hip PROPERTY CXX_STANDARD 11)
+    endif()
  endif()
 endif()

@ -364,11 +399,12 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
  DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")

 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
-FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h})
+FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h} ${hip_h} ${miopen_h})
  string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
  GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
 ENDFOREACH()
+# TODO: Install hip_generated_h when we have it
 FOREACH(HEADER ${generated_h} ${cuda_generated_h})
  # NB: Assumed to be flat
  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen)
@ -386,10 +422,15 @@ endif()
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
+set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/Parallel.h"
-#include "ATen/TensorUtils.h"
+#include <ATen/Parallel.h>
+#include <ATen/TensorUtils.h>
 #include <limits>
 #include <utility>
 #include <cstring>
--- a/aten/src/ATen/CPUFixedAllocator.h
+++ b/aten/src/ATen/CPUFixedAllocator.h
@ -1,7 +1,7 @@
 #pragma once

-#include "TH/TH.h"
-#include "c10/util/Exception.h"
+#include <TH/TH.h>
+#include <c10/util/Exception.h>

 // This file creates a fake allocator that just throws exceptions if
 // it is actually used.
--- a/aten/src/ATen/CPUGeneral.h
+++ b/aten/src/ATen/CPUGeneral.h
@ -4,7 +4,7 @@
 // linking errors using MSVC
 // See https://msdn.microsoft.com/en-us/library/a90k134d.aspx
 // This header adds this if using CAFFE2_API
-#include "ATen/core/ATenGeneral.h"
+#include <ATen/core/ATenGeneral.h>

 namespace at {
 CAFFE2_API void set_num_threads(int);
--- a/aten/src/ATen/CPUGenerator.cpp
+++ b/aten/src/ATen/CPUGenerator.cpp
@ -1,4 +1,4 @@
-#include "ATen/CPUGenerator.h"
+#include <ATen/CPUGenerator.h>

 #define const_generator_cast(generator) \
  dynamic_cast<const CPUGenerator&>(generator)
--- a/aten/src/ATen/CheckGenerator.h
+++ b/aten/src/ATen/CheckGenerator.h
@ -1,8 +1,8 @@
 #pragma once

-#include "ATen/Utils.h"
-#include "ATen/core/Generator.h"
-#include "c10/util/Exception.h"
+#include <ATen/Utils.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Exception.h>

 namespace at {

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -1,8 +1,8 @@
-#include "ATen/Config.h"
+#include <ATen/Config.h>

-#include "Context.h"
+#include <ATen/Context.h>

-#include <ATen/core/TensorOptions.h>
+#include <c10/core/TensorOptions.h>

 #include <thread>
 #include <mutex>
@ -10,12 +10,12 @@
 #include <string>
 #include <stdexcept>

-#include "ATen/CPUGenerator.h"
-#include "ATen/RegisterCPU.h"
-#include "ATen/Tensor.h"
+#include <ATen/CPUGenerator.h>
+#include <ATen/RegisterCPU.h>
+#include <ATen/Tensor.h>
 #include <ATen/cpu/FlushDenormal.h>

-#include "TH/TH.h"  // for USE_LAPACK
+#include <TH/TH.h>  // for USE_LAPACK

 namespace at {

--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -1,19 +1,19 @@
 #pragma once

 #include <ATen/CPUGeneral.h>
-#include "ATen/Type.h"
-#include "ATen/TypeExtendedInterface.h"
-#include "ATen/Utils.h"
-#include "ATen/LegacyTHDispatch.h"
-#include "ATen/LegacyTHDispatcher.h"
-#include "ATen/core/ATenGeneral.h"
-#include "ATen/core/Generator.h"
-#include "ATen/core/LegacyTypeDispatch.h"
-#include "ATen/core/VariableHooksInterface.h"
-#include "ATen/detail/CUDAHooksInterface.h"
-#include "ATen/detail/HIPHooksInterface.h"
-#include "ATen/detail/ComplexHooksInterface.h"
-#include "c10/util/Exception.h"
+#include <ATen/Type.h>
+#include <ATen/TypeExtendedInterface.h>
+#include <ATen/Utils.h>
+#include <ATen/LegacyTHDispatch.h>
+#include <ATen/LegacyTHDispatcher.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/core/VariableHooksInterface.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/detail/HIPHooksInterface.h>
+#include <ATen/detail/ComplexHooksInterface.h>
+#include <c10/util/Exception.h>

 #include <memory>
 #include <mutex>
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -1,5 +1,5 @@
-#include "ATen/DLConvertor.h"
-#include "ATen/Functions.h"
+#include <ATen/DLConvertor.h>
+#include <ATen/Functions.h>

 #include <iostream>
 #include <sstream>
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -1,8 +1,8 @@
 #pragma once

-#include "ATen/Tensor.h"
-#include "ATen/ATen.h"
-#include "ATen/dlpack.h"
+#include <ATen/Tensor.h>
+#include <ATen/ATen.h>
+#include <ATen/dlpack.h>

 // this convertor will:
 // 1) take a Tensor object and wrap it in the DLPack tensor
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@ -1629,8 +1629,7 @@
        - arg: THTensor* result
          output: True
        - THTensor* self
-        - arg: real p
-          python_default_init: AS_REAL(2)
+        - real p
        - arg: long dim
          wrap_dim: self
        - arg: bool keepdim
@ -1882,44 +1881,6 @@
    - THTensor* end
    - real weight
 ]]
-[[
-  name: _th_linspace
-  cname: linspace
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - real start
-    - real end
-    - long steps
-]]
-[[
-  name: _th_logspace
-  cname: logspace
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - real start
-    - real end
-    - long steps
-]]
 [[
  name: _th_histc
  cname: histc
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@ -1,4 +1,4 @@
-#include "ATen/ExpandUtils.h"
+#include <ATen/ExpandUtils.h>

 namespace at {

--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/Tensor.h"
-#include "c10/util/Exception.h"
+#include <ATen/Tensor.h>
+#include <c10/util/Exception.h>

 #include <functional>
 #include <sstream>
--- a/aten/src/ATen/InitialTensorOptions.h
+++ b/aten/src/ATen/InitialTensorOptions.h
@ -1,6 +1,6 @@
 #pragma once

-#include <ATen/core/TensorOptions.h>
+#include <c10/core/TensorOptions.h>

 namespace at {

--- a/aten/src/ATen/LegacyTHDispatch.h
+++ b/aten/src/ATen/LegacyTHDispatch.h
@ -38,6 +38,7 @@

 #include <c10/core/Backend.h>
 #include <c10/core/ScalarType.h>
+#include <ATen/core/LegacyDeviceTypeInit.h>
 #include <ATen/LegacyTHDispatcher.h>

 namespace at {
@ -69,16 +70,51 @@ class CAFFE2_API LegacyTHDispatch {
    dispatcher_registry[static_cast<int>(b)][static_cast<int>(s)] = std::move(t);
  }

+  LegacyTHDispatcher & getLegacyTHDispatcher(Backend p, ScalarType s) {
+    auto* dispatcher = getLegacyTHDispatcherOpt(p, s);
+    if (!dispatcher) AT_ERROR(toString(p), toString(s), "THDispatcher is not enabled.");
+    return *dispatcher;
+  }
+private:
  LegacyTHDispatcher* getLegacyTHDispatcherRaw(Backend p, ScalarType s) {
    return dispatcher_registry[static_cast<int>(p)][static_cast<int>(s)].get();
  }

-  LegacyTHDispatcher & getLegacyTHDispatcher(Backend p, ScalarType s) {
-    auto* type = getLegacyTHDispatcherRaw(p, s);
-    if (!type) AT_ERROR(toString(p), toString(s), "THDispatcher is not enabled.");
-    return *type;
+  LegacyTHDispatcher* getLegacyTHDispatcherOpt(Backend p, ScalarType s) {
+    if (p != Backend::Undefined) {
+      initForDeviceType(backendToDeviceType(p));
+      // NB: there is no Complex for TH, so no initialization to be done.
+    }
+    auto dispatcher = getLegacyTHDispatcherRaw(p, s);
+
+    if(!dispatcher) {
+      if (p == Backend::Undefined || s == ScalarType::Undefined) {
+        AT_ERROR("Requested Undefined THDispatcher which is invalid.  Backend:",
+                 toString(p), "ScalarType: ", toString(s));
+      }
+    }
+
+    return dispatcher;
  }
-private:
+
+  void initForDeviceType(DeviceType p) {
+    static std::once_flag cpu_once;
+    static std::once_flag cuda_once;
+    if (p == DeviceType::CPU) {
+      std::call_once(cpu_once, [] {
+        getLegacyDeviceTypeInit().initCPU();
+      });
+    } else if (p == DeviceType::CUDA) {
+      std::call_once(cuda_once, [] {
+        getLegacyDeviceTypeInit().initCUDA();
+      });
+    } else if (p == DeviceType::HIP) {
+      std::call_once(cuda_once, [] {
+        getLegacyDeviceTypeInit().initHIP();
+      });
+    }
+  }
+
  // NB: dispatcher_registry has nullptr for all CUDA backends until
  // CUDA initialization has occurred
  LegacyTHDispatcherUniquePtr dispatcher_registry
--- a/aten/src/ATen/ScalarOps.h
+++ b/aten/src/ATen/ScalarOps.h
@ -1,7 +1,7 @@
 #pragma once

 #include <c10/core/Scalar.h>
-#include "ATen/Tensor.h"
+#include <ATen/Tensor.h>

 // This is in the c10 namespace because we use ADL to find the functions in it.
 namespace c10 {
@ -10,10 +10,10 @@ namespace c10 {
 // to implement this without going through Derived Types (which are not part of core).
 inline at::Tensor scalar_to_tensor(Scalar s) {
  if (s.isFloatingPoint()) {
-    return at::CPU(kDouble).scalarTensor(s);
+    return at::scalar_tensor(s, at::CPU(kDouble).options());
  } else {
    AT_ASSERT(s.isIntegral());
-    return at::CPU(kLong).scalarTensor(s);
+    return at::scalar_tensor(s, at::CPU(kLong).options());
  }
 }

--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@ -32,14 +32,13 @@ namespace {
 // values tensor for such an empty tensor.
 SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type)
    : TensorImpl(type_id, data_type, nullptr, false)
-    , size_{0}
    , sparse_dim_(1)
    , dense_dim_(0)
    , indices_(at::empty({1, 0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(ScalarType::Long)))
    , values_(at::empty({0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(data_type))) {}

 IntList SparseTensorImpl::sizes() const {
-  return size_;
+  return sizes_;
 }
 IntList SparseTensorImpl::strides() const {
  AT_ERROR("sparse tensors do not have strides");
@ -47,10 +46,6 @@ IntList SparseTensorImpl::strides() const {
 bool SparseTensorImpl::is_contiguous() const {
  AT_ERROR("sparse tensors do not have is_contiguous");
 }
-int64_t SparseTensorImpl::size(int64_t d) const {
-  d = at::maybe_wrap_dim(d, dim(), false);
-  return size_[d];
-}
 int64_t SparseTensorImpl::stride(int64_t d) const {
  AT_ERROR("sparse tensors do not have strides");
 }
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -1,8 +1,8 @@
 #pragma once

-#include "ATen/Tensor.h"
-#include "ATen/core/TensorImpl.h"
-#include "c10/util/Exception.h"
+#include <ATen/Tensor.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>

 namespace at {
 struct CAFFE2_API SparseTensorImpl : public TensorImpl {
@ -14,11 +14,6 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
  // _indices.shape: dimensionality: 2,  shape: (sparse_dim, nnz)
  // _values.shape:  dimensionality: 1 + dense_dim.  shape: (nnz, shape[sparse_dim:])

-  // The true size of the sparse tensor (e.g., if you called to_dense()
-  // on it).  When THTensor merges into TensorImpl, this field
-  // should move to the parent class.
-  std::vector<int64_t> size_;
-
  int64_t sparse_dim_ = 0; // number of sparse dimensions
  int64_t dense_dim_ = 0; // number of dense dimensions

@ -48,7 +43,6 @@ public:
  IntList sizes() const override;
  IntList strides() const override;
  bool is_contiguous() const override;
-  int64_t size(int64_t d) const override;
  int64_t stride(int64_t d) const override;
  void resize_dim(int64_t ndim) override;
  void set_size(int64_t dim, int64_t new_size) override;
@ -63,7 +57,7 @@ public:
  // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with
  // respect to indices and values
  void raw_resize_(int64_t sparse_dim, int64_t dense_dim, IntList size) {
-    size_ = size.vec();
+    sizes_ = size.vec();
    sparse_dim_ = sparse_dim;
    dense_dim_ = dense_dim;
    refresh_numel();
@ -132,7 +126,7 @@ public:
        "shrinking the size of dense dimensions (from ", dense_size_original, " to ", dense_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
    }

-    if ((!size.equals(size_)) || (sparse_dim != sparse_dim_) || (dense_dim != dense_dim_)) {
+    if ((!size.equals(sizes_)) || (sparse_dim != sparse_dim_) || (dense_dim != dense_dim_)) {
      auto nnz = values().size(0);
      std::vector<int64_t> values_size = {nnz};
      auto dense_size = size.slice(sparse_dim);
@ -141,7 +135,7 @@ public:
      indices_.resize_({sparse_dim, nnz});
    }

-    size_ = size.vec();
+    sizes_ = size.vec();
    sparse_dim_ = sparse_dim;
    dense_dim_ = dense_dim;
    refresh_numel();
@ -151,7 +145,7 @@ public:
  void resize_and_clear_(int64_t sparse_dim, int64_t dense_dim, IntList size) {
    AT_CHECK(sparse_dim + dense_dim == size.size(), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());

-    size_ = size.vec();
+    sizes_ = size.vec();
    sparse_dim_ = sparse_dim;
    dense_dim_ = dense_dim;

--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@ -1,8 +1,8 @@
 #pragma once

 #include <c10/core/Scalar.h>
-#include "ATen/Tensor.h"
-#include "ATen/Type.h"
+#include <ATen/Tensor.h>
+#include <ATen/Type.h>

 #include <string>
 #include <stdexcept>
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@ -1,2 +1,2 @@
 #pragma once
-#include <ATen/core/TensorOptions.h>
+#include <c10/core/TensorOptions.h>
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -1,7 +1,7 @@
-#include "ATen/Config.h"
-#include "ATen/TensorUtils.h"
+#include <ATen/Config.h>
+#include <ATen/TensorUtils.h>

-#include "ATen/ATen.h"
+#include <ATen/ATen.h>

 #include <ostream>
 #include <sstream>
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@ -1,8 +1,8 @@
 #pragma once

-#include "ATen/Tensor.h"
-#include "ATen/TensorGeometry.h"
-#include "ATen/Utils.h"
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/Utils.h>

 // These functions are NOT in Utils.h, because this file has a dep on Tensor.h

--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@ -1,5 +1,5 @@
-#include "ATen/UndefinedType.h"
-#include "c10/util/Exception.h"
+#include <ATen/UndefinedType.h>
+#include <c10/util/Exception.h>

 namespace at {

@ -23,12 +23,6 @@ Device UndefinedType::getDeviceFromPtr(void*) const {
  AT_ERROR("getDeviceFromPtr not defined for UndefinedType");
 }

-Storage UndefinedType::storage(bool resizable) const {
-  AT_ERROR("storage not defined for UndefinedType");
-}
-Storage UndefinedType::storage(size_t size, bool resizable) const {
-  AT_ERROR("storage(size_t) not defined for UndefinedType");
-}
 Storage UndefinedType::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
  AT_ERROR("storageFromBlob not defined for UndefinedType");
 }
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/TypeDefault.h"
-#include "ATen/CheckGenerator.h"
+#include <ATen/TypeDefault.h>
+#include <ATen/CheckGenerator.h>

 #ifdef _MSC_VER
 #ifdef Type
@ -18,8 +18,6 @@ struct UndefinedType final : public TypeDefault {
  virtual Backend backend() const override;
  virtual Allocator* allocator() const override;
  virtual Device getDeviceFromPtr(void* data) const override;
-  virtual Storage storage(bool resizable = false) const override;
-  virtual Storage storage(size_t size, bool resizable = false) const override;
  virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const override;
  virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const override;
  virtual std::unique_ptr<Generator> generator() const override;
--- a/aten/src/ATen/Utils.cpp
+++ b/aten/src/ATen/Utils.cpp
@ -1,4 +1,4 @@
-#include "ATen/Utils.h"
+#include <ATen/Utils.h>
 #include <stdarg.h>
 #include <stdexcept>
 #include <typeinfo>
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -1,11 +1,11 @@
 #pragma once

-#include "ATen/core/ATenGeneral.h"
+#include <ATen/core/ATenGeneral.h>
 #include <c10/core/StorageImpl.h>
-#include "ATen/core/UndefinedTensorImpl.h"
+#include <c10/core/UndefinedTensorImpl.h>

 #include <c10/core/ScalarType.h>
-#include "ATen/Formatting.h"
+#include <ATen/Formatting.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>

@ -73,6 +73,9 @@ static inline TensorImpl* checked_tensor_unwrap(const Tensor& expr, const char *
    AT_ERROR("Expected object of scalar type ", scalar_type, " but got scalar type ", expr.scalar_type(),
             " for argument #", pos, " '", name, "'");
  }
+  if (expr.is_variable()) {
+    AT_ERROR("Expected Tensor (not Variable) for argument #", pos, " '", name, "'");
+  }
  return expr.unsafeGetTensorImpl();
 }

@ -88,7 +91,11 @@ static inline std::vector<TensorImpl*> checked_tensor_list_unwrap(ArrayRef<Tenso
    }
    if (expr.scalar_type() != scalar_type) {
      AT_ERROR("Expected object of scalar type ", scalar_type, " but got scalar type ", expr.scalar_type(),
-               " for sequence elment ", i , " in sequence argument at position #", pos, " '", name, "'");
+               " for sequence element ", i , " in sequence argument at position #", pos, " '", name, "'");
+    }
+    if (expr.is_variable()) {
+      AT_ERROR("Expected Tensor (not Variable) for sequence element ",
+               i , " in sequence argument at position #", pos, " '", name, "'");
    }
    unwrapped.emplace_back(expr.unsafeGetTensorImpl());
  }
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -1,10 +1,14 @@
 #pragma once

-#include "ATen/core/WrapDimMinimal.h"
-#include "ATen/core/TensorImpl.h"
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/core/TensorImpl.h>

 namespace at {

+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
+  return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
+}
+
 static inline int64_t maybe_wrap_dim(int64_t dim, TensorImpl *tensor) {
  return maybe_wrap_dim(dim, tensor->dim());
 }
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/core/TensorImpl.h"
-#include "ATen/WrapDimUtils.h"
+#include <c10/core/TensorImpl.h>
+#include <ATen/WrapDimUtils.h>
 #include <sstream>
 #include <bitset>

--- a/aten/src/ATen/core/ATenGeneral.h
+++ b/aten/src/ATen/core/ATenGeneral.h
@ -1,3 +1,3 @@
 #pragma once

-#include "c10/macros/Macros.h"
+#include <c10/macros/Macros.h>
--- a/aten/src/ATen/core/Backtrace.h
+++ b/aten/src/ATen/core/Backtrace.h
@ -1,2 +1,2 @@
-#include "c10/util/Backtrace.h"
-#include "c10/util/Type.h"
+#include <c10/util/Backtrace.h>
+#include <c10/util/Type.h>
--- a/aten/src/ATen/core/CMakeLists.txt
+++ b/aten/src/ATen/core/CMakeLists.txt
@ -6,12 +6,6 @@ FILE(GLOB ATen_CORE_SRCS "*.cpp")
 FILE(GLOB ATen_CORE_TEST_SRCS "*_test.cpp")
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})

-# see the source file for explanation
-set_source_files_properties(
-  ${CMAKE_CURRENT_SOURCE_DIR}/register_symbols.cpp
-  PROPERTIES COMPILE_FLAGS -O0
-  )
-
 # Pass to parent
 set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE)
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
--- a/aten/src/ATen/core/DefaultDtype.h
+++ b/aten/src/ATen/core/DefaultDtype.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include <c10/macros/Macros.h>
-
-namespace caffe2 {
-class TypeMeta;
-} // namespace caffe2
-
-namespace at {
-CAFFE2_API void set_default_dtype(caffe2::TypeMeta dtype);
-CAFFE2_API const caffe2::TypeMeta& get_default_dtype();
-} // namespace at
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@ -1,4 +1,4 @@
-#include "ATen/core/Formatting.h"
+#include <ATen/core/Formatting.h>

 #include <cmath>
 #include <cstdint>
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@ -1,2 +1,2 @@
 #pragma once
-#include "c10/Half.h"
+#include <c10/Half.h>
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@ -28,7 +28,7 @@
 #include <ATen/core/VariableHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <ATen/core/LegacyDeviceTypeInit.h>
-#include <ATen/core/TensorImpl.h>
+#include <c10/core/TensorImpl.h>

 namespace at {

--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@ -1,2 +1,2 @@
 #pragma once
-#include "c10/macros/Macros.h"
+#include <c10/macros/Macros.h>
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@ -4,23 +4,29 @@
 #include <c10/core/Layout.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
-#include "ATen/core/SparseTensorRef.h"
+#include <ATen/core/SparseTensorRef.h>
 #include <c10/core/Storage.h>
-#include "ATen/core/TensorAccessor.h"
-#include "ATen/core/TensorImpl.h"
-#include "ATen/core/UndefinedTensorImpl.h"
+#include <ATen/core/TensorAccessor.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 #include <ATen/core/LegacyTypeDispatch.h>

+namespace c10{
+struct TensorOptions;
+}
 namespace at {
 struct Generator;
 struct Type;
 class Tensor;
-struct TensorOptions;
 } // namespace at

 namespace at {
+
+class Tensor;
+using TensorList = ArrayRef<Tensor>;
+
 // Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
 // has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
 //
@ -292,10 +298,8 @@ public:
  Tensor argmax() const;
  Tensor argmin(int64_t dim, bool keepdim=false) const;
  Tensor argmin() const;
-  Tensor as_strided(IntList size, IntList stride) const;
-  Tensor & as_strided_(IntList size, IntList stride);
-  Tensor as_strided(IntList size, IntList stride, int64_t storage_offset) const;
-  Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset);
+  Tensor as_strided(IntList size, IntList stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
+  Tensor & as_strided_(IntList size, IntList stride, c10::optional<int64_t> storage_offset=c10::nullopt);
  Tensor asin() const;
  Tensor & asin_();
  Tensor atan() const;
@ -443,17 +447,18 @@ public:
  Tensor & squeeze_();
  Tensor & squeeze_(int64_t dim);
  Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const;
+  Tensor stft(int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const Tensor & window={}, bool normalized=false, bool onesided=true) const;
  int64_t stride(int64_t dim) const;
  Tensor sum(ScalarType dtype) const;
  Tensor sum() const;
  Tensor sum(IntList dim, bool keepdim, ScalarType dtype) const;
  Tensor sum(IntList dim, bool keepdim=false) const;
  Tensor sum(IntList dim, ScalarType dtype) const;
+  Tensor sum_to_size(IntList size) const;
  Tensor sqrt() const;
  Tensor & sqrt_();
  Tensor std(bool unbiased=true) const;
-  Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor std(IntList dim, bool unbiased=true, bool keepdim=false) const;
  Tensor prod(ScalarType dtype) const;
  Tensor prod() const;
  Tensor prod(int64_t dim, bool keepdim, ScalarType dtype) const;
@ -480,7 +485,7 @@ public:
  Tensor view_as(const Tensor & other) const;
  Tensor where(const Tensor & condition, const Tensor & other) const;
  Tensor norm(Scalar p=2) const;
-  Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const;
+  Tensor norm(c10::optional<Scalar> p, int64_t dim, bool keepdim=false) const;
  Tensor clone() const;
  Tensor & resize_as_(const Tensor & the_template);
  Tensor pow(Scalar exponent) const;
@ -627,7 +632,7 @@ public:
  std::tuple<Tensor,Tensor> eig(bool eigenvectors=false) const;
  std::tuple<Tensor,Tensor,Tensor> svd(bool some=true, bool compute_uv=true) const;
  Tensor cholesky(bool upper=false) const;
-  Tensor potrs(const Tensor & input2, bool upper=true) const;
+  Tensor cholesky_solve(const Tensor & input2, bool upper=false) const;
  Tensor potri(bool upper=true) const;
  std::tuple<Tensor,Tensor> pstrf(bool upper=true, Scalar tol=-1) const;
  std::tuple<Tensor,Tensor> qr() const;
@ -732,4 +737,4 @@ Tensor make_tensor(Args&&... args) {

 } // namespace at

-#include "ATen/core/TensorMethods.h"
+#include <ATen/core/TensorMethods.h>
--- a/aten/src/ATen/core/TensorImpl_test.cpp
+++ b/aten/src/ATen/core/TensorImpl_test.cpp
@ -1,5 +1,5 @@
-#include "gtest/gtest.h"
-#include "caffe2/core/tensor.h"
+#include <gtest/gtest.h>
+#include <caffe2/core/tensor.h>

 TEST(TensorImplTest, Caffe2Constructor) {
  caffe2::Tensor tensor(caffe2::CPU);
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@ -1,10 +1,11 @@
 #pragma once

-#include "ATen/core/Tensor.h"
+#include <ATen/core/Tensor.h>
 #include <c10/core/Scalar.h>
-#include "ATen/core/SparseTensorRef.h"
-#include "ATen/core/Type.h"
-#include "ATen/core/TensorOptions.h"
+#include <c10/macros/Macros.h>
+#include <ATen/core/SparseTensorRef.h>
+#include <ATen/core/Type.h>
+#include <c10/core/TensorOptions.h>

 namespace at {

@ -114,16 +115,10 @@ inline Tensor Tensor::argmin(int64_t dim, bool keepdim) const {
 inline Tensor Tensor::argmin() const {
    return type().argmin(*this);
 }
-inline Tensor Tensor::as_strided(IntList size, IntList stride) const {
-    return type().as_strided(*this, size, stride);
-}
-inline Tensor & Tensor::as_strided_(IntList size, IntList stride) {
-    return type().as_strided_(*this, size, stride);
-}
-inline Tensor Tensor::as_strided(IntList size, IntList stride, int64_t storage_offset) const {
+inline Tensor Tensor::as_strided(IntList size, IntList stride, c10::optional<int64_t> storage_offset) const {
    return type().as_strided(*this, size, stride, storage_offset);
 }
-inline Tensor & Tensor::as_strided_(IntList size, IntList stride, int64_t storage_offset) {
+inline Tensor & Tensor::as_strided_(IntList size, IntList stride, c10::optional<int64_t> storage_offset) {
    return type().as_strided_(*this, size, stride, storage_offset);
 }
 inline Tensor Tensor::asin() const {
@ -567,7 +562,7 @@ inline Tensor & Tensor::squeeze_(int64_t dim) {
 inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
    return type().sspaddmm(*this, mat1, mat2, beta, alpha);
 }
-inline Tensor Tensor::stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const {
+inline Tensor Tensor::stft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const Tensor & window, bool normalized, bool onesided) const {
    return type().stft(*this, n_fft, hop_length, win_length, window, normalized, onesided);
 }
 inline int64_t Tensor::stride(int64_t dim) const {
@ -588,6 +583,9 @@ inline Tensor Tensor::sum(IntList dim, bool keepdim) const {
 inline Tensor Tensor::sum(IntList dim, ScalarType dtype) const {
    return type().sum(*this, dim, dtype);
 }
+inline Tensor Tensor::sum_to_size(IntList size) const {
+    return type().sum_to_size(*this, size);
+}
 inline Tensor Tensor::sqrt() const {
    return type().sqrt(*this);
 }
@ -597,7 +595,7 @@ inline Tensor & Tensor::sqrt_() {
 inline Tensor Tensor::std(bool unbiased) const {
    return type().std(*this, unbiased);
 }
-inline Tensor Tensor::std(int64_t dim, bool unbiased, bool keepdim) const {
+inline Tensor Tensor::std(IntList dim, bool unbiased, bool keepdim) const {
    return type().std(*this, dim, unbiased, keepdim);
 }
 inline Tensor Tensor::prod(ScalarType dtype) const {
@ -678,7 +676,7 @@ inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) cons
 inline Tensor Tensor::norm(Scalar p) const {
    return type().norm(*this, p);
 }
-inline Tensor Tensor::norm(Scalar p, int64_t dim, bool keepdim) const {
+inline Tensor Tensor::norm(c10::optional<Scalar> p, int64_t dim, bool keepdim) const {
    return type().norm(*this, p, dim, keepdim);
 }
 inline Tensor Tensor::clone() const {
@ -1119,8 +1117,8 @@ inline std::tuple<Tensor,Tensor,Tensor> Tensor::svd(bool some, bool compute_uv)
 inline Tensor Tensor::cholesky(bool upper) const {
    return type().cholesky(*this, upper);
 }
-inline Tensor Tensor::potrs(const Tensor & input2, bool upper) const {
-    return type().potrs(*this, input2, upper);
+inline Tensor Tensor::cholesky_solve(const Tensor & input2, bool upper) const {
+    return type().cholesky_solve(*this, input2, upper);
 }
 inline Tensor Tensor::potri(bool upper) const {
    return type().potri(*this, upper);
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@ -1,18 +1,18 @@
 #pragma once

-#include "ATen/core/ATenGeneral.h"
+#include <ATen/core/ATenGeneral.h>
 #include <c10/core/Allocator.h>
-#include "ATen/core/Deprecated.h"
-#include "ATen/core/Generator.h"
+#include <ATen/core/Deprecated.h>
+#include <ATen/core/Generator.h>
 #include <c10/core/Layout.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
-#include "ATen/core/SparseTensorRef.h"
+#include <ATen/core/SparseTensorRef.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/Half.h>
 #include <c10/core/TensorTypeIdRegistration.h>
-#include "ATen/core/Reduction.h"
-#include "ATen/core/TensorOptions.h"
+#include <ATen/core/Reduction.h>
+#include <c10/core/TensorOptions.h>

 #include <c10/util/Optional.h>

@ -35,9 +35,11 @@ struct Storage;

 namespace at {

+class Tensor;
+using TensorList = ArrayRef<Tensor>;
+
 class Context;
 struct Generator;
-class Tensor;

 static inline void noop_deleter(void*) {}

@ -97,8 +99,6 @@ struct CAFFE2_API Type {
  bool is_undefined() const noexcept { return is_undefined_; }
  virtual Allocator * allocator() const = 0;
  virtual Device getDeviceFromPtr(void * data) const = 0;
-  virtual Storage storage(bool resizable = false) const = 0;
-  virtual Storage storage(size_t size, bool resizable = false) const = 0;
  virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
  virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const = 0;
  virtual std::unique_ptr<Generator> generator() const = 0;
@ -135,7 +135,10 @@ struct CAFFE2_API Type {
    return backendToDeviceType(backend());
  }

-  virtual Tensor copy(const Tensor & src, bool non_blocking=false, c10::optional<Device> to_device={}) const = 0;
+  virtual Tensor copy(
+      const Tensor& src,
+      bool non_blocking = false,
+      c10::optional<Device> to_device = {}) const = 0;
  virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0;

  virtual void backward(
@ -149,7 +152,6 @@ struct CAFFE2_API Type {
  virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
  virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0;
  virtual Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const = 0;
-  virtual Tensor scalarTensor(Scalar s) const = 0;

  bool operator==(const Type& other) const {
    return this == &other;
@ -168,7 +170,7 @@ struct CAFFE2_API Type {

  /// Constructs the `TensorOptions` from a type and a Device.  Asserts that
  /// the device type matches the device type of the type.
-  TensorOptions options(optional<Device> device_opt) const {
+  TensorOptions options(c10::optional<Device> device_opt) const {
    if (!device_opt.has_value()) {
      return options(-1);
    } else {
@ -203,10 +205,8 @@ struct CAFFE2_API Type {
  virtual Tensor argmax(const Tensor & self) const = 0;
  virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0;
  virtual Tensor argmin(const Tensor & self) const = 0;
-  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0;
-  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0;
-  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
-  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, c10::optional<int64_t> storage_offset) const = 0;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, c10::optional<int64_t> storage_offset) const = 0;
  virtual Tensor asin(const Tensor & self) const = 0;
  virtual Tensor & asin_(Tensor & self) const = 0;
  virtual Tensor atan(const Tensor & self) const = 0;
@ -354,17 +354,18 @@ struct CAFFE2_API Type {
  virtual Tensor & squeeze_(Tensor & self) const = 0;
  virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0;
  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const = 0;
+  virtual Tensor stft(const Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const Tensor & window, bool normalized, bool onesided) const = 0;
  virtual int64_t stride(const Tensor & self, int64_t dim) const = 0;
  virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0;
  virtual Tensor sum(const Tensor & self) const = 0;
  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0;
  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim) const = 0;
  virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0;
+  virtual Tensor sum_to_size(const Tensor & self, IntList size) const = 0;
  virtual Tensor sqrt(const Tensor & self) const = 0;
  virtual Tensor & sqrt_(Tensor & self) const = 0;
  virtual Tensor std(const Tensor & self, bool unbiased) const = 0;
-  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
+  virtual Tensor std(const Tensor & self, IntList dim, bool unbiased, bool keepdim) const = 0;
  virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0;
  virtual Tensor prod(const Tensor & self) const = 0;
  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
@ -391,7 +392,7 @@ struct CAFFE2_API Type {
  virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
  virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
  virtual Tensor norm(const Tensor & self, Scalar p) const = 0;
-  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor norm(const Tensor & self, c10::optional<Scalar> p, int64_t dim, bool keepdim) const = 0;
  virtual Tensor clone(const Tensor & self) const = 0;
  virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0;
  virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0;
@ -538,7 +539,7 @@ struct CAFFE2_API Type {
  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors) const = 0;
  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some, bool compute_uv) const = 0;
  virtual Tensor cholesky(const Tensor & self, bool upper) const = 0;
-  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) const = 0;
+  virtual Tensor cholesky_solve(const Tensor & self, const Tensor & input2, bool upper) const = 0;
  virtual Tensor potri(const Tensor & self, bool upper) const = 0;
  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper, Scalar tol) const = 0;
  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const = 0;
@ -588,4 +589,4 @@ protected:

 } // namespace at

-#include "ATen/core/Tensor.h"
+#include <ATen/core/Tensor.h>
--- a/aten/src/ATen/core/UndefinedTensorImpl.h
+++ b/aten/src/ATen/core/UndefinedTensorImpl.h
@ -1,34 +1 @@
-#pragma once
-
-#include "ATen/core/TensorImpl.h"
-
-namespace at {
-
-struct CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
- public:
-  // Without this, we get:
-  //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code
-  // (ostensibly because the constexpr tricks MSVC into trying to compile this
-  // function for device as well).
-#ifdef _WIN32
-  static inline TensorImpl * singleton() {
-#else
-  static constexpr inline TensorImpl * singleton() {
-#endif
-    return &_singleton;
-  }
-  IntList sizes() const override;
-  IntList strides() const override;
-  int64_t size(int64_t d) const override;
-  int64_t stride(int64_t d) const override;
-  int64_t dim() const override;
-  const Storage& storage() const override;
-  int64_t storage_offset() const override;
-private:
-  UndefinedTensorImpl();
-  static UndefinedTensorImpl _singleton;
-public:
-  friend struct UndefinedType;
-};
-
-} // namespace at
+#include <c10/core/UndefinedTensorImpl.h>
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -2,7 +2,7 @@
 #include <unordered_set>
 #include <vector>
 #include <ATen/core/interned_strings.h>
-#include "c10/util/Exception.h"
+#include <c10/util/Exception.h>

 namespace c10 {
 class AliasInfo {
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@ -43,6 +43,7 @@ _(aten, _cast_Short) \
 _(aten, _cat) \
 _(aten, _ceil) \
 _(aten, _cholesky_helper) \
+_(aten, _cholesky_solve_helper) \
 _(aten, _convolution) \
 _(aten, _convolution_double_backward) \
 _(aten, _convolution_nogroup) \
@ -102,7 +103,6 @@ _(aten, _pack_padded_sequence_backward) \
 _(aten, _pad_packed_sequence) \
 _(aten, _pdist_backward) \
 _(aten, _pdist_forward) \
-_(aten, _potrs_helper) \
 _(aten, _prod) \
 _(aten, _prodall) \
 _(aten, _range) \
@ -242,6 +242,7 @@ _(aten, ceil) \
 _(aten, celu) \
 _(aten, chain_matmul) \
 _(aten, cholesky) \
+_(aten, cholesky_solve) \
 _(aten, chunk) \
 _(aten, clamp) \
 _(aten, clamp_max) \
@ -523,7 +524,6 @@ _(aten, pixel_shuffle) \
 _(aten, poisson) \
 _(aten, polygamma) \
 _(aten, potri) \
-_(aten, potrs) \
 _(aten, pow) \
 _(aten, prelu) \
 _(aten, prelu_backward) \
@ -626,6 +626,7 @@ _(aten, sub) \
 _(aten, sub_) \
 _(aten, rsub) \
 _(aten, sum) \
+_(aten, sum_to_size) \
 _(aten, svd) \
 _(aten, symeig) \
 _(aten, t) \
@ -683,6 +684,9 @@ _(aten, unsqueeze) \
 _(aten, upsample_bilinear2d) \
 _(aten, upsample_bilinear2d_backward) \
 _(aten, upsample_bilinear2d_forward) \
+_(aten, upsample_bicubic2d) \
+_(aten, upsample_bicubic2d_backward) \
+_(aten, upsample_bicubic2d_forward) \
 _(aten, upsample_linear1d) \
 _(aten, upsample_linear1d_backward) \
 _(aten, upsample_linear1d_forward) \
--- a/aten/src/ATen/core/context_base.cpp
+++ b/aten/src/ATen/core/context_base.cpp
@ -11,49 +11,6 @@ C10_DEFINE_TYPED_REGISTRY(
    std::unique_ptr,
    at::Device);

-// First dimension of the array is `bool async`: 0 is sync,
-// 1 is async (non-blocking)
-static CopyBytesFunction g_copy_bytes[2][COMPILE_TIME_MAX_DEVICE_TYPES]
-                                     [COMPILE_TIME_MAX_DEVICE_TYPES];
-
-_CopyBytesFunctionRegisterer::_CopyBytesFunctionRegisterer(
-    DeviceType fromType,
-    DeviceType toType,
-    CopyBytesFunction func_sync,
-    CopyBytesFunction func_async) {
-  auto from = static_cast<int>(fromType);
-  auto to = static_cast<int>(toType);
-  if (!func_async) {
-    // default to the sync function
-    func_async = func_sync;
-  }
-  CHECK(
-      g_copy_bytes[0][from][to] == nullptr &&
-      g_copy_bytes[1][from][to] == nullptr)
-      << "Duplicate registration for device type pair "
-      << c10::DeviceTypeName(fromType) << ", " << c10::DeviceTypeName(toType);
-  g_copy_bytes[0][from][to] = func_sync;
-  g_copy_bytes[1][from][to] = func_async;
-}
-
-void CopyBytes(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device,
-    bool async) {
-  auto ptr = g_copy_bytes[async ? 1 : 0][static_cast<int>(src_device.type())]
-                         [static_cast<int>(dst_device.type())];
-  CAFFE_ENFORCE(
-      ptr,
-      "No function found for copying from ",
-      c10::DeviceTypeName(src_device.type()),
-      " to ",
-      c10::DeviceTypeName(dst_device.type()));
-  ptr(nbytes, src, src_device, dst, dst_device);
-}
-
 } // namespace at

 namespace caffe2 {
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@ -11,6 +11,7 @@
 #include <c10/util/typeid.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
+#include <c10/core/CopyBytes.h>

 namespace caffe2 {
 class Event;
@ -156,39 +157,6 @@ inline std::unique_ptr<at::BaseContext> CreateContext(

 } // namespace at

-// TODO: move it to a separate file in c10 if possible
-namespace at {
-
-using CopyBytesFunction = void (*)(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device);
-
-struct CAFFE2_API _CopyBytesFunctionRegisterer {
-  _CopyBytesFunctionRegisterer(
-      DeviceType from,
-      DeviceType to,
-      CopyBytesFunction func_sync,
-      CopyBytesFunction func_async = nullptr);
-};
-
-#define REGISTER_COPY_BYTES_FUNCTION(from, to, ...)           \
-  namespace {                                                 \
-  static _CopyBytesFunctionRegisterer C10_ANONYMOUS_VARIABLE( \
-      g_copy_function)(from, to, __VA_ARGS__);                \
-  }
-
-CAFFE2_API void CopyBytes(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device,
-    bool async);
-} // namespace at
-
 namespace caffe2 {

 using at::BaseContext;
--- a/aten/src/ATen/core/interned_strings.cpp
+++ b/aten/src/ATen/core/interned_strings.cpp
@ -1,4 +1,4 @@
-#include "ATen/core/interned_strings.h"
+#include <ATen/core/interned_strings.h>
 #include <cstdint>
 #include <cstring>
 #include <iostream>
@ -7,9 +7,9 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ATen/core/interned_strings_class.h"
-#include "c10/util/Exception.h"
-#include "c10/util/Optional.h"
+#include <ATen/core/interned_strings_class.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>

 namespace c10 {

--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -52,14 +52,11 @@ namespace c10 {
  _(prim, TupleSlice)              \
  _(prim, ListConstruct)           \
  _(prim, ListUnpack)              \
-  _(prim, BoolToTensor)            \
  _(prim, NumToTensor)             \
-  _(prim, TensorToNum)             \
  _(prim, ImplicitTensorToNum)     \
-  _(prim, TensorToBool)            \
-  _(prim, IntToFloat)              \
-  _(prim, FloatToInt)              \
-  _(prim, StringToFloat)           \
+  _(prim, Bool)                    \
+  _(prim, Int)                     \
+  _(prim, Float)                   \
  _(prim, device)                  \
  _(prim, dtype)                   \
  _(prim, shape)                   \
@ -70,7 +67,6 @@ namespace c10 {
  _(prim, AnyDefined)              \
  _(prim, FusedConcat)             \
  _(prim, ConstantChunk)           \
-  _(prim, NoneGenerator)           \
  _(prim, MMTreeReduce)            \
  _(prim, MMBatchSide)             \
  _(aten, warn)                    \
@ -78,6 +74,7 @@ namespace c10 {
  _(aten, __round_to_zero_floordiv)\
  _(prim, fork)                    \
  _(prim, RaiseException)          \
+  _(prim, Function)                \
  _(aten, append)                  \
  _(aten, format)                  \
  _(aten, __not__)                 \
@ -87,6 +84,7 @@ namespace c10 {
  _(aten, _set_item)               \
  _(aten, index_put_)              \
  _(aten, device)                  \
+  _(aten, len)                     \
  FORALL_ATEN_BASE_SYMBOLS(_)      \
  _(onnx, Add)                     \
  _(onnx, Concat)                  \
--- a/aten/src/ATen/core/interned_strings_class.h
+++ b/aten/src/ATen/core/interned_strings_class.h
@ -6,8 +6,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ATen/core/interned_strings.h"
-#include "c10/util/Exception.h"
+#include <ATen/core/interned_strings.h>
+#include <c10/util/Exception.h>

 namespace c10 {

--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -80,4 +80,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {

 #undef TORCH_FORALL_TAGS

+void IValue::dump() const {
+  std::cout << *this << "\n";
+}
+
 } // namespace c10
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -2,8 +2,8 @@

 #include <ATen/core/Scalar.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/core/TensorImpl.h>
-#include <ATen/core/UndefinedTensorImpl.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
 #include <ATen/core/blob.h>
 #include <c10/util/intrusive_ptr.h>
 #include <ATen/core/thread_pool.h>
@ -134,6 +134,8 @@ struct CAFFE2_API IValue final {
    return *this;
  }

+  void dump() const;
+
  bool isAliasOf(const IValue& rhs) const {
    if (this->tag != rhs.tag) {
      // Trivially don't alias if the type is different
@ -510,14 +512,49 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  }

 public:
+  struct CAFFE2_API FutureError final : public std::exception {
+    FutureError(std::string&& error_msg_)
+        : error_msg(std::move(error_msg_)) {}
+
+    FutureError() = default;
+
+    const char* what() const noexcept override {
+      return error_msg.c_str();
+    }
+
+    std::string error_msg;
+  };
+
+  /**
+  * Wait on the future until it completes.
+  */
  void wait() {
    if (completed()) {
      return;
    }
-    c10::global_work_queue().workOnTasksUntilCompleted(intrusive_from_this());
+    std::condition_variable finished;
+    bool fired = false;
+
+    // Add a callback to notify the current thread
+    // when the current future completes.
+    addCallback([&] {
+      std::unique_lock<std::mutex> lock(mutex_);
+      finished.notify_all();
+      fired = true;
+    });
+
+    // The current thread will be blocked unless the above callback is fired.
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!fired) {
+      finished.wait(lock);
+    }
+
    AT_ASSERT(completed());
  }

+  /**
+   * Explicitly mark the future as completed with the output value.
+   */
  void markCompleted(IValue value) {
    {
      // This is not to protect completed_ but to create a barrier
@ -528,21 +565,39 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
      value_ = std::move(value);
    }

-    // There is no need to protect callbacks anymore.
-    // Once completed_ is set to true, no one can add new callback to the list.
-    for (auto& callback : callbacks) {
-      callback();
+    fireCallbacks();
+  }
+
+  void markCompleted(FutureError&& error_) {
+    {
+      // This is not to protect completed_ but to create a barrier
+      // from possible addCallback() calls
+      std::unique_lock<std::mutex> lock(mutex_);
+      AT_ASSERT(!completed());
+      completed_ = true;
+      has_error = true;
+      error = std::move(error_);
    }
-    callbacks.clear();
+
+    fireCallbacks();
  }

  // Get the result of the current future.
  IValue value() {
    std::unique_lock<std::mutex> lock(mutex_);
    AT_ASSERT(completed());
+    if (has_error) {
+      throw error;
+    }
    return value_;
  }

+  /**
+   * Add a callback to the future.
+   * The callbacks will be executed once the future completes.
+   * If the future has already completed,
+   * this function will execute the callback immediately.
+   */
  void addCallback(std::function<void(void)> callback) {
    std::unique_lock<std::mutex> lock(mutex_);
    if (completed()) {
@ -558,23 +613,43 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    return completed_;
  }

-  std::mutex& get_mutex() {
-    return mutex_;
-  }
-
  CAFFE2_API friend std::ostream& operator<<(
      std::ostream& out,
      const Future& v);

 private:
+  void fireCallbacks() {
+    AT_ASSERT(completed());
+    // There is no need to protect callbacks with the lock.
+    // Once completed_ is set to true, no one can add new callback to the list.
+    for (auto& callback : callbacks) {
+      callback();
+    }
+    callbacks.clear();
+  }
+
  std::mutex mutex_;
  IValue value_; // when finished the value
  std::atomic_bool completed_ = {false}; // is this future complete
  std::vector<std::function<void(void)>> callbacks;
+  bool has_error = false;
+  FutureError error;
 };

 #undef TORCH_FORALL_TAGS

+namespace detail {
+
+struct _guarded_unsigned_long_unique_dummy final {
+  _guarded_unsigned_long_unique_dummy(int64_t){};
+};
+using _guarded_unsigned_long = c10::guts::conditional_t<
+    std::is_same<unsigned long, uint32_t>::value ||
+        std::is_same<unsigned long, uint64_t>::value,
+    _guarded_unsigned_long_unique_dummy,
+    unsigned long>;
+
+} // namespace detail

 #define DEFINE_TO(type, method_name) \
 template<> \
@ -587,7 +662,16 @@ inline type IValue::to<type>() const & { \
 }
 DEFINE_TO(at::Tensor, toTensor)
 DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
+DEFINE_TO(float, toDouble)
 DEFINE_TO(double, toDouble)
+DEFINE_TO(unsigned char, toInt)
+DEFINE_TO(signed char, toInt)
+DEFINE_TO(unsigned short, toInt)
+DEFINE_TO(short, toInt)
+DEFINE_TO(int, toInt)
+DEFINE_TO(uint32_t, toInt)
+DEFINE_TO(uint64_t, toInt)
+DEFINE_TO(detail::_guarded_unsigned_long, toInt)
 DEFINE_TO(int64_t, toInt)
 DEFINE_TO(bool, toBool)
 DEFINE_TO(c10::intrusive_ptr<ivalue::DoubleList>, toDoubleList)
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -532,6 +532,9 @@ struct CAFFE2_API FutureType : public SingleElementType<TypeKind::FutureType, Fu
    ss << "Future[" << getElementType()->python_str() << "]";
    return ss.str();
  }
+  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
+    return create(contained_types.at(0));
+  }
 private:
  FutureType(TypePtr elem) : SingleElementType(elem) {}
 };
@ -868,7 +871,6 @@ inline TypePtr unshapedType(const TypePtr& type) {
 }

 inline TypePtr CompleteTensorType::fromNumberType(TypePtr typ) {
-  AT_ASSERT(typ->isSubtypeOf(NumberType::get()));
  if (typ->isSubtypeOf(IntType::get())) {
    return CompleteTensorType::create(at::kLong, at::kCPU, {});
  } else if (typ->isSubtypeOf(FloatType::get())) {
@ -902,7 +904,6 @@ TypePtr getTypePtr() {
      " could not be converted to any of the known types { ",
      C10_FORALL_TYPES(TYPE_STR) "}");
 #undef TYPE_STR
-  return nullptr;
 }

 template<> inline TypePtr getTypePtr<at::Tensor>() { return DynamicType::get(); }
@ -915,7 +916,7 @@ template<> inline TypePtr getTypePtr<std::vector<at::Tensor>>() { return ListTyp
 template<> inline TypePtr getTypePtr<std::vector<double>>() { return ListType::ofFloats(); }
 template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::ofInts(); }

-CAFFE2_API TypePtr inferTypeFrom(const IValue& value);
+CAFFE2_API TypePtr incompleteInferTypeFrom(const IValue& value);

 using TypeEnv = std::unordered_map<std::string, TypePtr>;
 struct MatchTypeReturn {
--- a/aten/src/ATen/core/register_symbols.cpp
+++ b/aten/src/ATen/core/register_symbols.cpp
@ -1,16 +1,38 @@
-#include "ATen/core/interned_strings_class.h"
-
-// This file is compiled with -O0 because the fully-macro-expanded
-// function is huge and only called once at startup.
+#include <ATen/core/interned_strings_class.h>

 namespace c10 {
+
+namespace {
+
+struct Entry {
+  const char* const qual_name;
+  const char* const unqual_name;
+  const Symbol sym;
+  const Symbol ns_sym;
+};
+
+constexpr Entry entries[] = {
+#define SYMBOL_ENTRY(n, s) {#n "::" #s, #s, n::s, namespaces::n},
+
+    FORALL_NS_SYMBOLS(SYMBOL_ENTRY)
+#undef SYMBOL_ENTRY
+};
+
+} // namespace
+
 InternedStrings::InternedStrings()
    : sym_to_info_(static_cast<size_t>(_keys::num_symbols)) {
-#define REGISTER_SYMBOL(n, s)        \
-  string_to_sym_[#n "::" #s] = n::s; \
-  sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s};
-
-  FORALL_NS_SYMBOLS(REGISTER_SYMBOL)
-#undef REGISTER_SYMBOL
+  // Instead of a loop, this could be done by expanding the
+  // assignments directly into FORALL_NS_SYMBOLS, but it would create
+  // a huge function (thanks to all the std::string constructors and
+  // operator[]s) which would take several minutes to optimize. A
+  // static C array of constexpr-constructible structs takes instead
+  // no time to compile.
+  for (const auto& entry : entries) {
+    string_to_sym_[entry.qual_name] = entry.sym;
+    sym_to_info_[entry.sym] = {
+        entry.ns_sym, entry.qual_name, entry.unqual_name};
+  }
 }
+
 } // namespace c10
--- a/aten/src/ATen/core/thread_pool.cpp
+++ b/aten/src/ATen/core/thread_pool.cpp
@ -65,20 +65,6 @@ void ThreadPool::waitWorkComplete() {
  }
 }

-void ThreadPool::workOnTasksUntilCompleted(
-    c10::intrusive_ptr<ivalue::Future> future) {
-  if (future->completed()) {
-    return;
-  }
-  std::condition_variable finished;
-  future->addCallback([&] { finished.notify_all(); });
-
-  std::unique_lock<std::mutex> future_lock(future->get_mutex());
-  while (!future->completed()) {
-    finished.wait(future_lock);
-  }
-}
-
 void ThreadPool::main_loop(std::size_t index) {
  init_thread();

--- a/aten/src/ATen/core/thread_pool.h
+++ b/aten/src/ATen/core/thread_pool.h
@ -53,7 +53,7 @@ class CAFFE2_API ThreadPool : public c10::TaskThreadPoolBase {
  std::mutex mutex_;
  std::condition_variable condition_;
  std::condition_variable completed_;
-  bool running_;
+  std::atomic_bool running_;
  bool complete_;
  std::size_t available_;
  std::size_t total_;
@ -89,9 +89,6 @@ class CAFFE2_API ThreadPool : public c10::TaskThreadPoolBase {
  /// @brief Wait for queue to be empty
  void waitWorkComplete();

-  // @brief Wait for the specific future to finish in the queue
-  void workOnTasksUntilCompleted(c10::intrusive_ptr<ivalue::Future> future);
-
 protected:
  virtual void init_thread() {}

--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -116,7 +116,13 @@ ListTypePtr ListType::ofBools() {
  return value;
 }

-TypePtr inferTypeFrom(const IValue& value) {
+// why incomplete? You cannot completely recover a type from
+// an IValue, List[List[int]] and List[List[Tensor]] will both
+// become ivalue.isGenericList() and cannot be recovered.
+// The only appropriate place to use this is where you know that
+// you are only dealing with a subset of objects where you can recover
+// the type, like in the tracer.
+TypePtr incompleteInferTypeFrom(const IValue& value) {
  if (value.isTensor()) {
    return CompleteTensorType::create(value.toTensor());
  } else if (value.isDouble()) {
@ -136,11 +142,11 @@ TypePtr inferTypeFrom(const IValue& value) {
  } else if (value.isDoubleList()) {
    return ListType::ofFloats();
  } else if (value.isTuple()) {
-    return TupleType::create(fmap(value.toTuple()->elements(), inferTypeFrom));
+    return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
  } else if (value.isDevice()) {
    return DeviceObjType::get();
  }
-  AT_ASSERTM(false, "Unhandled IValue kind in inferTypeFrom");
+  AT_ERROR("Type cannot be accurately recovered from this IValue.");
 }

 c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
--- a/aten/src/ATen/cpu/vec256/functional.h
+++ b/aten/src/ATen/cpu/vec256/functional.h
@ -1,5 +1,5 @@
 #pragma once
-#include "vec256.h"
+#include <ATen/cpu/vec256/vec256.h>

 namespace at { namespace vec256 {

@ -10,10 +10,10 @@ inline scalar_t vec_reduce_all(
    vec256::Vec256<scalar_t> acc_vec,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  scalar_t acc_arr[Vec::size];
+  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
  for (int64_t i = 1; i < size; i++) {
-    scalar_t acc_arr_next[Vec::size];
+    scalar_t acc_arr_next[Vec::size()];
    acc_arr_next[0] = acc_arr[i];
    Vec acc_vec_next = Vec::loadu(acc_arr_next);
    acc_vec = vec_fun(acc_vec, acc_vec_next);
@ -25,11 +25,11 @@ inline scalar_t vec_reduce_all(
 template <typename scalar_t, typename Op>
 inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = Vec::loadu(data);
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    acc_vec = vec_fun(acc_vec, data_vec);
  }
@ -37,7 +37,7 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
    Vec data_vec = Vec::loadu(data + d, size - d);
    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(vec_fun, acc_vec, Vec::size);
+  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -47,11 +47,11 @@ inline scalar_t map_reduce_all(
    scalar_t* data,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size)
+  if (size < Vec::size())
    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    data_vec = map_fun(data_vec);
    acc_vec = red_fun(acc_vec, data_vec);
@ -61,7 +61,7 @@ inline scalar_t map_reduce_all(
    data_vec = map_fun(data_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename MapOp, typename ReduceOp>
@ -72,15 +72,15 @@ inline scalar_t map2_reduce_all(
    const scalar_t* data2,
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
-  if (size < Vec::size) {
+  if (size < Vec::size()) {
    Vec data_vec = Vec::loadu(data, size);
    Vec data2_vec = Vec::loadu(data2, size);
    data_vec = map_fun(data_vec, data2_vec);
    return vec_reduce_all(red_fun, data_vec, size);
  }
-  int64_t d = Vec::size;
+  int64_t d = Vec::size();
  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(data + d);
    Vec data2_vec = Vec::loadu(data2 + d);
    data_vec = map_fun(data_vec, data2_vec);
@ -92,7 +92,7 @@ inline scalar_t map2_reduce_all(
    data_vec = map_fun(data_vec, data2_vec);
    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
  }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size);
+  return vec_reduce_all(red_fun, acc_vec, Vec::size());
 }

 template <typename scalar_t, typename Op>
@ -103,7 +103,7 @@ inline void map(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
    output_vec.store(output_data + d);
  }
@ -122,7 +122,7 @@ inline void map2(
    int64_t size) {
  using Vec = vec256::Vec256<scalar_t>;
  int64_t d = 0;
-  for (; d < size - (size % Vec::size); d += Vec::size) {
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
    Vec data_vec = Vec::loadu(input_data + d);
    Vec data_vec2 = Vec::loadu(input_data2 + d);
    Vec output_vec = vec_fun(data_vec, data_vec2);
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@ -1,11 +1,11 @@
 #pragma once

-#include "intrinsics.h"
+#include <ATen/cpu/vec256/intrinsics.h>

-#include "vec256_base.h"
-#include "vec256_float.h"
-#include "vec256_double.h"
-#include "vec256_int.h"
+#include <ATen/cpu/vec256/vec256_base.h>
+#include <ATen/cpu/vec256/vec256_float.h>
+#include <ATen/cpu/vec256/vec256_double.h>
+#include <ATen/cpu/vec256/vec256_int.h>

 #include <algorithm>
 #include <cstddef>
@ -15,14 +15,24 @@

 namespace at {
 namespace vec256 {
+
+// Note [Acceptable use of anonymous namespace in header]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Yes you saw right, this is an anonymous namespace in a header.  This header,
+// and all of its subheaders, REQUIRE their code to be entirely inlined into
+// the compilation unit that uses them.  It's important that these functions have
+// internal linkage so that kernels for different architectures don't get
+// combined during linking. It's sufficient to label functions "static", but
+// class methods must be an unnamed namespace to have internal linkage (since
+// static means something different in the context of classes).
 namespace {

 template <typename T>
 std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
-  T buf[Vec256<T>::size];
+  T buf[Vec256<T>::size()];
  vec.store(buf);
  stream << "vec[";
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    if (i != 0) {
      stream << ", ";
    }
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -6,8 +6,8 @@
 #include <type_traits>
 #include <bitset>

-#include "ATen/Utils.h"
-#include "ATen/native/Copy.h"
+#include <ATen/Utils.h>
+#include <ATen/native/Copy.h>
 #include <c10/util/C++17.h>

 #if defined(__GNUC__)
@ -20,6 +20,7 @@

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 template<size_t n> struct int_of_size;
@ -45,15 +46,49 @@ struct Vec256 {
 private:
  T values[32 / sizeof(T)] = {0};
 public:
-  static constexpr int size = 32 / sizeof(T);
+  // Note [constexpr static function to avoid odr-usage compiler bug]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Why, you might ask, is size defined to be a static constexpr function,
+  // rather than a more ordinary 'static constexpr int size;' variable?
+  // The problem lies within ODR rules for static constexpr members versus
+  // static constexpr functions.  First, recall that this class (along with all
+  // of its derivations) live in an anonymous namespace: they are intended to be
+  // *completely* inlined at their use-sites, because we need to compile it
+  // multiple times for different instruction sets.
+  //
+  // Because of this constraint, we CANNOT provide a single definition for
+  // any static members in this class; since we want to compile the class
+  // multiple times, there wouldn't actually be any good place to put the
+  // definition.  Now here is the problem: if we ODR-use a static constexpr
+  // member, we are *obligated* to provide a definition.  Without the
+  // definition, you get a compile error like:
+  //
+  //    relocation R_X86_64_PC32 against undefined symbol
+  //    `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
+  //    a shared object; recompile with -fPIC
+  //
+  // If this were C++17, we could replace a static constexpr variable with
+  // an inline variable which doesn't require one definition. But we are not
+  // C++17.  So the next best thing is to replace the member with a static
+  // constexpr (and therefore inline) function, which does not require ODR
+  // either.
+  //
+  // Also, technically according to the C++ standard, we don't have to define
+  // a constexpr variable if we never odr-use it.  But it seems that some
+  // versions GCC/Clang have buggy determinations on whether or not an
+  // identifier is odr-used or not, and in any case it's hard to tel if
+  // a variabe is odr-used or not.  So best to just cut the probem at the root.
+  static constexpr int size() {
+    return 32 / sizeof(T);
+  }
  Vec256() {}
  Vec256(T val) {
-    for (int i = 0; i != size; i++) {
+    for (int i = 0; i != size(); i++) {
      values[i] = val;
    }
  }
  template<typename... Args,
-           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
+           typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
  Vec256(Args... vals) {
    values = { vals... };
  }
@ -61,7 +96,7 @@ public:
  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
    int64_t mask = mask_;
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (mask & 0x01) {
        vec[i] = b[i];
      } else {
@ -74,9 +109,9 @@ public:
  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
                          const Vec256<T>& mask) {
    Vec256 vec;
-    int_same_size_t<T> buffer[size];
+    int_same_size_t<T> buffer[size()];
    mask.store(buffer);
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (buffer[i] & 0x01)
       {
        vec[i] = b[i];
@ -88,14 +123,14 @@ public:
  }
  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      vec.values[i] = base + i * step;
    }
    return vec;
  }
-  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
+  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
    Vec256 vec;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      if (i < count) {
        vec[i] = b[i];
      } else {
@ -114,7 +149,7 @@ public:
    std::memcpy(vec.values, ptr, count * sizeof(T));
    return vec;
  }
-  void store(void* ptr, int count = size) const {
+  void store(void* ptr, int count = size()) const {
    std::memcpy(ptr, values, count * sizeof(T));
  }
  const T& operator[](int idx) const {
@ -125,14 +160,14 @@ public:
  }
  Vec256<T> map(T (*f)(T)) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i != size; i++) {
+    for (int64_t i = 0; i != size(); i++) {
      ret[i] = f(values[i]);
    }
    return ret;
  }
  Vec256<T> abs() const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = values[i] < 0 ? -values[i] : values[i];
    }
    return ret;
@ -214,7 +249,7 @@ public:
  }
  Vec256<T> pow(const Vec256<T> &exp) const {
    Vec256<T> ret;
-    for (int64_t i = 0; i < size; i++) {
+    for (int64_t i = 0; i < size(); i++) {
      ret[i] = std::pow(values[i], exp[i]);
    }
    return ret;
@ -222,7 +257,7 @@ public:
 #define DEFINE_COMP(binary_pred)                                              \
  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
    Vec256<T> vec;                                                            \
-    for (int64_t i = 0; i != size; i++) {                                     \
+    for (int64_t i = 0; i != size(); i++) {                                     \
      if (values[i] binary_pred other.values[i]) {                            \
        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
      } else {                                                                \
@ -242,7 +277,7 @@ public:

 template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] + b[i];
  }
  return c;
@ -250,7 +285,7 @@ template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] - b[i];
  }
  return c;
@ -258,7 +293,7 @@ template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] * b[i];
  }
  return c;
@ -266,7 +301,7 @@ template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T

 template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] / b[i];
  }
  return c;
@ -276,7 +311,7 @@ template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T
 // either input is a NaN.
 template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -301,7 +336,7 @@ inline T maximum(const T& a, const T& b) {
 // either input is a NaN.
 template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
    if (std::is_floating_point<T>::value && std::isnan(a[i])) {
      // If either input is NaN, propagate a NaN.
@ -327,8 +362,8 @@ inline T minimum(const T& a, const T& b) {
 template <class T>                                                          \
 Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
  using iT = int_same_size_t<T>;                                            \
-  iT buffer[Vec256<T>::size];                                               \
-  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
+  iT buffer[Vec256<T>::size()];                                               \
+  for (int64_t i = 0; i != Vec256<T>::size(); i++) {                          \
    auto a_val = a[i];                                                      \
    auto b_val = b[i];                                                      \
    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
@ -350,7 +385,7 @@ inline T fmadd(const T& a, const T& b, const T& c) {
 template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  int_same_size_t<T> index_arr[size];
  vindex.store(static_cast<void*>(index_arr));
  T buffer[size];
@ -364,7 +399,7 @@ template <int64_t scale = 1, typename T = void>
 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
 inline mask_gather(const Vec256<T>& src, T const* base_addr,
                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
  int_same_size_t<T> index_arr[size];
@ -392,7 +427,7 @@ namespace {
  template<typename dst_t, typename src_t>
  struct CastImpl {
    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
-      src_t src_arr[Vec256<src_t>::size];
+      src_t src_arr[Vec256<src_t>::size()];
      src.store(static_cast<void*>(src_arr));
      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
    }
@ -412,7 +447,7 @@ Vec256<dst_t> cast(const Vec256<src_t>& src) {

 template <typename T>
 inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  T src_arr[size];
  src.store(static_cast<void*>(src_arr));
  int_same_size_t<T> buffer[size];
@ -427,9 +462,9 @@ inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& s
 //       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 //                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -453,9 +488,9 @@ deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
 //       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 //                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 template <typename T>
-inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
 interleave2(const Vec256<T>& a, const Vec256<T>& b) {
-  static constexpr int size = Vec256<T>::size;
+  static constexpr int size = Vec256<T>::size();
  static constexpr int half_size = size / 2;
  T a_arr[size];
  T b_arr[size];
@ -475,7 +510,9 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {

 template <typename src_T, typename dst_T>
 void convert(const src_T *src, dst_T *dst, int64_t n) {
-#pragma unroll
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
  for (int64_t i = 0; i < n; i++) {
    *dst = static_cast<dst_T>(
        static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@ -1,13 +1,14 @@
 #pragma once

-#include "intrinsics.h"
-#include "vec256_base.h"
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <ATen/cpu/vec256/vec256_base.h>
 #if defined(__AVX__) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<double> {
 private:
  __m256d values;
 public:
-  static constexpr int size = 4;
+  static constexpr int size() {
+    return 4;
+  }
  Vec256() {}
  Vec256(__m256d v) : values(v) {}
  Vec256(double val) {
@ -40,7 +43,7 @@ public:
    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
-                            int64_t count = size) {
+                            int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -53,22 +56,22 @@ public:
    }
    return b;
  }
-  static Vec256<double> loadu(const void* ptr, int64_t count = size) {
-    if (count == size)
+  static Vec256<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));

-    __at_align32__ double tmp_values[size];
+    __at_align32__ double tmp_values[size()];
    std::memcpy(
        tmp_values,
        reinterpret_cast<const double*>(ptr),
        count * sizeof(double));
    return _mm256_load_pd(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
    } else if (count > 0) {
-      double tmp_values[size];
+      double tmp_values[size()];
      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(double));
    }
@ -252,7 +255,7 @@ template <>
 void convert(const double* src, double* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@ -1,13 +1,14 @@
 #pragma once

-#include "intrinsics.h"
-#include "vec256_base.h"
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <ATen/cpu/vec256/vec256_base.h>
 #if defined(__AVX__) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif

 namespace at {
 namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
 namespace {

 #if defined(__AVX__) && !defined(_MSC_VER)
@ -16,7 +17,9 @@ template <> class Vec256<float> {
 private:
  __m256 values;
 public:
-  static constexpr int size = 8;
+  static constexpr int size() {
+    return 8;
+  }
  Vec256() {}
  Vec256(__m256 v) : values(v) {}
  Vec256(float val) {
@ -43,7 +46,7 @@ public:
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
-                           int64_t count = size) {
+                           int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -64,19 +67,19 @@ public:
    }
    return b;
  }
-  static Vec256<float> loadu(const void* ptr, int64_t count = size) {
-    if (count == size)
+  static Vec256<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
-    __at_align32__ float tmp_values[size];
+    __at_align32__ float tmp_values[size()];
    std::memcpy(
        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
    return _mm256_loadu_ps(tmp_values);
  }
-  void store(void* ptr, int64_t count = size) const {
-    if (count == size) {
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
    } else if (count > 0) {
-      float tmp_values[size];
+      float tmp_values[size()];
      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(float));
    }
@ -260,7 +263,7 @@ template <>
 void convert(const float* src, float* dst, int64_t n) {
  int64_t i;
 #pragma unroll
-  for (i = 0; i <= (n - Vec256<float>::size); i += Vec256<float>::size) {
+  for (i = 0; i <= (n - Vec256<float>::size()); i += Vec256<float>::size()) {
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
  }
 #pragma unroll
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@ -1,7 +1,7 @@
 #pragma once

-#include "intrinsics.h"
-#include "vec256_base.h"
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <ATen/cpu/vec256/vec256_base.h>

 namespace at {
 namespace vec256 {
@ -22,7 +22,9 @@ public:

 template <>
 struct Vec256<int64_t> : public Vec256i {
-  static constexpr int size = 4;
+  static constexpr int size() {
+    return 4;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
@ -31,7 +33,7 @@ struct Vec256<int64_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
@ -51,7 +53,7 @@ struct Vec256<int64_t> : public Vec256i {
    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
  }
  static Vec256<int64_t>
-  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
+  set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -68,15 +70,15 @@ struct Vec256<int64_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
-    __at_align32__ int64_t tmp_values[size];
+    __at_align32__ int64_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int64_t tmp_values[size];
+      __at_align32__ int64_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
    }
@ -117,7 +119,9 @@ struct Vec256<int64_t> : public Vec256i {

 template <>
 struct Vec256<int32_t> : public Vec256i {
-  static constexpr int size = 8;
+  static constexpr int size() {
+    return 8;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
@ -139,7 +143,7 @@ struct Vec256<int32_t> : public Vec256i {
      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
  }
  static Vec256<int32_t>
-  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
+  set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -164,15 +168,15 @@ struct Vec256<int32_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
-    __at_align32__ int32_t tmp_values[size];
+    __at_align32__ int32_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int32_t tmp_values[size];
+      __at_align32__ int32_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
    }
@ -212,13 +216,17 @@ template <>
 void convert(const int32_t *src, float *dst, int64_t n) {
  int64_t i;
  // int32_t and float have same size
-#pragma unroll
-  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
+  for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
    auto output_vec = _mm256_cvtepi32_ps(input_vec);
    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<float>(src[i]);
  }
@ -228,13 +236,17 @@ template <>
 void convert(const int32_t *src, double *dst, int64_t n) {
  int64_t i;
  // int32_t has half the size of double
-#pragma unroll
-  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
+  for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
  }
-#pragma unroll
+#ifndef _MSC_VER  
+# pragma unroll  
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<double>(src[i]);
  }
@ -242,7 +254,9 @@ void convert(const int32_t *src, double *dst, int64_t n) {

 template <>
 struct Vec256<int16_t> : public Vec256i {
-  static constexpr int size = 16;
+  static constexpr int size() {
+    return 16;
+  }
  using Vec256i::Vec256i;
  Vec256() {}
  Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
@ -255,7 +269,7 @@ struct Vec256<int16_t> : public Vec256i {
  }
  template <int64_t mask>
  static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    a.store(tmp_values);
    if (mask & 0x01)
      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
@ -303,7 +317,7 @@ struct Vec256<int16_t> : public Vec256i {
      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
  }
  static Vec256<int16_t>
-  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
+  set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size()) {
    switch (count) {
      case 0:
        return a;
@ -344,15 +358,15 @@ struct Vec256<int16_t> : public Vec256i {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
  static Vec256<int16_t> loadu(const void* ptr, int16_t count) {
-    __at_align32__ int16_t tmp_values[size];
+    __at_align32__ int16_t tmp_values[size()];
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
    return loadu(tmp_values);
  }
-  void store(void* ptr, int count = size) const {
-    if (count == size) {
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
    } else if (count > 0) {
-      __at_align32__ int16_t tmp_values[size];
+      __at_align32__ int16_t tmp_values[size()];
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
    }
@ -454,11 +468,11 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>

 template <typename T>
 Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
-  T values_a[Vec256<T>::size];
-  T values_b[Vec256<T>::size];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
  a.store(values_a);
  b.store(values_b);
-  for (int i = 0; i != Vec256<T>::size; i++) {
+  for (int i = 0; i != Vec256<T>::size(); i++) {
    values_a[i] /= values_b[i];
  }
  return Vec256<T>::loadu(values_a);
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@ -1,9 +1,9 @@
 #pragma once

-#include "ATen/Config.h"
-#include "ATen/Parallel.h"
-#include "ATen/cpu/vec256/functional.h"
-#include "ATen/cpu/vec256/vec256.h"
+#include <ATen/Config.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec256/functional.h>
+#include <ATen/cpu/vec256/vec256.h>

 // This header implements various unary operations using a MKL VML style
 // interface.
--- a/aten/src/ATen/cuda/Array.h
+++ b/aten/src/ATen/cuda/Array.h
@ -7,7 +7,11 @@
 namespace at { namespace cuda {

 template <typename T, int size>
+#ifndef __HIP_PLATFORM_HCC__
+struct alignas(16) Array {
+#else
 struct Array {
+#endif
  T data[size];

  C10_HOST_DEVICE T operator[](int i) const {
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@ -1,9 +1,9 @@
 #pragma once

-#include "ATen/cuda/detail/IndexUtils.cuh"
-#include "ATen/TensorUtils.h"
-#include "THC/THCAtomics.cuh"
-#include "ATen/cuda/CUDAContext.h"
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/TensorUtils.h>
+#include <THC/THCAtomics.cuh>
+#include <ATen/cuda/CUDAContext.h>

 #include <math.h>

@ -271,7 +271,7 @@ template <typename Op,
          typename IndexType,
          int ADims,
          int step>
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
 __global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
@ -355,7 +355,7 @@ template <typename Op,
          typename IndexType,
          int ADims, int BDims,
          int step>
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
 __global__ void
@ -464,7 +464,7 @@ template <typename Op,
          typename IndexType,
          int ADims, int BDims, int CDims,
          int step>
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
 __global__ void
@ -587,7 +587,7 @@ template <typename Op,
          typename IndexType,
          int ADims, int BDims, int CDims, int DDims,
          int step>
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
 __global__ void
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@ -1,5 +1,7 @@
-#include "ATen/cuda/CUDAContext.h"
-#include "THC/THCGeneral.hpp"
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCGeneral.hpp>
+
+#include <ATen/cuda/CUDAConfig.h>

 namespace at { namespace cuda {

--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@ -1,16 +1,16 @@
 #pragma once

-#include "ATen/core/ATenGeneral.h"
-#include "ATen/Context.h"
-#include "ATen/cuda/CUDAStream.h"
-#include "ATen/cuda/Exceptions.h"
-#include "c10/cuda/CUDAFunctions.h"
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/Context.h>
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAFunctions.h>

 #include <cstdint>

-#include "cuda_runtime_api.h"
-#include "cusparse.h"
-#include "cublas_v2.h"
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <cublas_v2.h>

 namespace at {
 namespace cuda {
--- a/aten/src/ATen/cuda/CUDADevice.h
+++ b/aten/src/ATen/cuda/CUDADevice.h
@ -1,8 +1,8 @@
 #pragma once

-#include "ATen/cuda/Exceptions.h"
+#include <ATen/cuda/Exceptions.h>

-#include "cuda.h"
+#include <cuda.h>

 namespace at {
 namespace cuda {
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -1,13 +1,13 @@
 #pragma once

-#include "ATen/cuda/ATenCUDAGeneral.h"
-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/CUDAStream.h"
-#include "ATen/cuda/CUDAGuard.h"
-#include "ATen/cuda/Exceptions.h"
-#include "c10/util/Exception.h"
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/Exception.h>

-#include "cuda_runtime_api.h"
+#include <cuda_runtime_api.h>

 #include <cstdint>
 #include <utility>
@ -35,7 +35,7 @@ struct AT_CUDA_API CUDAEvent {
  ~CUDAEvent() {
    try {
      if (is_created_) {
-        at::cuda::CUDAGuard device_guard(static_cast<int16_t>(device_index_));
+        CUDAGuard device_guard(static_cast<int16_t>(device_index_));
        cudaEventDestroy(event_);
      }
    } catch (...) { /* No throw */ }
@ -74,7 +74,7 @@ struct AT_CUDA_API CUDAEvent {

  // Note: cudaEventRecord must be called on the same device as the stream.
  void record(const CUDAStream& stream) {
-    at::cuda::CUDAGuard guard(static_cast<int16_t>(stream.device_index()));
+    CUDAGuard guard(static_cast<int16_t>(stream.device_index()));

    if (is_created_) {
      AT_ASSERT(device_index_ == stream.device_index());
@ -92,7 +92,7 @@ struct AT_CUDA_API CUDAEvent {
  // The event has no actual GPU resources associated with it.
  void block(const CUDAStream& stream) {
    if (is_created_) {
-      at::cuda::CUDAGuard guard(static_cast<int16_t>(stream.device_index()));
+      CUDAGuard guard(static_cast<int16_t>(stream.device_index()));
      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
    }
  }
--- a/aten/src/ATen/cuda/CUDAGenerator.cpp
+++ b/aten/src/ATen/cuda/CUDAGenerator.cpp
@ -1,8 +1,8 @@
-#include "ATen/Config.h"
+#include <ATen/Config.h>

-#include "ATen/CUDAGenerator.h"
-#include "ATen/Context.h"
-#include "THCTensorRandom.h"
+#include <ATen/CUDAGenerator.h>
+#include <ATen/Context.h>
+#include <THC/THCTensorRandom.h>
 #include <stdexcept>

 // There is only one CUDAGenerator instance. Calls to seed(), manualSeed(),
--- a/aten/src/ATen/cuda/CUDAMultiStreamGuard.h
+++ b/aten/src/ATen/cuda/CUDAMultiStreamGuard.h
@ -1,7 +1,7 @@
 #pragma once

 #include <c10/util/ArrayRef.h>
-#include <ATen/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAStream.h>
 #include <ATen/cuda/CUDAContext.h>

 #include <vector>
--- a/aten/src/ATen/cuda/CUDATensorMethods.cuh
+++ b/aten/src/ATen/cuda/CUDATensorMethods.cuh
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/Tensor.h"
-#include "ATen/core/Half.h"
+#include <ATen/Tensor.h>
+#include <ATen/core/Half.h>

 #include <cuda.h>
 #include <cuda_runtime.h>
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@ -1,7 +1,7 @@
 #pragma once

-#include "c10/util/Exception.h"
-#include "c10/cuda/CUDAException.h"
+#include <c10/util/Exception.h>
+#include <c10/cuda/CUDAException.h>

 // See Note [CHECK macro]
 #define AT_CUDNN_CHECK(EXPR)                                                     \
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -9,11 +9,11 @@
 #include <ATen/native/cuda/CuFFTPlanCache.h>
 #include <c10/util/Exception.h>

-#include "THC/THC.h"
+#include <THC/THC.h>
 #include <THC/THCGeneral.hpp>

 #if AT_CUDNN_ENABLED()
-#include "ATen/cudnn/cudnn-wrapper.h"
+#include <ATen/cudnn/cudnn-wrapper.h>
 #endif

 #include <cuda.h>
--- a/aten/src/ATen/cuda/detail/IndexUtils.cu
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cu
@ -1,4 +1,5 @@
-#include "IndexUtils.cuh"
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <vector>

 namespace at {
 namespace cuda {
@ -35,7 +36,7 @@ within the next one.
 */
 bool maybeOverlappingIndices(const Tensor& t) {
  /* Extract size/stride arrays; only consider size >1 dims. */
-  SizeAndStride *info = (SizeAndStride *)alloca(sizeof(SizeAndStride) * t.dim());
+  std::vector<SizeAndStride> info(t.dim());
  int dims = t.dim();
  int nonSize1Dims = 0;
  for (int i = 0; i < dims; ++i) {
@ -58,7 +59,7 @@ bool maybeOverlappingIndices(const Tensor& t) {
  }

  /* Ascending order (innermost dimension in sorted view is at [0]) */
-  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
+  qsort(info.data(), nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);

  for (int i = 0; i < (nonSize1Dims - 1); ++i) {
    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {
--- a/aten/src/ATen/cuda/detail/IndexUtils.cuh
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh
@ -1,7 +1,7 @@
 #pragma once

-#include "ATen/ATen.h"
-#include "TensorInfo.cuh"
+#include <ATen/ATen.h>
+#include <ATen/cuda/detail/TensorInfo.cuh>
 #include <limits>

 namespace at {
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@ -1,6 +1,6 @@
 #pragma once

-#include "ATen/ATen.h"
+#include <ATen/ATen.h>

 // Contents of this file are copied from THCUNN/common.h for the ease of porting
 // THCUNN functions into ATen.
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@ -9,20 +9,20 @@
 /// OffsetCalculator calculates the offset in bytes of a linear index for NARGS
 /// operands that share the same shape, but may have different strides.

-template <int NARGS>
+template <int NARGS, typename index_t = uint32_t>
 struct OffsetCalculator {
  static constexpr int MAX_DIMS = 25;

  // The offset for each argument (in bytes). Wrapper around fixed-size array.
-  using offset_type = at::cuda::Array<uint32_t, NARGS>;
+  using offset_type = at::cuda::Array<index_t, NARGS>;

  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides) : dims(dims) {
    AT_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims");
    for (int i = 0; i < MAX_DIMS; ++i) {
      if (i < dims) {
-        sizes_[i] = IntDivider<uint32_t>(sizes[i]);
+        sizes_[i] = IntDivider<index_t>(sizes[i]);
      } else {
-        sizes_[i] = IntDivider<uint32_t>(1);
+        sizes_[i] = IntDivider<index_t>(1);
      }
      for (int arg = 0; arg < NARGS; arg++) {
        strides_[i][arg] =  i < dims ? strides[arg][i] : 0;
@ -30,7 +30,7 @@ struct OffsetCalculator {
    }
  }

-  C10_HOST_DEVICE offset_type get(uint32_t linear_idx) const {
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
    offset_type offsets;
    #pragma unroll
    for (int arg = 0; arg < NARGS; arg++) {
@ -55,6 +55,6 @@ struct OffsetCalculator {
  }

  int dims;
-  IntDivider<uint32_t> sizes_[MAX_DIMS];
-  uint32_t strides_[MAX_DIMS][NARGS];
+  IntDivider<index_t> sizes_[MAX_DIMS];
+  index_t strides_[MAX_DIMS][NARGS];
 };
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -1,4 +1,4 @@
-#include "Descriptors.h"
+#include <ATen/cudnn/Descriptors.h>

 #include <ATen/ATen.h>

--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -1,12 +1,12 @@
 #pragma once

-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/Exceptions.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>

-#include "cudnn-wrapper.h"
+#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
-#include "ATen/cuda/ATenCUDAGeneral.h"
+#include <ATen/cuda/ATenCUDAGeneral.h>
 #include <cuda.h>

 #if CUDNN_VERSION < 7000
--- a/aten/src/ATen/cudnn/Handle.cpp
+++ b/aten/src/ATen/cudnn/Handle.cpp
@ -1,6 +1,6 @@
-#include "Handle.h"
+#include <ATen/cudnn/Handle.h>

-#include "ATen/cuda/Exceptions.h"
+#include <ATen/cuda/Exceptions.h>

 #include <unordered_map>
 #include <mutex>
--- a/aten/src/ATen/cudnn/Handle.h
+++ b/aten/src/ATen/cudnn/Handle.h
@ -1,7 +1,7 @@
 #pragma once

-#include "cudnn-wrapper.h"
-#include "ATen/cuda/ATenCUDAGeneral.h"
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>

 namespace at { namespace native {

--- a/Show More
+++ b/Show More