Updating fbgemm

add python 3.8 workaround
clarify when to use as_tuple in torch.nonzero (#32051 )
2025-10-21 21:49:24 +08:00 · 2020-03-23 11:05:29 -07:00 · 2020-01-14 09:05:04 -08:00 · 2020-01-14 11:07:33 -05:00 · 2020-01-14 10:02:37 -05:00 · 2020-01-13 21:57:12 -08:00
118 changed files with 2320 additions and 2083 deletions
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -36,6 +36,8 @@ class Conf(object):
        # The cpu nightlies are built on the pytorch/manylinux-cuda100 docker image
        alt_docker_suffix = self.cuda_version or "100"
        docker_distro_suffix = "" if self.pydistro == "conda" else alt_docker_suffix
+        if self.cuda_version == "101":
+            return "soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916"
        return miniutils.quote("pytorch/" + docker_distro_prefix + "-cuda" + docker_distro_suffix)

    def get_name_prefix(self):
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -24,11 +24,11 @@ CONFIG_TREE_DATA = [
            ("5", [
                XImportant("3.6"),  # This is actually the ASAN build
            ]),
-            ("7", [
-                ("3.6", [
-                    ("xla", [XImportant(True)]),
-                ]),
-            ]),
+            # ("7", [
+            #     ("3.6", [
+            #         ("xla", [XImportant(True)]),
+            #     ]),
+            # ]),
        ]),
        ("cuda", [
            ("9", [
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -210,6 +210,7 @@ def instantiate_configs():
            android_abi = fc.find_prop("android_abi")
            parms_list_ignored_for_docker_image.append(android_abi)
            restrict_phases = ["build"]
+            fc.props["is_important"] = True

        elif compiler_name:
            gcc_version = compiler_name + (fc.find_prop("compiler_version") or "")
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -307,27 +307,28 @@ jobs:
          time docker pull ${DOCKER_IMAGE} >/dev/null
          export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})

-          # TODO We may want to move the rebase logic to a separate step after checkout
-          # Rebase to master only if in xenial_py3_6_gcc5_4 case
-          if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
-            echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
-            set -x
-            git config --global user.email "circleci.ossci@gmail.com"
-            git config --global user.name "CircleCI"
-            git config remote.origin.url https://github.com/pytorch/pytorch.git
-            git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
-            git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
-            export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
-            echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
-            export GIT_COMMIT=${CIRCLE_SHA1}
-            echo "GIT_COMMIT: " ${GIT_COMMIT}
-            git checkout -f ${GIT_COMMIT}
-            git reset --hard ${GIT_COMMIT}
-            git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
-            set +x
-          else
-            echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
-          fi
+          # NB: Temporarily disable the rebase logic in v1.4.0, don't merge this change into master
+          # # TODO We may want to move the rebase logic to a separate step after checkout
+          # # Rebase to master only if in xenial_py3_6_gcc5_4 case
+          # if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
+          #   echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
+          #   set -x
+          #   git config --global user.email "circleci.ossci@gmail.com"
+          #   git config --global user.name "CircleCI"
+          #   git config remote.origin.url https://github.com/pytorch/pytorch.git
+          #   git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+          #   git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+          #   export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
+          #   echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+          #   export GIT_COMMIT=${CIRCLE_SHA1}
+          #   echo "GIT_COMMIT: " ${GIT_COMMIT}
+          #   git checkout -f ${GIT_COMMIT}
+          #   git reset --hard ${GIT_COMMIT}
+          #   git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
+          #   set +x
+          # else
+          #   echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+          # fi

          git submodule sync && git submodule update -q --init --recursive

@ -1709,20 +1710,6 @@ workflows:
          build_environment: "pytorch-linux-xenial-py3-clang5-asan-test"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:405"
          resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_xla_linux_xenial_py3_6_clang7_build
-          requires:
-            - setup
-          build_environment: "pytorch-xla-linux-xenial-py3.6-clang7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-clang7:405"
-      - pytorch_linux_test:
-          name: pytorch_xla_linux_xenial_py3_6_clang7_test
-          requires:
-            - setup
-            - pytorch_xla_linux_xenial_py3_6_clang7_build
-          build_environment: "pytorch-xla-linux-xenial-py3.6-clang7-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-clang7:405"
-          resource_class: large
      - pytorch_linux_build:
          name: pytorch_linux_xenial_cuda9_cudnn7_py3_build
          requires:
@ -1874,33 +1861,18 @@ workflows:
          name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build
          requires:
            - setup
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
          build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c:405"
      - pytorch_linux_build:
          name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build
          requires:
            - setup
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
          build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c:405"
      - pytorch_linux_build:
          name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build
          requires:
            - setup
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
          build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c:405"
      # Warning: indentation here matters!
@ -2292,7 +2264,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2305,7 +2277,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2318,7 +2290,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2331,7 +2303,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2344,7 +2316,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2505,7 +2477,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2518,7 +2490,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2531,7 +2503,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2544,7 +2516,7 @@ workflows:
          filters:
            branches:
              only: postnightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2718,7 +2690,7 @@ workflows:
            branches:
              only: postnightly
          libtorch_variant: "shared-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2732,7 +2704,7 @@ workflows:
            branches:
              only: postnightly
          libtorch_variant: "shared-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2746,7 +2718,7 @@ workflows:
            branches:
              only: postnightly
          libtorch_variant: "static-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -2760,7 +2732,7 @@ workflows:
            branches:
              only: postnightly
          libtorch_variant: "static-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - smoke_linux_test:
@ -3212,7 +3184,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_manywheel_2_7mu_cu101_devtoolset7_nightly_build
          build_environment: "manywheel 2.7mu cu101 devtoolset7"
@ -3221,7 +3193,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_manywheel_3_5m_cu101_devtoolset7_nightly_build
          build_environment: "manywheel 3.5m cu101 devtoolset7"
@ -3230,7 +3202,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_build
          build_environment: "manywheel 3.6m cu101 devtoolset7"
@ -3239,7 +3211,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_manywheel_3_7m_cu101_devtoolset7_nightly_build
          build_environment: "manywheel 3.7m cu101 devtoolset7"
@ -3248,7 +3220,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_conda_2_7_cpu_devtoolset7_nightly_build
          build_environment: "conda 2.7 cpu devtoolset7"
@ -3365,7 +3337,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_conda_3_5_cu101_devtoolset7_nightly_build
          build_environment: "conda 3.5 cu101 devtoolset7"
@ -3374,7 +3346,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_build
          build_environment: "conda 3.6 cu101 devtoolset7"
@ -3383,7 +3355,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_conda_3_7_cu101_devtoolset7_nightly_build
          build_environment: "conda 3.7 cu101 devtoolset7"
@ -3392,7 +3364,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_libtorch_2_7m_cpu_devtoolset7_nightly_shared-with-deps_build
          build_environment: "libtorch 2.7m cpu devtoolset7"
@ -3522,7 +3494,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "shared-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_libtorch_2_7m_cu101_devtoolset7_nightly_shared-without-deps_build
          build_environment: "libtorch 2.7m cu101 devtoolset7"
@ -3532,7 +3504,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "shared-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_libtorch_2_7m_cu101_devtoolset7_nightly_static-with-deps_build
          build_environment: "libtorch 2.7m cu101 devtoolset7"
@ -3542,7 +3514,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "static-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_libtorch_2_7m_cu101_devtoolset7_nightly_static-without-deps_build
          build_environment: "libtorch 2.7m cu101 devtoolset7"
@ -3552,7 +3524,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "static-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
      - binary_linux_build:
          name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_nightly_shared-with-deps_build
          build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi"
@ -4056,7 +4028,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4068,7 +4040,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4080,7 +4052,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4092,7 +4064,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4104,7 +4076,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4252,7 +4224,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4264,7 +4236,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4276,7 +4248,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4288,7 +4260,7 @@ workflows:
          filters:
            branches:
              only: nightly
-          docker_image: "pytorch/conda-cuda"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4449,7 +4421,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "shared-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4462,7 +4434,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "shared-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4475,7 +4447,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "static-with-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
@ -4488,7 +4460,7 @@ workflows:
            branches:
              only: nightly
          libtorch_variant: "static-without-deps"
-          docker_image: "pytorch/manylinux-cuda101"
+          docker_image: soumith/manylinux-cuda101@sha256:5d62be90d5b7777121180e6137c7eed73d37aaf9f669c51b783611e37e0b4916
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
      - binary_linux_test:
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -11,6 +11,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  source activate testenv >/dev/null
 elif [[ "$DESIRED_PYTHON" == 2.7mu ]]; then
  export PATH="/opt/python/cp27-cp27mu/bin:\$PATH"
+elif [[ "$DESIRED_PYTHON" == 3.8m ]]; then
+  export PATH="/opt/python/cp38-cp38/bin:\$PATH"
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"
  export PATH="/opt/python/cp\$python_nodot-cp\${python_nodot}m/bin:\$PATH"
--- a/.circleci/scripts/should_run_job.py
+++ b/.circleci/scripts/should_run_job.py
@ -53,8 +53,10 @@ default_set = set([
    'pytorch-macos-10.13-cuda9.2-cudnn7-py3',
    # PyTorch Android
    'pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build',
+    'pytorch-linux-xenial-py3-clang5-android-ndk-r19',
    # PyTorch Android gradle
    'pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-only-x86_32',
+
    # Pytorch iOS builds
    'pytorch-ios-11.2.1-x86_64_build',
    'pytorch-ios-11.2.1-arm64_build',
--- a/.circleci/verbatim-sources/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/pytorch-job-specs.yml
@ -19,27 +19,28 @@ jobs:
          time docker pull ${DOCKER_IMAGE} >/dev/null
          export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})

-          # TODO We may want to move the rebase logic to a separate step after checkout
-          # Rebase to master only if in xenial_py3_6_gcc5_4 case
-          if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
-            echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
-            set -x
-            git config --global user.email "circleci.ossci@gmail.com"
-            git config --global user.name "CircleCI"
-            git config remote.origin.url https://github.com/pytorch/pytorch.git
-            git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
-            git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
-            export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
-            echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
-            export GIT_COMMIT=${CIRCLE_SHA1}
-            echo "GIT_COMMIT: " ${GIT_COMMIT}
-            git checkout -f ${GIT_COMMIT}
-            git reset --hard ${GIT_COMMIT}
-            git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
-            set +x
-          else
-            echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
-          fi
+          # NB: Temporarily disable the rebase logic in v1.4.0, don't merge this change into master
+          # # TODO We may want to move the rebase logic to a separate step after checkout
+          # # Rebase to master only if in xenial_py3_6_gcc5_4 case
+          # if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
+          #   echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
+          #   set -x
+          #   git config --global user.email "circleci.ossci@gmail.com"
+          #   git config --global user.name "CircleCI"
+          #   git config remote.origin.url https://github.com/pytorch/pytorch.git
+          #   git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+          #   git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+          #   export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
+          #   echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+          #   export GIT_COMMIT=${CIRCLE_SHA1}
+          #   echo "GIT_COMMIT: " ${GIT_COMMIT}
+          #   git checkout -f ${GIT_COMMIT}
+          #   git reset --hard ${GIT_COMMIT}
+          #   git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
+          #   set +x
+          # else
+          #   echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+          # fi

          git submodule sync && git submodule update -q --init --recursive

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -16,7 +16,7 @@ jobs:
          python-version: 3.x
          architecture: x64
      - name: Checkout PyTorch
-        uses: actions/checkout@master
+        uses: actions/checkout@v1
      - name: Ensure consistent CircleCI YAML config
        run: |
          pip install -r requirements.txt
@ -51,7 +51,7 @@ jobs:
          python-version: 3.x
          architecture: x64
      - name: Fetch PyTorch
-        uses: actions/checkout@master
+        uses: actions/checkout@v1
      - name: Checkout PR tip
        run: |
          set -eux
@ -87,7 +87,7 @@ jobs:
          python-version: 2.x
          architecture: x64
      - name: Fetch PyTorch
-        uses: actions/checkout@master
+        uses: actions/checkout@v1
      - name: Checkout PR tip
        run: |
          set -eux
@ -126,7 +126,7 @@ jobs:
          python-version: 3.x
          architecture: x64
      - name: Checkout PyTorch
-        uses: actions/checkout@master
+        uses: actions/checkout@v1
      - name: Checkout PR tip
        run: |
          set -eux
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@ -64,7 +64,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then
  exit 0
 fi

-if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then
+# if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then
  # Hotfix, use hypothesis 3.44.6 on Ubuntu 14.04
  # See comments on
  # https://github.com/HypothesisWorks/hypothesis-python/commit/eadd62e467d6cee6216e71b391951ec25b4f5830
@ -74,9 +74,9 @@ if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then
  sudo pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl
  sudo pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl
  sudo pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl
-else
-  pip install --user --no-cache-dir hypothesis==3.59.0
-fi
+# else
+#   pip install --user --no-cache-dir hypothesis==3.59.0
+# fi

 # Collect additional tests to run (outside caffe2/python)
 EXTRA_TESTS=()
@ -133,7 +133,7 @@ pip install --user pytest-sugar
 # torchvision tests #
 #####################
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
-  pip install -q --user git+https://github.com/pytorch/vision.git
+  pip install -q --user git+https://github.com/pytorch/vision.git@v0.5.0
  pip install -q --user ninja
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
@ -141,7 +141,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
    # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
    # Fix the pip error: Couldn't find a version that satisfies the requirement
    sudo pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.0.0.dev1104
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.1.0.dev1228
  fi
  "$ROOT_DIR/scripts/onnx/test.sh"
 fi
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -49,7 +49,7 @@ if [[ "$BUILD_ENVIRONMENT" != *ppc64le* ]]; then
  export PATH="/var/lib/jenkins/.local/bin:$PATH"

  # TODO: move this to Docker
-  pip_install --user hypothesis
+  pip_install --user "hypothesis==4.53.2"

  # TODO: move this to Docker
  PYTHON_VERSION=$(python -c 'import platform; print(platform.python_version())'|cut -c1)
@ -214,7 +214,7 @@ test_backward_compatibility() {
  pushd test/backward_compatibility
  python dump_all_function_schemas.py --filename new_schemas.txt
  pip_uninstall torch
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  pip_install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
  python check_backward_compatibility.py --new-schemas new_schemas.txt
  popd
  set +x
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -22,7 +22,7 @@ if NOT "%BUILD_ENVIRONMENT%"=="" (
    :: Numba is pinned to 0.44.0 to avoid https://github.com/numba/numba/issues/4352
    call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0
 )
-pip install -q ninja future hypothesis "librosa>=0.6.2" psutil pillow
+pip install -q ninja future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow
 :: No need to install faulthandler since we only test Python >= 3.6 on Windows
 :: faulthandler is builtin since Python 3.3

--- a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
@ -413,7 +413,7 @@ public abstract class Tensor {
   */
  public long[] getDataAsLongArray() {
    throw new IllegalStateException(
-        "Tensor of type " + getClass().getSimpleName() + " cannot return data as float array.");
+        "Tensor of type " + getClass().getSimpleName() + " cannot return data as long array.");
  }

  /**
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -16,7 +16,7 @@
 // merge the libraries inside Facebook".  Well, the problem is that there
 // are some downstream applications which are at binary size limit, and
 // incorporating all of the extra code from libtorch would push them
-// over (admarket/adreview/service:adreviewservice, see also 
+// over (admarket/adreview/service:adreviewservice, see also
 // https://github.com/pytorch/pytorch/pull/29299)  So if you want to do that,
 // we have to fix all of the services like this.
 //
@ -50,10 +50,10 @@ struct CAFFE2_API VariableHooksInterface {
  virtual const std::string& name(const Tensor&) const = 0;
 };

-C10_API void SetVariableHooks(VariableHooksInterface* hooks);
-C10_API VariableHooksInterface* GetVariableHooks();
+CAFFE2_API void SetVariableHooks(VariableHooksInterface* hooks);
+CAFFE2_API VariableHooksInterface* GetVariableHooks();

-struct C10_API VariableHooksRegisterer {
+struct CAFFE2_API VariableHooksRegisterer {
  explicit VariableHooksRegisterer(VariableHooksInterface* hooks) {
    SetVariableHooks(hooks);
  }
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@ -41,7 +41,7 @@ Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const T
  auto denom = (mag_square1 * mag_square2).sqrt_();
  auto cos = prod_sum / denom;

-  auto zeros = at::zeros_like(target, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto zeros = at::zeros_like(cos, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  auto pos = 1 - cos;
  auto neg = (cos - margin).clamp_min_(0);
  auto output_pos = at::where(target == 1, pos, zeros);
@ -77,8 +77,8 @@ Tensor margin_ranking_loss(const Tensor& input1, const Tensor& input2, const Ten
 }

 Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction) {
-  auto zeros = at::zeros_like(target, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  auto output_pos = target * (at::log(target) - input);
+  auto zeros = at::zeros_like(output_pos, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  auto output = at::where(target > 0, output_pos, zeros);
  return apply_loss_reduction(output, reduction);
 }
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@ -324,13 +324,14 @@ Tensor unflatten(const Tensor& self, int64_t dim, IntArrayRef sizes, DimnameList
      "up to the size of dim ", dim, " (", self.names()[dim], ": ", self.size(dim),
      ") in Tensor", self.names());

+  int64_t dim_wrap = maybe_wrap_dim(dim, self.dim());
  auto outnames = self.names().vec();
-  outnames.erase(outnames.begin() + dim);
-  outnames.insert(outnames.begin() + dim, names.begin(), names.end());
+  outnames.erase(outnames.begin() + dim_wrap);
+  outnames.insert(outnames.begin() + dim_wrap, names.begin(), names.end());

  auto new_sizes = self.sizes().vec();
-  new_sizes.erase(new_sizes.begin() + dim);
-  new_sizes.insert(new_sizes.begin() + dim, sizes.begin(), sizes.end());
+  new_sizes.erase(new_sizes.begin() + dim_wrap);
+  new_sizes.insert(new_sizes.begin() + dim_wrap, sizes.begin(), sizes.end());

  Tensor result;
  {
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -138,5 +138,14 @@ Tensor max_pool3d(
      self, kernel_size, stride, padding, dilation, ceil_mode);
  return std::get<0>(output_and_indices);
 }
+
+Tensor _test_optional_float(const Tensor & self, c10::optional<double> scale) {
+  if (scale.has_value()) {
+    return at::full({}, scale.value(), self.options());
+  } else {
+    return at::empty({0}, self.options());
+  }
+}
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -134,6 +134,7 @@ std::vector<Tensor> where(const Tensor& condition) {
 }

 Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) {
+  TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype());
  Tensor ret = at::empty(self.sizes(), self.options());
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(ret.scalar_type(), "where_cpu", [&] {
    where_cpu<scalar_t>(ret, condition, self, other);
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@ -42,7 +42,22 @@ static void copy_kernel(TensorIterator& iter, bool non_blocking) {
    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] {
      using dest_t = scalar_t;
      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] {
-        cpu_kernel(iter, c10::static_cast_with_inter_type<dest_t, scalar_t>);
+        // Note (@zasdfgbnm):
+        //
+        // The code below can not be simplified as
+        //    cpu_kernel(iter, c10::static_cast_with_inter_type<dest_t, scalar_t>::apply);
+        //
+        // because this would force the compiler to instantiate the inline function and generate a function call in the loop
+        // instead of inlining it, making all the optimizations like vectorization impossible.
+        // You can verify this by looking the the symbols of `libtorch_cpu.so`:
+        //
+        //    readelf -Ws libtorch_cpu.so | grep static_cast_with_inter_type
+        //
+        // If done correctly, the above command should have no output.
+        //
+        // See: https://github.com/pytorch/pytorch/issues/31271
+        cpu_kernel(iter, [](scalar_t src) -> dest_t {
+          return c10::static_cast_with_inter_type<dest_t, scalar_t>(src); });
      });
    });
  }
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@ -171,8 +171,9 @@ void avg_pool2d_out_cuda_template(

  output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});

-  const int count = safe_downcast<int, int64_t>(output.numel());
-  const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
+  const uint32_t  num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const uint32_t num_blocks = cuda::ATenCeilDiv<uint32_t>(count, num_threads);

  if (divisor_override.has_value()) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
@ -184,7 +185,7 @@ void avg_pool2d_out_cuda_template(
        scalar_t *input_data = input.data_ptr<scalar_t>();

        avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, false, true>
-            <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
            count,
                input_data,
                nbatch,
@ -209,7 +210,7 @@ void avg_pool2d_out_cuda_template(
          scalar_t *input_data = input.data_ptr<scalar_t>();

          avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, true, false>
-              <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
              count,
                  input_data,
                  nbatch,
@ -233,7 +234,7 @@ void avg_pool2d_out_cuda_template(
          scalar_t *input_data = input.data_ptr<scalar_t>();

          avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, false, false>
-              <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
              count,
                  input_data,
                  nbatch,
@ -249,10 +250,8 @@ void avg_pool2d_out_cuda_template(
    }
  }

-  
-  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
-     "avg_pool2d_out_cuda_frame failed with error code ",
-     cudaGetLastError());
+
+  THCudaCheck(cudaGetLastError());

  if (input.ndimension() == 3) {
    output.resize_({nInputPlane, outputHeight, outputWidth});
@ -322,8 +321,9 @@ Tensor& avg_pool2d_backward_out_cuda_template(

  gradInput.resize_as_(input);

-  const int count =  safe_downcast<int, int64_t>(input.numel());
-  const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const int32_t count =  safe_downcast<int32_t, int64_t>(input.numel());
+  const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const uint32_t num_blocks = cuda::ATenCeilDiv<uint32_t>(count, num_threads);

  if (divisor_override.has_value()) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
@ -335,7 +335,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

        avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, false, true>
-            <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
            count,
                gradOutput_data,
                nbatch,
@ -360,7 +360,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
          scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

          avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, true, false>
-            <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               count,
               gradOutput_data,
               nbatch,
@ -384,7 +384,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
          scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

          avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, false, false>
-            <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               count,
               gradOutput_data,
               nbatch,
@ -400,9 +400,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
    }
  }

-  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
-    "avg_pool2d_backward_out_cuda failed with error code ",
-    cudaGetLastError());
+  THCudaCheck(cudaGetLastError());

  return gradInput;
 }
--- a/aten/src/ATen/native/cuda/PowKernel.cu
+++ b/aten/src/ATen/native/cuda/PowKernel.cu
@ -22,16 +22,88 @@ static inline __host__ __device__ T powi(T a, T b) {
  return result;
 }

+// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt.
+// So we need to define the functions with the explicit function signatures.
+// As for pow, the following signatures are defined as the device function:
+//   pow(float, int)
+//   pow(double, int)
+//   pow(float, float)
+//   pow(double, double)
+// As for sqrt, the following signatures are defined as the device function:
+//   sqrt(float)
+//   sqrt(double)
+// As for inverse sqrt, we must define it explicitly in MSVC, otherwise the static cast will be
+// applied to the result of the inline function, and thus the result is incorrect.
+//   e.g. if we use 1.0 / sqrt(2) for 2 ^ (-0.5) in MSVC, we get
+//          int(2 ^ (-0.5)) = int(1.0 / sqrt(2)) = int(1.0 / int(1.414)) = int(1.0 / 1) = 1
+//        However, the correct result is 
+//          int(2 ^ (-0.5)) = int(1.0 / 1.414) = 0
+#ifdef _MSC_VER
+// Functions for pow
+// pow for at::Half
+static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
+  return static_cast<at::Half>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow (floating, floating/int)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<Base_type>::value && (std::is_same<Base_type, Exp_type>::value || std::is_same<Exp_type, int>::value), Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return std::pow(base, exp);
+}
+// pow (integral, integral)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<std::is_integral<Base_type>::value && std::is_same<Base_type, Exp_type>::value, Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return powi(base, exp);
+}
+// pow (Otherwise)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<!std::is_same<Base_type, Exp_type>::value && !std::is_same<Exp_type, int>::value, Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return static_cast<Base_type>(std::pow(static_cast<double>(base), static_cast<double>(exp)));
+}
+// Functions for sqrt
+// sqrt (floating)
 template <typename T>
-static inline __host__ __device__ T sqrt(T x) {
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<T>::value, T>::type sqrt_(T x) {
  return std::sqrt(x);
 }
+// sqrt (integral)
+template <typename T>
+static inline __host__ __device__ typename std::enable_if<!std::is_floating_point<T>::value, T>::type sqrt_(T x) {
+  return static_cast<T>(std::sqrt(static_cast<double>(x)));
+}
+// Function for inverse sqrt
+// invsqrt (floating)
+template <typename T>
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<T>::value, T>::type invsqrt_(T x) {
+  return 1.0 / std::sqrt(x);
+}
+// invsqrt (integral)
+template <typename T>
+static inline __host__ __device__ typename std::enable_if<!std::is_floating_point<T>::value, T>::type invsqrt_(T x) {
+  return static_cast<T>(1.0 / std::sqrt(static_cast<double>(x)));
+}
+#else
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) {
+  return std::pow(base, exp);
+}
+template <typename T>
+static inline __host__ __device__ T sqrt_(T x) {
+  return ::sqrt(x);
+}
+template <typename T>
+static inline __host__ __device__ T invsqrt_(T x) {
+  return 1.0 / ::sqrt(x);
+}
+#endif

 void pow_tensor_tensor_kernel(TensorIterator& iter) {
  if (isFloatingType(iter.dtype())) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "pow_cuda", [&]() {
      gpu_kernel(iter, []GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
-        return std::pow(base, exp);
+        return pow_(base, exp);
      });
    });
  } else {
@ -49,7 +121,7 @@ void pow_tensor_scalar_kernel_impl(TensorIterator& iter,
  const auto d_exp = static_cast<double>(exp);
  if (d_exp == 0.5) {
    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return ::sqrt(base);
+      return sqrt_(base);
    });
  } else if (d_exp == 2) {
    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
@ -61,7 +133,7 @@ void pow_tensor_scalar_kernel_impl(TensorIterator& iter,
    });
  } else if (d_exp == -0.5) {
    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return 1.0 / ::sqrt(base);
+      return invsqrt_(base);
    });
  } else if (d_exp == -1) {
    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
@ -73,7 +145,7 @@ void pow_tensor_scalar_kernel_impl(TensorIterator& iter,
    });
  } else {
    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return std::pow(base, exp);
+      return pow_(base, exp);
    });
  }
 }
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@ -52,7 +52,7 @@ Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t step
  } else if (steps == 1) {
    r.fill_(start);
  } else {
-    AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "linspace_cuda", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(r.scalar_type(), "linspace_cuda", [&]() {
      scalar_t scalar_start = start.to<scalar_t>();
      scalar_t scalar_end = end.to<scalar_t>();
      scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
@ -84,7 +84,7 @@ Tensor& logspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t step
  } else if (steps == 1) {
    r.fill_(std::pow(base, start.to<double>()));
  } else {
-    AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "logspace_cuda", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(r.scalar_type(), "logspace_cuda", [&]() {
      scalar_t scalar_base = static_cast<scalar_t>(base);
      scalar_t scalar_start = start.to<scalar_t>();
      scalar_t scalar_end = end.to<scalar_t>();
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -6068,6 +6068,9 @@
    CPU: replication_pad3d_backward_cpu
    CUDA: replication_pad3d_backward_cuda

+- func: _test_optional_float(Tensor self, *, float? scale=None) -> Tensor
+  variants: function
+
 - func: upsample_linear1d.out(Tensor self, int[1] output_size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
--- a/aten/src/ATen/native_parse.py
+++ b/aten/src/ATen/native_parse.py
@ -77,6 +77,8 @@ def type_argument_translations(arg):
    # Enables float by translating to legacy double.
    elif t == 'float':
        t = 'double'
+    elif t == 'float?':
+        t = 'double?'
    # Enables str by translating to legacy std::string.
    elif t == 'str':
        t = 'std::string'
--- a/caffe2/operators/alias_with_name.cc
+++ b/caffe2/operators/alias_with_name.cc
@ -0,0 +1,25 @@
+#include "caffe2/operators/alias_with_name.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(AliasWithName, AliasWithNameOp<CPUContext>);
+
+OPERATOR_SCHEMA(AliasWithName)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Similar with AliasOp, storing the alias name as operator argument.
+)DOC")
+    .Arg("name", "name of the aliasing")
+    .Arg("is_backward", "weather or not to alias forward or backward")
+    .Input(0, "input", "Input tensor whose storage will be shared.")
+    .Output(0, "output", "Tensor of same shape as input, sharing its storage.");
+
+} // namespace caffe2
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    AliasWithName,
+    "_caffe2::AliasWithName(Tensor input, str name, bool is_backward = False) -> (Tensor output)",
+    caffe2::AliasWithNameOp<caffe2::CPUContext>);
--- a/caffe2/operators/alias_with_name.cu
+++ b/caffe2/operators/alias_with_name.cu
@ -0,0 +1,12 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/alias_with_name.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(AliasWithName, AliasWithNameOp<CUDAContext>);
+
+} // namespace caffe2
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(
+    AliasWithName,
+    caffe2::AliasWithNameOp<caffe2::CUDAContext>);
--- a/caffe2/operators/alias_with_name.h
+++ b/caffe2/operators/alias_with_name.h
@ -0,0 +1,46 @@
+#ifndef ALIAS_WITH_NAME_OP_H_
+#define ALIAS_WITH_NAME_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
+#include "caffe2/core/operator.h"
+
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(AliasWithName)
+
+namespace caffe2 {
+
+template <class Context>
+class AliasWithNameOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  template <class... Args>
+  explicit AliasWithNameOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        name_(this->template GetSingleArgument<std::string>(
+            "name",
+            "invalid_name")),
+        is_backward_(
+            this->template GetSingleArgument<bool>("is_backward", false)) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("name"), "You have to specify argument name");
+  }
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    CAFFE_ENFORCE_GE(input.numel(), 0, "Tensor is not initialized");
+
+    // This doesn't work anymore as this is "newstyle" operator
+    // OutputTensorAlias(0, input);
+
+    OperatorBase::SetOutputTensor(0, input.Alias());
+    return true;
+  }
+
+ protected:
+  std::string name_;
+  bool is_backward_;
+};
+
+} // namespace caffe2
+
+#endif // ALIAS_WITH_NAME_OP_H_
--- a/caffe2/operators/batch_permutation_op.cc
+++ b/caffe2/operators/batch_permutation_op.cc
@ -0,0 +1,169 @@
+#include "caffe2/operators/batch_permutation_op.h"
+
+#include <cstring>
+#include <vector>
+
+#ifdef CAFFE2_USE_MKLDNN
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
+namespace caffe2 {
+
+template <bool forwards>
+void batch_permutation_loop(
+    const int N,
+    const int K,
+    const float* src,
+    const int* indices,
+    float* dst) {
+  long numBytes = K * sizeof(float);
+  if (forwards) {
+#ifdef _OPENMP
+#if (_OPENMP >= 201307)
+#pragma omp parallel for simd
+#else
+#pragma omp parallel for
+#endif
+#endif
+    for (int n = 0; n < N; n++) {
+      int origIdx = n * K;
+      int permuteIdx = indices[n] * K;
+      std::memcpy(dst + origIdx, src + permuteIdx, numBytes);
+    }
+  } else {
+    std::vector<int> backward_indices(N);
+    for (size_t i = 0; i < N; ++i) {
+      backward_indices[indices[i]] = i;
+    }
+    for (int n = 0; n < N; n++) {
+      int permuteIdx = n * K;
+      int origIdx = backward_indices[n] * K;
+      std::memcpy(dst + permuteIdx, src + origIdx, numBytes);
+    }
+  }
+}
+
+template <>
+bool BatchPermutationOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& indices = Input(1);
+
+  CAFFE_ENFORCE(indices.dim() == 1, "indices must be 1-d");
+  CAFFE_ENFORCE(
+      X.dim32(0) == indices.dim32(0),
+      "X.dim32(0) must be equal to indices.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      indices.dim32(0),
+      ")");
+
+  auto* Y = Output(0, X.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(X.dim32(0), 0);
+  batch_permutation_loop<true>(
+      X.dim32(0),
+      X.numel() / X.dim32(0),
+      X.data<float>(),
+      indices.data<int>(),
+      Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool BatchPermutationGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& indices = Input(0);
+  auto& dY = Input(1);
+
+  auto* dX = Output(0, dY.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(dY.dim32(0), 0);
+  batch_permutation_loop<false>(
+      dY.dim32(0),
+      dY.numel() / dY.dim32(0),
+      dY.data<float>(),
+      indices.data<int>(),
+      dX->mutable_data<float>());
+  return true;
+}
+
+#ifdef CAFFE2_USE_MKLDNN
+REGISTER_IDEEP_OPERATOR(
+    BatchPermutation,
+    IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>);
+#endif
+
+REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CPUContext>);
+
+// Input: X, indices; Output: Y
+OPERATOR_SCHEMA(BatchPermutation)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Batch permutation of an input tensor X given input indices. First dimension of
+X equals batch size N. The indices stores a be permutation of N.
+The output Y is a tensor of same shape as X, with data re-ordered according to
+the indices within the batch size.
+
+Example of batch permutation on a 2-D tensor with batch size 4:
+  X = [
+    [1, 5, 2, 3, 4, 6, 0],
+    [4, 3, 3, 5, 2, 3, 1],
+    [2, 2, 3, 6, 0, 0, 1],
+    [0, 0, 1, 1, 2, 2, 3]
+  ]
+  indices = [2, 0, 1, 3]
+  Y = [
+    [2, 2, 3, 6, 0, 0, 1],
+    [1, 5, 2, 3, 4, 6, 0],
+    [4, 3, 3, 5, 2, 3, 1],
+    [0, 0, 1, 1, 2, 2, 3]
+  ]
+
+Example of batch permutation on a 3-D tensor with batch size 4:
+  X = [
+    [[1, 5, 2], [3, 4, 6, 0]],
+    [[4, 3, 3], [5, 2, 3, 1]],
+    [[2, 2, 3], [6, 0, 0, 1]],
+    [[0, 0, 1], [1, 2, 2, 3]]
+  ]
+  indices = [2, 0, 1, 3]
+  Y = [
+    [[2, 2, 3], [6, 0, 0, 1]],
+    [[1, 5, 2], [3, 4, 6, 0]],
+    [[4, 3, 3], [5, 2, 3, 1]],
+    [[0, 0, 1], [1, 2, 2, 3]]
+  ]
+)DOC")
+    .Input(0, "X", "Input tensor, where 1st dimension equals batch size")
+    .Input(1, "indices", "Input indices of batch to permute")
+    .Output(0, "Y", "Output permuted tensor");
+// Input: indices, dY (aka "gradOutput"); Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(BatchPermutationGradient).NumInputs(2).NumOutputs(1);
+
+class GetBatchPermutationGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchPermutationGradient",
+        "",
+        vector<string>{I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(BatchPermutation, GetBatchPermutationGradient);
+
+} // namespace caffe2
+
+using BatchPermutationOpFloatCPU =
+    caffe2::BatchPermutationOp<float, caffe2::CPUContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    BatchPermutation,
+    "_caffe2::BatchPermutation(Tensor X, Tensor indices) -> Tensor",
+    BatchPermutationOpFloatCPU);
--- a/caffe2/operators/batch_permutation_op.cu
+++ b/caffe2/operators/batch_permutation_op.cu
@ -0,0 +1,113 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/batch_permutation_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <bool forward>
+__global__ void BatchPermutationKernel(
+    int N,
+    int K,
+    const float* src,
+    const int* indices,
+    float* dst) {
+  if (forward) {
+    CUDA_1D_KERNEL_LOOP(index, N * K) {
+      int k = index % K;
+      int n = index / K;
+      int idx = indices[n];
+      CUDA_KERNEL_ASSERT(idx >= 0);
+      CUDA_KERNEL_ASSERT(idx < N);
+      dst[index] = src[idx * K + k];
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * K) {
+      int k = index % K;
+      int n = index / K;
+
+      // NOTE: an alternative implementation if we want to align the index with
+      // the output tensor (rather than the input tensor).
+      // int idx = -1;
+      // for (size_t i = 0; i < N; ++i) {
+      //   if (indices[i] == n) {
+      //     idx = i;
+      //   }
+      // }
+      // CUDA_KERNEL_ASSERT(idx >= 0);
+      // CUDA_KERNEL_ASSERT(idx < N);
+      // dst[index] = src[idx * K + k];
+
+      int idx = indices[n];
+      CUDA_KERNEL_ASSERT(idx >= 0);
+      CUDA_KERNEL_ASSERT(idx < N);
+      dst[idx * K + k] = src[index];
+    }
+  }
+}
+} // namespace
+
+template <>
+bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& indices = Input(1);
+
+  CAFFE_ENFORCE(indices.dim() == 1, "indices must be 1-d");
+  CAFFE_ENFORCE(
+      X.dim32(0) == indices.dim32(0),
+      "X.dim32(0) must be equal to indices.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      indices.dim32(0),
+      ")");
+
+  auto* Y = Output(0, X.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(X.dim32(0), 0);
+  BatchPermutationKernel<true>
+      <<<CAFFE_GET_BLOCKS(X.numel()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          X.dim32(0),
+          X.numel() / X.dim32(0),
+          X.data<float>(),
+          indices.data<int>(),
+          Y->mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& indices = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0, dY.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(dY.dim32(0), 0);
+  BatchPermutationKernel<false>
+      <<<CAFFE_GET_BLOCKS(dY.numel()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          dY.dim32(0),
+          dY.numel() / dY.dim32(0),
+          dY.data<float>(),
+          indices.data<int>(),
+          dX->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    BatchPermutation,
+    BatchPermutationOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CUDAContext>);
+} // namespace caffe2
+
+using BatchPermutationOpFloatCUDA =
+    caffe2::BatchPermutationOp<float, caffe2::CUDAContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(BatchPermutation, BatchPermutationOpFloatCUDA);
--- a/caffe2/operators/batch_permutation_op.h
+++ b/caffe2/operators/batch_permutation_op.h
@ -0,0 +1,37 @@
+#ifndef BATCHPERMUTATION_OP_H_
+#define BATCHPERMUTATION_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(BatchPermutation)
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BatchPermutationOp final : public Operator<Context> {
+ public:
+  template <class... Args>
+  explicit BatchPermutationOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice();
+};
+
+template <typename T, class Context>
+class BatchPermutationGradientOp final : public Operator<Context> {
+ public:
+  BatchPermutationGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice();
+};
+
+} // namespace caffe2
+
+#endif // BATCHPERMUTATION_OP_H_
--- a/caffe2/operators/batch_permutation_op_gpu_test.cc
+++ b/caffe2/operators/batch_permutation_op_gpu_test.cc
@ -0,0 +1,269 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/operators/batch_permutation_op.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+namespace {
+
+// Add the vector as an input to a Workspace depending on the context of the
+// workspace
+
+template <typename T>
+void AddInputCPU(
+    const vector<int64_t>& shape,
+    const vector<T>& values,
+    const string& name,
+    Workspace* ws) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  tensor->Resize(shape);
+  EigenVectorMap<T> tensor_vec(tensor->mutable_data<T>(), tensor->numel());
+  tensor_vec.array() = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>{
+      values.data(), static_cast<int>(values.size())};
+}
+
+template <typename T>
+void AddInputGPU(
+    const vector<int64_t>& shape,
+    const vector<T>& values,
+    const string& name,
+    Workspace* ws) {
+  Tensor tmp(shape, CPU);
+  EigenVectorMap<T> tmp_vec(tmp.mutable_data<T>(), tmp.numel());
+  tmp_vec.array() = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>{
+      values.data(), static_cast<int>(values.size())};
+
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
+  tensor->CopyFrom(tmp);
+}
+
+// Overload 4 different signatures for AddInput because clang does not allow
+// template <typename T>
+// void AddInput<CPUContext>(...) {...}
+
+template <typename T, class Context>
+void AddInput(
+    const vector<int64_t>& shape,
+    const vector<T>& values,
+    const string& name,
+    Workspace* ws);
+
+template <>
+void AddInput<int, CPUContext>(
+    const vector<int64_t>& shape,
+    const vector<int>& values,
+    const string& name,
+    Workspace* ws) {
+  AddInputCPU<int>(shape, values, name, ws);
+}
+
+template <>
+void AddInput<float, CPUContext>(
+    const vector<int64_t>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  AddInputCPU<float>(shape, values, name, ws);
+}
+
+template <>
+void AddInput<int, CUDAContext>(
+    const vector<int64_t>& shape,
+    const vector<int>& values,
+    const string& name,
+    Workspace* ws) {
+  AddInputGPU<int>(shape, values, name, ws);
+}
+
+template <>
+void AddInput<float, CUDAContext>(
+    const vector<int64_t>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  AddInputGPU<float>(shape, values, name, ws);
+}
+
+template <class Context>
+DeviceTypeProto GetDeviceType() {
+  return PROTO_CPU;
+}
+template <>
+DeviceTypeProto GetDeviceType<CUDAContext>() {
+  return PROTO_CUDA;
+}
+
+// Create a BatchPermutationOp with the given inputs (actual values are
+// generated sequentially) and run it
+template <class Context>
+void CreateAndRun(
+    TensorCPU* outResult,
+    int N,
+    vector<int64_t>& shape,
+    vector<float>& features,
+    vector<int> indices) {
+  Workspace ws;
+
+  AddInput<float, Context>(shape, features, "X", &ws);
+  AddInput<int, Context>(vector<int64_t>{N}, indices, "indices", &ws);
+
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("BatchPermutation");
+  def.add_input("X");
+  def.add_input("indices");
+  def.add_output("Y");
+  def.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+  unique_ptr<OperatorBase> op = CreateOperator(def, &ws);
+
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+
+  Blob* Y_blob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, Y_blob);
+
+  auto& Y = Y_blob->Get<Tensor>();
+  outResult->CopyFrom(Y);
+}
+
+// Create a BatchPermutationOp with the given inputs (actual values are
+// generated sequentially) and run it
+template <class Context>
+void CreateAndRunGradient(
+    TensorCPU* outResult,
+    int N,
+    vector<int64_t>& shape,
+    vector<float>& features,
+    vector<int> indices) {
+  Workspace ws;
+
+  AddInput<float, Context>(shape, features, "dY", &ws);
+  AddInput<int, Context>(vector<int64_t>{N}, indices, "indices", &ws);
+
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("BatchPermutationGradient");
+  def.add_input("indices");
+  def.add_input("dY");
+  def.add_output("dX");
+  def.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+  unique_ptr<OperatorBase> op = CreateOperator(def, &ws);
+
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+
+  Blob* Y_blob = ws.GetBlob("dX");
+  EXPECT_NE(nullptr, Y_blob);
+
+  auto& Y = Y_blob->Get<Tensor>();
+  outResult->CopyFrom(Y);
+}
+
+// Check that the CPU and GPU implementations provide the exact same results
+void CheckCPUGPUEqual(vector<int64_t> shape, vector<int> indices) {
+  // Prepare input data
+  EXPECT_GT(shape.size(), 1);
+  int N = shape[0];
+  int input_size = 1;
+  for (auto k : shape) {
+    input_size *= k;
+  }
+  int K = input_size / N;
+  vector<float> features(input_size);
+  std::iota(features.begin(), features.end(), 0);
+
+  // CPU outputs
+  Tensor y_cpu{CPU};
+  Tensor y_cpu_grad{CPU};
+
+  // CPU BatchPermutation
+  CreateAndRun<CPUContext>(&y_cpu, N, shape, features, indices);
+
+  // CPU BatchPermutationGradient
+  CreateAndRunGradient<CPUContext>(&y_cpu_grad, N, shape, features, indices);
+
+  // Check CPU output values
+  for (auto i = 0; i < indices.size(); ++i) {
+    for (auto k = 0; k < K; ++k) {
+      EXPECT_NEAR(
+          y_cpu.data<float>()[indices[i] * K + k], features[i * K + k], 1e4);
+      EXPECT_NEAR(
+          y_cpu_grad.data<float>()[i * K + k],
+          features[indices[i] * K + k],
+          1e4);
+    }
+  }
+
+  if (!caffe2::HasCudaGPU()) {
+    VLOG(2) << "No CudaGPU found. Skip GPU test." << std::endl;
+    return;
+  }
+
+  // GPU outputs
+  Tensor y_gpu{CPU};
+  Tensor y_gpu_grad{CPU};
+
+  // GPU BatchPermutation
+  CreateAndRun<CPUContext>(&y_gpu, N, shape, features, indices);
+
+  // Compare CPU and GPU BatchPermutation outputs
+  EXPECT_EQ(y_cpu.sizes(), y_gpu.sizes());
+  ConstEigenVectorMap<float> y_cpu_vec(y_cpu.data<float>(), y_cpu.numel());
+  ConstEigenVectorMap<float> y_gpu_vec(y_gpu.data<float>(), y_gpu.numel());
+  EXPECT_TRUE(y_cpu_vec.isApprox(y_gpu_vec));
+
+  // GPU BatchPermutationGradient
+  CreateAndRunGradient<CUDAContext>(&y_gpu_grad, N, shape, features, indices);
+
+  // Check GPU outputs
+  for (auto i = 0; i < indices.size(); ++i) {
+    for (auto k = 0; k < K; ++k) {
+      EXPECT_NEAR(
+          y_gpu.data<float>()[indices[i] * K + k], features[i * K + k], 1e4);
+      EXPECT_NEAR(
+          y_gpu_grad.data<float>()[i * K + k],
+          features[indices[i] * K + k],
+          1e4);
+    }
+  }
+
+  // Compare CPU and GPU BatchPermutationGradient outputs
+  EXPECT_EQ(y_cpu_grad.sizes(), y_gpu_grad.sizes());
+  ConstEigenVectorMap<float> y_cpu_vec_grad(
+      y_cpu_grad.data<float>(), y_cpu_grad.numel());
+  ConstEigenVectorMap<float> y_gpu_vec_grad(
+      y_gpu_grad.data<float>(), y_gpu_grad.numel());
+  EXPECT_TRUE(y_cpu_vec_grad.isApprox(y_gpu_vec_grad));
+}
+
+} // namespace
+
+TEST(BatchPermutationTest, CHECKCPUGPUEqualGenericDimension) {
+  auto t0 = std::chrono::high_resolution_clock::now();
+  int batch_size = 8;
+  int max_dimension = 6;
+  vector<int64_t> shape = vector<int64_t>{batch_size};
+
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  std::default_random_engine generator(seed);
+
+  for (int i = 2; i < max_dimension; ++i) {
+    std::uniform_int_distribution<> dis(1, i);
+    shape.push_back(dis(generator));
+    CheckCPUGPUEqual(shape, vector<int>{0, 1, 2, 3, 4, 5, 6, 7});
+    CheckCPUGPUEqual(shape, vector<int>{7, 6, 5, 4, 3, 2, 1, 0});
+    CheckCPUGPUEqual(shape, vector<int>{1, 3, 5, 7, 0, 2, 4, 6});
+    CheckCPUGPUEqual(shape, vector<int>{4, 5, 6, 7, 0, 1, 2, 3});
+    CheckCPUGPUEqual(shape, vector<int>{3, 1, 5, 7, 6, 2, 4, 0});
+  }
+  auto t1 = std::chrono::high_resolution_clock::now();
+  double elapsed =
+      std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
+  VLOG(2) << "Time elapsed: " << elapsed << " ms" << std::endl;
+  return;
+}
+} // namespace caffe2
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@ -136,7 +136,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
    bias_data = Input(BIAS).template data<T>();
  }

-  auto f = [&](Tensor* col_buffer) {
+  auto f = [this, &filter_offset, &bias_data, &X, &buffer_shape, &N, &Xdata, &offset_data, &M, &filter, &output_image_size, &kernel_dim, &Ydata, &input_offset, &offset_offset, &output_offset] (Tensor* col_buffer) {
    col_buffer->Resize(buffer_shape);
    T* col_buffer_data = col_buffer->template mutable_data<T>();
    // Im2col, followed by gemm.
--- a/caffe2/python/operator_test/alias_with_name_test.py
+++ b/caffe2/python/operator_test/alias_with_name_test.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core, utils
+from hypothesis import given
+
+
+class TestAliasWithNameOp(hu.HypothesisTestCase):
+    @given(
+        shape=st.lists(st.integers(0, 5), min_size=1, max_size=3),
+        dtype=st.sampled_from([np.float32, np.int64]),
+        **hu.gcs
+    )
+    def test_alias_with_name_op(self, shape, dtype, dc, gc):
+        test_input = (100 * np.random.random(shape)).astype(dtype)
+        test_inputs = [test_input]
+
+        alias_op = core.CreateOperator(
+            "AliasWithName",
+            ["input"],
+            ["output"],
+            device_option=gc,
+        )
+        alias_op.arg.add().CopyFrom(utils.MakeArgument("name", "whatever_name"))
+
+        def reference_func(x):
+            return (x,)
+
+        self.assertReferenceChecks(gc, alias_op, test_inputs, reference_func)
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@ -139,6 +139,7 @@ class TestMomentumSGD(serial.SerializedTestCase):
            [grad, m, lr, w, indices],
            sparse)

+    @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/31368")
    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
    def test_fp16momentum_sgd(self, n, nesterov, gc, dc):
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@ -710,6 +710,15 @@ class TorchIntegration(hu.HypothesisTestCase):

        torch.testing.assert_allclose(torch.tensor(expected_output), actual_output)

+    def test_alias_with_name_is_in_place(self):
+        device = "cuda" if workspace.has_cuda_support else "cpu"
+        x = torch.Tensor([3, 42]).to(device)
+        y = torch.ops._caffe2.AliasWithName(x, "new_name")
+        x[1] = 6
+        torch.testing.assert_allclose(x, torch.Tensor([3, 6]).to(device))
+        # y should also change because y is alias of x
+        torch.testing.assert_allclose(y, torch.Tensor([3, 6]).to(device))
+

 if __name__ == '__main__':
    unittest.main()
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@ -36,6 +36,7 @@ torch.optim

 -  Vincent Quenneville-Belair (`vincentqb <https://github.com/vincentqb>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  Wanchao Liang (`wanchaol <https://github.com/wanchaol>`__)

 Autograd Engine
 ~~~~~~~~~~~~~~~
@ -95,6 +96,20 @@ MKLDNN
 -  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 -  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)

+Mobile
+~~~~~~
+
+-  David Reiss (`dreiss <https://github.com/dreiss>`__)
+-  Jiakai Liu (`ljk53 <https://github.com/ljk53>`__)
+
+Quantization
+~~~~~~
+
+-  Raghuraman Krishnamoorthi (`dreiss <https://github.com/dreiss>`__)
+-  Jerry Zhang (`jerryzh168 <https://github.com/jerryzh168>`__)
+-  Lingyi Liu (`lly-zero-one <https://github.com/lly-zero-one>`__)
+-  James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
+
 XLA
 ~~~

@ -138,6 +153,9 @@ ONNX <-> PyTorch
 ~~~~~~~~~~~~~~~~

 -  Lu Fang (`houseroad <https://github.com/houseroad>`__)
+-  Lara Haidar (`lara-hdr <https://github.com/lara-hdr>`__)
+-  Spandan Tiwari (`spandantiwari <https://github.com/spandantiwari>`__)
+-  Bowen Bao (`BowenBao <https://github.com/BowenBao>`__)

 Windows
 ~~~~~~~
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -46,7 +46,6 @@ extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
    'sphinx.ext.todo',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -17,13 +17,13 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.

   notes/*
   PyTorch on XLA Devices <http://pytorch.org/xla/>
-
+   
 .. toctree::
-  :glob:
-  :maxdepth: 1
-  :caption: Community
+   :maxdepth: 1
+   :caption: Language Bindings

-  community/*
+   C++ API <https://pytorch.org/cppdocs/>
+   Javadoc <https://pytorch.org/javadoc/>

 .. toctree::
   :maxdepth: 1
@ -78,13 +78,13 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   :caption: torchtext Reference

   torchtext <https://pytorch.org/text>
-
+   
 .. toctree::
-   :maxdepth: 1
-   :caption: Other Languages
+  :glob:
+  :maxdepth: 1
+  :caption: Community

-   C++ API <https://pytorch.org/cppdocs/>
-   packages
+  community/*

 Indices and tables
 ==================
--- a/docs/source/notes/distributed_autograd.rst
+++ b/docs/source/notes/distributed_autograd.rst
@ -314,13 +314,14 @@ The :class:`~torch.distributed.optim.DistributedOptimizer` operates as follows:
 Simple end to end example
 ^^^^^^^^^^^^^^^^^^^^^^^^^

-Putting it all together, a very simple end to end example using distributed
-autograd and distributed optimizer is as follows:
+Putting it all together, the following is a simple end to end example using
+distributed autograd and the distributed optimizer. If the code is placed into a
+file called "dist_autograd_simple.py", it can be run with the command
+:code:`MASTER_ADDR="localhost" MASTER_PORT=29500 python dist_autograd_simple.py`:

 .. code::

  import multiprocessing as mp
-  from tempfile import NamedTemporaryFile
  import torch
  import torch.distributed.autograd as dist_autograd
  from torch.distributed import rpc
@ -330,52 +331,52 @@ autograd and distributed optimizer is as follows:
  def random_tensor():
      return torch.rand((3, 3), requires_grad=True)

-  def _run_process(self_rank, dst_rank, file_name):
-      self_name = "worker{}".format(self_rank)
+  def _run_process(rank, dst_rank, world_size):
+      name = "worker{}".format(rank)
      dst_name = "worker{}".format(dst_rank)

      # Initialize RPC.
      rpc.init_rpc(
-          self_name=self_name,
-          self_rank=self_rank,
-          worker_name_to_id={"worker0": 0, "worker1": 1},
-          init_method="file://{}".format(file_name),
+          name=name,
+          rank=rank,
+          world_size=world_size
      )

      # Use a distributed autograd context.
      with dist_autograd.context() as context_id:
-         # Forward pass (create references on remote nodes).
-         rref1 = rpc.remote(dst_name, random_tensor)
-         rref2 = rpc.remote(dst_name, random_tensor)
-         loss = rref1.to_here() + rref2.to_here()
+          # Forward pass (create references on remote nodes).
+          rref1 = rpc.remote(dst_name, random_tensor)
+          rref2 = rpc.remote(dst_name, random_tensor)
+          loss = rref1.to_here() + rref2.to_here()

-         # Backward pass (run distributed autograd).
-         dist_autograd.backward([loss.sum()])
+          # Backward pass (run distributed autograd).
+          dist_autograd.backward([loss.sum()])

-         # Build DistributedOptimizer.
-         dist_optim = DistributedOptimizer(
-           optim.SGD,
-           [rref1, rref2],
-           lr=0.05,
-         )
+          # Build DistributedOptimizer.
+          dist_optim = DistributedOptimizer(
+          optim.SGD,
+          [rref1, rref2],
+          lr=0.05,
+          )

-         # Run the distributed optimizer step.
-         dist_optim.step()
+          # Run the distributed optimizer step.
+          dist_optim.step()

-  def run_process(self_rank, dst_rank, file_name):
-      _run_process(self_rank, dst_rank, file_name)
-      rpc.wait_all_workers()
+  def run_process(rank, dst_rank, world_size):
+      _run_process(rank, dst_rank, world_size)
+      rpc.shutdown()

-  file_name = NamedTemporaryFile().name
  processes = []

-  # Run two workers.
-  for i in range(2):
-      p = mp.Process(target=run_process, args=(i, (i + 1) % 2, file_name))
+  # Run world_size workers.
+  world_size = 2
+  for i in range(world_size):
+      p = mp.Process(target=run_process, args=(i, (i + 1) % 2, world_size))
      p.start()
      processes.append(p)

  for p in processes:
      p.join()

+
 .. _RFC: https://github.com/pytorch/pytorch/issues/23110
--- a/docs/source/org/pytorch/DType.rst
+++ b/docs/source/org/pytorch/DType.rst
@ -1,67 +0,0 @@
-DType
-=====
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: public enum DType
-
-   Codes representing tensor data types.
-
-Enum Constants
--------------
-FLOAT32
-^^^^^^^
-
-.. java:field:: public static final DType FLOAT32
-   :outertype: DType
-
-   Code for dtype torch.float32. \ :java:ref:`Tensor.dtype()`\
-
-FLOAT64
-^^^^^^^
-
-.. java:field:: public static final DType FLOAT64
-   :outertype: DType
-
-   Code for dtype torch.float64. \ :java:ref:`Tensor.dtype()`\
-
-INT32
-^^^^^
-
-.. java:field:: public static final DType INT32
-   :outertype: DType
-
-   Code for dtype torch.int32. \ :java:ref:`Tensor.dtype()`\
-
-INT64
-^^^^^
-
-.. java:field:: public static final DType INT64
-   :outertype: DType
-
-   Code for dtype torch.int64. \ :java:ref:`Tensor.dtype()`\
-
-INT8
-^^^^
-
-.. java:field:: public static final DType INT8
-   :outertype: DType
-
-   Code for dtype torch.int8. \ :java:ref:`Tensor.dtype()`\
-
-UINT8
-^^^^^
-
-.. java:field:: public static final DType UINT8
-   :outertype: DType
-
-   Code for dtype torch.uint8. \ :java:ref:`Tensor.dtype()`\
-
-Fields
------
-jniCode
-^^^^^^^
-
-.. java:field:: final int jniCode
-   :outertype: DType
--- a/docs/source/org/pytorch/IValue.rst
+++ b/docs/source/org/pytorch/IValue.rst
@ -1,297 +0,0 @@
-.. java:import:: java.util Locale
-
-.. java:import:: java.util Map
-
-IValue
-======
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: public class IValue
-
-   Java representation of a TorchScript value, which is implemented as tagged union that can be one of the supported types: https://pytorch.org/docs/stable/jit.html#types .
-
-   Calling \ ``toX``\  methods for inappropriate types will throw \ :java:ref:`IllegalStateException`\ .
-
-   \ ``IValue``\  objects are constructed with \ ``IValue.from(value)``\ , \ ``IValue.tupleFrom(value1, value2, ...)``\ , \ ``IValue.listFrom(value1, value2, ...)``\ , or one of the \ ``dict``\  methods, depending on the key type.
-
-   Data is retrieved from \ ``IValue``\  objects with the \ ``toX()``\  methods. Note that \ ``str``\ -type IValues must be extracted with \ :java:ref:`toStr()`\ , rather than \ :java:ref:`toString()`\ .
-
-   \ ``IValue``\  objects may retain references to objects passed into their constructors, and may return references to their internal state from \ ``toX()``\ .
-
-Methods
-------
-dictLongKeyFrom
-^^^^^^^^^^^^^^^
-
-.. java:method:: public static IValue dictLongKeyFrom(Map<Long, IValue> map)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``Dict[int, V]``\ .
-
-dictStringKeyFrom
-^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static IValue dictStringKeyFrom(Map<String, IValue> map)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``Dict[str, V]``\ .
-
-from
-^^^^
-
-.. java:method:: public static IValue from(Tensor tensor)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``Tensor``\ .
-
-from
-^^^^
-
-.. java:method:: public static IValue from(boolean value)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``bool``\ .
-
-from
-^^^^
-
-.. java:method:: public static IValue from(long value)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``int``\ .
-
-from
-^^^^
-
-.. java:method:: public static IValue from(double value)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``float``\ .
-
-from
-^^^^
-
-.. java:method:: public static IValue from(String value)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``str``\ .
-
-isBool
-^^^^^^
-
-.. java:method:: public boolean isBool()
-   :outertype: IValue
-
-isBoolList
-^^^^^^^^^^
-
-.. java:method:: public boolean isBoolList()
-   :outertype: IValue
-
-isDictLongKey
-^^^^^^^^^^^^^
-
-.. java:method:: public boolean isDictLongKey()
-   :outertype: IValue
-
-isDictStringKey
-^^^^^^^^^^^^^^^
-
-.. java:method:: public boolean isDictStringKey()
-   :outertype: IValue
-
-isDouble
-^^^^^^^^
-
-.. java:method:: public boolean isDouble()
-   :outertype: IValue
-
-isDoubleList
-^^^^^^^^^^^^
-
-.. java:method:: public boolean isDoubleList()
-   :outertype: IValue
-
-isList
-^^^^^^
-
-.. java:method:: public boolean isList()
-   :outertype: IValue
-
-isLong
-^^^^^^
-
-.. java:method:: public boolean isLong()
-   :outertype: IValue
-
-isLongList
-^^^^^^^^^^
-
-.. java:method:: public boolean isLongList()
-   :outertype: IValue
-
-isNull
-^^^^^^
-
-.. java:method:: public boolean isNull()
-   :outertype: IValue
-
-isString
-^^^^^^^^
-
-.. java:method:: public boolean isString()
-   :outertype: IValue
-
-isTensor
-^^^^^^^^
-
-.. java:method:: public boolean isTensor()
-   :outertype: IValue
-
-isTensorList
-^^^^^^^^^^^^
-
-.. java:method:: public boolean isTensorList()
-   :outertype: IValue
-
-isTuple
-^^^^^^^
-
-.. java:method:: public boolean isTuple()
-   :outertype: IValue
-
-listFrom
-^^^^^^^^
-
-.. java:method:: public static IValue listFrom(boolean... list)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``List[bool]``\ .
-
-listFrom
-^^^^^^^^
-
-.. java:method:: public static IValue listFrom(long... list)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``List[int]``\ .
-
-listFrom
-^^^^^^^^
-
-.. java:method:: public static IValue listFrom(double... list)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``List[float]``\ .
-
-listFrom
-^^^^^^^^
-
-.. java:method:: public static IValue listFrom(Tensor... list)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``List[Tensor]``\ .
-
-listFrom
-^^^^^^^^
-
-.. java:method:: public static IValue listFrom(IValue... array)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``List[T]``\ . All elements must have the same type.
-
-optionalNull
-^^^^^^^^^^^^
-
-.. java:method:: public static IValue optionalNull()
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``Optional``\  that contains no value.
-
-toBool
-^^^^^^
-
-.. java:method:: public boolean toBool()
-   :outertype: IValue
-
-toBoolList
-^^^^^^^^^^
-
-.. java:method:: public boolean[] toBoolList()
-   :outertype: IValue
-
-toDictLongKey
-^^^^^^^^^^^^^
-
-.. java:method:: public Map<Long, IValue> toDictLongKey()
-   :outertype: IValue
-
-toDictStringKey
-^^^^^^^^^^^^^^^
-
-.. java:method:: public Map<String, IValue> toDictStringKey()
-   :outertype: IValue
-
-toDouble
-^^^^^^^^
-
-.. java:method:: public double toDouble()
-   :outertype: IValue
-
-toDoubleList
-^^^^^^^^^^^^
-
-.. java:method:: public double[] toDoubleList()
-   :outertype: IValue
-
-toList
-^^^^^^
-
-.. java:method:: public IValue[] toList()
-   :outertype: IValue
-
-toLong
-^^^^^^
-
-.. java:method:: public long toLong()
-   :outertype: IValue
-
-toLongList
-^^^^^^^^^^
-
-.. java:method:: public long[] toLongList()
-   :outertype: IValue
-
-toStr
-^^^^^
-
-.. java:method:: public String toStr()
-   :outertype: IValue
-
-toTensor
-^^^^^^^^
-
-.. java:method:: public Tensor toTensor()
-   :outertype: IValue
-
-toTensorList
-^^^^^^^^^^^^
-
-.. java:method:: public Tensor[] toTensorList()
-   :outertype: IValue
-
-toTuple
-^^^^^^^
-
-.. java:method:: public IValue[] toTuple()
-   :outertype: IValue
-
-tupleFrom
-^^^^^^^^^
-
-.. java:method:: public static IValue tupleFrom(IValue... array)
-   :outertype: IValue
-
-   Creates a new \ ``IValue``\  of type \ ``Tuple[T0, T1, ...]``\ .
--- a/docs/source/org/pytorch/Module.rst
+++ b/docs/source/org/pytorch/Module.rst
@ -1,55 +0,0 @@
-.. java:import:: com.facebook.jni HybridData
-
-Module
-======
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: public class Module
-
-   Java wrapper for torch::jit::script::Module.
-
-Methods
-------
-destroy
-^^^^^^^
-
-.. java:method:: public void destroy()
-   :outertype: Module
-
-   Explicitly destroys the native torch::jit::script::Module. Calling this method is not required, as the native object will be destroyed when this object is garbage-collected. However, the timing of garbage collection is not guaranteed, so proactively calling \ ``destroy``\  can free memory more quickly. See \ :java:ref:`com.facebook.jni.HybridData.resetNative`\ .
-
-forward
-^^^^^^^
-
-.. java:method:: public IValue forward(IValue... inputs)
-   :outertype: Module
-
-   Runs the 'forward' method of this module with the specified arguments.
-
-   :param inputs: arguments for the TorchScript module's 'forward' method.
-   :return: return value from the 'forward' method.
-
-load
-^^^^
-
-.. java:method:: public static Module load(String modelPath)
-   :outertype: Module
-
-   Loads a serialized TorchScript module from the specified path on the disk.
-
-   :param modelPath: path to file that contains the serialized TorchScript module.
-   :return: new \ :java:ref:`org.pytorch.Module`\  object which owns torch::jit::script::Module.
-
-runMethod
-^^^^^^^^^
-
-.. java:method:: public IValue runMethod(String methodName, IValue... inputs)
-   :outertype: Module
-
-   Runs the specified method of this module with the specified arguments.
-
-   :param methodName: name of the TorchScript method to run.
-   :param inputs: arguments that will be passed to TorchScript method.
-   :return: return value from the method.
--- a/docs/source/org/pytorch/Tensor-Tensor_float32.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_float32.rst
@ -1,60 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_float32
-=====================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_float32 extends Tensor
-   :outertype: Tensor
-
-Constructors
------------
-Tensor_float32
-^^^^^^^^^^^^^^
-
-.. java:constructor::  Tensor_float32(FloatBuffer data, long[] shape)
-   :outertype: Tensor.Tensor_float32
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_float32
-
-getDataAsFloatArray
-^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public float[] getDataAsFloatArray()
-   :outertype: Tensor.Tensor_float32
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_float32
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_float32
--- a/docs/source/org/pytorch/Tensor-Tensor_float64.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_float64.rst
@ -1,52 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_float64
-=====================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_float64 extends Tensor
-   :outertype: Tensor
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_float64
-
-getDataAsDoubleArray
-^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public double[] getDataAsDoubleArray()
-   :outertype: Tensor.Tensor_float64
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_float64
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_float64
--- a/docs/source/org/pytorch/Tensor-Tensor_int32.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_int32.rst
@ -1,52 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_int32
-===================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_int32 extends Tensor
-   :outertype: Tensor
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_int32
-
-getDataAsIntArray
-^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public int[] getDataAsIntArray()
-   :outertype: Tensor.Tensor_int32
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_int32
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_int32
--- a/docs/source/org/pytorch/Tensor-Tensor_int64.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_int64.rst
@ -1,52 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_int64
-===================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_int64 extends Tensor
-   :outertype: Tensor
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_int64
-
-getDataAsLongArray
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public long[] getDataAsLongArray()
-   :outertype: Tensor.Tensor_int64
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_int64
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_int64
--- a/docs/source/org/pytorch/Tensor-Tensor_int8.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_int8.rst
@ -1,52 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_int8
-==================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_int8 extends Tensor
-   :outertype: Tensor
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_int8
-
-getDataAsByteArray
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public byte[] getDataAsByteArray()
-   :outertype: Tensor.Tensor_int8
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_int8
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_int8
--- a/docs/source/org/pytorch/Tensor-Tensor_uint8.rst
+++ b/docs/source/org/pytorch/Tensor-Tensor_uint8.rst
@ -1,52 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor.Tensor_uint8
-===================
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: static class Tensor_uint8 extends Tensor
-   :outertype: Tensor
-
-Methods
-------
-dtype
-^^^^^
-
-.. java:method:: @Override public DType dtype()
-   :outertype: Tensor.Tensor_uint8
-
-getDataAsUnsignedByteArray
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override public byte[] getDataAsUnsignedByteArray()
-   :outertype: Tensor.Tensor_uint8
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method:: @Override  Buffer getRawDataBuffer()
-   :outertype: Tensor.Tensor_uint8
-
-toString
-^^^^^^^^
-
-.. java:method:: @Override public String toString()
-   :outertype: Tensor.Tensor_uint8
--- a/docs/source/org/pytorch/Tensor.rst
+++ b/docs/source/org/pytorch/Tensor.rst
@ -1,315 +0,0 @@
-.. java:import:: java.nio Buffer
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio ByteOrder
-
-.. java:import:: java.nio DoubleBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.nio IntBuffer
-
-.. java:import:: java.nio LongBuffer
-
-.. java:import:: java.util Arrays
-
-.. java:import:: java.util Locale
-
-Tensor
-======
-
-.. java:package:: org.pytorch
-   :noindex:
-
-.. java:type:: public abstract class Tensor
-
-   Representation of a Tensor. Behavior is similar to PyTorch's tensor objects.
-
-   Most tensors will be constructed as \ ``Tensor.fromBlob(data, shape)``\ , where \ ``data``\  can be an array or a direct \ :java:ref:`Buffer`\  (of the proper subclass). Helper methods are provided to allocate buffers properly.
-
-   To access Tensor data, see \ :java:ref:`dtype()`\ , \ :java:ref:`shape()`\ , and various \ ``getDataAs*``\  methods.
-
-   When constructing \ ``Tensor``\  objects with \ ``data``\  as an array, it is not specified whether this data is is copied or retained as a reference so it is recommended not to modify it after constructing. \ ``data``\  passed as a \ :java:ref:`Buffer`\  is not copied, so it can be modified between \ :java:ref:`Module`\  calls to avoid reallocation. Data retrieved from \ ``Tensor``\  objects may be copied or may be a reference to the \ ``Tensor``\ 's internal data buffer. \ ``shape``\  is always copied.
-
-Methods
-------
-allocateByteBuffer
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static ByteBuffer allocateByteBuffer(int numElements)
-   :outertype: Tensor
-
-   Allocates a new direct \ :java:ref:`java.nio.ByteBuffer`\  with native byte order with specified capacity that can be used in \ :java:ref:`Tensor.fromBlob(ByteBuffer,long[])`\ , \ :java:ref:`Tensor.fromBlobUnsigned(ByteBuffer,long[])`\ .
-
-   :param numElements: capacity (number of elements) of result buffer.
-
-allocateDoubleBuffer
-^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static DoubleBuffer allocateDoubleBuffer(int numElements)
-   :outertype: Tensor
-
-   Allocates a new direct \ :java:ref:`java.nio.DoubleBuffer`\  with native byte order with specified capacity that can be used in \ :java:ref:`Tensor.fromBlob(DoubleBuffer,long[])`\ .
-
-   :param numElements: capacity (number of elements) of result buffer.
-
-allocateFloatBuffer
-^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static FloatBuffer allocateFloatBuffer(int numElements)
-   :outertype: Tensor
-
-   Allocates a new direct \ :java:ref:`java.nio.FloatBuffer`\  with native byte order with specified capacity that can be used in \ :java:ref:`Tensor.fromBlob(FloatBuffer,long[])`\ .
-
-   :param numElements: capacity (number of elements) of result buffer.
-
-allocateIntBuffer
-^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static IntBuffer allocateIntBuffer(int numElements)
-   :outertype: Tensor
-
-   Allocates a new direct \ :java:ref:`java.nio.IntBuffer`\  with native byte order with specified capacity that can be used in \ :java:ref:`Tensor.fromBlob(IntBuffer,long[])`\ .
-
-   :param numElements: capacity (number of elements) of result buffer.
-
-allocateLongBuffer
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static LongBuffer allocateLongBuffer(int numElements)
-   :outertype: Tensor
-
-   Allocates a new direct \ :java:ref:`java.nio.LongBuffer`\  with native byte order with specified capacity that can be used in \ :java:ref:`Tensor.fromBlob(LongBuffer,long[])`\ .
-
-   :param numElements: capacity (number of elements) of result buffer.
-
-dtype
-^^^^^
-
-.. java:method:: public abstract DType dtype()
-   :outertype: Tensor
-
-   :return: data type of this tensor.
-
-dtypeJniCode
-^^^^^^^^^^^^
-
-.. java:method::  int dtypeJniCode()
-   :outertype: Tensor
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(byte[] data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int8 with specified shape and data as array of bytes.
-
-   :param data: Tensor elements
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(int[] data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int32 with specified shape and data as array of ints.
-
-   :param data: Tensor elements
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(float[] data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.float32 with specified shape and data as array of floats.
-
-   :param data: Tensor elements
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(long[] data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int64 with specified shape and data as array of longs.
-
-   :param data: Tensor elements
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(long[] shape, double[] data)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.float64 with specified shape and data as array of doubles.
-
-   :param shape: Tensor shape
-   :param data: Tensor elements
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(ByteBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int8 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(IntBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int32 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(FloatBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.float32 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(LongBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.int64 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-fromBlob
-^^^^^^^^
-
-.. java:method:: public static Tensor fromBlob(DoubleBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.float64 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-fromBlobUnsigned
-^^^^^^^^^^^^^^^^
-
-.. java:method:: public static Tensor fromBlobUnsigned(byte[] data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.uint8 with specified shape and data as array of bytes.
-
-   :param data: Tensor elements
-   :param shape: Tensor shape
-
-fromBlobUnsigned
-^^^^^^^^^^^^^^^^
-
-.. java:method:: public static Tensor fromBlobUnsigned(ByteBuffer data, long[] shape)
-   :outertype: Tensor
-
-   Creates a new Tensor instance with dtype torch.uint8 with specified shape and data.
-
-   :param data: Direct buffer with native byte order that contains \ ``Tensor.numel(shape)``\  elements. The buffer is used directly without copying, and changes to its content will change the tensor.
-   :param shape: Tensor shape
-
-getDataAsByteArray
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public byte[] getDataAsByteArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-int8 tensor.
-   :return: a Java byte array that contains the tensor data. This may be a copy or reference.
-
-getDataAsDoubleArray
-^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public double[] getDataAsDoubleArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-float64 tensor.
-   :return: a Java double array that contains the tensor data. This may be a copy or reference.
-
-getDataAsFloatArray
-^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public float[] getDataAsFloatArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-float32 tensor.
-   :return: a Java float array that contains the tensor data. This may be a copy or reference.
-
-getDataAsIntArray
-^^^^^^^^^^^^^^^^^
-
-.. java:method:: public int[] getDataAsIntArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-int32 tensor.
-   :return: a Java int array that contains the tensor data. This may be a copy or reference.
-
-getDataAsLongArray
-^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public long[] getDataAsLongArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-int64 tensor.
-   :return: a Java long array that contains the tensor data. This may be a copy or reference.
-
-getDataAsUnsignedByteArray
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public byte[] getDataAsUnsignedByteArray()
-   :outertype: Tensor
-
-   :throws IllegalStateException: if it is called for a non-uint8 tensor.
-   :return: a Java byte array that contains the tensor data. This may be a copy or reference.
-
-getRawDataBuffer
-^^^^^^^^^^^^^^^^
-
-.. java:method::  Buffer getRawDataBuffer()
-   :outertype: Tensor
-
-numel
-^^^^^
-
-.. java:method:: public long numel()
-   :outertype: Tensor
-
-   Returns the number of elements in this tensor.
-
-numel
-^^^^^
-
-.. java:method:: public static long numel(long[] shape)
-   :outertype: Tensor
-
-   Calculates the number of elements in a tensor with the specified shape.
-
-shape
-^^^^^
-
-.. java:method:: public long[] shape()
-   :outertype: Tensor
-
-   Returns the shape of this tensor. (The array is a fresh copy.)
--- a/docs/source/org/pytorch/TensorImageUtils.rst
+++ b/docs/source/org/pytorch/TensorImageUtils.rst
@ -1,114 +0,0 @@
-.. java:import:: android.graphics Bitmap
-
-.. java:import:: android.graphics ImageFormat
-
-.. java:import:: android.media Image
-
-.. java:import:: org.pytorch Tensor
-
-.. java:import:: java.nio ByteBuffer
-
-.. java:import:: java.nio FloatBuffer
-
-.. java:import:: java.util Locale
-
-TensorImageUtils
-================
-
-.. java:package:: org.pytorch.torchvision
-   :noindex:
-
-.. java:type:: public final class TensorImageUtils
-
-   Contains utility functions for \ :java:ref:`org.pytorch.Tensor`\  creation from \ :java:ref:`android.graphics.Bitmap`\  or \ :java:ref:`android.media.Image`\  source.
-
-Fields
------
-TORCHVISION_NORM_MEAN_RGB
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:field:: public static float[] TORCHVISION_NORM_MEAN_RGB
-   :outertype: TensorImageUtils
-
-TORCHVISION_NORM_STD_RGB
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:field:: public static float[] TORCHVISION_NORM_STD_RGB
-   :outertype: TensorImageUtils
-
-Methods
-------
-bitmapToFloat32Tensor
-^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static Tensor bitmapToFloat32Tensor(Bitmap bitmap, float[] normMeanRGB, float[] normStdRGB)
-   :outertype: TensorImageUtils
-
-   Creates new \ :java:ref:`org.pytorch.Tensor`\  from full \ :java:ref:`android.graphics.Bitmap`\ , normalized with specified in parameters mean and std.
-
-   :param normMeanRGB: means for RGB channels normalization, length must equal 3, RGB order
-   :param normStdRGB: standard deviation for RGB channels normalization, length must equal 3, RGB order
-
-bitmapToFloat32Tensor
-^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static Tensor bitmapToFloat32Tensor(Bitmap bitmap, int x, int y, int width, int height, float[] normMeanRGB, float[] normStdRGB)
-   :outertype: TensorImageUtils
-
-   Creates new \ :java:ref:`org.pytorch.Tensor`\  from specified area of \ :java:ref:`android.graphics.Bitmap`\ , normalized with specified in parameters mean and std.
-
-   :param bitmap: \ :java:ref:`android.graphics.Bitmap`\  as a source for Tensor data
-   :param x: - x coordinate of top left corner of bitmap's area
-   :param y: - y coordinate of top left corner of bitmap's area
-   :param width: - width of bitmap's area
-   :param height: - height of bitmap's area
-   :param normMeanRGB: means for RGB channels normalization, length must equal 3, RGB order
-   :param normStdRGB: standard deviation for RGB channels normalization, length must equal 3, RGB order
-
-bitmapToFloatBuffer
-^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static void bitmapToFloatBuffer(Bitmap bitmap, int x, int y, int width, int height, float[] normMeanRGB, float[] normStdRGB, FloatBuffer outBuffer, int outBufferOffset)
-   :outertype: TensorImageUtils
-
-   Writes tensor content from specified \ :java:ref:`android.graphics.Bitmap`\ , normalized with specified in parameters mean and std to specified \ :java:ref:`java.nio.FloatBuffer`\  with specified offset.
-
-   :param bitmap: \ :java:ref:`android.graphics.Bitmap`\  as a source for Tensor data
-   :param x: - x coordinate of top left corner of bitmap's area
-   :param y: - y coordinate of top left corner of bitmap's area
-   :param width: - width of bitmap's area
-   :param height: - height of bitmap's area
-   :param normMeanRGB: means for RGB channels normalization, length must equal 3, RGB order
-   :param normStdRGB: standard deviation for RGB channels normalization, length must equal 3, RGB order
-
-imageYUV420CenterCropToFloat32Tensor
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static Tensor imageYUV420CenterCropToFloat32Tensor(Image image, int rotateCWDegrees, int tensorWidth, int tensorHeight, float[] normMeanRGB, float[] normStdRGB)
-   :outertype: TensorImageUtils
-
-   Creates new \ :java:ref:`org.pytorch.Tensor`\  from specified area of \ :java:ref:`android.media.Image`\ , doing optional rotation, scaling (nearest) and center cropping.
-
-   :param image: \ :java:ref:`android.media.Image`\  as a source for Tensor data
-   :param rotateCWDegrees: Clockwise angle through which the input image needs to be rotated to be upright. Range of valid values: 0, 90, 180, 270
-   :param tensorWidth: return tensor width, must be positive
-   :param tensorHeight: return tensor height, must be positive
-   :param normMeanRGB: means for RGB channels normalization, length must equal 3, RGB order
-   :param normStdRGB: standard deviation for RGB channels normalization, length must equal 3, RGB order
-
-imageYUV420CenterCropToFloatBuffer
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. java:method:: public static void imageYUV420CenterCropToFloatBuffer(Image image, int rotateCWDegrees, int tensorWidth, int tensorHeight, float[] normMeanRGB, float[] normStdRGB, FloatBuffer outBuffer, int outBufferOffset)
-   :outertype: TensorImageUtils
-
-   Writes tensor content from specified \ :java:ref:`android.media.Image`\ , doing optional rotation, scaling (nearest) and center cropping to specified \ :java:ref:`java.nio.FloatBuffer`\  with specified offset.
-
-   :param image: \ :java:ref:`android.media.Image`\  as a source for Tensor data
-   :param rotateCWDegrees: Clockwise angle through which the input image needs to be rotated to be upright. Range of valid values: 0, 90, 180, 270
-   :param tensorWidth: return tensor width, must be positive
-   :param tensorHeight: return tensor height, must be positive
-   :param normMeanRGB: means for RGB channels normalization, length must equal 3, RGB order
-   :param normStdRGB: standard deviation for RGB channels normalization, length must equal 3, RGB order
-   :param outBuffer: Output buffer, where tensor content will be written
-   :param outBufferOffset: Output buffer offset with which tensor content will be written
--- a/docs/source/org/pytorch/package-index.rst
+++ b/docs/source/org/pytorch/package-index.rst
@ -1,18 +0,0 @@
-org.pytorch
-===========
-
-.. java:package:: org.pytorch
-
-.. toctree::
-   :maxdepth: 1
-
-   DType
-   IValue
-   Module
-   Tensor
-   Tensor-Tensor_float32
-   Tensor-Tensor_float64
-   Tensor-Tensor_int32
-   Tensor-Tensor_int64
-   Tensor-Tensor_int8
-   Tensor-Tensor_uint8
--- a/docs/source/org/pytorch/torchvision/package-index.rst
+++ b/docs/source/org/pytorch/torchvision/package-index.rst
@ -1,9 +0,0 @@
-rg.pytorch.torchvision
-=======================
-
-.. java:package:: org.pytorch.torchvision
-
-.. toctree::
-   :maxdepth: 1
-
-   TensorImageUtils
--- a/docs/source/packages.rst
+++ b/docs/source/packages.rst
@ -1,7 +0,0 @@
-Javadoc
-=======
-
-.. toctree::
-   :maxdepth: 2
-
-   org/pytorch/package-index
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@ -42,6 +42,27 @@ The corresponding implementation is chosen automatically based on the PyTorch bu

  Quantization-aware training (through :class:`~torch.quantization.FakeQuantize`) supports both CPU and CUDA.

+
+.. note::
+
+   When preparing a quantized model, it is necessary to ensure that qconfig and the engine used for quantized computations match 
+   the backend on which the model will be executed. Quantization currently supports two backends: fbgemm (for use on x86, 
+   `<https://github.com/pytorch/FBGEMM>`_) and qnnpack (for use on the ARM QNNPACK library `<https://github.com/pytorch/QNNPACK>`_). 
+   For example, if you are interested in quantizing a model to run on ARM, it is recommended to set the qconfig by calling:
+
+   ``qconfig = torch.quantization.get_default_qconfig('qnnpack')``
+
+   for post training quantization and
+
+   ``qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')``
+
+   for quantization aware training.
+
+   In addition, the torch.backends.quantized.engine parameter should be set to match the backend. For using qnnpack for inference, the 
+   backend is set to qnnpack as follows
+
+   ``torch.backends.quantized.engine = 'qnnpack'``
+
 Quantized Tensors
 ---------------------------------------

@ -111,7 +132,7 @@ Operations that are available from the ``torch`` namespace or as methods on Tens

 * :func:`~torch.quantize_per_tensor` - Convert float tensor to quantized tensor with per-tensor scale and zero point
 * :func:`~torch.quantize_per_channel` - Convert float tensor to quantized tensor with per-channel scale and zero point
-* View-based operations like :meth:`~torch.Tensor.view`, :meth:`~torch.Tensor.as_strided`, :meth:`~torch.Tensor.expand`, :meth:`~torch.Tensor.flatten`, :meth:`~torch.Tensor.slice`, python-style indexing, etc - work as on regular tensor (if quantization is not per-channel)
+* View-based operations like :meth:`~torch.Tensor.view`, :meth:`~torch.Tensor.as_strided`, :meth:`~torch.Tensor.expand`, :meth:`~torch.Tensor.flatten`, :meth:`~torch.Tensor.select`, python-style indexing, etc - work as on regular tensor (if quantization is not per-channel)
 * Comparators
    * :meth:`~torch.Tensor.ne` — Not equal
    * :meth:`~torch.Tensor.eq` — Equal
@ -132,12 +153,24 @@ Operations that are available from the ``torch`` namespace or as methods on Tens
 * :meth:`~torch.Tensor.q_per_channel_scales` — Returns the scales of the per-channel quantized tensor
 * :meth:`~torch.Tensor.q_per_channel_zero_points` — Returns the zero points of the per-channel quantized tensor
 * :meth:`~torch.Tensor.q_per_channel_axis` — Returns the channel axis of the per-channel quantized tensor
-* :meth:`~torch.Tensor.relu` — Rectified linear unit (copy)
-* :meth:`~torch.Tensor.relu_` — Rectified linear unit (inplace)
 * :meth:`~torch.Tensor.resize_` — In-place resize
 * :meth:`~torch.Tensor.sort` — Sorts the tensor
 * :meth:`~torch.Tensor.topk` — Returns k largest values of a tensor

+``torch.nn.functional``
+~~~~~~~~~~~~~~~~~~~~~~
+
+Basic activations are supported.
+
+* :meth:`~torch.nn.functional.relu` — Rectified linear unit (copy)
+* :meth:`~torch.nn.functional.relu_` — Rectified linear unit (inplace)
+* :meth:`~torch.nn.functional.max_pool2d` - Maximum pooling 
+* :meth:`~torch.nn.functional.adaptive_avg_pool2d` - Adaptive average pooling
+* :meth:`~torch.nn.functional.avg_pool2d` - Average pooling
+* :meth:`~torch.nn.functional.interpolate` - Interpolation
+* :meth:`~torch.nn.functional.upsample` - Upsampling
+* :meth:`~torch.nn.functional.upsample_bilinear` - Bilinear Upsampling 
+* :meth:`~torch.nn.functional.upsample_nearest` - Upsampling Nearest

 ``torch.nn.intrinsic``
 ~~~~~~~~~~~~~~~~~~~~~~
@ -432,7 +465,7 @@ Debugging utilities
 .. autofunction:: get_observer_dict
 .. autoclass:: RecordingObserver

-torch.nn.instrinsic
+torch.nn.intrinsic
 --------------------------------

 This module implements the combined (fused) modules conv + relu which can be then quantized.
@ -546,6 +579,13 @@ Functional interface
 .. autofunction:: conv2d
 .. autofunction:: conv3d
 .. autofunction:: max_pool2d
+.. autofunction:: adaptive_avg_pool2d
+.. autofunction:: avg_pool2d
+.. autofunction:: interpolate
+.. autofunction:: upsample
+.. autofunction:: upsample_bilinear
+.. autofunction:: upsample_nearest
+

 .. automodule:: torch.nn.quantized

--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@ -55,7 +55,7 @@ This library provides primitives allowing users to create and modify references
 .. autofunction:: rpc_async
 .. autofunction:: remote
 .. autofunction:: get_worker_info
-.. autofunction:: wait_all_workers
+.. autofunction:: shutdown

 Distributed Autograd Framework
 ------------------------------
--- a/modules/detectron/batch_permutation_op.cc
+++ b/modules/detectron/batch_permutation_op.cc
@ -1,131 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "batch_permutation_op.h"
-#ifdef CAFFE2_USE_MKLDNN
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-#endif
-
-namespace caffe2 {
-
-#ifdef CAFFE2_USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(
-    BatchPermutation,
-    IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>);
-#endif
-
-REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    BatchPermutationGradient,
-    BatchPermutationGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(BatchPermutation)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Permute the batch elements of the input tensor X according to the permutation
-specified in the input indices.
-
-Warning: this op does not verify that indices is a valid permutation; gradient
-comptuation is only correct if indices is a permutation.
-)DOC")
-    .Input(
-        0,
-        "X",
-        "Tensor of at least 1D shape (N, D0, D1, ...).")
-    .Input(
-        1,
-        "indices",
-        "1D tensor of type int with shape (N, ) specifying a valid permutation "
-        "of the indices in [0, N - 1] (inclusive).")
-    .Output(
-        0,
-        "Y",
-        "Tensor with the same shape as X where the (D0, D1, ...) dimensional "
-        "batch elements of X are permuted according to the input indices.");
-
-OPERATOR_SCHEMA(BatchPermutationGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "indices",
-        "See BatchPermutation.")
-    .Input(
-        1,
-        "dY",
-        "Gradient of forward output 0 (Y).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-template <>
-bool BatchPermutationOp<float, CPUContext>::RunOnDevice() {
-  const auto& X = Input(0);
-  const auto& indices = Input(1);
-
-  CAFFE_ENFORCE_EQ(indices.dim(), 1, "indices must be 1-d");
-  CAFFE_ENFORCE_EQ(
-    X.dim32(0), indices.dim32(0),
-    "X.dim32(0) must be equal to indices.dim32(0)",
-    "(",
-    X.dim32(0),
-    " vs. ",
-    indices.dim32(0),
-    ")");
-
-  auto* Y = Output(0, X.sizes(), at::dtype<float>());
-
-  const int N = X.dim32(0);
-  const int C = X.dim32(1);
-  const int H = X.dim32(2);
-  const int W = X.dim32(3);
-
-  const float *src = X.template data<float>();
-  float *dst = Y->template mutable_data<float>();
-
-#ifdef _OPENMP
-#if (_OPENMP >= 201307)
-#pragma omp parallel for simd
-#else
-#pragma omp parallel for
-#endif 
-#endif  
-  for (int i = 0; i < N; i++) {
-    int idx = indices.template data<int>()[i];
-
-    std::memcpy(dst + i * C * H * W, src + idx * C * H * W, sizeof(float) * C * H * W);
-  }
-
-  return true;
-}
-
-class GetBatchPermutationGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "BatchPermutationGradient",
-        "",
-        vector<string>{I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(BatchPermutation, GetBatchPermutationGradient);
-
-} // namespace caffe2
--- a/modules/detectron/batch_permutation_op.cu
+++ b/modules/detectron/batch_permutation_op.cu
@ -1,112 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modules/detectron/batch_permutation_op.h"
-#include "caffe2/core/context_gpu.h"
-
-namespace caffe2 {
-
-namespace {
-template <bool forward>
-__global__ void BatchPermutationKernel(
-    int N,
-    int C,
-    int H,
-    int W,
-    const float* src,
-    const int* indices,
-    float* dst) {
-  CUDA_1D_KERNEL_LOOP(index, N * C * H * W) {
-    int w = index % W;
-    int h = (index / W) % H;
-    int c = (index / W / H) % C;
-    int n = (index / W / H / C);
-    int idx = indices[n];
-    if (forward) {
-      dst[n * C * H * W + c * H * W + h * W + w] =
-          src[idx * C * H * W + c * H * W + h * W + w];
-    } else {
-      dst[idx * C * H * W + c * H * W + h * W + w] =
-          src[n * C * H * W + c * H * W + h * W + w];
-    }
-  }
-}
-}
-
-template <>
-bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& indices = Input(1);
-
-
-  CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d");
-  CAFFE_ENFORCE(
-      X.dim32(0) == indices.dim32(0),
-      "X.dim32(0) must be equal to indices.dim32(0)",
-      "(",
-      X.dim32(0),
-      " vs. ",
-      indices.dim32(0),
-      ")");
-
-  auto* Y = Output(0, X.sizes(), at::dtype<float>());
-
-  BatchPermutationKernel<true><<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      X.data<float>(),
-      indices.data<int>(),
-      Y->mutable_data<float>());
-
-  return true;
-}
-
-template <>
-bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& indices = Input(0);
-  auto& dY = Input(1);
-
-  auto* dX = Output(0, dY.sizes(), at::dtype<float>());
-
-  BatchPermutationKernel<false><<<
-      CAFFE_GET_BLOCKS(dY.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      dY.dim32(0),
-      dY.dim32(1),
-      dY.dim32(2),
-      dY.dim32(3),
-      dY.data<float>(),
-      indices.data<int>(),
-      dX->mutable_data<float>());
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(
-    BatchPermutation,
-    BatchPermutationOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    BatchPermutationGradient,
-    BatchPermutationGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/batch_permutation_op.h
+++ b/modules/detectron/batch_permutation_op.h
@ -1,53 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef BATCHPERMUTATION_OP_H_
-#define BATCHPERMUTATION_OP_H_
-
-#include <cstring>
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class BatchPermutationOp final : public Operator<Context> {
- public:
-  BatchPermutationOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override;
-};
-
-template <typename T, class Context>
-class BatchPermutationGradientOp final : public Operator<Context> {
- public:
-  BatchPermutationGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-} // namespace caffe2
-
-#endif // BATCHPERMUTATION_OP_H_
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@ -17,17 +17,23 @@ from torch._C import parse_schema
 #
 # Whitelist entries can be removed after the date listed on them passes.
 white_list = [
-    ('c10_experimental', datetime.date(2020, 1, 1)),
-    ('_batch_norm_impl_index', datetime.date(2019, 11, 15)),
-    ('_batch_norm_impl_index_backward', datetime.date(2019, 11, 15)),
-    ('cudnn_batch_norm', datetime.date(2019, 11, 15)),
-    ('cudnn_batch_norm_backward', datetime.date(2019, 11, 15)),
-    ('_nnpack_spatial_convolution', datetime.date(2019, 11, 12)),
-    ('_aten', datetime.date(2019, 12, 22)),
-    ('_prim::ListConstruct', datetime.date(2019, 11, 22)),
-    ('thnn_conv3d', datetime.date(9999, 1, 1)),
-    ('thnn_conv3d.out', datetime.date(9999, 1, 1)),
-    ('grad', datetime.date(2020, 1, 1)),
+    ("aten::append", datetime.date(9999, 1, 1)),
+    ("prim::AutogradAnyNonZero", datetime.date(9999, 1, 1)),
+    ("aten::grad", datetime.date(9999, 1, 1)),
+    ("_c10_experimental", datetime.date(9999, 1, 1)),
+    ("aten::thnn_conv3d", datetime.date(9999, 1, 1)),
+    ("aten::native_layer_norm_double_backward", datetime.date(9999, 1, 1)),
+    ("aten::cudnn_batch_norm", datetime.date(9999, 1, 1)),
+    ("aten::cudnn_batch_norm_backward", datetime.date(9999, 1, 1)),
+    ("aten::_batch_norm_impl_index_backward", datetime.date(9999, 1, 1)),
+    ("aten::empty_like", datetime.date(9999, 1, 1)),
+    ("aten::_batch_norm_impl_index", datetime.date(9999, 1, 1)),
+    ("aten::index_fill_", datetime.date(9999, 1, 1)),
+    ("aten::index_fill", datetime.date(9999, 1, 1)),
+    ("aten::log_softmax", datetime.date(9999, 1, 1)),
+    ("aten::softmax", datetime.date(9999, 1, 1)),
+    ("aten::thnn_conv3d_forward", datetime.date(9999, 1, 1)),
+    ("aten::thnn_conv3d_backward.output_mask", datetime.date(9999, 1, 1)),
 ]


@ -43,6 +49,8 @@ def white_listed(schema, white_list):

 def check_bc(new_schema_dict):
    existing_schemas = torch._C._jit_get_all_schemas()
+    is_bc = True
+    broken_ops = []
    for existing_schema in existing_schemas:
        if white_listed(existing_schema, white_list):
            print("skipping schema: ", str(existing_schema))
@ -60,13 +68,17 @@ def check_bc(new_schema_dict):
                  .format(
                      str(existing_schema),
                      "\n\t".join(str(s) for s in new_schemas)))
-            print('The PR is introducing backward incompatible changes to the '
-                  'operator library. Please contact PyTorch team to confirm '
-                  'whether this change is wanted or not.')
            # TODO Print out more details about why candidates don't match.
-            return False
-    print('Found backward compatible schemas for all existing schemas')
-    return True
+            broken_ops.append(str(existing_schema))
+            is_bc = False
+    if is_bc:
+        print('Found backward compatible schemas for all existing schemas')
+    else:
+        print('The PR is introducing backward incompatible changes to the '
+              'operator library. Please contact PyTorch team to confirm '
+              'whether this change is wanted or not. \n Broken ops: [\n{}]'
+              .format("\n".join(broken_ops)))
+    return is_bc


 if __name__ == '__main__':
--- a/test/common_utils.py
+++ b/test/common_utils.py
@ -903,6 +903,15 @@ class TestCase(expecttest.TestCase):
        # Don't put this in the try block; the AssertionError will catch it
        self.fail(msg="Did not raise when expected to")

+    def assertNotWarn(self, callable, msg=''):
+        r"""
+        Test if :attr:`callable` does not raise a warning.
+        """
+        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            callable()
+            self.assertTrue(len(ws) == 0, msg)
+
    def assertWarns(self, callable, msg=''):
        r"""
        Test if :attr:`callable` raises a warning.
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@ -145,7 +145,7 @@ TEST_F(ModuleTest, RegisterParameterUndefinedTensor) {
  {
    TestModel model;
    model.register_parameter("undefined_tensor", torch::Tensor(), /*requires_grad=*/false);
-    ASSERT_FALSE(model.named_parameters()["undefined_tensor"].defined());
+    ASSERT_EQ(model.parameters().size(), 0);
  }
  {
    std::stringstream buffer;
@ -153,7 +153,7 @@ TEST_F(ModuleTest, RegisterParameterUndefinedTensor) {

    TestModel model;
    model.register_parameter("undefined_tensor", torch::Tensor());
-    ASSERT_FALSE(model.named_parameters()["undefined_tensor"].defined());
+    ASSERT_EQ(model.parameters().size(), 0);

    ASSERT_EQ(
      count_substr_occurrences(
@ -221,6 +221,87 @@ TEST_F(ModuleTest, AsCastsModulesCorrectly) {
  ASSERT_EQ(unit.as<AGIUnit>(), &unit);
 }

+void test_DeviceOrDtypeConversionSkipsUndefinedTensor(
+  torch::Device to_device, torch::Dtype to_dtype) {
+  {
+    // Case 1: Undefined tensors as parameters
+    Linear module(LinearOptions(10, 20).bias(false));
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_FALSE(module->bias.defined());
+
+    module->to(to_device);
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_EQ(module->weight.device().type(), to_device.type());
+    ASSERT_FALSE(module->bias.defined());
+
+    module->to(to_dtype);
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_EQ(module->weight.dtype(), to_dtype);
+    ASSERT_FALSE(module->bias.defined());
+  }
+  {
+    // Case 2: Undefined tensors as buffers
+    BatchNorm1d module(BatchNorm1dOptions(5).track_running_stats(false).affine(true));
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_FALSE(module->running_mean.defined());
+
+    module->to(to_device);
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_EQ(module->weight.device().type(), to_device.type());
+    ASSERT_FALSE(module->running_mean.defined());
+
+    module->to(to_dtype);
+    ASSERT_TRUE(module->weight.defined());
+    ASSERT_EQ(module->weight.dtype(), to_dtype);
+    ASSERT_FALSE(module->running_mean.defined());
+  }
+}
+
+TEST_F(ModuleTest, DeviceOrDtypeConversionSkipsUndefinedTensor) {
+  test_DeviceOrDtypeConversionSkipsUndefinedTensor(torch::kCPU, torch::kDouble);
+}
+
+TEST_F(ModuleTest, DeviceOrDtypeConversionSkipsUndefinedTensor_CUDA) {
+  test_DeviceOrDtypeConversionSkipsUndefinedTensor(torch::kCUDA, torch::kDouble);
+}
+
+TEST_F(ModuleTest, ParametersAndBuffersAccessorSkipsUndefinedTensor) {
+  {
+    Linear module(LinearOptions(10, 20).bias(false));
+
+    auto params = module->parameters();
+    ASSERT_EQ(params.size(), 1);
+    auto named_params = module->named_parameters();
+    ASSERT_EQ(named_params.size(), 1);
+
+    ASSERT_TRUE(pointer_equal(params[0], named_params["weight"]));
+    ASSERT_TRUE(pointer_equal(named_params["weight"], module->weight));
+  }
+  {
+    BatchNorm1d module(BatchNorm1dOptions(5).track_running_stats(false).affine(false));
+
+    auto buffers = module->buffers();
+    ASSERT_EQ(buffers.size(), 0);
+    auto named_buffers = module->named_buffers();
+    ASSERT_EQ(named_buffers.size(), 0);
+  }
+  {
+    BatchNorm1d module(BatchNorm1dOptions(5).track_running_stats(true).affine(false));
+
+    auto buffers = module->buffers();
+    ASSERT_EQ(buffers.size(), 3);
+    auto named_buffers = module->named_buffers();
+    ASSERT_EQ(named_buffers.size(), 3);
+
+    ASSERT_TRUE(pointer_equal(buffers[0], named_buffers["running_mean"]));
+    ASSERT_TRUE(pointer_equal(named_buffers["running_mean"], module->running_mean));
+    ASSERT_TRUE(pointer_equal(buffers[1], named_buffers["running_var"]));
+    ASSERT_TRUE(pointer_equal(named_buffers["running_var"], module->running_var));
+    ASSERT_TRUE(pointer_equal(buffers[2], named_buffers["num_batches_tracked"]));
+    ASSERT_TRUE(pointer_equal(named_buffers["num_batches_tracked"], module->num_batches_tracked));
+  }
+}
+
 TEST_F(ModuleTest, Conversion_MultiCUDA) {
  Linear module(128, 64);
  for (auto& parameter : module->parameters()) {
--- a/test/cpp/api/support.h
+++ b/test/cpp/api/support.h
@ -46,7 +46,7 @@ private:
 };

 inline bool pointer_equal(at::Tensor first, at::Tensor second) {
-  return first.data_ptr<float>() == second.data_ptr<float>();
+  return first.data_ptr() == second.data_ptr();
 }

 inline int count_substr_occurrences(const std::string& str, const std::string& substr) {
--- a/test/dist_autograd_test.py
+++ b/test/dist_autograd_test.py
@ -1360,7 +1360,7 @@ class DistAutogradTest(RpcAgentTestFixture):
        # receive gradients from the node that received an error (and as a
        # result it didn't execute the rest of the graph).
        dist.barrier()
-        rpc.wait_all_workers()
+        rpc.shutdown()
        sys.exit(0)


--- a/test/dist_utils.py
+++ b/test/dist_utils.py
@ -72,6 +72,11 @@ def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True):

    @wraps(old_test_method)
    def new_test_method(self, *arg, **kwargs):
+        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
+        # in tests.
+        import torch.distributed.rpc.api as api
+        api._ignore_rref_leak = False
+
        self.worker_id = self.rank

        if setup_rpc:
@ -83,7 +88,6 @@ def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True):
            rpc.init_rpc(
                name="worker%d" % self.rank,
                backend=self.rpc_backend,
-                init_method=self.init_method,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
@ -123,7 +127,7 @@ def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True):
            # since we need to shutdown the RPC agent. If we don't shutdown the
            # RPC agent, tests would fail since RPC agent threads, locks and
            # condition variables are not properly terminated.
-            rpc.wait_all_workers()
+            rpc.shutdown()

        return return_value

@ -134,6 +138,7 @@ def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True):
 TEST_CONFIG.rpc_backend_name = "PROCESS_GROUP"
 TEST_CONFIG.build_rpc_backend_options = lambda test_object: rpc.backend_registry.construct_rpc_backend_options(
    test_object.rpc_backend,
+    init_method=test_object.init_method,
    # Use enough 'num_send_recv_threads' until we fix https://github.com/pytorch/pytorch/issues/26359
    num_send_recv_threads=16,
 )
--- a/test/hypothesis_utils.py
+++ b/test/hypothesis_utils.py
@ -6,7 +6,7 @@ import hypothesis
 from hypothesis import assume
 from hypothesis import strategies as st
 from hypothesis.extra import numpy as stnp
-from hypothesis.searchstrategy import SearchStrategy
+from hypothesis.strategies import SearchStrategy

 from common_quantized import _calculate_dynamic_qparams, _calculate_dynamic_per_channel_qparams

@ -304,10 +304,11 @@ def tensor_conv(

    return X, W, b, groups

-# Disable deadline testing if this version of hypthesis supports it, otherwise
-# just return the original function
-def no_deadline(fn):
-    try:
-        return hypothesis.settings(deadline=None)(fn)
-    except hypothesis.errors.InvalidArgument:
-        return fn
+from hypothesis import settings
+settings.register_profile("no_deadline", deadline=None)
+settings.load_profile("no_deadline")
+
+# This is really just to get flake8 to not complain when this file
+# is imported purely for the side-effectful stuff above
+def assert_deadline_disabled():
+    assert settings().deadline is None
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@ -183,31 +183,6 @@ class TestONNXRuntime(unittest.TestCase):
        x = torch.randn(2, 3, 224, 224, requires_grad=True)
        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)

-    def test_googlenet_quantization(self):
-        model = torchvision.models.quantization.googlenet(pretrained=True)
-        x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
-
-    def test_inception_quantization(self):
-        model = torchvision.models.quantization.inception_v3(pretrained=True)
-        x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
-
-    def test_mobilenet_quantization(self):
-        model = torchvision.models.quantization.mobilenet_v2(pretrained=True)
-        x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
-
-    def test_resnet_quantization(self):
-        model = torchvision.models.quantization.resnet50(pretrained=True)
-        x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (x,))
-
-    def test_shufflenet_quantization(self):
-        model = torchvision.models.quantization.shufflenet_v2_x1_0(pretrained=True)
-        x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
-
    def test_r3d_18_video(self):
        model = torchvision.models.video.r3d_18(pretrained=True)
        x = torch.randn(1, 3, 4, 112, 112, requires_grad=True)
@ -238,6 +213,55 @@ class TestONNXRuntime(unittest.TestCase):
        # Only support CPU version, since tracer is not working in GPU RNN.
        self.run_test(model, (x, model.hidden))

+    def get_image_from_url(self, url):
+        import sys
+        import os
+        if sys.version_info < (3,):
+            from urlparse import urlsplit
+            import urllib2
+            request = urllib2
+        else:
+            from urllib.parse import urlsplit
+            from urllib import request
+        from PIL import Image
+        from torchvision import transforms
+        from torch._utils_internal import get_writable_path
+
+        filename = os.path.basename(urlsplit(url)[2])
+        data_dir = get_writable_path(os.path.join(os.path.dirname(__file__)))
+        path = os.path.join(data_dir, filename)
+        data = request.urlopen(url, timeout=15).read()
+        with open(path, 'wb') as f:
+            f.write(data)
+        image = Image.open(path).convert("RGB")
+        image = image.resize((300, 200), Image.BILINEAR)
+        to_tensor = transforms.ToTensor()
+        return to_tensor(image)
+
+    def get_test_images(self):
+        image_url = "http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg"
+        image = self.get_image_from_url(url=image_url)
+        images = [image]
+        return images
+
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_keypoint_rcnn(self):
+        class KeyPointRCNN(torch.nn.Module):
+            def __init__(self):
+                super(KeyPointRCNN, self).__init__()
+                self.model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(pretrained=True,
+                                                                                                  min_size=200,
+                                                                                                  max_size=300)
+
+            def forward(self, images):
+                output = self.model(images)
+                # TODO: The keypoints_scores require the use of Argmax that is updated in ONNX.
+                #       For now we are testing all the output of KeypointRCNN except keypoints_scores.
+                #       Enable When Argmax is updated in ONNX Runtime.
+                return output[0]['boxes'], output[0]['labels'], output[0]['scores'], output[0]['keypoints']
+        images = self.get_test_images()
+        self.run_test(KeyPointRCNN(), (images,), rtol=1e-3, atol=1e-5)
+
    def test_word_language_model_RNN_TANH(self):
        self.run_word_language_model("RNN_TANH")

--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@ -3,6 +3,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import concurrent.futures
 from datetime import timedelta
 import sys
+import time
 import unittest
 from collections import namedtuple
 from unittest import mock
@ -18,6 +19,21 @@ from torch.distributed.rpc.api import _use_rpc_pickler
 from torch.distributed.rpc.internal import PythonUDF, _internal_rpc_pickler
 from rpc_agent_test_fixture import RpcAgentTestFixture

+rpc_done = [False, False, False, False]
+
+# TODO: dedupe this with the code in dist_autograd_test.py.
+# Send rpc done info and context_id to
+# dst_rank = (self.rank + rank_distance) % self.world_size
+# we don't need a lock here since the GIL is held while executing remote
+# python UDFs, so access is serialized across several workers.
+def _set_rpc_done(rank_distance):
+    global rpc_done
+    rpc_done[rank_distance] = True
+
+def _check_rpc_done(rank_distance):
+    while not rpc_done[rank_distance]:
+        # yield control to other threads
+        time.sleep(0)

 def requires_process_group_agent(message=""):
    def decorator(old_func):
@ -127,7 +143,6 @@ def my_tensor_function(a, b):
    return a + b

 def my_sleep_func(seconds=1):
-    import time
    time.sleep(seconds)


@ -306,7 +321,6 @@ class RpcTest(RpcAgentTestFixture):
        rpc.init_rpc(
            name="worker1",
            backend=backend,
-            init_method=self.init_method,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
@ -327,14 +341,13 @@ class RpcTest(RpcAgentTestFixture):
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )
-        rpc.wait_all_workers()
+        rpc.shutdown()

    @dist_init(setup_rpc=False)
    def test_reinit(self):
        rpc.init_rpc(
            name="worker{}".format(self.rank),
            backend=self.rpc_backend,
-            init_method=self.init_method,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
@ -357,13 +370,13 @@ class RpcTest(RpcAgentTestFixture):
            rpc.init_rpc(
                name="worker{}".format(self.rank),
                backend=self.rpc_backend,
-                init_method=self.init_method,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )
-        rpc.wait_all_workers()
+        rpc.shutdown()

+    @unittest.skip("test_invalid_names is flaky, see https://github.com/pytorch/pytorch/issues/25912")
    @dist_init(setup_rpc=False)
    def test_invalid_names(self):
        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
@ -430,8 +443,8 @@ class RpcTest(RpcAgentTestFixture):

        from torch.distributed.rpc.api import _agent
        self.assertEqual(_agent, None)
-        # wait_all_workers() should not do anything as _agent is None
-        rpc.wait_all_workers()
+        # shutdown() should not do anything as _agent is None
+        rpc.shutdown()
        # We need this barrier here because although init_process_group is
        # blocking, it does not guarantee that all ranks are done with
        # initialization after the call. We did run into issues with it where
@ -508,12 +521,11 @@ class RpcTest(RpcAgentTestFixture):
            self.assertEqual(ret, torch.ones(n, n) * 2)

    @dist_init(setup_rpc=False)
-    def test_wait_all_workers(self):
+    def test_shutdown(self):
        # Initialize RPC.
        rpc.init_rpc(
            name="worker%d" % self.rank,
            backend=self.rpc_backend,
-            init_method=self.init_method,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
@ -527,7 +539,7 @@ class RpcTest(RpcAgentTestFixture):
            args=(torch.ones(n, n), torch.ones(n, n)),
        )
        self.assertEqual(ret, torch.ones(n, n) * 2)
-        rpc.wait_all_workers()
+        rpc.shutdown()

        with self.assertRaisesRegex(RuntimeError, "^RPC has not been initialized"):
            rpc.rpc_sync(
@ -536,8 +548,8 @@ class RpcTest(RpcAgentTestFixture):
                args=(torch.ones(n, n), torch.ones(n, n)),
            )

-        # it's safe to call wait_all_workers() multiple times
-        rpc.wait_all_workers()
+        # it's safe to call shutdown() multiple times
+        rpc.shutdown()

    @dist_init
    def test_expected_src(self):
@ -701,8 +713,6 @@ class RpcTest(RpcAgentTestFixture):
        self.assertEqual(ret, torch.ones(2, 2) + 1)

    def _stress_test_rpc(self, f, repeat=1000, args=()):
-        import time
-
        n = self.rank + 1
        dst_rank = n % self.world_size
        futs = []
@ -1090,6 +1100,111 @@ class RpcTest(RpcAgentTestFixture):

        self.assertEqual(result, sum(vals))

+    def _test_rref_leak(self, ignore_leak=False):
+        rpc.init_rpc(
+            name="worker{}".format(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        # This is for the below `dist.barrier`.
+        # For `RpcAgent` other than `ProcessGroupAgent`,
+        # no `_default_pg` is initialized.
+        if not dist.is_initialized():
+            dist.init_process_group(
+                backend="gloo",
+                init_method=self.init_method,
+                rank=self.rank,
+                world_size=self.world_size,
+            )
+        # Wait for all init to complete.
+        dist.barrier()
+
+        rref = rpc.remote(
+            "worker{}".format((self.rank + 1) % self.world_size),
+            torch.add,
+            args=(torch.ones(2, 2), 1)
+        )
+
+        if ignore_leak:
+            import torch.distributed.rpc.api as api
+            api._ignore_rref_leak = True
+
+        rpc.shutdown()
+
+    @dist_init(setup_rpc=False)
+    def test_rref_leak(self):
+        with self.assertRaisesRegex(RuntimeError, "Leaking RRef"):
+            self._test_rref_leak()
+
+    @dist_init(setup_rpc=False)
+    def test_ignore_rref_leak(self):
+        self._test_rref_leak(ignore_leak=True)
+
+    @dist_init(setup_rpc=False)
+    @requires_process_group_agent("PROCESS_GROUP rpc backend specific test, skip")
+    def test_local_shutdown(self):
+        # test that we can start RPC and then immediately locally shutdown
+        # without sending any messages.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=rpc.backend_registry.BackendType[
+                dist_utils.TEST_CONFIG.rpc_backend_name
+            ],
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        # pass in graceful=False to ensure that we don't wait for other workers.
+        rpc.shutdown(graceful=False)
+
+    @dist_init(setup_rpc=False)
+    @requires_process_group_agent("PROCESS_GROUP rpc backend specific test, skip")
+    def test_local_shutdown_with_rpc(self):
+        # test that we can start RPC, send RPCs, and then run local shutdown.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=rpc.backend_registry.BackendType[
+                dist_utils.TEST_CONFIG.rpc_backend_name
+            ],
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            "worker{}".format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        # wait for RPCs to be done, so that some workers don't try to shut down
+        # too early.
+        rpc.rpc_sync("worker{}".format(dst_rank), _set_rpc_done, args=(1,))
+        _check_rpc_done(1)
+        # pass in graceful=False to ensure that we don't wait for other workers.
+        rpc.shutdown(graceful=False)
+
+    @dist_init(setup_rpc=False)
+    @requires_process_group_agent("PROCESS_GROUP rpc backend specific test, skip")
+    def test_wait_all_workers_and_shutdown(self):
+        # This tests ensures that both rpc._wait_all_workers() and rpc.shutdown() can be
+        # called without errors being raised due to attempting to shut down
+        # multiple times.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=rpc.backend_registry.BackendType[dist_utils.TEST_CONFIG.rpc_backend_name],
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options
+        )
+        from torch.distributed.rpc.api import _wait_all_workers
+        # intentional call to internal _wait_all_workers.
+        _wait_all_workers()
+        rpc.shutdown()
+
    @dist_init(setup_rpc=False)
    def test_get_rpc_timeout(self):
        timeout = timedelta(seconds=1)
@ -1102,14 +1217,13 @@ class RpcTest(RpcAgentTestFixture):
        rpc.init_rpc(
            name="worker{}".format(self.rank),
            backend=self.rpc_backend,
-            init_method=self.init_method,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=rpc_backend_options,
        )
        set_timeout = rpc.get_rpc_timeout()
        self.assertEqual(timeout, set_timeout)
-        rpc.wait_all_workers()
+        rpc.shutdown()

    @dist_init
    @requires_process_group_agent("PROCESS_GROUP rpc backend specific test, skip")
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -422,6 +422,9 @@ class WorkerSpecificIterableDataset(IterableDataset):
        assert worker_info is not None
        return iter(range(self.sizes_for_all_workers[worker_info.id]))

+    def __len__(self):
+        return sum(self.sizes_for_all_workers)
+

 # Inspired by https://stackoverflow.com/a/26703365
 # If all workers will call `sync_once`, they will be blocked until all workers
@ -961,8 +964,8 @@ class TestDataLoader(TestCase):
            # non-batched should not convert ints into tensors
            self.assertIsInstance(d, torch._six.int_classes)
            self.assertEqual(d, i)
-        with self.assertRaisesRegex(TypeError, "Cannot determine the DataLoader length of a IterableDataset"):
-            len(dataloader)  # DataLoader with iterable-style dataset should error in __len__
+        # DataLoader should match len of the iterable-style dataset (if implemented)
+        self.assertEqual(len(dataloader), len(dataset))

        # [no auto-batching] multiprocessing loading
        num_workers = 3
@ -978,8 +981,26 @@ class TestDataLoader(TestCase):
            # non-batched should not convert ints into tensors
            self.assertIsInstance(a, torch._six.int_classes)
            self.assertEqual(a, b)
-        with self.assertRaisesRegex(TypeError, "Cannot determine the DataLoader length of a IterableDataset"):
-            len(dataloader)  # DataLoader with iterable-style dataset should error in __len__
+        # DataLoader should match len of the iterable-style dataset (if implemented)
+        self.assertEqual(len(dataloader), len(dataset))
+        # When loading more than len(dataset) data, after accessing len(dataloader),
+        # we should get a warning. See NOTE [ IterableDataset and __len__ ].
+        dataset = CountingIterableDataset(20)
+        dataloader = DataLoader(dataset, num_workers=num_workers,
+                                worker_init_fn=set_faulthander_if_available)
+        it = iter(dataloader)
+        for _ in range(40):
+            self.assertNotWarn(lambda: next(it), "Should not warn before accessing len(dataloader)")
+        self.assertEqual(len(dataloader), len(dataset))
+        self.assertEqual(len(dataloader), 20)
+        it = iter(dataloader)
+        for _ in range(20):
+            self.assertNotWarn(lambda: next(it), "Should not warn before exceeding length")
+        for _ in range(3):
+            self.assertWarnsRegex(
+                lambda: next(it),
+                r"but [0-9]+ samples have been fetched\. For multiprocessing data-loading, this",
+                "Should always warn after exceeding length")

        # [no auto-batching] test that workers exit gracefully
        workers = dataloader_iter._workers
--- a/test/test_fake_quant.py
+++ b/test/test_fake_quant.py
@ -5,7 +5,7 @@ import numpy as np
 from hypothesis import given
 from hypothesis import strategies as st
 import hypothesis_utils as hu
-from hypothesis_utils import no_deadline
+hu.assert_deadline_disabled()
 from common_utils import run_tests, TestCase
 from torch.quantization import FakeQuantize
 from torch.quantization import default_observer, default_per_channel_weight_observer
@ -64,10 +64,8 @@ NP_RANDOM_SEED = 19
 tolerance = 1e-6

 class TestFakeQuantizePerTensor(TestCase):
-    # NOTE: Tests in this class are decorated with no_deadline
-    # to prevent spurious failures due to cuda runtime initialization.

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.tensor(shapes=hu.array_shapes(1, 5,),
                       qparams=hu.qparams(dtypes=torch.quint8)))
@ -85,7 +83,7 @@ class TestFakeQuantizePerTensor(TestCase):
            X, scale, zero_point, quant_min, quant_max)
        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.tensor(shapes=hu.array_shapes(1, 5,),
                       qparams=hu.qparams(dtypes=torch.quint8)))
@ -108,7 +106,8 @@ class TestFakeQuantizePerTensor(TestCase):
        Y_prime.backward(dout)
        np.testing.assert_allclose(dX.cpu(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    # https://github.com/pytorch/pytorch/issues/30604
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.tensor(shapes=hu.array_shapes(1, 5,),
                       qparams=hu.qparams(dtypes=torch.quint8)))
@ -127,7 +126,7 @@ class TestFakeQuantizePerTensor(TestCase):
            X, scale, zero_point, quant_min, quant_max)
        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.tensor(shapes=hu.array_shapes(1, 5,),
                       qparams=hu.qparams(dtypes=[torch.quint8])),
@ -206,10 +205,8 @@ class TestFakeQuantizePerTensor(TestCase):


 class TestFakeQuantizePerChannel(TestCase):
-    # NOTE: Tests in this class are decorated with no_deadline
-    # to prevent spurious failures due to cuda runtime initialization.

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
           qparams=hu.qparams(dtypes=torch.quint8)))
@ -229,7 +226,7 @@ class TestFakeQuantizePerChannel(TestCase):
            X, scale, zero_point, axis, quant_min, quant_max)
        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
           qparams=hu.qparams(dtypes=torch.quint8)))
@ -253,11 +250,10 @@ class TestFakeQuantizePerChannel(TestCase):
        Y_prime.backward(dout)
        np.testing.assert_allclose(dX.cpu().detach().numpy(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
           qparams=hu.qparams(dtypes=torch.quint8)))
-    @unittest.skip("temporarily disable the test")
    def test_numerical_consistency_per_channel(self, device, X):
        r"""Comparing numerical consistency between CPU quantize/dequantize op and the CPU fake quantize op
        """
@ -275,7 +271,7 @@ class TestFakeQuantizePerChannel(TestCase):
            X, scale, zero_point, axis, quant_min, quant_max)
        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

-    @no_deadline
+    @unittest.skip("temporarily disable the test")
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
           X=hu.per_channel_tensor(shapes=hu.array_shapes(2, 5,),
           qparams=hu.qparams(dtypes=torch.qint8)))
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -3564,6 +3564,38 @@ graph(%Ra, %Rb):
                self.assertTrue(type(block.paramNode()) == torch._C.Node)
        self.assertTrue(tested_blocks)

+    def test_export_opnames(self):
+        class Foo(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Foo, self).__init__()
+
+            def one(self, x, y):
+                # type: (Tensor, Tensor) -> Tensor
+                return x + y
+
+            def two(self, x):
+                # type: (Tensor) -> Tensor
+                return 2 * x
+
+            @torch.jit.script_method
+            def forward(self, x):
+                # type: (Tensor) -> Tensor
+                return self.one(self.two(x), x)
+
+        class Bar(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Bar, self).__init__()
+                self.sub = Foo()
+
+            def forward(self, x):
+                # type: (Tensor) -> Tensor
+                return self.sub.forward(x)
+
+        bar = Bar()
+        ops = torch.jit.export_opnames(bar)
+        expected = ['aten::add.Tensor', 'aten::mul.Scalar', 'prim::Constant']
+        self.assertEqual(ops, expected)
+
    def test_pytorch_jit_env_off(self):
        import subprocess
        env = os.environ.copy()
@ -7037,6 +7069,15 @@ a")
        self.checkScript(func1, (), optimize=True)
        self.checkScript(func2, (), optimize=True)

+    # FIXME: get rid of this once we have actual ops using optional floats
+    def test_optional_float(self):
+        def _test_optional_float(x, scale):
+            # type: (Tensor, Optional[float]) -> torch.Tensor
+            return torch._test_optional_float(x, scale=scale)
+
+        self.assertEqual([0], torch.jit.script(_test_optional_float)(torch.randn(()), None).shape)
+        self.assertEqual((), torch.jit.script(_test_optional_float)(torch.randn(()), 2.5).shape)
+
    def _test_tensor_number_math(self, device='cpu'):
        template = dedent('''
        def func(t):
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@ -1038,6 +1038,11 @@ class TestNamedTensor(TestCase):
        self.assertEqual(out.names, ('N', 'C', 'H', 'W', 'K'))
        self.assertEqual(out.shape, (7, 2, 3, 5, 11))

+        # takes negative positional dim
+        out = tensor.unflatten(-2, (('C', 2), ('H', 3), ('W', 5)))
+        self.assertEqual(out.names, ('N', 'C', 'H', 'W', 'K'))
+        self.assertEqual(out.shape, (7, 2, 3, 5, 11))
+
        with self.assertRaisesRegex(RuntimeError, "don't multiply up to"):
            tensor.unflatten('D', (('H', 3), ('W', 5)))

--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -6232,6 +6232,38 @@ class TestNN(NNTestCase):
        inp = torch.randn(4, 5, device='cuda', requires_grad=True)
        gradgradcheck(F.pdist, (inp,))

+    def test_cosine_embedding_loss_with_diff_type(self):
+        for device in device_():
+            input1 = torch.tensor([[2, 3, 4], [6, 2, 4]], dtype=torch.double, device=device)
+            input2 = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device)
+            target = torch.tensor([1, -1], dtype=torch.int, device=device)
+            expected = torch.nn.functional.cosine_embedding_loss(input1, input2, target)
+            for dt1 in torch.testing.get_all_math_dtypes(device):
+                for dt2 in torch.testing.get_all_math_dtypes(device):
+                    for dt3 in torch.testing.get_all_math_dtypes(device):
+                        # dt3 is used as dtype for target = [1, -1], so let's skip unsigned type
+                        if dt3 == torch.uint8:
+                            continue
+                        input1 = input1.to(dt1)
+                        input2 = input2.to(dt2)
+                        target = target.to(dt3)
+                        result = torch.nn.functional.cosine_embedding_loss(input1, input2, target)
+                        self.assertEqual(result.item(), expected.item(), 0.001)
+
+    def test_kl_div_with_diff_type(self):
+        for device in device_():
+            input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device)
+            target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device)
+            expected = torch.nn.functional.kl_div(input, target)
+            for input_dtype in torch.testing.get_all_math_dtypes(device):
+                for target_dtype in [torch.float32, torch.float64, torch.float16]:
+                    if (torch.device(device).type == 'cpu' and target_dtype == torch.float16):
+                        continue
+                    input = input.to(input_dtype)
+                    target = target.to(target_dtype)
+                    result = torch.nn.functional.kl_div(input, target)
+                    self.assertEqual(result.item(), expected.item(), 0.001)
+
    def test_cosine_embedding_loss_no_reduce(self):
        input1 = torch.randn(15, 10, requires_grad=True)
        input2 = torch.randn(15, 10, requires_grad=True)
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@ -309,6 +309,30 @@ class TestNumbaIntegration(common.TestCase):
                torch_ary += 42
                self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary) + 42)

+    @unittest.skipIf(not TEST_NUMPY, "No numpy")
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda")
+    def test_from_cuda_array_interface_inferred_strides(self):
+        """torch.as_tensor(numba_ary) should have correct inferred (contiguous) strides"""
+        # This could, in theory, be combined with test_from_cuda_array_interface but that test
+        # is overly strict: it checks that the exported protocols are exactly the same, which
+        # cannot handle differening exported protocol versions.
+        dtypes = [
+            numpy.float64,
+            numpy.float32,
+            numpy.int64,
+            numpy.int32,
+            numpy.int16,
+            numpy.int8,
+            numpy.uint8,
+        ]
+        for dtype in dtypes:
+            numpy_ary = numpy.arange(6).reshape(2, 3).astype(dtype),
+            numba_ary = numba.cuda.to_device(numpy_ary)
+            self.assertTrue(numba_ary.is_c_contiguous())
+            torch_ary = torch.as_tensor(numba_ary, device="cuda")
+            self.assertTrue(torch_ary.is_contiguous())
+
    @unittest.skipIf(not TEST_NUMPY, "No numpy")
    @unittest.skipIf(not TEST_CUDA, "No cuda")
    @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda")
--- a/test/test_qat.py
+++ b/test/test_qat.py
@ -11,15 +11,13 @@ import torch.backends.mkldnn
 from common_utils import TestCase, run_tests
 from hypothesis import given
 from hypothesis import strategies as st
-from hypothesis_utils import no_deadline
+import hypothesis_utils as hu
+hu.assert_deadline_disabled()
 from functools import reduce


 class IntrinsicQATModuleTest(TestCase):
-    # NOTE: Tests in this class are decorated with no_deadline
-    # to prevent spurious failures due to cuda runtime initialization.

-    @no_deadline
    @given(batch_size=st.integers(2, 4),
           input_channels_per_group=st.sampled_from([2, 3, 4]),
           height=st.integers(5, 10),
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@ -42,7 +42,8 @@ from jit_utils import get_forward

 from hypothesis import given
 from hypothesis import strategies as st
-from hypothesis_utils import no_deadline
+import hypothesis_utils as hu
+hu.assert_deadline_disabled()
 import io
 import copy

@ -50,7 +51,6 @@ import copy
                     " Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs"
                     " with instruction set support avx2 or newer.")
 class EagerModePostTrainingQuantTest(QuantizationTestCase):
-    @no_deadline
    @given(qconfig=st.sampled_from((torch.quantization.default_qconfig, torch.quantization.default_per_channel_qconfig)))
    def test_single_layer(self, qconfig):
        r"""Quantize SingleLayerLinearModel which has one Linear module, make sure it is swapped
@ -919,7 +919,6 @@ class GraphModePostTrainingQuantTest(QuantizationTestCase):

 class FunctionalModuleTest(QuantizationTestCase):
    # Histogram Observers are slow, so have no-deadline to ensure test doesn't time out
-    @no_deadline
    @given(train_mode=st.booleans())
    def test_functional_module(self, train_mode):
        model = ModelWithFunctionals()
@ -1349,7 +1348,6 @@ class RecordHistogramObserverTest(QuantizationTestCase):
        self.assertEqual(len(observer_dict['fc1.module.activation_post_process'].get_tensor_value()), 2 * len(self.calib_data))
        self.assertEqual(observer_dict['fc1.module.activation_post_process'].get_tensor_value()[0], model(self.calib_data[0][0]))

-    @no_deadline
    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
           qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)))
    def test_observer_scriptable(self, qdtype, qscheme):
@ -1366,7 +1364,6 @@ class RecordHistogramObserverTest(QuantizationTestCase):
        loaded = torch.jit.load(buf)
        self.assertTrue(torch.equal(obs.get_tensor_value()[0], loaded.get_tensor_value()[0]))

-    @no_deadline
    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
           qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)),
           reduce_range=st.booleans())
--- a/test/test_quantized.py
+++ b/test/test_quantized.py
@ -10,7 +10,7 @@ from hypothesis import settings, HealthCheck
 from hypothesis import assume, given
 from hypothesis import strategies as st
 import hypothesis_utils as hu
-from hypothesis_utils import no_deadline
+hu.assert_deadline_disabled()

 from common_utils import TEST_WITH_UBSAN, TestCase, run_tests, IS_PPC, IS_MACOS
 from common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
@ -145,7 +145,6 @@ class TestQuantizedOps(TestCase):
                                 message="{} relu failed".format(name))

    """Tests the correctness of the scalar addition."""
-    @no_deadline
    @given(A=hu.tensor(shapes=hu.array_shapes(1, 4, 1, 5),
                       elements=st.floats(-1e6, 1e6, allow_nan=False),
                       qparams=hu.qparams()),
@ -506,7 +505,6 @@ class TestQuantizedOps(TestCase):
        self.assertEqual(a_ref, a_hat.dequantize(),
                         message="ops.quantized.max_pool2d results are off")

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
                                              min_side=5, max_side=10),
                       qparams=hu.qparams(dtypes=torch.quint8)),
@ -556,7 +554,6 @@ class TestQuantizedOps(TestCase):
                             message=error_message.format(name + '.zero_point', scale,
                                                          qX_hat.q_zero_point()))

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=5, max_side=10),
                       qparams=hu.qparams(dtypes=torch.qint8)),
@ -619,7 +616,6 @@ class TestQuantizedOps(TestCase):
                             message=error_message.format(name + '.zero_point', scale,
                             X_hat.q_zero_point()))

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=1, max_side=10),
                       qparams=hu.qparams(dtypes=torch.quint8)),
@ -662,7 +658,6 @@ class TestQuantizedOps(TestCase):
                                                          qX_hat.q_zero_point()))

    """Tests adaptive average pool operation on NHWC quantized tensors."""
-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=1, max_side=10),
                       qparams=hu.qparams(dtypes=torch.qint8)),
@ -708,7 +703,6 @@ class TestQuantizedOps(TestCase):
                             message=error_message.format(name + '.zero_point', scale,
                                                          X_hat.q_zero_point()))

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
                                              min_side=1, max_side=10),
                       qparams=hu.qparams()),
@ -733,7 +727,6 @@ class TestQuantizedOps(TestCase):
        torch.testing.assert_allclose(quantized_out[0].dequantize(), unquantized_out[0])
        torch.testing.assert_allclose(quantized_out[1], unquantized_out[1])

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=1, max_side=10),
                       qparams=hu.qparams()),
@ -818,7 +811,6 @@ class TestQuantizedOps(TestCase):
            cat_q = q_cat_op(tensors_q, dim=ch_axis, scale=scale,
                             zero_point=zero_point)

-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=5, max_side=10),
                       qparams=hu.qparams()),
@ -874,7 +866,6 @@ class TestQuantizedOps(TestCase):
                                                          qX_hat.q_zero_point()))

    """Tests quantize concatenation (both fused and not)."""
-    @no_deadline
    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                              min_side=1, max_side=10),
                       qparams=hu.qparams()),
@ -999,7 +990,6 @@ class TestQuantizedOps(TestCase):
                     " with instruction set support avx2 or newer.")
 class TestDynamicQuantizedLinear(TestCase):
    """Tests the correctness of the dynamic quantized linear and linear_relu op."""
-    @no_deadline
    @given(
        batch_size=st.integers(1, 4),
        input_channels=st.integers(16, 32),
@ -1112,7 +1102,6 @@ class TestDynamicQuantizedLinear(TestCase):
                         message="torch.ops.quantized.linear_dynamic (fbgemm) results are off")

    """Tests the correctness of the legacy dynamic quantized linear op."""
-    @no_deadline
    @given(
        batch_size=st.integers(1, 4),
        input_channels=st.integers(16, 32),
@ -1189,7 +1178,6 @@ class TestDynamicQuantizedLinear(TestCase):

 class TestQuantizedLinear(unittest.TestCase):
    """Tests the correctness of the quantized linear and linear_relu op."""
-    @no_deadline
    @given(batch_size=st.integers(1, 4),
           input_channels=st.integers(16, 32),
           output_channels=st.integers(4, 8),
--- a/test/test_quantized_nn_mods.py
+++ b/test/test_quantized_nn_mods.py
@ -13,7 +13,8 @@ from common_quantized import _calculate_dynamic_qparams, override_quantized_engi
 from common_utils import run_tests, IS_PPC, TEST_WITH_UBSAN
 from hypothesis import assume, given
 from hypothesis import strategies as st
-from hypothesis_utils import no_deadline
+import hypothesis_utils as hu
+hu.assert_deadline_disabled()

 import io
 import numpy as np
@ -127,7 +128,6 @@ class FunctionalAPITest(QuantizationTestCase):



-    @no_deadline
    @given(batch_size=st.integers(1, 3),
           in_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
           H=st.integers(4, 16),
@ -181,7 +181,6 @@ class FunctionalAPITest(QuantizationTestCase):
                W_scale, W_zero_point, Y_scale, Y_zero_point, use_bias,
                use_channelwise)

-    @no_deadline
    @given(batch_size=st.integers(1, 3),
           in_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
           D=st.integers(4, 8),
@ -239,7 +238,6 @@ class FunctionalAPITest(QuantizationTestCase):


 class DynamicModuleAPITest(QuantizationTestCase):
-    @no_deadline
    @unittest.skipUnless('fbgemm' in torch.backends.quantized.supported_engines,
                         " Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs"
                         " with instruction set support avx2 or newer.")
@ -357,7 +355,6 @@ class ModuleAPITest(QuantizationTestCase):
                         message="ReLU6 module API failed")


-    @no_deadline
    @given(
        batch_size=st.integers(1, 5),
        in_features=st.integers(16, 32),
@ -421,7 +418,6 @@ class ModuleAPITest(QuantizationTestCase):
            self.assertEqual(Z_ref, Z_q)

            # Test serialization of quantized Linear Module using state_dict
-
            model_dict = qlinear.state_dict()
            self.assertEqual(model_dict['_packed_params.weight'], W_q)
            if use_bias:
@ -647,7 +643,6 @@ class ModuleAPITest(QuantizationTestCase):
        # Smoke test extra_repr
        self.assertTrue(module_name in str(converted_qconv_module))

-    @no_deadline
    @given(batch_size=st.integers(1, 3),
           in_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
           H=st.integers(4, 16),
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -763,6 +763,45 @@ class _TestTorchMixin(object):
            res = torch.where(a > 0)
            self.assertEqual(1, len(res))

+    def test_where_tensor(self):
+        def rand_tensor(size, dtype, device):
+            if dtype.is_floating_point:
+                return torch.rand(size=size, dtype=dtype, device=device)
+            elif dtype == torch.uint8:
+                return torch.randint(1, 5, size=size, dtype=dtype, device=device)
+            elif dtype == torch.bool:
+                return torch.randint(0, 1, size=size, dtype=dtype, device=device).bool()
+            else:
+                return torch.randint(-5, 5, size=size, dtype=dtype, device=device)
+
+        def get_tensor(size, dtype, device, contiguous):
+            if not contiguous and len(size) < 2:
+                raise RuntimeError("Unable to generate non contiguous tensor with size < 2")
+            t = rand_tensor(size, dtype, device)
+            if contiguous:
+                return t
+            else:
+                return t.transpose(0, 1)
+
+        height = 5
+        width = 5
+        for device in torch.testing.get_all_device_types():
+            for dt1 in torch.testing.get_all_math_dtypes(device):
+                for dt2 in torch.testing.get_all_math_dtypes(device):
+                    for contiguous in [True, False]:
+                        x1 = get_tensor((height, width), dt1, device, contiguous)
+                        x2 = get_tensor((height, width), dt2, device, contiguous)
+                        if dt1 != dt2:
+                            self.assertRaisesRegex(RuntimeError, "expected scalar type", lambda: torch.where(x1 == 1, x1, x2))
+                        else:
+                            if x1.is_floating_point():
+                                condition = (x1 < 0.5)
+                            else:
+                                condition = (x1 == 1)
+                            expected = condition.to(x1.dtype) * x1 + (~condition).to(x2.dtype) * x2
+                            result = torch.where(condition, x1, x2)
+                            self.assertEqual(expected, result)
+
    def test_all_any_with_dim(self):
        def test(x):
            r1 = x.prod(dim=0, keepdim=False).byte()
@ -1772,6 +1811,13 @@ class _TestTorchMixin(object):
        x = torch.tensor(2., requires_grad=True)
        self.assertRaises(Exception, lambda: y.addcmul(y, y, value=x))

+    # FIXME: get rid of this once we have actual ops using optional floats
+    def test_optional_floats(self):
+        x = torch.randn(())
+        self.assertEqual(torch._test_optional_float(x), torch.empty((0,)))
+        self.assertEqual(torch._test_optional_float(x, scale=None), torch.empty((0,)))
+        self.assertEqual(torch._test_optional_float(x, scale=2.5), torch.full((), 2.5))
+
    def test_copy_broadcast(self):
        torch.zeros(5, 6).copy_(torch.zeros(6))
        self.assertRaises(RuntimeError, lambda: torch.zeros(5, 6).copy_(torch.zeros(30)))
@ -13661,23 +13707,69 @@ class TestTorchDeviceType(TestCase):
        result = torch.cat(concat_list)
        self.assertEqual(result.size(0), SIZE1 + SIZE2)

+# NOTE [Linspace+Logspace precision override]
+# Our Linspace and logspace torch.half CUDA kernels are not very precise.
+# Since linspace/logspace are deterministic, we can compute an expected
+# amount of error (by testing without a precision override), adding a tiny
+# amount (EPS) to that, and using that value as the override.
+LINSPACE_LOGSPACE_EXTRA_EPS = 1e-5
+
 # Tests that compare a device's computation with the (gold-standard) CPU's.
 class TestDevicePrecision(TestCase):
-    def test_linspace(self, device):
-        a = torch.linspace(0, 10, 10, device=device)
-        b = torch.linspace(0, 10, 10)
+
+    # The implementation of linspace+logspace goes through a different path
+    # when the steps arg is equal to 0 or 1. For other values of `steps`
+    # they call specialized linspace (or logspace) kernels.
+    LINSPACE_LOGSPACE_SPECIAL_STEPS = [0, 1]
+
+    def _test_linspace(self, device, dtype, steps):
+        a = torch.linspace(0, 10, steps=steps, dtype=dtype, device=device)
+        b = torch.linspace(0, 10, steps=steps)
        self.assertEqual(a, b)

-    @dtypes(torch.double)
+    # See NOTE [Linspace+Logspace precision override]
+    @precisionOverride({torch.half: 0.0039 + LINSPACE_LOGSPACE_EXTRA_EPS})
+    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypes(torch.float, torch.double)
+    def test_linspace(self, device, dtype):
+        self._test_linspace(device, dtype, steps=10)
+
+    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypes(torch.float, torch.double)
+    def test_linspace_special_steps(self, device, dtype):
+        for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS:
+            self._test_linspace(device, dtype, steps=steps)
+
+    def _test_logspace(self, device, dtype, steps):
+        a = torch.logspace(1, 1.1, steps=steps, dtype=dtype, device=device)
+        b = torch.logspace(1, 1.1, steps=steps)
+        self.assertEqual(a, b)
+
+    def _test_logspace_base2(self, device, dtype, steps):
+        a = torch.logspace(1, 1.1, steps=steps, base=2, dtype=dtype, device=device)
+        b = torch.logspace(1, 1.1, steps=steps, base=2)
+        self.assertEqual(a, b)
+
+    # See NOTE [Linspace+Logspace precision override]
+    @precisionOverride({torch.half: 0.0157 + LINSPACE_LOGSPACE_EXTRA_EPS})
+    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypes(torch.float, torch.double)
    def test_logspace(self, device, dtype):
-        a = torch.logspace(1, 10, 10, dtype=dtype, device=device)
-        b = torch.logspace(1, 10, 10, dtype=dtype, device='cpu')
-        self.assertEqual(a, b)
+        self._test_logspace(device, dtype, steps=10)

-        # Check non-default base=2
-        a = torch.logspace(1, 10, 10, 2, dtype=dtype, device=device)
-        b = torch.logspace(1, 10, 10, 2, dtype=dtype, device='cpu')
-        self.assertEqual(a, b)
+    # See NOTE [Linspace+Logspace precision override]
+    @precisionOverride({torch.half: 0.00201 + LINSPACE_LOGSPACE_EXTRA_EPS})
+    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypes(torch.float, torch.double)
+    def test_logspace_base2(self, device, dtype):
+        self._test_logspace_base2(device, dtype, steps=10)
+
+    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypes(torch.float, torch.double)
+    def test_logspace_special_steps(self, device, dtype):
+        for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS:
+            self._test_logspace(device, dtype, steps=steps)
+            self._test_logspace_base2(device, dtype, steps=steps)

    # Note: ROCm fails when using float tensors
    @dtypes(torch.double)
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@ -328,6 +328,7 @@ def create_python_bindings(python_functions, has_self, is_module=False):
        'c10::optional<Scalar>': 'scalarOptional',
        'c10::optional<int64_t>': 'toInt64Optional',
        'c10::optional<bool>': 'toBoolOptional',
+        'c10::optional<double>': 'toDoubleOptional',
        'IntArrayRef': 'intlist',
        'int64_t': 'toInt64',
        'bool': 'toBool',
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@ -63,6 +63,7 @@ TYPE_MAP = {
    'int64_t': 'int',
    'int64_t?': 'int?',
    'double': 'float',
+    'double?': 'float?',
    'bool': 'bool',
    'bool?': 'bool?',
    'Generator': 'Generator?',
@ -115,6 +116,7 @@ FROM_IVALUE = {
    'bool': '{}.toBool()',
    'bool?': '{}.toOptional<bool>()',
    'double': '{}.toDouble()',
+    'double?': '{}.toOptional<double>()',
    'int64_t': '{}.toInt()',
    'int64_t?': '{}.toOptional<int64_t>()',
    'std::string': '{}.toStringRef()',
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -3751,25 +3751,37 @@ add_docstr(torch.nonzero,
           r"""
 nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors

-**When** :attr:`as_tuple` **is false or unspecified:**
+.. note::
+    :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+    2-D tensor where each row is the index for a nonzero value.
+
+    :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+    index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+    gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+    contains nonzero indices for a certain dimension.
+
+    See below for more details on the two behaviors.
+
+
+**When** :attr:`as_tuple` **is ``False`` (default)**:

 Returns a tensor containing the indices of all non-zero elements of
 :attr:`input`.  Each row in the result contains the indices of a non-zero
 element in :attr:`input`. The result is sorted lexicographically, with
 the last index changing the fastest (C-style).

-If :attr:`input` has `n` dimensions, then the resulting indices tensor
+If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
 :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
 non-zero elements in the :attr:`input` tensor.

-**When** :attr:`as_tuple` **is true:**
+**When** :attr:`as_tuple` **is ``True``**:

 Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
 each containing the indices (in that dimension) of all non-zero elements of
 :attr:`input` .

-If :attr:`input` has `n` dimensions, then the resulting tuple contains `n` tensors
-of size `z`, where `z` is the total number of
+If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+tensors of size :math:`z`, where :math:`z` is the total number of
 non-zero elements in the :attr:`input` tensor.

 As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
@ -3780,8 +3792,8 @@ Args:
    out (LongTensor, optional): the output tensor containing indices

 Returns:
-    LongTensor or tuple of LongTensor: If :attr:`as_tuple` is false, the output
-    tensor containing indices. If :attr:`as_tuple` is true, one 1-D tensor for
+    LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+    tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
    each dimension, containing the indices of each nonzero element along that
    dimension.

@ -5199,6 +5211,9 @@ i.e., if the last two dimensions of :attr:`input` are ``m`` and ``n``, then the
 If :attr:`compute_uv` is ``False``, the returned `U` and `V` matrices will be zero matrices
 of shape :math:`(m \times m)` and :math:`(n \times n)` respectively. :attr:`some` will be ignored here.

+.. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
+          then the singular values of each matrix in the batch is returned in descending order.
+
 .. note:: The implementation of SVD on CPU uses the LAPACK routine `?gesdd` (a divide-and-conquer
          algorithm) instead of `?gesvd` for speed. Analogously, the SVD on GPU uses the MAGMA routine
          `gesdd` as well.
@ -5279,6 +5294,9 @@ only the upper triangular portion is used by default.

 If :attr:`upper` is ``False``, then lower triangular portion is used.

+.. note:: The eigenvalues are returned in ascending order. If :attr:`input` is a batch of matrices,
+          then the eigenvalues of each matrix in the batch is returned in ascending order.
+
 .. note:: Irrespective of the original strides, the returned matrix `V` will
          be transposed, i.e. with strides `V.contiguous().transpose(-1, -2).stride()`.

@ -5782,7 +5800,7 @@ The upper triangular part of the matrix is defined as the elements on and
 above the diagonal.

 The argument :attr:`diagonal` controls which diagonal to consider. If
-:attr:`diagonal` = 0, all elements on and below the main diagonal are
+:attr:`diagonal` = 0, all elements on and above the main diagonal are
 retained. A positive value excludes just as many diagonals above the main
 diagonal, and similarly a negative value includes just as many diagonals below
 the main diagonal. The main diagonal are the set of indices
--- a/torch/csrc/api/include/torch/data/datasets/chunk.h
+++ b/torch/csrc/api/include/torch/data/datasets/chunk.h
@ -22,6 +22,8 @@ namespace datasets {
 template <typename ExampleType_, typename ChunkType_ = std::vector<ExampleType_>>
 class ChunkDataReader {
 public:
+  virtual ~ChunkDataReader() = default;
+
  using ChunkType = ChunkType_;
  using ExampleType = ExampleType_;

--- a/torch/csrc/api/include/torch/nn/cloneable.h
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@ -47,7 +47,7 @@ class Cloneable : public virtual Module {
        "parameters as the original module after calling reset(). "
        "Are you sure you called register_parameter() inside reset() "
        "and not the constructor?");
-    for (const auto& parameter : parameters_) {
+    for (const auto& parameter : named_parameters(/*recurse=*/false)) {
      auto& tensor = *parameter;
      auto data = device && tensor.device() != *device ?
          tensor.to(*device) : autograd::Variable(tensor).clone();
@ -59,7 +59,7 @@ class Cloneable : public virtual Module {
        "buffers as the original module after calling reset(). "
        "Are you sure you called register_buffer() inside reset() "
        "and not the constructor?");
-    for (const auto& buffer : buffers_) {
+    for (const auto& buffer : named_buffers(/*recurse=*/false)) {
      auto& tensor = *buffer;
      auto data = device && tensor.device() != *device ?
          tensor.to(*device) : autograd::Variable(tensor).clone();
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@ -648,11 +648,11 @@ void Module::to_impl(Ts&&... ts) {
    child.value()->to(ts...);
  }
  // Then move every parameter to the new dtype/device.
-  for (auto& parameter : parameters_) {
+  for (auto& parameter : named_parameters(/*recurse=*/false)) {
    parameter->set_data(autograd::Variable(*parameter).to(ts...));
  }
  // Then move every buffer to the new dtype/device.
-  for (auto& buffer : buffers_) {
+  for (auto& buffer : named_buffers(/*recurse=*/false)) {
    buffer->set_data(autograd::Variable(*buffer).to(ts...));
  }
 }
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@ -9,8 +9,6 @@

 #include <cstdint>

-namespace F = torch::nn::functional;
-
 namespace torch {
 namespace nn {

@ -178,7 +176,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
      }
    }

-    return F::detail::batch_norm(
+    return torch::nn::functional::detail::batch_norm(
        input,
        this->running_mean,
        this->running_var,
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@ -17,9 +17,9 @@ namespace nn {

 /// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
-class ConvImpl : public torch::nn::Cloneable<Derived> {
+class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 public:
-  explicit ConvImpl(ConvOptions<D> options_) : options(std::move(options_)) {
+  explicit ConvNdImpl(detail::ConvNdOptions<D> options_) : options(std::move(options_)) {
    reset();
  }

@ -98,7 +98,7 @@ class ConvImpl : public torch::nn::Cloneable<Derived> {
  }

  /// The options with which this `Module` was constructed.
-  ConvOptions<D> options;
+  detail::ConvNdOptions<D> options;

  /// The learned kernel (or "weight").
  Tensor weight;
@ -112,15 +112,15 @@ class ConvImpl : public torch::nn::Cloneable<Derived> {
 /// Applies convolution over a 1-D input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about
 /// the exact behavior of this module.
-class TORCH_API Conv1dImpl : public ConvImpl<1, Conv1dImpl> {
+class TORCH_API Conv1dImpl : public ConvNdImpl<1, Conv1dImpl> {
 public:
  Conv1dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<1> kernel_size)
-      : Conv1dImpl(ConvOptions<1>(input_channels, output_channels, kernel_size)) {
+      : Conv1dImpl(Conv1dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit Conv1dImpl(ConvOptions<1> options_);
+  explicit Conv1dImpl(Conv1dOptions options_);
  Tensor forward(const Tensor& input);
 };

@ -135,15 +135,15 @@ TORCH_MODULE(Conv1d);
 /// Applies convolution over a 2-D input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about
 /// the exact behavior of this module.
-class TORCH_API Conv2dImpl : public ConvImpl<2, Conv2dImpl> {
+class TORCH_API Conv2dImpl : public ConvNdImpl<2, Conv2dImpl> {
 public:
  Conv2dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<2> kernel_size)
-      : Conv2dImpl(ConvOptions<2>(input_channels, output_channels, kernel_size)) {
+      : Conv2dImpl(Conv2dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit Conv2dImpl(ConvOptions<2> options_);
+  explicit Conv2dImpl(Conv2dOptions options_);
  Tensor forward(const Tensor& input);
 };

@ -158,15 +158,15 @@ TORCH_MODULE(Conv2d);
 /// Applies convolution over a 3-D input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about
 /// the exact behavior of this module.
-class TORCH_API Conv3dImpl : public ConvImpl<3, Conv3dImpl> {
+class TORCH_API Conv3dImpl : public ConvNdImpl<3, Conv3dImpl> {
 public:
  Conv3dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<3> kernel_size)
-      : Conv3dImpl(ConvOptions<3>(input_channels, output_channels, kernel_size)) {
+      : Conv3dImpl(Conv3dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit Conv3dImpl(ConvOptions<3> options_);
+  explicit Conv3dImpl(Conv3dOptions options_);
  Tensor forward(const Tensor& input);
 };

@ -180,9 +180,9 @@ TORCH_MODULE(Conv3d);

 /// Base class for all (dimension-specialized) convolution transpose modules.
 template <size_t D, typename Derived>
-class ConvTransposeImpl : public ConvImpl<D, Derived> {
+class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 public:
-  using torch::nn::ConvImpl<D, Derived>::ConvImpl;
+  using torch::nn::ConvNdImpl<D, Derived>::ConvNdImpl;

  /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
  void pretty_print(std::ostream& stream) const override {
@ -224,15 +224,15 @@ class ConvTransposeImpl : public ConvImpl<D, Derived> {
 /// Applies the ConvTranspose1d function.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose1d to
 /// learn about the exact behavior of this module.
-class TORCH_API ConvTranspose1dImpl : public ConvTransposeImpl<1, ConvTranspose1dImpl> {
+class TORCH_API ConvTranspose1dImpl : public ConvTransposeNdImpl<1, ConvTranspose1dImpl> {
 public:
  ConvTranspose1dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<1> kernel_size)
-      : ConvTranspose1dImpl(ConvTransposeOptions<1>(input_channels, output_channels, kernel_size)) {
+      : ConvTranspose1dImpl(ConvTranspose1dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit ConvTranspose1dImpl(ConvTransposeOptions<1> options_);
+  explicit ConvTranspose1dImpl(ConvTranspose1dOptions options_);
  Tensor forward(const Tensor& input,
                 const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
 };
@ -244,15 +244,15 @@ TORCH_MODULE(ConvTranspose1d);
 /// Applies the ConvTranspose2d function.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose2d to
 /// learn about the exact behavior of this module.
-class TORCH_API ConvTranspose2dImpl : public ConvTransposeImpl<2, ConvTranspose2dImpl> {
+class TORCH_API ConvTranspose2dImpl : public ConvTransposeNdImpl<2, ConvTranspose2dImpl> {
 public:
  ConvTranspose2dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<2> kernel_size)
-      : ConvTranspose2dImpl(ConvTransposeOptions<2>(input_channels, output_channels, kernel_size)) {
+      : ConvTranspose2dImpl(ConvTranspose2dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit ConvTranspose2dImpl(ConvTransposeOptions<2> options_);
+  explicit ConvTranspose2dImpl(ConvTranspose2dOptions options_);
  Tensor forward(const Tensor& input,
                 const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
 };
@ -264,15 +264,15 @@ TORCH_MODULE(ConvTranspose2d);
 /// Applies the ConvTranspose3d function.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose3d to
 /// learn about the exact behavior of this module.
-class TORCH_API ConvTranspose3dImpl : public ConvTransposeImpl<3, ConvTranspose3dImpl> {
+class TORCH_API ConvTranspose3dImpl : public ConvTransposeNdImpl<3, ConvTranspose3dImpl> {
 public:
  ConvTranspose3dImpl(
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<3> kernel_size)
-      : ConvTranspose3dImpl(ConvTransposeOptions<3>(input_channels, output_channels, kernel_size)) {
+      : ConvTranspose3dImpl(ConvTranspose3dOptions(input_channels, output_channels, kernel_size)) {
  }
-  explicit ConvTranspose3dImpl(ConvTransposeOptions<3> options_);
+  explicit ConvTranspose3dImpl(ConvTranspose3dOptions options_);
  Tensor forward(const Tensor& input,
                 const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
 };
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@ -14,7 +14,7 @@ class InstanceNormImpl : public torch::nn::NormImplBase<D, Derived, InstanceNorm

  Tensor forward(const Tensor& input) {
    this->_check_input_dim(input);
-    return F::detail::instance_norm(
+    return torch::nn::functional::detail::instance_norm(
      input, this->running_mean, this->running_var, this->weight, this->bias,
      this->is_training() || !this->options.track_running_stats(), this->options.momentum(), this->options.eps());
  }
--- a/torch/csrc/api/include/torch/nn/options/conv.h
+++ b/torch/csrc/api/include/torch/nn/options/conv.h
@ -9,12 +9,14 @@
 namespace torch {
 namespace nn {

-/// Options for a `D`-dimensional convolution module.
-template <size_t D>
-struct ConvOptions {
-  typedef c10::variant<enumtype::kZeros, enumtype::kCircular> padding_mode_t;
+namespace detail {

-  ConvOptions(
+typedef c10::variant<enumtype::kZeros, enumtype::kCircular> conv_padding_mode_t;
+
+/// Options for a `D`-dimensional convolution or convolution transpose module.
+template <size_t D>
+struct ConvNdOptions {
+  ConvNdOptions(
      int64_t in_channels,
      int64_t out_channels,
      ExpandingArray<D> kernel_size) :
@ -73,6 +75,67 @@ struct ConvOptions {
  /// Changing this parameter after construction __has no effect__.
  TORCH_ARG(bool, bias) = true;

+  /// Accepted values `zeros` and `circular` Default: `zeros`
+  TORCH_ARG(conv_padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+} // namespace detail
+
+// ============================================================================
+
+/// Options for a `D`-dimensional convolution module.
+template <size_t D>
+struct ConvOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+
+  ConvOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size) :
+                in_channels_(in_channels),
+                out_channels_(out_channels),
+                kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
  /// Accepted values `zeros` and `circular` Default: `zeros`
  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
 };
@ -129,8 +192,67 @@ using Conv3dFuncOptions = ConvFuncOptions<3>;

 // ============================================================================

-template<size_t D>
-using ConvTransposeOptions = ConvOptions<D>;
+template <size_t D>
+struct ConvTransposeOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+
+  ConvTransposeOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size) :
+                in_channels_(in_channels),
+                out_channels_(out_channels),
+                kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// Accepted values `zeros` and `circular` Default: `zeros`
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+};

 /// `ConvTransposeOptions` specialized for 1-D convolution.
 using ConvTranspose1dOptions = ConvTransposeOptions<1>;
--- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@ -100,7 +100,7 @@ void replicate_grad_edges(
    const std::vector<std::shared_ptr<ModuleType>>& replicas,
    const std::vector<Device>& devices) {

-  for (auto& parameter : module->parameters_) {
+  for (auto& parameter : module->named_parameters(/*recurse=*/false)) {
    auto grad_fn = std::make_shared<ReduceAdd>((*parameter).device());
    grad_fn->set_next_edges(autograd::collect_next_edges(*parameter));

@ -109,7 +109,7 @@ void replicate_grad_edges(
    }
  }

-  for (auto& buffer : module->buffers_) {
+  for (auto& buffer : module->named_buffers(/*recurse=*/false)) {
    if (buffer.value().requires_grad()){
      auto grad_fn = std::make_shared<ReduceAdd>((*buffer).device());
      grad_fn->set_next_edges(autograd::collect_next_edges(*buffer));
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@ -32,15 +32,6 @@ std::string join_name(const std::string& name_prefix, const std::string& name) {
  full_name += name;
  return full_name;
 }
-
-void extend(
-    std::vector<Tensor>& vector,
-    const OrderedDict<std::string, Tensor>& dict) {
-  vector.reserve(vector.size() + dict.size());
-  for (const auto& item : dict) {
-    vector.push_back(item.value());
-  }
-}
 } // namespace

 Module::Module()
@ -141,46 +132,48 @@ void Module::apply(
 }

 std::vector<Tensor> Module::parameters(bool recurse) const {
-  if (!recurse) {
-    return parameters_.values();
-  }
-  std::vector<Tensor> result;
-  apply(
-      [&result](const Module& module) { extend(result, module.parameters_); });
-  return result;
+  return named_parameters(recurse).values();
 }

 OrderedDict<std::string, Tensor> Module::named_parameters(bool recurse) const {
-  if (!recurse) {
-    return parameters_;
-  }
  OrderedDict<std::string, Tensor> result;
-  apply([&result](const std::string& name, const Module& module) {
-    for (const auto& parameter : module.parameters_) {
-      result.insert(join_name(name, parameter.key()), parameter.value());
+  if (!recurse) {
+    for (const auto& parameter : parameters_) {
+      if (parameter.value().defined()) {
+        result.insert(parameter.key(), parameter.value());
+      }
    }
-  });
+  } else {
+    apply([&result](const std::string& name, const Module& module) {
+      for (const auto& parameter : module.named_parameters(/*recurse=*/false)) {
+        TORCH_INTERNAL_ASSERT(parameter.value().defined());
+        result.insert(join_name(name, parameter.key()), parameter.value());
+      }
+    });
+  }
  return result;
 }

 std::vector<Tensor> Module::buffers(bool recurse) const {
-  if (!recurse) {
-    return buffers_.values();
-  }
-  std::vector<Tensor> result;
-  apply([&result](const Module& module) { extend(result, module.buffers_); });
-  return result;
+  return named_buffers(recurse).values();
 }
+
 OrderedDict<std::string, Tensor> Module::named_buffers(bool recurse) const {
-  if (!recurse) {
-    return buffers_;
-  }
  OrderedDict<std::string, Tensor> result;
-  apply([&result](const std::string& name, const Module& module) {
-    for (const auto& buffer : module.buffers_) {
-      result.insert(join_name(name, buffer.key()), buffer.value());
+  if (!recurse) {
+    for (const auto& buffer : buffers_) {
+      if (buffer.value().defined()) {
+        result.insert(buffer.key(), buffer.value());
+      }
    }
-  });
+  } else {
+    apply([&result](const std::string& name, const Module& module) {
+      for (const auto& buffer : module.named_buffers(/*recurse=*/false)) {
+        TORCH_INTERNAL_ASSERT(buffer.value().defined());
+        result.insert(join_name(name, buffer.key()), buffer.value());
+      }
+    });
+  }
  return result;
 }

@ -261,7 +254,7 @@ void Module::zero_grad() {
  for (auto& child : children_) {
    child.value()->zero_grad();
  }
-  for (auto& parameter : parameters_) {
+  for (auto& parameter : named_parameters(/*recurse=*/false)) {
    auto& grad = parameter->grad();
    if (grad.defined()) {
      grad = grad.detach();
@ -271,10 +264,10 @@ void Module::zero_grad() {
 }

 void Module::save(serialize::OutputArchive& archive) const {
-  for (const auto& parameter : parameters_) {
+  for (const auto& parameter : named_parameters(/*recurse=*/false)) {
    archive.write(parameter.key(), parameter.value());
  }
-  for (const auto& buffer : buffers_) {
+  for (const auto& buffer : named_buffers(/*recurse=*/false)) {
    archive.write(buffer.key(), buffer.value(), /*is_buffer=*/true);
  }
  for (const auto& child : children_) {
@ -287,10 +280,10 @@ void Module::save(serialize::OutputArchive& archive) const {
 }

 void Module::load(serialize::InputArchive& archive) {
-  for (auto& parameter : parameters_) {
+  for (auto& parameter : named_parameters(/*recurse=*/false)) {
    archive.read(parameter.key(), parameter.value());
  }
-  for (auto& buffer : buffers_) {
+  for (auto& buffer : named_buffers(/*recurse=*/false)) {
    archive.read(buffer.key(), buffer.value(), /*is_buffer=*/true);
  }
  for (const auto& child : children_) {
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@ -19,8 +19,20 @@ namespace F = torch::nn::functional;
 namespace torch {
 namespace nn {
 Conv1dImpl::Conv1dImpl(
-    ConvOptions<1> options_)
-    : ConvImpl(options_.transposed(false).output_padding(0)) {}
+    Conv1dOptions options_)
+    : ConvNdImpl(
+        detail::ConvNdOptions<1>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(false)
+          .output_padding(0)
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor Conv1dImpl::forward(const Tensor& input) {
  if (c10::get_if<enumtype::kCircular>(&options.padding_mode())) {
@ -44,8 +56,20 @@ Tensor Conv1dImpl::forward(const Tensor& input) {
 }

 Conv2dImpl::Conv2dImpl(
-    ConvOptions<2> options_)
-    : ConvImpl(options_.transposed(false).output_padding(0)) {}
+    Conv2dOptions options_)
+    : ConvNdImpl(
+        detail::ConvNdOptions<2>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(false)
+          .output_padding(0)
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor Conv2dImpl::forward(const Tensor& input) {
  if (c10::get_if<enumtype::kCircular>(&options.padding_mode())) {
@ -71,8 +95,20 @@ Tensor Conv2dImpl::forward(const Tensor& input) {
 }

 Conv3dImpl::Conv3dImpl(
-    ConvOptions<3> options_)
-    : ConvImpl(options_.transposed(false).output_padding(0)) {}
+    Conv3dOptions options_)
+    : ConvNdImpl(
+        detail::ConvNdOptions<3>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(false)
+          .output_padding(0)
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor Conv3dImpl::forward(const Tensor& input) {
  if (c10::get_if<enumtype::kCircular>(&options.padding_mode())) {
@ -98,14 +134,14 @@ Tensor Conv3dImpl::forward(const Tensor& input) {
    options.groups());
 }

-template class ConvImpl<1, Conv1dImpl>;
-template class ConvImpl<2, Conv2dImpl>;
-template class ConvImpl<3, Conv3dImpl>;
+template class ConvNdImpl<1, Conv1dImpl>;
+template class ConvNdImpl<2, Conv2dImpl>;
+template class ConvNdImpl<3, Conv3dImpl>;

 // ============================================================================

 template <size_t D, typename Derived>
-std::vector<int64_t> ConvTransposeImpl<D, Derived>::_output_padding(
+std::vector<int64_t> ConvTransposeNdImpl<D, Derived>::_output_padding(
    const Tensor& input, const c10::optional<at::IntArrayRef>& output_size,
    const ExpandingArray<D>& stride, const ExpandingArray<D>& padding,
    const ExpandingArray<D>& kernel_size) {
@ -151,7 +187,20 @@ std::vector<int64_t> ConvTransposeImpl<D, Derived>::_output_padding(
 }

 ConvTranspose1dImpl::ConvTranspose1dImpl(
-    ConvTransposeOptions<1> options_) : ConvTransposeImpl(options_.transposed(true)) {}
+    ConvTranspose1dOptions options_)
+    : ConvTransposeNdImpl(
+        detail::ConvNdOptions<1>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(true)
+          .output_padding(options_.output_padding())
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor ConvTranspose1dImpl::forward(
    const Tensor& input, const c10::optional<at::IntArrayRef>& output_size) {
@ -168,7 +217,19 @@ Tensor ConvTranspose1dImpl::forward(
 }

 ConvTranspose2dImpl::ConvTranspose2dImpl(
-    ConvTransposeOptions<2> options_) : ConvTransposeImpl(options_.transposed(true)) {}
+    ConvTranspose2dOptions options_)
+    : ConvTransposeNdImpl(detail::ConvNdOptions<2>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(true)
+          .output_padding(options_.output_padding())
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor ConvTranspose2dImpl::forward(
    const Tensor& input, const c10::optional<at::IntArrayRef>& output_size) {
@ -185,7 +246,19 @@ Tensor ConvTranspose2dImpl::forward(
 }

 ConvTranspose3dImpl::ConvTranspose3dImpl(
-    ConvTransposeOptions<3> options_) : ConvTransposeImpl(options_.transposed(true)) {}
+    ConvTranspose3dOptions options_)
+    : ConvTransposeNdImpl(detail::ConvNdOptions<3>(
+          /*in_channels=*/options_.in_channels(),
+          /*out_channels=*/options_.out_channels(),
+          /*kernel_size=*/options_.kernel_size())
+          .stride(options_.stride())
+          .padding(options_.padding())
+          .dilation(options_.dilation())
+          .transposed(true)
+          .output_padding(options_.output_padding())
+          .groups(options_.groups())
+          .bias(options_.bias())
+          .padding_mode(options_.padding_mode())) {}

 Tensor ConvTranspose3dImpl::forward(
    const Tensor& input, const c10::optional<at::IntArrayRef>& output_size) {
@ -201,9 +274,9 @@ Tensor ConvTranspose3dImpl::forward(
    output_padding, options.groups(), options.dilation());
 }

-template class ConvTransposeImpl<1, ConvTranspose1dImpl>;
-template class ConvTransposeImpl<2, ConvTranspose2dImpl>;
-template class ConvTransposeImpl<3, ConvTranspose3dImpl>;
+template class ConvTransposeNdImpl<1, ConvTranspose1dImpl>;
+template class ConvTransposeNdImpl<2, ConvTranspose2dImpl>;
+template class ConvTransposeNdImpl<3, ConvTranspose3dImpl>;

 } // namespace nn
 } // namespace torch
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@ -36,7 +36,8 @@ PyObject* rpc_init(PyObject* /* unused */) {

  auto rpcBackendOptions =
      shared_ptr_class_<RpcBackendOptions>(module, "RpcBackendOptions")
-          .def_readwrite("rpc_timeout", &RpcBackendOptions::rpcTimeout);
+          .def_readwrite("rpc_timeout", &RpcBackendOptions::rpcTimeout)
+          .def_readwrite("init_method", &RpcBackendOptions::initMethod);

  auto workerInfo =
      shared_ptr_class_<WorkerInfo>(
@ -111,9 +112,9 @@ Otherwise, throws an exception.
                return PyRRef::unpickle(t);
              }));

-  // future.wait() should not be called after wait_all_workers(), e.g.,
-  // pythonRpcHandler is cleaned up in wait_all_workers(), after
-  // wait_all_workers(), python objects returned from rpc python call can not be
+  // future.wait() should not be called after shutdown(), e.g.,
+  // pythonRpcHandler is cleaned up in shutdown(), after
+  // shutdown(), python objects returned from rpc python call can not be
  // resolved.
  auto futureMessage =
      shared_ptr_class_<FutureMessage>(module, "FutureMessage")
@ -154,6 +155,10 @@ Otherwise, throws an exception.
          "join",
          &ProcessGroupAgent::join,
          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "shutdown",
+          &ProcessGroupAgent::shutdown,
+          py::call_guard<py::gil_scoped_release>())
      .def(
          "sync",
          &ProcessGroupAgent::sync,
@ -164,8 +169,8 @@ Otherwise, throws an exception.
    agent->start();
  });

-  module.def("_destroy_rref_context", []() {
-    RRefContext::getInstance().destroyInstance();
+  module.def("_destroy_rref_context", [](bool ignoreRRefLeak) {
+    RRefContext::getInstance().destroyInstance(ignoreRRefLeak);
  });

  module.def("_cleanup_python_rpc_handler", []() {
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@ -127,7 +127,6 @@ ProcessGroupAgent::ProcessGroupAgent(
          WorkerInfo(std::move(workerName), pg->getRank()),
          c10::guts::make_unique<RequestCallbackImpl>(),
          rpcTimeout),
-      shutdown_{false},
      pg_(std::move(pg)),
      sendCounts_(pg_->getSize()),
      recvCounts_(pg_->getSize()),
@ -180,30 +179,12 @@ const WorkerInfo& ProcessGroupAgent::getWorkerInfo(worker_id_t id) const {
 }

 void ProcessGroupAgent::join() {
-  // Every process i sends a SHUTDOWN message to process i + 1. This is
-  // necessary for now because:
-  // 1. There is no abort API for ProcessGroup::recvAnysource yet. We have to
-  //    feed it a message or kill the thread.
-  // 2. A GLOO process cannot send message to itself. (there is an ongoing
-  //    effort to fix this problem).
-  shutdown_.store(true);
  sync();
-  // This is needed in case no futures were created, otherwise the future
-  // timeout watchdog would sleep forever.
-
-  futureTimeoutCV_.notify_one();
  std::unique_lock<std::mutex> lock(futureMutex_);
  futureCV_.wait(
      lock, [this] { return futures_.empty() && futureTimeouts_.empty(); });
  lock.unlock();
  pg_->barrier()->wait();
-  int dst = (pg_->getRank() + 1) % pg_->getSize();
-  enqueueSend(
-      SendWork(allWorkerInfo_[dst], Message({}, {}, MessageType::SHUTDOWN)));
-  threadPool_.waitWorkComplete();
-  listenerThread_.join();
-  futureTimeoutThread_.join();
-  PythonRpcHandler::getInstance().cleanup();
 }

 bool ProcessGroupAgent::hasPendingMessage() {
@ -269,14 +250,38 @@ void ProcessGroupAgent::sync() {
 }

 void ProcessGroupAgent::start() {
+  {
+    std::lock_guard<std::mutex> futureLock{futureMutex_};
+    rpcRunning_.store(true);
+  }
  listenerThread_ = std::thread(&ProcessGroupAgent::listenLoop, this);
  futureTimeoutThread_ =
      std::thread(&ProcessGroupAgent::pollTimedOutRPCs, this);
 }

+void ProcessGroupAgent::shutdown() {
+  LOG(INFO) << "Shutting down ProcessGroupAgent.";
+  std::unique_lock<std::mutex> lock{futureMutex_};
+  if (!rpcRunning_.exchange(false)) {
+    return;
+  }
+  lock.unlock();
+  futureTimeoutCV_.notify_one();
+  futureTimeoutThread_.join();
+  {
+    std::unique_lock<std::mutex> lock(recvWorkMutex_);
+    if (recvWork_) {
+      recvWork_->abort();
+    }
+  }
+  threadPool_.waitWorkComplete();
+  listenerThread_.join();
+}
+
 std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
    const WorkerInfo& to,
    Message&& message) {
+  TORCH_CHECK(rpcRunning_.load(), "ProcessGroupAgent hasn't started.")
  TORCH_CHECK(
      to.id_ < (worker_id_t)pg_->getSize(),
      "Destination rank is out of bound, got ",
@ -456,10 +461,19 @@ void ProcessGroupAgent::enqueueRecv(RecvWork work) {
 }

 void ProcessGroupAgent::listenLoop() {
-  while (true) {
+  while (rpcRunning_.load()) {
    // rank, tensor size, message type
    std::vector<torch::Tensor> preamble = {torch::empty({3}, {torch::kInt64})};
-    pg_->recvAnysource(preamble, pg_->getRank())->wait();
+    auto work = pg_->recvAnysource(preamble, pg_->getRank());
+    {
+      std::lock_guard<std::mutex> guard(recvWorkMutex_);
+      recvWork_ = work;
+    }
+
+    if (!rpcRunning_.load() || !work->wait() /* aborted */) {
+      return;
+    }
+
    int64_t* preamble_items = preamble.front().storage().data<int64_t>();

    auto srcRank = preamble_items[0];
@ -483,9 +497,12 @@ void ProcessGroupAgent::listenLoop() {
 }

 void ProcessGroupAgent::pollTimedOutRPCs() {
-  while (!shutdown_.load()) {
-    std::chrono::milliseconds sleepTime;
+  while (true) {
    std::unique_lock<std::mutex> lock{futureMutex_};
+    if (!rpcRunning_.load()) {
+      return;
+    }
+    std::chrono::milliseconds sleepTime;
    // Estimate amount of time the first future will time out in, and sleep
    // for that long.
    // if there are no futures or the first future's RPC timeout is set to 0
@ -505,7 +522,7 @@ void ProcessGroupAgent::pollTimedOutRPCs() {
      futureTimeoutCV_.wait_for(lock, sleepTime);
    }

-    if (shutdown_.load()) {
+    if (!rpcRunning_.load()) {
      return;
    }

--- a/torch/csrc/distributed/rpc/process_group_agent.h
+++ b/torch/csrc/distributed/rpc/process_group_agent.h
@ -14,7 +14,7 @@ namespace distributed {
 namespace rpc {

 struct ProcessGroupRpcBackendOptions : public RpcBackendOptions {
-  ProcessGroupRpcBackendOptions() noexcept = default;
+  ProcessGroupRpcBackendOptions() = default;
  int numSendRecvThreads;
 };

@ -57,6 +57,8 @@ class ProcessGroupAgent : public RpcAgent {

  void start() override;

+  void shutdown() override;
+
 protected:
  // This method wraps the destination information and the message into a
  // SendWork object, and put the SendWork into a queue. Another thread will
@ -143,10 +145,6 @@ class ProcessGroupAgent : public RpcAgent {
    return ++nextId_;
  }

-  // atomic bool indicating if join() has been called and background threads
-  // should shutdown.
-  std::atomic_bool shutdown_;
-
  std::shared_ptr<c10d::ProcessGroup> pg_;
  // worker name -> rank
  std::unordered_map<std::string, int> nameMap_;
@ -159,12 +157,23 @@ class ProcessGroupAgent : public RpcAgent {
  MessageCounter recvCounts_;

  std::atomic<int64_t> nextId_;
+  // atomic bool indicating if this agent is running. It is set in
+  // ProcessGroupAgent::start and unset in ProcessGroupAgent::shutdown and
+  // ProcessGroupAgent::join. It controls whether several background threads
+  // should be running.
+  // We lock access to this in shutdown() and pollTimedOutRPCs() to prevent race
+  // conditions when notifying condition variables.
+  std::atomic<bool> rpcRunning_{false};
  // one mutex per ProcessGroup rank, as ProcessGroup::send is not thread-safe
  // when using the same tag.
  std::vector<std::mutex> sendMutexes_;
  std::thread listenerThread_;
  // A thread to poll existing futures and check for timed out ones.
  std::thread futureTimeoutThread_;
+  // Lock and shared ptr to currently pending work, set in listenloop() and
+  // interruptible in shutdown().
+  std::mutex recvWorkMutex_;
+  std::shared_ptr<c10d::ProcessGroup::Work> recvWork_;
  // A threadPool that processing both SendWork and RecvWork. There are two
  // motivations for adding a ThreadPool:
  // (1) RPC serialization/deserialization and processing can be expensive,
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@ -13,8 +13,9 @@ namespace distributed {
 namespace rpc {

 struct RpcBackendOptions {
-  RpcBackendOptions() noexcept = default;
+  RpcBackendOptions() = default;
  std::chrono::milliseconds rpcTimeout;
+  std::string initMethod;
 };

 // A globally unique ID to identify an RpcAgent
@ -124,7 +125,11 @@ class TORCH_API RpcAgent {
  virtual void sync() = 0;

  // start accepting requests
-  virtual void start() {}
+  virtual void start() = 0;
+
+  // Stop accepting requests and shutdown the RPC framework as soon as possible
+  // by terminating all RPC threads.
+  virtual void shutdown() = 0;

  // Set the default rpc agent.
  static void setDefaultRpcAgent(std::shared_ptr<RpcAgent> defaultRpcAgent);
--- a/torch/csrc/distributed/rpc/rref.cpp
+++ b/torch/csrc/distributed/rpc/rref.cpp
@ -136,15 +136,16 @@ UserRRef<T>::UserRRef(

 template <typename T>
 UserRRef<T>::~UserRRef() {
-  // TODO: queue this in RRefContext instead of doing it here.
-  auto& ctx = RRefContext::getInstance();
-  if (ctx.getWorkerId() != ownerId_) {
-    auto fm = ctx.agent()->send(
-        ctx.agent()->getWorkerInfo(ownerId_),
-        RRefUserDelete(rrefId_, forkId_).toMessage());
-
-    fm->addCallback(
-        [](const Message& message) { RRefContext::handleException(message); });
+  try {
+    RRefContext::getInstance().delUser(ownerId_, rrefId_, forkId_);
+  } catch (const std::exception& ex) {
+    LOG(ERROR) << "Error occurred when deleting UserRRef instance, "
+               << "RRefId = " << rrefId_ << ", ForkId = " << forkId_ << " : "
+               << ex.what();
+  } catch (...) {
+    LOG(ERROR) << "Error occurred when deleting UserRRef instance, "
+               << "RRefId = " << rrefId_ << ", ForkId = " << forkId_ << " : "
+               << "unknown error";
  }
 }

--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@ -13,8 +13,13 @@ RRefContext& RRefContext::getInstance() {
  return *context;
 }

-void RRefContext::destroyInstance() {
-  RRefContext::getInstance().checkRRefLeaks();
+void RRefContext::destroyInstance(bool ignoreRRefLeak) {
+  auto& ctx = RRefContext::getInstance();
+  {
+    std::lock_guard<std::mutex> lock(ctx.destroyedMutex_);
+    ctx.destroyed_ = true;
+  }
+  ctx.checkRRefLeaks(ignoreRRefLeak);
 }

 void RRefContext::handleException(const Message& message) {
@ -27,7 +32,7 @@ void RRefContext::handleException(const Message& message) {
 }

 RRefContext::RRefContext(std::shared_ptr<RpcAgent> agent)
-    : agent_(std::move(agent)) {}
+    : agent_(std::move(agent)), destroyed_(false) {}

 RRefContext::~RRefContext() {
  if (!owners_.empty()) {
@ -36,7 +41,7 @@ RRefContext::~RRefContext() {
  }
 }

-void RRefContext::checkRRefLeaks() {
+void RRefContext::checkRRefLeaks(bool ignoreRRefLeak) {
  if (!forks_.empty()) {
    std::stringstream ss;
    for (auto& entry : forks_) {
@ -46,7 +51,21 @@ void RRefContext::checkRRefLeaks() {
           << std::endl;
      }
    }
-    AT_ERROR(ss.str());
+
+    if (ignoreRRefLeak) {
+      LOG(WARNING)
+          << "Detected RRef Leaks during shutdown. This usually "
+          << "occurs when the application code still holds references to RRef "
+          << "instances when calling shutdown(). If the program has "
+          << "completed correctly and the process is exiting, it is OK to "
+          << "ignore these leaks. However, if you program will keep running "
+          << "after this, these leaks could result in memory leaks on RRef "
+          << "owners. Please make sure all RRefs are out of scope and Python "
+          << "GC has deleted them before calling shutdown(): \n"
+          << ss.str();
+    } else {
+      AT_ERROR(ss.str());
+    }
  }
 }

@ -96,6 +115,21 @@ template std::shared_ptr<UserRRef<py::object>> RRefContext::createUserRRef<
    const RRefId& rrefId,
    const ForkId& forkId);

+void RRefContext::delUser(
+    const worker_id_t owner,
+    const RRefId& rrefId,
+    const ForkId& forkId) {
+  std::lock_guard<std::mutex> lock(destroyedMutex_);
+  if (!destroyed_) {
+    auto fm = agent_->send(
+        agent_->getWorkerInfo(owner),
+        RRefUserDelete(rrefId, forkId).toMessage());
+
+    fm->addCallback(
+        [](const Message& message) { RRefContext::handleException(message); });
+  }
+}
+
 template <typename T>
 std::shared_ptr<RRef> RRefContext::getOrCreateRRef(const RRefForkData& rfd) {
  auto& ownerId = rfd.ownerId_;
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@ -16,7 +16,7 @@ namespace rpc {
 class RRefContext {
 public:
  static RRefContext& getInstance();
-  static void destroyInstance();
+  static void destroyInstance(bool ignoreRRefLeak = true);

  static void handleException(const Message& message);

@ -111,6 +111,11 @@ class RRefContext {
  void addPendingUser(const ForkId& forkId, const std::shared_ptr<RRef>& rref);
  void delPendingUser(const ForkId& forkId);

+  void delUser(
+      const worker_id_t owner,
+      const RRefId& rrefId,
+      const ForkId& forkId);
+
 private:
  RRefContext(std::shared_ptr<RpcAgent>);

@ -123,7 +128,7 @@ class RRefContext {
  void finishForkRequest(const ForkId& forkId, worker_id_t parent);

  // If there is any leak on any RRef, this method will throw an error.
-  void checkRRefLeaks();
+  void checkRRefLeaks(bool ignoreRRefLeak);

  static std::atomic<local_id_t> nextLocalId_;

@ -157,6 +162,9 @@ class RRefContext {
  //     owner learns about the forked child.
  std::unordered_map<ForkId, std::shared_ptr<RRef>, ForkId::Hash>
      pendingChildren_;
+
+  std::mutex destroyedMutex_;
+  bool destroyed_;
 };

 } // namespace rpc
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@ -751,5 +751,33 @@ std::tuple<std::string, RawDataExportMap> export_onnx(
      graph_encoder.get_raw_data_export_map());
 }

+namespace {
+void export_opnames(const script::Module& m, std::set<std::string>& opnames) {
+  for (const auto& method : m.get_methods()) {
+    const auto& func = method.function();
+    for (const auto& node : func.graph()->nodes()) {
+      auto op = findOperatorFor(node);
+      if (op) {
+        auto opname = node->schema().operator_name();
+        std::string namestr = opname.name;
+        if (!opname.overload_name.empty()) {
+          namestr += "." + opname.overload_name;
+        }
+        opnames.emplace(namestr);
+      }
+    }
+  }
+  for (const auto& sub_m : m.children()) {
+    export_opnames(sub_m, opnames);
+  }
+}
+} // namespace
+
+std::vector<std::string> export_opnames(const script::Module& m) {
+  std::set<std::string> names;
+  export_opnames(m, names);
+  return std::vector<std::string>(names.begin(), names.end());
+}
+
 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/export.h
+++ b/torch/csrc/jit/export.h
@ -65,5 +65,8 @@ using ExportModuleExtraFilesHook =
    std::function<script::ExtraFilesMap(const script::Module&)>;
 TORCH_API void SetExportModuleExtraFilesHook(ExportModuleExtraFilesHook hook);

+// Returns a list of names of all operators in the module and its submodules.
+TORCH_API std::vector<std::string> export_opnames(const script::Module& m);
+
 } // namespace jit
 } // namespace torch
--- a/Show More
+++ b/Show More