[LTS] Fix Linux & MacOS LTS build errors (#78953 )

[LTS] CircleCI: Remove unsupported configurations (#75678 )
Co-authored-by: kedejesu <dkeren@seas.upenn.edu>
2025-10-23 06:34:55 +08:00 · 2022-06-07 13:05:09 -07:00 · 2022-04-12 16:20:51 -07:00 · 2022-03-08 14:00:35 -08:00 · 2022-03-08 13:52:57 -08:00 · 2022-03-08 13:37:10 -08:00
204 changed files with 2747 additions and 3563 deletions
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -52,6 +52,14 @@ CONFIG_TREE_DATA = OrderedDict(
            "3.7",
        ],
    )),
+    macos_arm64=([None], OrderedDict(
+        wheel=[
+            "3.8",
+        ],
+        conda=[
+            "3.8",
+        ],
+    )),
    # Skip CUDA-9.2 builds on Windows
    windows=(
        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -42,7 +42,21 @@ class Conf(object):
            "rocm:" + self.gpu_version.strip("rocm") if self.gpu_version.startswith("rocm") else self.gpu_version)
        docker_distro_suffix = alt_docker_suffix if self.pydistro != "conda" else (
            "cuda" if alt_docker_suffix.startswith("cuda") else "rocm")
-        return miniutils.quote("pytorch/" + docker_distro_prefix + "-" + docker_distro_suffix)
+
+        docker_digest_map = {
+            "binary_linux_manywheel_3_6m_cu111_devtoolset7_nightly_build" : 
+                "@sha256:3a9c1537a6ae97a36ea29c6bad6e9bbd3dbd18e4f34fbce30f176bcbb10c12d8",
+            "binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_build" : 
+                "@sha256:2277f8c324c3928cc0baa574591a32cbec1f32979d40f8c15b41584171819169",
+            "binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_build" : 
+                "@sha256:0e8df61551e084c9fe26ac1c9c009136ea4376c62ea55884e1c96bb74415353f",
+            "binary_linux_manywheel_3_6m_cpu_devtoolset7_nightly_build" : 
+                "@sha256:2277f8c324c3928cc0baa574591a32cbec1f32979d40f8c15b41584171819169"
+        }
+        build_name = self.gen_build_name("build", nightly=True)
+        docker_digest = docker_digest_map.get(build_name, "")
+
+        return miniutils.quote("pytorch/" + docker_distro_prefix + "-" + docker_distro_suffix + docker_digest)

    def get_name_prefix(self):
        return "smoke" if self.smoke else "binary"
@ -100,7 +114,7 @@ class Conf(object):
                if self.os == "windows":
                    job_def["executor"] = "windows-with-nvidia-gpu"
                else:
-                    job_def["resource_class"] = "gpu.medium"
+                    job_def["resource_class"] = "gpu.nvidia.small"

        os_name = miniutils.override(self.os, {"macos": "mac"})
        job_name = "_".join([self.get_name_prefix(), os_name, phase])
@ -164,7 +178,7 @@ def gen_build_env_list(smoke):
            c.find_prop("gpu"),
            c.find_prop("package_format"),
            [c.find_prop("pyver")],
-            c.find_prop("smoke"),
+            c.find_prop("smoke") and not (c.find_prop("os_name") == "macos_arm64"),  # don't test arm64
            c.find_prop("libtorch_variant"),
            c.find_prop("gcc_config_variant"),
            c.find_prop("libtorch_config_variant"),
@ -216,7 +230,9 @@ def get_jobs(toplevel_key, smoke):
    configs = gen_build_env_list(smoke)
    phase = "build" if toplevel_key == "binarybuilds" else "test"
    for build_config in configs:
-        jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
+        # don't test for macos_arm64 as it's cross compiled
+        if phase != "test" or build_config.os != "macos_arm64":
+            jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))

    return jobs_list

--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -1,9 +1,8 @@
 PHASES = ["build", "test"]

 CUDA_VERSIONS = [
-    "101",
    "102",
-    "112",
+    "111",
 ]

 ROCM_VERSIONS = [
@ -13,7 +12,7 @@ ROCM_VERSIONS = [

 ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]

-GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] 

 STANDARD_PYTHON_VERSIONS = [
    "3.6",
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -32,21 +32,6 @@ CONFIG_TREE_DATA = [
            ]),
        ]),
        ("cuda", [
-            ("9.2", [
-                ("3.6", [
-                    X(True),
-                    ("cuda_gcc_override", [
-                        ("gcc5.4", [
-                            ('build_only', [XImportant(True)]),
-                        ]),
-                    ]),
-                ])
-            ]),
-            ("10.1", [
-                ("3.6", [
-                    ('build_only', [X(True)]),
-                ]),
-            ]),
            ("10.2", [
                ("3.6", [
                    ("shard_test", [XImportant(True)]),
@ -92,13 +77,6 @@ CONFIG_TREE_DATA = [
                ]),
            ]),
        ]),
-        ("rocm", [
-            ("3.9", [
-                ("3.6", [
-                    ('build_only', [XImportant(True)]),
-                ]),
-            ]),
-        ]),
    ]),
 ]

--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -182,7 +182,7 @@ def gen_dependent_configs(xenial_parent_config):
        (["multigpu"], "large"),
        (["nogpu", "NO_AVX2"], None),
        (["nogpu", "NO_AVX"], None),
-        (["slow"], "medium"),
+        (["slow"], "nvidia.small"),
    ]

    configs = []
@ -340,7 +340,7 @@ def instantiate_configs():

        gpu_resource = None
        if cuda_version and cuda_version != "10":
-            gpu_resource = "medium"
+            gpu_resource = "nvidia.small"

        c = Conf(
            distro_name,
--- a/.circleci/cimodel/data/simple/android_definitions.py
+++ b/.circleci/cimodel/data/simple/android_definitions.py
@ -75,29 +75,6 @@ class AndroidGradleJob:


 WORKFLOW_DATA = [
-    AndroidJob(["x86_32"], "pytorch_linux_build", is_master_only=False),
-    AndroidJob(["x86_64"], "pytorch_linux_build"),
-    AndroidJob(["arm", "v7a"], "pytorch_linux_build"),
-    AndroidJob(["arm", "v8a"], "pytorch_linux_build"),
-    AndroidGradleJob(
-        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-        "pytorch_android_gradle_build-x86_32",
-        ["pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build"],
-        is_master_only=False,
-        is_pr_only=True),
-    AndroidGradleJob(
-        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-        "pytorch_android_gradle_custom_build_single",
-        [DOCKER_REQUIREMENT_NDK],
-        is_master_only=False,
-        is_pr_only=True),
-    AndroidGradleJob(
-        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build",
-        "pytorch_android_gradle_build",
-        ["pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-         "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build",
-         "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build",
-         "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build"]),
 ]


--- a/.circleci/cimodel/data/simple/bazel_definitions.py
+++ b/.circleci/cimodel/data/simple/bazel_definitions.py
@ -60,8 +60,6 @@ class BazelJob:


 WORKFLOW_DATA = [
-    BazelJob("build", {"resource_class": "large"}),
-    BazelJob("test"),
 ]


--- a/.circleci/cimodel/data/simple/binary_smoketest.py
+++ b/.circleci/cimodel/data/simple/binary_smoketest.py
@ -164,7 +164,7 @@ WORKFLOW_DATA = [
        is_master_only=True,
        requires=["binary_linux_manywheel_3_7m_cu102_devtoolset7_build"],
        extra_props={
-            "resource_class": "gpu.medium",
+            "resource_class": "gpu.nvidia.small",
            "use_cuda_docker_runtime": miniutils.quote((str(1))),
        },
    ),
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@ -14,14 +14,9 @@ IMAGE_NAMES = [
    "pytorch-linux-bionic-py3.6-clang9",
    "pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
    "pytorch-linux-bionic-py3.8-gcc9",
-    "pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
-    "pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
    "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
    "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
    "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
-    "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
-    "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
-    "pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
    "pytorch-linux-xenial-py3-clang5-asan",
    "pytorch-linux-xenial-py3-clang7-onnx",
    "pytorch-linux-xenial-py3.8",
@ -29,8 +24,6 @@ IMAGE_NAMES = [
    "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
    "pytorch-linux-xenial-py3.6-gcc7.2",
    "pytorch-linux-xenial-py3.6-gcc7",
-    "pytorch-linux-bionic-rocm3.9-py3.6",
-    "pytorch-linux-bionic-rocm3.10-py3.6",
 ]


--- a/.circleci/cimodel/data/simple/ge_config_tests.py
+++ b/.circleci/cimodel/data/simple/ge_config_tests.py
@ -37,7 +37,7 @@ class GeConfigTestJob:

    def gen_tree(self):

-        resource_class = "gpu.medium" if self.use_cuda_docker else "large"
+        resource_class = "gpu.nvidia.small" if self.use_cuda_docker else "large"
        docker_image = DOCKER_IMAGE_CUDA_10_2 if self.use_cuda_docker else DOCKER_IMAGE_BASIC
        full_name = "_".join(self.get_all_parts(False))
        build_env = self.build_env_override or "-".join(self.get_all_parts(True))
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@ -1,5 +1,4 @@
 from cimodel.data.simple.util.versions import MultiPartVersion
-import cimodel.lib.miniutils as miniutils

 XCODE_VERSION = MultiPartVersion([12, 0, 0])

@ -61,10 +60,6 @@ class IOSJob:


 WORKFLOW_DATA = [
-    IOSJob(XCODE_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64")),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64", "metal"), extra_props={"use_metal": miniutils.quote(str(int(True)))}),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64", "custom"), extra_props={"op_list": "mobilenetv2.yaml"}),
 ]


--- a/.circleci/cimodel/data/simple/mobile_definitions.py
+++ b/.circleci/cimodel/data/simple/mobile_definitions.py
@ -4,12 +4,6 @@ PyTorch Mobile PR builds (use linux host toolchain + mobile build options)

 import cimodel.lib.miniutils as miniutils
 import cimodel.data.simple.util.branch_filters
-from cimodel.data.simple.util.docker_constants import (
-    DOCKER_IMAGE_ASAN,
-    DOCKER_REQUIREMENT_ASAN,
-    DOCKER_IMAGE_NDK,
-    DOCKER_REQUIREMENT_NDK
-)


 class MobileJob:
@ -52,27 +46,6 @@ class MobileJob:


 WORKFLOW_DATA = [
-    MobileJob(
-        DOCKER_IMAGE_ASAN,
-        [DOCKER_REQUIREMENT_ASAN],
-        ["build"]
-    ),
-
-    # Use LLVM-DEV toolchain in android-ndk-r19c docker image
-    MobileJob(
-        DOCKER_IMAGE_NDK,
-        [DOCKER_REQUIREMENT_NDK],
-        ["custom", "build", "dynamic"]
-    ),
-
-    # Use LLVM-DEV toolchain in android-ndk-r19c docker image
-    # Most of this CI is already covered by "mobile-custom-build-dynamic" job
-    MobileJob(
-        DOCKER_IMAGE_NDK,
-        [DOCKER_REQUIREMENT_NDK],
-        ["code", "analysis"],
-        True
-    ),
 ]


--- a/.circleci/cimodel/data/simple/nightly_android.py
+++ b/.circleci/cimodel/data/simple/nightly_android.py
@ -54,22 +54,6 @@ class AndroidNightlyJob:
 BASE_REQUIRES = [DOCKER_REQUIREMENT_NDK]

 WORKFLOW_DATA = [
-    AndroidNightlyJob(["x86_32"], "pytorch_linux_build", requires=BASE_REQUIRES),
-    AndroidNightlyJob(["x86_64"], "pytorch_linux_build", requires=BASE_REQUIRES),
-    AndroidNightlyJob(["arm", "v7a"], "pytorch_linux_build", requires=BASE_REQUIRES),
-    AndroidNightlyJob(["arm", "v8a"], "pytorch_linux_build", requires=BASE_REQUIRES),
-    AndroidNightlyJob(["android_gradle"], "pytorch_android_gradle_build",
-                      with_docker=False,
-                      requires=[
-                          "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                          "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build",
-                          "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build",
-                          "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build"]),
-    AndroidNightlyJob(["x86_32_android_publish_snapshot"], "pytorch_android_publish_snapshot",
-                      extra_props={"context": "org-member"},
-                      with_docker=False,
-                      requires=["nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_android_gradle_build"],
-                      no_build_suffix=True),
 ]


--- a/.circleci/cimodel/data/simple/nightly_ios.py
+++ b/.circleci/cimodel/data/simple/nightly_ios.py
@ -59,9 +59,7 @@ BUILD_CONFIGS = [
 ]


-WORKFLOW_DATA = BUILD_CONFIGS + [
-    IOSNightlyJob("binary", is_upload=True),
-]
+WORKFLOW_DATA = []


 def get_workflow_jobs():
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/docker/build_docker.sh
+++ b/.circleci/docker/build_docker.sh
@ -28,7 +28,6 @@ login() {

 # Retry on timeouts (can happen on job stampede).
 retry login "${registry}"
-
 # Logout on exit
 trap "docker logout ${registry}" EXIT

@ -45,5 +44,6 @@ trap "docker logout ${registry}" EXIT

 docker push "${image}:${tag}"

+trap "rm -rf ${IMAGE_NAME}:${tag}.tar" EXIT
 docker save -o "${IMAGE_NAME}:${tag}.tar" "${image}:${tag}"
 aws s3 cp "${IMAGE_NAME}:${tag}.tar" "s3://ossci-linux-build/pytorch/base/${IMAGE_NAME}:${tag}.tar" --acl public-read
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@ -44,6 +44,10 @@ install_ubuntu() {
    wget \
    vim

+  # Should resolve issues related to various apt package repository cert issues
+  # see: https://github.com/pytorch/pytorch/issues/65931
+  apt-get install -y libgnutls30
+
  # Cleanup package manager
  apt-get autoclean && apt-get clean
  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -108,10 +112,7 @@ esac
 # Install Valgrind separately since the apt-get version is too old.
 mkdir valgrind_build && cd valgrind_build
 VALGRIND_VERSION=3.16.1
-if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
-then
-  wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
-fi
+wget https://ossci-linux.s3.amazonaws.com/valgrind-${VALGRIND_VERSION}.tar.bz2
 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
 ./configure --prefix=/usr/local
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@ -13,7 +13,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
      CONDA_FILE="Miniconda2-latest-Linux-x86_64.sh"
    ;;
    3)
-      CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
+      if [ "$ANACONDA_PYTHON_VERSION" = "3.6" ]; then
+        # Latest release of Conda that still supports python-3.6
+        CONDA_FILE="Miniconda3-py37_4.10.3-Linux-x86_64.sh"
+      else
+        CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
+      fi
    ;;
    *)
      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
@ -56,7 +61,9 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  pushd /opt/conda

  # Track latest conda update
-  as_jenkins conda update -y -n base conda
+  if [ "$ANACONDA_PYTHON_VERSION" != "3.6" ]; then
+    as_jenkins conda update -y -n base conda
+  fi

  # Install correct Python version
  as_jenkins conda install -y python="$ANACONDA_PYTHON_VERSION"
@ -106,7 +113,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  as_jenkins pip install --progress-bar off pytest \
    scipy==1.1.0 \
    scikit-image \
-    librosa>=0.6.2 \
+    "librosa>=0.6.2,<0.9.0" \
    psutil \
    numba \
    llvmlite \
@ -115,7 +122,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    coverage \
    hypothesis==4.53.2 \
    mypy==0.770 \
-    tb-nightly
+    tb-nightly \
+    "numpy==1.18.5"

  # Update scikit-learn to a python-3.8 compatible version
  if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -106,10 +106,28 @@ def gen_build_workflows_tree():
        binary_build_definitions.get_nightly_uploads,
    ]

+    # Schedule LTS branch to build every 2 weeks
+    # (on the 1st and 15th of every month at 1:30AM)
+    # on workflow "binary_builds".
+    lts_binary_builds_schedule = [
+        {
+            "schedule": {
+                "cron": "\"30 1 1,15 * *\"",
+                "filters": {
+                    "branches": {
+                        "only": [
+                            "lts/release/1.8"
+                        ]
+                    }
+                }
+            },
+        }
+    ]
+
    return {
        "workflows": {
            "binary_builds": {
-                "when": r"<< pipeline.parameters.run_binary_tests >>",
+                "triggers": lts_binary_builds_schedule,
                "jobs": [f() for f in binary_build_functions],
            },
            "build": {
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -61,7 +61,7 @@ git --no-pager log --max-count 1
 popd

 # Clone the Builder master repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b lts/release/1.8 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@ -7,6 +7,10 @@ source /env
 # Defaults here so they can be changed in one place
 export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}

+if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+  export BUILD_SPLIT_CUDA="ON"
+fi
+
 # Parse the parameters
 if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
  build_script='conda/build_pytorch.sh'
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -76,7 +76,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  pip install "\$pkg"
-  retry pip install -q future numpy protobuf typing-extensions six
+  retry pip install -q future numpy protobuf==3.19.4 typing-extensions six
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  pkg="\$(ls /final_pkgs/*-latest.zip)"
--- a/.circleci/scripts/binary_macos_test.sh
+++ b/.circleci/scripts/binary_macos_test.sh
@ -20,6 +20,24 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  unzip "$pkg" -d /tmp
  cd /tmp/libtorch
 elif [[ "$PACKAGE_TYPE" == conda ]]; then
+  # install dependencies before installing package
+  NUMPY_PIN=">=1.19"
+  if [[ "$DESIRED_PYTHON" == "3.9" ]]; then
+    NUMPY_PIN=">=1.20"
+  fi
+
+  retry conda install -y "numpy${NUMPY_PIN}" dataclasses typing-extensions future pyyaml six
+
+  cuda_ver="$DESIRED_CUDA"
+
+  # install cpuonly or cudatoolkit explicitly
+  if [[ "$cuda_ver" == 'cpu' ]]; then
+    retry conda install -c pytorch -y cpuonly
+  else
+    toolkit_ver="${cuda_ver:2:2}.${cuda_ver:4}"
+    retry conda install -y -c nvidia -c pytorch -c conda-forge "cudatoolkit=${toolkit_ver}"
+  fi
+  
  conda install -y "$pkg"
 else
  pip install "$pkg" -v
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -73,7 +73,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="1.8.0.dev$DATE"
+BASE_BUILD_VERSION="1.8.3.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@ -85,7 +85,7 @@ if tagged_version >/dev/null; then
  # Turns tag v1.6.0-rc1 -> v1.6.0
  BASE_BUILD_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
 fi
-if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu102" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
+if [[ "$(uname)" == 'Darwin' ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}"
 else
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,6 +15,41 @@ else
  export VC_YEAR=2019
 fi

+if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+    export BUILD_SPLIT_CUDA="ON"
+fi
+
+echo "Free Space for CUDA DEBUG BUILD"
+if [[ "$CIRCLECI" == 'true' ]]; then
+    if [[ -d "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0" ]]; then
+        rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0"
+    fi
+
+    if [[ -d "C:\\Program Files (x86)\\Microsoft.NET" ]]; then
+        rm -rf "C:\\Program Files (x86)\\Microsoft.NET"
+    fi
+
+    if [[ -d "C:\\Program Files\\dotnet" ]]; then
+        rm -rf "C:\\Program Files\\dotnet"
+    fi
+
+    if [[ -d "C:\\Program Files (x86)\\dotnet" ]]; then
+        rm -rf "C:\\Program Files (x86)\\dotnet"
+    fi
+
+    if [[ -d "C:\\Program Files (x86)\\Microsoft SQL Server" ]]; then
+        rm -rf "C:\\Program Files (x86)\\Microsoft SQL Server"
+    fi
+
+    if [[ -d "C:\\Program Files (x86)\\Xamarin" ]]; then
+        rm -rf "C:\\Program Files (x86)\\Xamarin"
+    fi
+
+    if [[ -d "C:\\Program Files (x86)\\Google" ]]; then
+        rm -rf "C:\\Program Files (x86)\\Google"
+    fi
+fi
+
 set +x
 export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
 export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@ -111,14 +111,6 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

-# Add the version handler by search and replace.
-# XXX: Consider moving this to the docs Makefile or site build
-if [ "$is_master_doc" = true ]; then
-  find "$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>\1 \&#x25BC</a>@g"
-else
-  find "$install_path" -name "*.html" -print0 | xargs -0 perl -pi -w -e "s@master\s+\((\d\.\d\.[A-Fa-f0-9]+\+[A-Fa-f0-9]+)\s+\)@<a href='http://pytorch.org/docs/versions.html'>$version \&#x25BC</a>@g"
-fi
-
 # Prevent Google from indexing $install_path/_modules. This folder contains
 # generated source files.
 # NB: the following only works on gnu sed. The sed shipped with mac os is different.
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@ -24,7 +24,9 @@ retry sudo apt-get -y install \
 echo "== DOCKER VERSION =="
 docker version

-retry sudo pip -q install awscli==1.16.35
+if ! command -v aws >/dev/null; then
+  retry sudo pip3 -q install awscli==1.19.64
+fi

 if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
  DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
@ -48,43 +50,50 @@ else
 fi

 add_to_env_file() {
-  local content
-  content=$1
-  # BASH_ENV should be set by CircleCI
-  echo "${content}" >> "${BASH_ENV:-/tmp/env}"
+  local name=$1
+  local value=$2
+  case "$value" in
+    *\ *)
+      # BASH_ENV should be set by CircleCI
+      echo "${name}='${value}'" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+    *)
+      echo "${name}=${value}" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+  esac
 }

-add_to_env_file "IN_CI=1"
-add_to_env_file "COMMIT_SOURCE=${CIRCLE_BRANCH:-}"
-add_to_env_file "BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}"
-add_to_env_file "CIRCLE_PULL_REQUEST=${CIRCLE_PULL_REQUEST}"
+add_to_env_file IN_CI 1
+add_to_env_file COMMIT_SOURCE "${CIRCLE_BRANCH:-}"
+add_to_env_file BUILD_ENVIRONMENT "${BUILD_ENVIRONMENT}"
+add_to_env_file CIRCLE_PULL_REQUEST "${CIRCLE_PULL_REQUEST}"


 if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
-  add_to_env_file "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2"
+  add_to_env_file SCCACHE_BUCKET ossci-compiler-cache-circleci-v2

  SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
  MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
  MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
-  add_to_env_file "MAX_JOBS=${MAX_JOBS}"
+  add_to_env_file MAX_JOBS "${MAX_JOBS}"

  if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-    add_to_env_file "TORCH_CUDA_ARCH_LIST=5.2"
+    add_to_env_file TORCH_CUDA_ARCH_LIST 6.1
  fi

  if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
    # This IAM user allows write access to S3 bucket for sccache & bazels3cache
    set +x
-    add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
-    add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
    set -x
  else
    # This IAM user allows write access to S3 bucket for sccache
    set +x
-    add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
-    add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
    set -x
  fi
 fi
@ -93,5 +102,7 @@ fi
 set +x
 export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-}
 export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-}
-eval "$(aws ecr get-login --region us-east-1 --no-include-email)"
+export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+export AWS_REGION=us-east-1
+aws ecr get-login-password --region $AWS_REGION|docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
 set -x
--- a/.circleci/scripts/vs_install_cmath.ps1
+++ b/.circleci/scripts/vs_install_cmath.ps1
@ -0,0 +1,5 @@
+$CMATH_DOWNLOAD_LINK = "https://raw.githubusercontent.com/microsoft/STL/12c684bba78f9b032050526abdebf14f58ca26a3/stl/inc/cmath"
+$VC14_28_INSTALL_PATH="C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.28.29910\include"
+
+curl.exe --retry 3 -kL $CMATH_DOWNLOAD_LINK --output "$home\cmath"
+Move-Item -Path "$home\cmath" -Destination "$VC14_28_INSTALL_PATH" -Force
--- a/.circleci/verbatim-sources/commands.yml
+++ b/.circleci/verbatim-sources/commands.yml
@ -111,11 +111,11 @@ commands:
                git config --global user.email "circleci.ossci@gmail.com"
                git config --global user.name "CircleCI"
                git config remote.origin.url https://github.com/pytorch/pytorch.git
-                git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
-                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+                git config --add remote.origin.fetch +refs/heads/release/1.8:refs/remotes/origin/release/1.8
+                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.8:refs/remotes/origin/release/1.8 --depth=100 --quiet
                # PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
                if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
-                  CIRCLE_PR_BASE_BRANCH=master
+                  CIRCLE_PR_BASE_BRANCH=release/1.8
                fi
                export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
                echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@ -8,9 +8,6 @@
 version: 2.1

 parameters:
-  run_binary_tests:
-    type: boolean
-    default: false
  run_build:
    type: boolean
    default: true
@ -34,6 +31,12 @@ executors:
      resource_class: windows.xlarge
      image: windows-server-2019-vs2019:stable
      shell: bash.exe
+  
+  windows-2xlarge-cpu-with-nvidia-cuda:
+    machine:
+      resource_class: windows.2xlarge
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe

  windows-medium-cpu-with-nvidia-cuda:
    machine:
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -45,7 +45,7 @@
  binary_linux_test:
    <<: *binary_linux_test_upload_params
    machine:
-        image: ubuntu-1604:202007-01
+        image: ubuntu-2004:202104-01
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
@ -108,7 +108,7 @@
  smoke_linux_test:
    <<: *binary_linux_test_upload_params
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -161,6 +161,7 @@
    <<: *binary_mac_params
    macos:
      xcode: "12.0"
+      resource_class: "large"
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
@ -198,6 +199,44 @@
        root: /Users/distiller/project
        paths: final_pkgs

+    - store_artifacts:
+        path: /Users/distiller/project/final_pkgs
+
+  binary_macos_arm64_build:
+    <<: *binary_mac_params
+    macos:
+      xcode: "12.3.0"
+    steps:
+    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
+    - checkout
+    - run:
+        <<: *binary_checkout
+    - run:
+        <<: *binary_populate_env
+    - brew_update
+    - run:
+        <<: *binary_install_miniconda
+
+    - run:
+        name: Build
+        no_output_timeout: "90m"
+        command: |
+          # Do not set -u here; there is some problem with CircleCI
+          # variable expansion with PROMPT_COMMAND
+          set -ex -o pipefail
+          export CROSS_COMPILE_ARM64=1
+          script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh"
+          cat "$script"
+          source "$script"
+
+    - persist_to_workspace:
+        root: /Users/distiller/project
+        paths: final_pkgs
+
+    - store_artifacts:
+        path: /Users/distiller/project/final_pkgs
+
+
  binary_ios_build:
    <<: *pytorch_ios_params
    macos:
@ -250,11 +289,16 @@
        default: ""
      executor:
        type: string
-        default: "windows-xlarge-cpu-with-nvidia-cuda"
+        default: "windows-2xlarge-cpu-with-nvidia-cuda"
    executor: <<parameters.executor>>
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
+    - run:
+        name: _HACK_ Install CUDA compatible cmath
+        no_output_timeout: 1m
+        command: |
+            powershell .circleci/scripts/vs_install_cmath.ps1
    - run:
        <<: *binary_checkout
    - run:
@ -270,6 +314,8 @@
    - persist_to_workspace:
        root: "C:/w"
        paths: final_pkgs
+    - store_artifacts:
+        path: C:/w/final_pkgs

  binary_windows_test:
    <<: *binary_windows_params
--- a/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
+++ b/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
@ -8,7 +8,7 @@
  # then install the one with the most recent version.
  update_s3_htmls: &update_s3_htmls
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    resource_class: medium
    steps:
    - checkout
--- a/.circleci/verbatim-sources/job-specs/docker_jobs.yml
+++ b/.circleci/verbatim-sources/job-specs/docker_jobs.yml
@ -4,7 +4,7 @@
          type: string
          default: ""
      machine:
-        image: ubuntu-1604:202007-01
+        image: ubuntu-2004:202104-01
      resource_class: large
      environment:
        IMAGE_NAME: << parameters.image_name >>
@ -20,7 +20,10 @@
              set +x
              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
-              eval $(aws ecr get-login --no-include-email --region us-east-1)
+              export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+              export AWS_REGION=us-east-1
+              aws ecr get-login-password --region $AWS_REGION|docker login --username AWS \
+                       --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
              set -x
              # Check if image already exists, if it does then skip building it
              if docker manifest inspect "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${IMAGE_NAME}:${DOCKER_TAG}"; then
@ -53,7 +56,7 @@
              cd .circleci/docker && ./build_docker.sh
  docker_for_ecr_gc_build_job:
      machine:
-        image: ubuntu-1604:202007-01
+        image: ubuntu-2004:202104-01
      steps:
        - checkout
        - run:
@ -65,9 +68,12 @@
              set +x
              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
-              eval $(aws ecr get-login --no-include-email --region us-east-1)
+              export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+              export AWS_REGION=us-east-1
+              aws ecr get-login-password --region $AWS_REGION|docker login --username AWS \
+                       --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
              set -x
-              docker push 308535385114.dkr.ecr.us-east-1.amazonaws.com/gc/ecr
+              docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/gc/ecr
  ecr_gc_job:
      parameters:
        project:
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@ -1,7 +1,7 @@
  pytorch_doc_push:
    resource_class: medium
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    parameters:
      branch:
        type: string
@ -30,7 +30,7 @@
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -75,7 +75,7 @@
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -174,7 +174,7 @@
      PYTHON_VERSION: "3.6"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -263,7 +263,7 @@
      PYTHON_VERSION: "3.6"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -299,7 +299,7 @@
      PYTHON_VERSION: "3.6"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -341,7 +341,7 @@
      PYTHON_VERSION: "3.6"
    resource_class: large
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -497,7 +497,7 @@
  pytorch_linux_bazel_build:
    <<: *pytorch_params
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -535,7 +535,7 @@
  pytorch_linux_bazel_test:
    <<: *pytorch_params
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
@ -582,7 +582,7 @@
      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
    resource_class: medium
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    - checkout
    - calculate_docker_image_tag
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@ -2,7 +2,7 @@ jobs:
  pytorch_linux_build:
    <<: *pytorch_params
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
@ -83,7 +83,7 @@ jobs:
  pytorch_linux_test:
    <<: *pytorch_params
    machine:
-      image: ubuntu-1604:202007-01
+      image: ubuntu-2004:202104-01
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
@ -256,6 +256,11 @@ jobs:
    executor: <<parameters.executor>>
    steps:
      - checkout
+      - run:
+          name: _HACK_ Install CUDA compatible cmath
+          no_output_timeout: 1m
+          command: |
+              powershell .circleci/scripts/vs_install_cmath.ps1
      - run:
          name: Install Cuda
          no_output_timeout: 30m
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -60,11 +60,6 @@ class PytorchVersion:
        self.no_build_suffix = no_build_suffix

    def get_post_build_suffix(self):
-        # CUDA 10.2 is the version to be uploaded to PyPI so it doesn't have a
-        # version suffix
-        if ((self.gpu_arch_type == "cuda" and self.gpu_arch_version == "10.2")
-                or self.no_build_suffix):
-            return ""
        if self.gpu_arch_type == "cuda":
            return f"+cu{self.gpu_arch_version.replace('.', '')}"
        return f"+{self.gpu_arch_type}{self.gpu_arch_version}"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -60,6 +60,9 @@ jobs:
        run: |
          set -eux
          python torch/testing/check_kernel_launches.py |& tee ${GITHUB_WORKSPACE}/cuda_kernel_launch_checks.txt
+      - name: Ensure no direct cub include
+        run: |
+          (! git grep -I -no $'#include <cub/' --  ./aten  ':(exclude)aten/src/ATen/cuda/CubUtils.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/CubUtils.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))

  flake8-py3:
    runs-on: ubuntu-18.04
@ -93,9 +96,12 @@ jobs:
          check_name: 'flake8-py3'
          linter_output_path: 'flake8-output.txt'
          commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }}
-          regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)'
+          regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w+\d+) (?<errorDesc>.*)'
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Catch any other warnings
+        run: |
+          [ ! -s flake8-output.txt ]

  clang-tidy:
    if: github.event_name == 'pull_request'
--- a/.gitmodules
+++ b/.gitmodules
@ -121,7 +121,7 @@
 [submodule "third_party/XNNPACK"]
    ignore = dirty
    path = third_party/XNNPACK
-    url = https://github.com/google/XNNPACK.git
+    url = https://github.com/malfet/XNNPACK.git
 [submodule "third_party/fmt"]
    ignore = dirty
    path = third_party/fmt
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -169,7 +169,7 @@ if [ -z "$MAX_JOBS" ]; then
 fi

 # Target only our CI GPU machine's CUDA arch to speed up the build
-export TORCH_CUDA_ARCH_LIST="5.2"
+export TORCH_CUDA_ARCH_LIST="6.1"

 if [[ "$BUILD_ENVIRONMENT" == *ppc64le* ]]; then
  export TORCH_CUDA_ARCH_LIST="6.0"
@ -182,7 +182,7 @@ fi

 # Patch required to build xla
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  git clone --recursive https://github.com/pytorch/xla.git
+  git clone --recursive -b r1.8 https://github.com/pytorch/xla.git
  ./xla/scripts/apply_patches.sh
 fi

--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@ -54,7 +54,7 @@ function file_diff_from_base() {
  set +e
  git fetch origin master --quiet
  set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/release/1.8 HEAD)" > "$1"
 }

 function get_bazel() {
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@ -19,8 +19,14 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
  retry bash ${WORKSPACE_DIR}/miniconda3.sh -b -p ${WORKSPACE_DIR}/miniconda3
 fi
 export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
-source ${WORKSPACE_DIR}/miniconda3/bin/activate
-retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses
+# shellcheck disable=SC1091
+source "${WORKSPACE_DIR}"/miniconda3/bin/activate
+
+# NOTE: mkl 2021.3.0+ cmake requires sub-command PREPEND, may break the build
+retry conda install -y \
+  mkl=2021.2.0 mkl-include=2021.2.0 \
+  numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 \
+  cmake cffi ninja typing_extensions dataclasses

 # The torch.hub tests make requests to GitHub.
 #
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@ -4,7 +4,7 @@
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"

 conda install -y six
-pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil
+pip install -q hypothesis "librosa>=0.6.2,<0.9.0" "numba<=0.49.1" psutil "scipy==1.6.3"

 # TODO move this to docker
 pip install unittest-xml-reporting pytest
--- a/.jenkins/pytorch/perf_test/common.sh
+++ b/.jenkins/pytorch/perf_test/common.sh
@ -14,7 +14,7 @@ get_runtime_of_command () {
  if [[ $runtime == *"Error"* ]]; then
    exit 1
  fi
-  runtime=${runtime#+++ $@}
+  runtime=${runtime#+++ "$@"}
  runtime=$(python -c "print($runtime)")

  echo $runtime
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -282,8 +282,10 @@ test_xla() {
  echo "Running Python Tests"
  ./test/run_tests.sh

-  echo "Running MNIST Test"
-  python test/test_train_mnist.py --tidy
+  # Disabled due to MNIST download issue.
+  # See https://github.com/pytorch/pytorch/issues/53267
+  # echo "Running MNIST Test"
+  # python test/test_train_mnist.py --tidy

  echo "Running C++ Tests"
  pushd test/cpp
@ -300,7 +302,8 @@ test_backward_compatibility() {
  pushd test/backward_compatibility
  python -m venv venv
  . venv/bin/activate
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  # check for backward compatibility with torch 1.8.1
+  pip_install --pre torch==1.8.1 -f https://download.pytorch.org/whl/test/cpu/torch_test.html
  pip show torch
  python dump_all_function_schemas.py --filename nightly_schemas.txt
  deactivate
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@ -5,7 +5,11 @@ if "%BUILD_ENVIRONMENT%"=="" (
 )
 if "%REBUILD%"=="" (
  IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
-  curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+  if "%PYTHON_VERSION%"=="3.6" (
+    curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-py37_4.10.3-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+  ) else (
+    curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+  )
  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
 )
 call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -13,7 +13,11 @@ if "%BUILD_ENVIRONMENT%"=="" (
 )
 if NOT "%BUILD_ENVIRONMENT%"=="" (
    IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
-    curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+    if "%PYTHON_VERSION%"=="3.6" (
+        curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-py37_4.10.3-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+    ) else (
+        curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+    )
    if %errorlevel% neq 0 ( exit /b %errorlevel% )
    %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
    if %errorlevel% neq 0 ( exit /b %errorlevel% )
@ -39,7 +43,7 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% )
 popd

 :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
-pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage
+pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2,<0.9.0" psutil pillow unittest-xml-reporting pytest coverage
 if %errorlevel% neq 0 ( exit /b %errorlevel% )

 set DISTUTILS_USE_SDK=1
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@ -11,7 +11,6 @@
 #include <ATen/DeviceGuard.h>
 #include <ATen/DimVector.h>
 #include <ATen/Dispatch.h>
-#include <ATen/DynamicLibrary.h>
 #include <ATen/Formatting.h>
 #include <ATen/Functions.h>
 #include <ATen/NamedTensor.h>
--- a/aten/src/ATen/DynamicLibrary.cpp
+++ b/aten/src/ATen/DynamicLibrary.cpp
@ -25,9 +25,16 @@ static void* checkDL(void* x) {

  return x;
 }
-DynamicLibrary::DynamicLibrary(const char* name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
  // NOLINTNEXTLINE(hicpp-signed-bitwise)
-  handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+  handle = dlopen(name, RTLD_LOCAL | RTLD_NOW);
+  if (!handle) {
+    if (alt_name) {
+      handle = checkDL(dlopen(alt_name, RTLD_LOCAL | RTLD_NOW));
+    } else {
+        AT_ERROR("Error in dlopen or dlsym: ", dlerror());
+    }
+  }
 }

 void* DynamicLibrary::sym(const char* name) {
@ -45,7 +52,7 @@ DynamicLibrary::~DynamicLibrary() {

 // Windows

-DynamicLibrary::DynamicLibrary(const char* name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
  // NOLINTNEXTLINE(hicpp-signed-bitwise)
  HMODULE theModule;
  bool reload = true;
--- a/aten/src/ATen/DynamicLibrary.h
+++ b/aten/src/ATen/DynamicLibrary.h
@ -8,7 +8,7 @@ namespace at {
 struct DynamicLibrary {
  AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);

-  TORCH_API DynamicLibrary(const char* name);
+  TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr);

  TORCH_API void* sym(const char* name);

--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -12,6 +12,11 @@
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 namespace at {
+#if AT_MKLDNN_ENABLED()
+namespace native { namespace mkldnn {
+void clear_computation_cache();
+}} // namespace native::mkldnn
+#endif

 namespace {
 // Number of threads set by the user
@ -58,6 +63,9 @@ void set_num_threads(int nthreads) {
  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
  pool->set_thread_count(nthreads);
 #endif
+#if AT_MKLDNN_ENABLED()
+  at::native::mkldnn::clear_computation_cache();
+#endif
 }

 // Explicitly calling omp_get_max_threads() as the size of the parallel
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -3,6 +3,12 @@
 namespace at {
 namespace autocast {

+namespace {
+  bool is_autocast_eligible(const Tensor& tensor) {
+    return (tensor.is_cuda() || tensor.is_xla()) && tensor.is_floating_point();
+  }
+} // namespace
+
 TORCH_API bool is_enabled();
 TORCH_API void set_enabled(bool enabled);
 TORCH_API void clear_cache();
@ -21,7 +27,7 @@ inline at::ScalarType prioritize(at::ScalarType current, const Tensor& nextArg)
    AT_ERROR("promote type is double in at::autocast::prioritize");
    return current;
  }
-  if (nextArg.is_cuda() && nextArg.is_floating_point()) {
+  if (is_autocast_eligible(nextArg)) {
    auto next = nextArg.scalar_type();
    if (next == at::kDouble) {
      return current; // ignores double tensors
@ -70,7 +76,7 @@ inline at::ScalarType promote_type(at::ScalarType current, Arg0 arg0, Args... ar
 Logic to apply cached casting to any Tensor argument.
 ****************************************************/
 inline bool is_eligible(const Tensor& arg) {
-  return (arg.defined() && arg.is_cuda() && arg.is_floating_point() && (arg.scalar_type() != at::kDouble));
+  return (arg.defined() && is_autocast_eligible(arg) && (arg.scalar_type() != at::kDouble));
 }

 // Overload to catch Tensor args
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@ -22,6 +22,8 @@ _(aten, __xor__) \
 _(aten, _abs) \
 _(aten, _addmv) \
 _(aten, _addr) \
+_(aten, _amp_foreach_non_finite_check_and_unscale_) \
+_(aten, _amp_update_scale) \
 _(aten, _arange) \
 _(aten, _argmax) \
 _(aten, _argmin) \
--- a/aten/src/ATen/cuda/CubUtils.cuh
+++ b/aten/src/ATen/cuda/CubUtils.cuh
@ -0,0 +1,10 @@
+#pragma once
+
+// include cub in a safe manner
+#undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
+#undef CUB_NS_PREFIX
+#define CUB_NS_PREFIX namespace at{ namespace native{
+#define CUB_NS_POSTFIX }}
+#include <cub/cub.cuh>
+#undef CUB_NS_POSTFIX
+#undef CUB_NS_PREFIX
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -23,10 +23,17 @@ at::DynamicLibrary& getNVRTCLibrary() {
  constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
 #if defined(_WIN32)
  auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
+  std::string alt_libname;
 #else
-  static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
+  static auto lib_version = std::to_string(major) + "." + std::to_string(minor);
+  static auto libname = std::string("libnvrtc.so.") + lib_version;
+#ifdef NVRTC_SHORTHASH
+  static auto alt_libname = std::string("libnvrtc-") + C10_STRINGIZE(NVRTC_SHORTHASH) + ".so." + lib_version;
+#else
+  std::string alt_libname;
 #endif
-  static at::DynamicLibrary lib(libname.c_str());
+#endif
+  static at::DynamicLibrary lib(libname.c_str(), alt_libname.empty() ? nullptr : alt_libname.c_str());
  return lib;
 }

--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -238,7 +238,12 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c
     (groups > 1
      || (weight.size(-1) > 3 && weight.size(-2) > 3)
      || input.size(0) > 1
-      || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480)); // for some case, native is faster
+      || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
+      // OneDNN < 1.8.1 produce incorrect results in this case (see #50042)
+      // TODO(VitalyFedyunin): Remove this patch after OneDNN 1.8.1 merged in
+      && !(groups > 0 && groups % 24 == 0 && weight.size(0) == groups && weight.size(1) == 1)
+      );
+
 #endif
  return false;
 }
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@ -17,17 +17,9 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
  auto indices_arg = TensorArg(indices, "indices", 1);
  checkScalarTypes("embedding", indices_arg, {kLong, kInt});

-  auto zerofill_padding = [&](Tensor& embedding) {
-    if (padding_idx >= 0) {
-      embedding.masked_fill_((indices == padding_idx).reshape({-1, 1}), 0);
-    }
-  };
-
  // TODO: use tensor.index() after improving perf
  if (indices.dim() == 1) {
-    auto out = weight.index_select(0, indices);
-    zerofill_padding(out);
-    return out;
+    return weight.index_select(0, indices);
  }

  auto size = indices.sizes().vec();
@ -35,9 +27,7 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
    size.push_back(d);
  }

-  auto out = weight.index_select(0, indices.reshape(-1));
-  zerofill_padding(out);
-  return out.view(size);
+  return weight.index_select(0, indices.reshape(-1)).view(size);
 }

 Tensor embedding_backward(
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@ -45,9 +45,7 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
  // Next, shuffle by permuting the new upscale_factor dims alongside the height and width dims.
  std::vector<int64_t> permutation(self.sizes().begin(), self_sizes_batch_end);
  // std::iota is used to maintain the batch dims within the permutation.
-  // Since 2 dims were added, the correct batch dim offsets are now:
-  // -added_dims_shape.size(), ..., -7, -6.
-  std::iota(permutation.begin(), permutation.end(), -added_dims_shape.size());
+  std::iota(permutation.begin(), permutation.end(), 0);
  permutation.insert(permutation.end(), {-5 /* oc */, -2 /* h */, -4 /* 1st upscale_factor */, -1 /* w */,
                                         -3 /* 2nd upscale_factor */});
  const auto input_permuted = input_reshaped.permute(permutation);
@ -98,9 +96,7 @@ Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
  // Next, unshuffle by permuting the downscale_factor dims alongside the channel dim.
  std::vector<int64_t> permutation(self.sizes().begin(), self_sizes_batch_end);
  // std::iota is used to maintain the batch dims within the permutation.
-  // Since 2 dims were added, the correct batch dim offsets are now:
-  // -added_dims_shape.size(), ..., -7, -6.
-  std::iota(permutation.begin(), permutation.end(), -added_dims_shape.size());
+  std::iota(permutation.begin(), permutation.end(), 0);
  permutation.insert(permutation.end(), {-5 /* c */, -3 /* 1st downscale_factor */, -1 /*2nd downscale_factor */,
                                         -4 /* oh */, -2 /* ow */});
  const auto input_permuted = input_reshaped.permute(permutation);
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@ -26,7 +26,7 @@ static void upsample_bicubic2d_out_frame(
        const scalar_t* in = &idata[output_y * input_width + output_x];
        scalar_t* out = &odata[output_y * output_width + output_x];

-        for (int64_t c = 0; c < channels; ++c) {
+        for (int64_t c = 0; c < channels * nbatch; ++c) {
          out[0] = in[0];
          in += input_width * input_height;
          out += output_width * output_height;
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -19,6 +19,27 @@ namespace {

 using namespace vec256;

+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template<typename T>
+T copysign(T a, T b) {
+  return std::copysign(a, b);
+}
+
+// Implement copysign for half precision floats using bit ops
+// Sign is the most significant bit for both half and bfloat16 types
+template<>
+c10::Half copysign(c10::Half a, c10::Half b) {
+  return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
+}
+
+template<>
+c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
+   return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
+}
+
+
 // Note: Undefined behavior when performing addition is intentionally
 // ignored.
 void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
@ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) {
                floordiv += scalar_t(1.0);
              }
            } else {
-              floordiv = std::copysign(scalar_t(0), a / b);
+              floordiv = copysign(scalar_t(0), a / b);
            }
            return floordiv;
          });
@ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) {
  });
 }

-template<typename T>
-T copysign(T a, T b) {
-  return std::copysign(a, b);
-}
-
-// Implement copysign for half precision floats using bit ops
-// Sign is the most significant bit for both half and bfloat16 types
-template<>
-c10::Half copysign(c10::Half a, c10::Half b) {
-  return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
-}
-
-template<>
-c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
-   return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
-}
-
 void copysign_kernel(TensorIterator& iter) {
  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
    cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -17,10 +17,8 @@
 #include <THC/THCThrustAllocator.cuh>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
-#include <thrust/transform.h>
 #include <THC/THCAtomics.cuh>

-#include <cub/cub.cuh>

 #include <c10/macros/Macros.h>

@ -848,92 +846,5 @@ Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
  return out;
 }

-template<typename T>
-struct NonZeroOp
-{
-    __host__ __device__ __forceinline__ bool operator()(const T& a) const {
-      return (a!=T(0));
-    }
-};
-
-template<typename scalar_t>
-void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
-  Tensor self_ = self.contiguous();
-  int N = self_.numel();
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-// compute number of nonzero elements
-  size_t temp_storage_bytes=0;
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  auto num_nonzeros = allocator.allocate(sizeof(int));
-  cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, scalar_t*> itr(self_.data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
-  cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
-  auto temp_storage = allocator.allocate(temp_storage_bytes);
-  cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
-  int num_nonzeros_h;
-  C10_CUDA_CHECK(cudaMemcpyAsync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  //need to synchronize to make sure data is available on the host
-  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
-  //expected output size is num_nonzeros x ndim
-  //we are producing output with size {num_nonzeros, ndim} and strides {num_nonzeros, 1} (that is, transposed ndim x num_nonzeros output)
-  //we are able to directly use passed output with this size and strides, and we can also (per contract)
-  //resize passed output with incorrect sizes anyway we want.
-  //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
-  bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
-  at::Tensor out_temp = need_to_copy ?
-    at::native::empty_cuda({self.dim(), num_nonzeros_h}, optTypeMetaToScalarType(out.options().dtype_opt()),
-                           out.options().layout_opt(), out.options().device_opt(), out.options().pinned_memory_opt()) :
-    out.resize_({self.dim(), num_nonzeros_h});
-  //Scalars are expected to produce output of size (1,0), so we can't write to it
-  if (self.dim() > 0) {
-    cub::CountingInputIterator<int64_t> counting_itr(0);
-    temp_storage_bytes = 0;
-    cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
-      out_temp.data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
-    temp_storage = allocator.allocate(temp_storage_bytes);
-    cub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
-      out_temp.data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
-    if (num_nonzeros_h > 0 && self.dim() > 1){
-        int64_t div = 1;
-        auto thrust_allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
-        for (int dim = self.dim()-1; dim >= 0; dim--){
-            int64_t dim_size = self.sizes()[dim];
-            thrust::transform(
-              thrust::cuda::par(thrust_allocator).on(stream),
-              thrust::device_ptr<int64_t>(out_temp.data_ptr<int64_t>()),
-              thrust::device_ptr<int64_t>(out_temp.data_ptr<int64_t>()) + num_nonzeros_h,
-              thrust::device_ptr<int64_t>(out_temp.data_ptr<int64_t>()) + num_nonzeros_h * dim,
-              [=] C10_HOST_DEVICE (const int64_t val) {return (val/div) % dim_size;}
-            );
-            div *= dim_size;
-        }
-    }
-  }
-  if (need_to_copy) {
-    out.copy_(out_temp.t());
-  } else {
-    //transpose out so it is correct size
-    Tensor out_ = out_temp.t();
-    out.set_(out_);
-  }
-}
-
-Tensor& nonzero_out_cuda(Tensor& out, const Tensor& self){
-  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
-  file a support request");
-  TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
-  TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
-  out.device(), " and self on ", self.device());
-  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
-    self.scalar_type(), "nonzero_cuda",
-    [&] {nonzero_cuda_out_impl<scalar_t>(self, out);});
-  return out;
-}
-
-Tensor nonzero_cuda(const Tensor& self){
-  Tensor out = at::native::empty_cuda({0}, kLong, self.options().layout_opt(), self.options().device_opt(), self.options().pinned_memory_opt());
-  return nonzero_out_cuda(out, self);
-}
-
-
 } // native
 } // at
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -0,0 +1,118 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh> //for MAX_DIMS
+#include <ATen/cuda/CubUtils.cuh>
+
+
+namespace at {
+namespace native {
+
+namespace{
+template<typename T>
+struct NonZeroOp
+{
+    __host__ __device__ __forceinline__ bool operator()(const T& a) const {
+      return (a!=T(0));
+    }
+};
+
+//TODO: actually support int64_t index_t
+template<typename index_t>
+struct TensorDims {
+  index_t sizes[MAX_DIMS];
+};
+
+template<typename index_t>
+__global__ void write_indices(int64_t * inp, TensorDims<index_t> dims, int ndim, index_t n){
+    CUDA_KERNEL_LOOP(index, n) { // this assumed int (not int64_t) index
+      index_t div = 1;
+      int64_t idx_flat = inp[index];
+      for (int dim = ndim-1; dim >= 0; dim--){
+        auto dim_size = dims.sizes[dim];
+        inp[index + dim*n] = (idx_flat/div) % dim_size;
+        div *= dim_size;
+      }
+    }
+}
+
+
+} //anonymous namespace
+
+template<typename scalar_t>
+void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
+  Tensor self_ = self.contiguous();
+  int N = self_.numel();
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+// compute number of nonzero elements
+  size_t temp_storage_bytes=0;
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto num_nonzeros = allocator.allocate(sizeof(int));
+  cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, scalar_t*> itr(self_.data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
+  cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  auto temp_storage = allocator.allocate(temp_storage_bytes);
+  cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  int num_nonzeros_h;
+  C10_CUDA_CHECK(cudaMemcpyAsync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  //need to synchronize to make sure data is available on the host
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+  //expected output size is num_nonzeros x ndim
+  //we are producing output with size {num_nonzeros, ndim} and strides {num_nonzeros, 1} (that is, transposed ndim x num_nonzeros output)
+  //we are able to directly use passed output with this size and strides, and we can also (per contract)
+  //resize passed output with incorrect sizes anyway we want.
+  //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
+  bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
+  at::Tensor out_temp = need_to_copy ?
+    at::native::empty_cuda({self.dim(), num_nonzeros_h}, optTypeMetaToScalarType(out.options().dtype_opt()),
+                           out.options().layout_opt(), out.options().device_opt(), out.options().pinned_memory_opt()) :
+    out.resize_({self.dim(), num_nonzeros_h});
+  //Scalars are expected to produce output of size (1,0), so we can't write to it
+  if (self.dim() > 0) {
+    cub::CountingInputIterator<int64_t> counting_itr(0);
+    temp_storage_bytes = 0;
+    cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
+      out_temp.data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    temp_storage = allocator.allocate(temp_storage_bytes);
+    cub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
+      out_temp.data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    if (num_nonzeros_h > 0 && self.dim() > 1){
+        TensorDims<int> dims;
+        for (int i=0; i<self.dim(); i++){
+            dims.sizes[i] = self.sizes()[i];
+        }
+        const int nthreads = 256;
+        const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
+        write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.data_ptr<int64_t>(),
+        dims, self.dim(), num_nonzeros_h);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+  }
+  if (need_to_copy) {
+    out.copy_(out_temp.t());
+  } else {
+    //transpose out so it is correct size
+    Tensor out_ = out_temp.t();
+    out.set_(out_);
+  }
+}
+
+Tensor& nonzero_out_cuda(Tensor& out, const Tensor& self){
+  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
+  file a support request");
+  TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
+  TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
+  out.device(), " and self on ", self.device());
+  TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+    self.scalar_type(), "nonzero_cuda",
+    [&] {nonzero_cuda_out_impl<scalar_t>(self, out);});
+  return out;
+}
+
+Tensor nonzero_cuda(const Tensor& self){
+  Tensor out = at::native::empty_cuda({0}, kLong, self.options().layout_opt(), self.options().device_opt(), self.options().pinned_memory_opt());
+  return nonzero_out_cuda(out, self);
+}
+} //namespace::native
+} //namespace::at
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@ -0,0 +1,83 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/cuda/CubUtils.cuh>
+
+#include <limits>
+
+namespace at {
+namespace native {
+
+Tensor& randperm_out_cuda(Tensor& result, int64_t n, c10::optional<Generator> generator) {
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+  TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
+  check_supported_max_int_with_precision(n, result);
+
+  result.resize_({n});
+
+  if (n < 30000) {  // For small inputs, we offload it to CPU instead.
+    auto result_cpu = at::empty({n}, result.options().device(kCPU));
+    randperm_out(result_cpu, n, generator);
+    return result.copy_(result_cpu);
+  }
+
+#if 0
+  // This if condition should never be true because if n >= 30000 and the tensor has a Half type,
+  // check_supported_max_int_with_precision should have reported an error. This snippet is commented out but left here
+  // for the sake of clarity, because Half in thrust is spotty, and we do not want future change unaware of this.
+  if (result.scalar_type() == at::ScalarType::Half) {  // Half in thrust is spotty. Avoid.
+    auto result_float = at::empty({n}, initialTensorOptions().device(Device(DeviceType::CUDA)));
+    return result.copy_(randperm_out_cuda(result_float, n, generator));
+  }
+#endif
+
+  // Generate random values for the keys array
+  AT_DISPATCH_ALL_TYPES(
+    result.scalar_type(), "randperm_out_cuda", [&] {
+      TORCH_CHECK(n <= std::numeric_limits<int>::max(),
+        "randperm of tensors larger than INT_MAX is not supported yet in pytorch");
+
+      auto keys = at::empty(result.sizes(), result.options()).random_(generator);
+      auto range = at::arange(n, result.options());
+      auto keys_tmp = at::empty_like(keys);
+
+      // shuffled_data points to the underlying data of the output tensor if the tensor is contiguous; otherwise it
+      // points to a new tensor.
+      Tensor shuffled;
+      scalar_t *shuffled_data;
+      if (result.is_contiguous()) {
+        shuffled_data = result.data_ptr<scalar_t>();
+      } else {
+        shuffled = at::empty(n, result.options());
+        shuffled_data = shuffled.data_ptr<scalar_t>();
+      }
+
+      // Use the sorted order of keys to rearrange the result array
+      size_t temp_storage_bytes = 0;
+
+      cub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        keys.data_ptr<scalar_t>(), keys_tmp.data_ptr<scalar_t>(),
+        range.data_ptr<scalar_t>(), shuffled_data, n,
+        0, sizeof(scalar_t) * 8, at::cuda::getCurrentCUDAStream());
+      auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+      auto dataPtr = allocator.allocate(temp_storage_bytes);
+      cub::DeviceRadixSort::SortPairs(
+        dataPtr.get(), temp_storage_bytes,
+        keys.data_ptr<scalar_t>(), keys_tmp.data_ptr<scalar_t>(),
+        range.data_ptr<scalar_t>(), shuffled_data, n,
+        0, sizeof(scalar_t) * 8, at::cuda::getCurrentCUDAStream());
+
+      if (!result.is_contiguous()) {
+        result.copy_(shuffled);
+      }
+    }
+  );
+
+  return result;
+}
+
+
+
+}} // namespace at::native
--- a/aten/src/ATen/native/cuda/ScanKernels.cu
+++ b/aten/src/ATen/native/cuda/ScanKernels.cu
@ -4,7 +4,7 @@
 #include <THC/THCNumerics.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <THC/THCGeneral.h>
-#include <cub/device/device_scan.cuh>
+#include <ATen/cuda/CubUtils.cuh>


 namespace at { namespace native {
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@ -8,11 +8,6 @@
 #include <c10/util/Exception.h>

 #include <THC/THCGeneral.h>
-#include <THC/THCThrustAllocator.cuh>
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>

 #include <algorithm>
 #include <cstddef>
@ -76,64 +71,6 @@ Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, c10::optional<Sc
  return t;
 }

-Tensor& randperm_out_cuda(Tensor& result, int64_t n, c10::optional<Generator> generator) {
-  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
-  TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
-  check_supported_max_int_with_precision(n, result);
-
-  result.resize_({n});
-
-  if (n < 30000) {  // For small inputs, we offload it to CPU instead.
-    auto result_cpu = at::empty({n}, result.options().device(kCPU));
-    randperm_out(result_cpu, n, generator);
-    return result.copy_(result_cpu);
-  }
-
-#if 0
-  // This if condition should never be true because if n >= 30000 and the tensor has a Half type,
-  // check_supported_max_int_with_precision should have reported an error. This snippet is commented out but left here
-  // for the sake of clarity, because Half in thrust is spotty, and we do not want future change unaware of this.
-  if (result.scalar_type() == at::ScalarType::Half) {  // Half in thrust is spotty. Avoid.
-    auto result_float = at::empty({n}, initialTensorOptions().device(Device(DeviceType::CUDA)));
-    return result.copy_(randperm_out_cuda(result_float, n, generator));
-  }
-#endif
-
-  // Generate random values for the keys array
-  AT_DISPATCH_ALL_TYPES(
-    result.scalar_type(), "randperm_out_cuda", [&] {
-      auto keys = at::empty(result.sizes(), result.options()).random_(generator);
-      auto keys_data = thrust::device_ptr<scalar_t>(keys.data_ptr<scalar_t>());
-
-      // shuffled_data points to the underlying data of the output tensor if the tensor is contiguous; otherwise it
-      // points to a new tensor.
-      Tensor shuffled;
-      thrust::device_ptr<scalar_t> shuffled_data;
-      if (result.is_contiguous()) {
-        shuffled_data = thrust::device_ptr<scalar_t>(result.data_ptr<scalar_t>());
-      } else {
-        shuffled = at::empty(n, result.options());
-        shuffled_data = thrust::device_ptr<scalar_t>(shuffled.data_ptr<scalar_t>());
-      }
-
-      auto state = globalContext().getTHCState();
-      THCThrustAllocator thrustAlloc(state);
-      auto policy = thrust::cuda::par(thrustAlloc).on(at::cuda::getCurrentCUDAStream());
-
-      thrust::sequence(policy, shuffled_data, shuffled_data + n);
-
-      // Use the sorted order of keys to rearrange the result array
-      thrust::sort_by_key(policy, keys_data, keys_data + n, shuffled_data);
-
-      if (!result.is_contiguous()) {
-        result.copy_(shuffled);
-      }
-    }
-  );
-
-  return result;
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 namespace {
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@ -113,31 +113,46 @@ __global__ void upsample_trilinear3d_out_frame(
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
 __global__ void upsample_trilinear3d_backward_out_frame(
-    const size_t nc_,
-    const int depth1,
-    const int height1,
-    const int width1,
-    const int depth2,
-    const int height2,
-    const int width2,
+    const int num_kernels,
    const accscalar_t rdepth,
    const accscalar_t rheight,
    const accscalar_t rwidth,
    const bool align_corners,
-    scalar_t* __restrict__ idata,
-    const scalar_t* __restrict__ odata) {
-  const size_t i_numel = nc_ * depth1 * height1 * width1;
-  const size_t o_numel = nc_ * depth2 * height2 * width2;
+    PackedTensorAccessor64<scalar_t, 5> idata,
+    const PackedTensorAccessor64<scalar_t, 5> odata,
+    scalar_t* idata_ptr) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;

-  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel; index += blockDim.x * gridDim.x) {
-    size_t index_temp = index;
-    const int w2 = index_temp % width2;   // 0:width2-1
-    index_temp /= width2;
-    const int h2 = index_temp % height2;  // 0:height2-1
-    index_temp /= height2;
-    const int t2 = index_temp % depth2;   // 0:depth2-1
-    const int nc = index_temp / depth2;
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);

+  const size_t i_numel = batchsize * channels * depth1 * height1 * width1;
+
+  if (index < num_kernels) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][t1][h1][w1];
+          idata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
        rdepth, t2, align_corners, /*cubic=*/false);
    const int t1 = t1r;
@ -159,55 +174,60 @@ __global__ void upsample_trilinear3d_backward_out_frame(
    const accscalar_t w1lambda = w1r - w1;
    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
    //
-    const scalar_t d2val = odata[index];
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1, w1),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
-      true);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][t2][h2][w2];
+        const size_t nc = n * channels + c;
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
+          true);
+      }
+    }
  }
 }

@ -350,21 +370,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
  // so it has to be initialized to zero.
  grad_input.zero_();

-  // const size_t num_kernels = nbatch * channels * output_depth * output_height * output_width;
-  const size_t num_kernels = grad_output.numel();
+  const int num_kernels = output_depth * output_height * output_width;
  const int num_threads = std::min(
      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  if (num_kernels > 0) {
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(),
      "upsample_trilinear3d_backward_out_frame",
      [&] {
        using accscalar_t = at::acc_type<scalar_t, true>;

-        auto idata = grad_input.data_ptr<scalar_t>();
-        auto odata = grad_output.data_ptr<scalar_t>();
+        auto idata = grad_input.packed_accessor64<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor64<scalar_t, 5>();
+        scalar_t* idata_ptr = grad_input.data_ptr<scalar_t>();

        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
            input_depth, output_depth, align_corners, scales_d);
@ -374,26 +393,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
            input_width, output_width, align_corners, scales_w);

        upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>
-            <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)),
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
               num_threads,
               0,
               stream>>>(
-                nbatch * channels,
-                input_depth,
-                input_height,
-                input_width,
-                output_depth,
-                output_height,
-                output_width,
+                num_kernels,
                rdepth,
                rheight,
                rwidth,
                align_corners,
                idata,
-                odata);
+                odata,
+                idata_ptr);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
-  }
 }

 } // namespace
--- a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
+++ b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
@ -18,4 +18,14 @@ RegisterEngineAllocator cpu_alloc(
  }
 );

+namespace at { namespace native { namespace mkldnn {
+
+void clear_computation_cache() {
+  // Reset computation_cache for forward convolutions
+  // As it also caches max number of OpenMP workers
+  ideep::convolution_forward::t_store().clear();
+}
+
+}}} // namespace  at::native::mkldnn
+
 #endif // AT_MKLDNN_ENALBED()
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@ -346,6 +346,9 @@ class TORCH_API Tensor {
  /// Returns if a `Tensor` has XPU backend.
  bool is_xpu() const;

+  /// Returns if a `Tensor` has XLA backend.
+  bool is_xla() const;
+
  /// Returns if a `Tensor` has HIP backend.
  bool is_hip() const;

--- a/aten/src/ATen/templates/TensorMethods.cpp
+++ b/aten/src/ATen/templates/TensorMethods.cpp
@ -91,6 +91,10 @@ bool is_xpu(Tensor self) {
  return self.is_xpu();
 }

+bool Tensor::is_xla() const {
+    return impl_->is_xla();
+}
+
 NamedTensorMeta* Tensor::get_named_tensor_meta() {
  return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
 }
@ -112,6 +116,10 @@ bool is_cuda(Tensor self) {
  return self.is_cuda();
 }

+bool is_xla(Tensor self) {
+    return self.is_xla();
+}
+
 bool Tensor::is_hip() const {
  // NB: this is not a native function to avoid dispatching overhead.
  return impl_->is_hip();
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@ -133,7 +133,9 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
    ASSERT_EQ(buffer1[i].z, buffer2[i].z);
    ASSERT_EQ(buffer1[i].w, buffer2[i].w);
  }
+// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved

+#if 0
  // unaligned
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 16; j++) {
@ -151,4 +153,5 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
      }
    }
  }
+#endif
 }
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -517,6 +517,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
        key_set_.has(DispatchKey::QuantizedXPU);
  }

+  bool is_xla() const {
+    return key_set_.has(DispatchKey::XLA);
+  }
+
  bool is_hip() const {
    // NB: This method is not virtual and avoid dispatches for performance reasons.
    return key_set_.has(DispatchKey::HIP) ||
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -16,7 +16,7 @@ int32_t driver_version() {
  return driver_version;
 }

-int device_count_impl() {
+int device_count_impl(bool fail_if_no_driver) {
  int count;
  auto err = cudaGetDeviceCount(&count);
  if (err == cudaSuccess) {
@ -34,6 +34,11 @@ int device_count_impl() {
    case cudaErrorInsufficientDriver: {
      auto version = driver_version();
      if (version <= 0) {
+        if (!fail_if_no_driver) {
+          // No CUDA driver means no devices
+          count = 0;
+          break;
+        }
        TORCH_CHECK(
            false,
            "Found no NVIDIA driver on your system. Please check that you "
@ -95,9 +100,9 @@ DeviceIndex device_count() noexcept {
  // initialize number of devices only once
  static int count = []() {
    try {
-      auto result = device_count_impl();
+      auto result = device_count_impl(/*fail_if_no_driver=*/false);
      TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
-      return device_count_impl();
+      return result;
    } catch (const c10::Error& ex) {
      // We don't want to fail, but still log the warning
      // msg() returns the message without the stack trace
@ -110,7 +115,7 @@ DeviceIndex device_count() noexcept {

 DeviceIndex device_count_ensure_non_zero() {
  // Call the implementation every time to throw the exception
-  int count = device_count_impl();
+  int count = device_count_impl(/*fail_if_no_driver=*/true);
  // Zero gpus doesn't produce a warning in `device_count` but we fail here
  TORCH_CHECK(count, "No CUDA GPUs are available");
  return static_cast<DeviceIndex>(count);
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -590,6 +590,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      list(APPEND Caffe2_GPU_SRCS
        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
    endif()
+    set_source_files_properties(
+      ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+      PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
+    )
  endif()

  if(USE_ROCM)
@ -741,6 +745,10 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
 # Wrapper library for people who link against torch and expect both CPU and CUDA support
 # Contains "torch_cpu" and "torch_cuda"
 add_library(torch ${DUMMY_EMPTY_FILE})
+if(BUILD_SPLIT_CUDA)
+  # When we split torch_cuda, we want a dummy torch_cuda library that contains both parts
+  add_library(torch_cuda ${DUMMY_EMPTY_FILE})
+endif()
 if(HAVE_SOVERSION)
  set_target_properties(torch PROPERTIES
      VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
@ -1233,11 +1241,12 @@ endif()

 caffe2_interface_library(torch_cpu torch_cpu_library)

-if(BUILD_SPLIT_CUDA)
-  caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
-  caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
-elseif(USE_CUDA)
+if(USE_CUDA)
  caffe2_interface_library(torch_cuda torch_cuda_library)
+  if(BUILD_SPLIT_CUDA)
+    caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
+    caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
+  endif()
 elseif(USE_ROCM)
  caffe2_interface_library(torch_hip torch_hip_library)
 endif()
@ -1245,22 +1254,26 @@ endif()
 caffe2_interface_library(torch torch_library)

 install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-if(BUILD_SPLIT_CUDA)
-  install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-elseif(USE_CUDA)
+
+if(USE_CUDA)
  install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  if(BUILD_SPLIT_CUDA)
+    install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+    install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endif()
 elseif(USE_ROCM)
  install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
 install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")

 target_link_libraries(torch PUBLIC torch_cpu_library)
-if(BUILD_SPLIT_CUDA)
-  target_link_libraries(torch PUBLIC torch_cuda_cu_library)
-  target_link_libraries(torch PUBLIC torch_cuda_cpp_library)
-elseif(USE_CUDA)
+
+if(USE_CUDA)
  target_link_libraries(torch PUBLIC torch_cuda_library)
+  if(BUILD_SPLIT_CUDA)
+    target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
+    target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
+  endif()
 elseif(USE_ROCM)
  target_link_libraries(torch PUBLIC torch_hip_library)
 endif()
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@ -47,9 +47,9 @@ OP_TEMPLATE = CT.from_file(

 try:
    # use faster C loader if available
-    from yaml import CLoader as Loader
+    from yaml import CSafeLoader as Loader
 except ImportError:
-    from yaml import Loader  # type: ignore[misc]
+    from yaml import SafeLoader as Loader  # type: ignore[misc]


 def write(filename, s):
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -188,6 +188,20 @@ find_library(CUDA_CUDA_LIB cuda
 find_library(CUDA_NVRTC_LIB nvrtc
    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
    PATH_SUFFIXES lib lib64 lib/x64)
+  if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
+ execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c
+    "import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])"
+    RESULT_VARIABLE _retval
+    OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH)
+  if(NOT _retval EQUAL 0)
+    message(WARNING "Failed to compute shorthash for libnvrtc.so")
+    set(CUDA_NVRTC_SHORTHASH "XXXXXXXX")
+  else()
+    string(STRIP "${CUDA_NVRTC_SHORTHASH}" CUDA_NVRTC_SHORTHASH)
+    message(STATUS "${CUDA_NVRTC_LIB} shorthash is ${CUDA_NVRTC_SHORTHASH}")
+  endif()
+endif()

 # Create new style imported libraries.
 # Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
@ -338,6 +352,12 @@ if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
      set_property(
        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
+      # Add explicit dependency to cudart_static to fix
+      # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
+      # error adding symbols: DSO missing from command line
+      set_property(
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_cudart_static_LIBRARY}" rt dl)
    endif()
 else()
    set_property(
--- a/docs/cpp/requirements.txt
+++ b/docs/cpp/requirements.txt
@ -1,6 +1,7 @@
 sphinx==3.1.2
 breathe==4.25.0
 exhale==0.2.3
+docutils==0.16
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 bs4
 lxml
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,4 +1,5 @@
 sphinx==2.4.4
+docutils==0.16
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinxcontrib.katex
 matplotlib
--- a/docs/source/_templates-stable/layout.html
+++ b/docs/source/_templates-stable/layout.html
@ -1,34 +0,0 @@
-{% extends "!layout.html" %}
-<link rel="canonical" href="{{ theme_canonical_url }}{{ pagename }}.html" />
-
-{% block menu %}
-
-{{ super() }}
-{% endblock %}
-
-{% block footer %}
-{{ super() }}
-<script>
-  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
-
-  ga('create', 'UA-90545585-1', 'auto');
-  ga('send', 'pageview');
-
-</script>
-
-<script async src="https://www.googletagmanager.com/gtag/js?id=UA-117752657-2"></script>
-
-<script>
-  window.dataLayer = window.dataLayer || [];
-
-  function gtag(){dataLayer.push(arguments);}
-
-  gtag('js', new Date());
-  gtag('config', 'UA-117752657-2');
-</script>
-
-<img height="1" width="1" style="border-style:none;" alt="" src="https://www.googleadservices.com/pagead/conversion/795629140/?label=txkmCPmdtosBENSssfsC&amp;guid=ON&amp;script=0"/>
-{% endblock %}
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@ -1,18 +1,31 @@
 {% extends "!layout.html" %}
-
 <link rel="canonical" href="{{ theme_canonical_url }}{{ pagename }}.html" />
+
 {% block menu %}
+{% if release == "master" %}
 <div>
  <a style="color:#F05732" href="{{ theme_canonical_url }}{{ pagename }}.html">
    You are viewing unstable developer preview docs.
    Click here to view docs for latest stable release.
  </a>
 </div>
+{% endif %}
 {{ super() }}
 {% endblock %}

+{% block sidebartitle %}
+    <div class="version">
+      <a href='https://pytorch.org/docs/versions.html'>{{ version }} &#x25BC</a>
+    </div>
+    {% include "searchbox.html" %}
+{% endblock %}
+
+
 {% block footer %}
 {{ super() }}
+<script script type="text/javascript">
+  var collapsedSections = ['Notes', 'Language Bindings', 'Libraries', 'Community'];
+</script>
 <script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
--- a/docs/source/complex_numbers.rst
+++ b/docs/source/complex_numbers.rst
@ -4,7 +4,7 @@ Complex Numbers
 ===============

 Complex numbers are numbers that can be expressed in the form :math:`a + bj`, where a and b are real numbers,
-and *j* is a solution of the equation :math:`x^2 = −1`. Complex numbers frequently occur in mathematics and
+and *j* is a solution of the equation :math:`x^2 = -1`. Complex numbers frequently occur in mathematics and
 engineering, especially in signal processing. Traditionally many users and libraries (e.g., TorchAudio) have
 handled complex numbers by representing the data in float tensors with shape :math:`(..., 2)` where the last
 dimension contains the real and imaginary values.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -75,8 +75,6 @@ napoleon_use_ivar = True

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
-if RELEASE:
-    templates_path = ['_templates-stable'] + templates_path

 # TODO: document these and remove them from here.

@ -170,6 +168,8 @@ if RELEASE:
        html_title = " ".join((project, torch.__version__, "documentation"))
    else:
        html_title = " ".join((project, torch.__version__[:version_end], "documentation"))
+    version = torch.__version__
+    release = version

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@ -0,0 +1,74 @@
+DDP Communication Hooks
+=======================
+
+DDP communication hook is a generic interface to control how to communicate
+gradients across workers by overriding the vanilla allreduce in
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.>`_.
+A few built-in communication hooks are provided,
+and users can easily apply any of these hooks to optimize communication.
+Besides, the hook interface can also support user-defined communication
+strategies for more advanced use cases.
+
+.. warning ::
+    DDP communication hook is experimental and subject to change.
+
+.. warning ::
+    DDP communication hooks can only support single process single device mode
+    on NCCL backend.
+
+How to Use a Communication Hook?
+--------------------------------
+
+To use a communication hook, the user just needs to let the DDP model register
+the hook before the training loop as below.
+
+:func:`torch.nn.parallel.DistributedDataParallel.register_comm_hook`.
+    :noindex:
+
+Default Communication Hooks
+---------------------------
+
+Default communication hooks are simple **stateless** hooks, so the input state
+in ``register_comm_hook`` is either a process group or ``None``.
+
+.. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
+    :members:
+
+PowerSGD Communication Hook
+---------------------------
+
+PowerSGD (`Vogels et al., NeurIPS 2019 <https://arxiv.org/abs/1905.13727>`_)
+is a gradient compression algorithm, which can provide very high compression
+rates and accelerate bandwidth-bound distributed training.
+This algorithm needs to maintain both some hyperparameters and the internal
+state. Therefore, PowerSGD communication hook is a **stateful** hook,
+and the user needs to provide a state object defined as below.
+
+PowerSGD State
+^^^^^^^^^^^^^^^^
+
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+.. autoclass:: PowerSGDState
+
+PowerSGD Hooks
+^^^^^^^^^^^^^^^^
+
+.. warning ::
+    PowerSGD typically requires extra memory of the same size as the model's
+    gradients to enable error feedback, which can compensate for biased
+    compressed communication and improve accuracy.
+
+.. warning ::
+    The current implementation may cause gradient overflow for FP16 input.
+
+.. autofunction:: powerSGD_hook
+.. autofunction:: batched_powerSGD_hook
+
+Acknowledgements
+----------------
+
+Many thanks to PowerSGD paper author **Thijs Vogels** for the code review on
+PowerSGD communication hook, as well as the
+`comparison experiments <https://observablehq.com/@tvogels/powersgd-benchmark>`_,
+which show that the performance of PowerSGD communication hook is on par with
+the implementation in the original `paper <https://arxiv.org/abs/1905.13727>`_.
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@ -58,16 +58,16 @@ distributed (NCCL only when building with CUDA). MPI is an optional backend that
 included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
 installed.)

-.. warning ::
-    As of PyTorch v1.7, Windows support for the distributed package only covers collective
-    communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore,
-    the `init_method` argument in :func:`init_process_group` must point to a file. This works
-    for both local and shared file systems:
+.. note ::
+    As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
+    If  the `init_method` argument of :func:`init_process_group` points to a file it must adhere
+    to the following schema:

    - Local file system, ``init_method="file:///d:/tmp/some_file"``
    - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``

-    Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance.
+    Same as on Linux platform, you can enable TcpStore by setting environment variables,
+    MASTER_ADDR and MASTER_PORT.

 Which backend to use?
 ^^^^^^^^^^^^^^^^^^^^^
@ -330,13 +330,13 @@ as they should never be created manually, but they are guaranteed to support two

 Synchronous and asynchronous collective operations
 --------------------------------------------------
-Every collective operation function supports the following two kinds of operations, 
+Every collective operation function supports the following two kinds of operations,
 depending on the setting of the ``async_op`` flag passed into the collective:

 **Synchronous operation** - the default mode, when ``async_op`` is set to ``False``.
 When the function returns, it is guaranteed that
 the collective operation is performed. In the case of CUDA operations, it is not guaranteed
-that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any 
+that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
 further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
 function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
 synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
@ -347,12 +347,12 @@ See the below script to see examples of differences in these semantics for CPU a
 returns a distributed request object. In general, you don't need to create it manually and it
 is guaranteed to support two methods:

-* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations, 
-  returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the 
-  default stream without further synchronization. 
+* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations,
+  returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
+  default stream without further synchronization.
 * ``wait()`` - in the case of CPU collectives, will block the process until the operation is completed. In the case
-  of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the 
-  output can be utilized on the default stream without further synchronization. 
+  of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the
+  output can be utilized on the default stream without further synchronization.

 **Example**

@ -368,7 +368,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
    handle = dist.all_reduce(output, async_op=True)
    # Wait ensures the operation is enqueued, but not necessarily complete.
    handle.wait()
-    # Using result on non-default stream.    
+    # Using result on non-default stream.
    with torch.cuda.stream(s):
        s.wait_stream(torch.cuda.default_stream())
        output.add_(100)
@ -382,7 +382,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
 Collective functions
 --------------------

-.. autofunction:: broadcast 
+.. autofunction:: broadcast

 .. autofunction:: broadcast_object_list

@ -426,7 +426,7 @@ you can find an implementation of those in the `torch.distributed.nn.*` module.
 Functions here are synchronous and will be inserted in the autograd graph, so
 you need to ensure that all the processes that participated in the collective operation
 will do the backward pass for the backward communication to effectively happen and
-don't cause a deadlock. 
+don't cause a deadlock.

 Please notice that currently the only backend where all the functions are guaranteed to work is ``gloo``.
 .. autofunction:: torch.distributed.nn.broadcast
--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
@ -176,6 +176,15 @@ Probability distributions - torch.distributions
    :undoc-members:
    :show-inheritance:

+:hidden:`LKJCholesky`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.lkj_cholesky
+.. autoclass:: LKJCholesky
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 :hidden:`Laplace`
 ~~~~~~~~~~~~~~~~~~~~~~~

@ -337,7 +346,7 @@ Probability distributions - torch.distributions
    :members:
    :undoc-members:
    :show-inheritance:
-    
+
 :hidden:`Weibull`
 ~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@ -209,7 +209,7 @@ can be found below.
        node.replace_all_uses_with(new_node)

 For simple transformations that only consist of substitutions, you can also
-make use of the `subgraph rewriter. <https://github.com/pytorch/pytorch/blob/master/torch/fx/subgraph_rewriter.py>`__
+make use of the `subgraph rewriter. <https://github.com/pytorch/pytorch/blob/release/1.8/torch/fx/subgraph_rewriter.py>`__

 Subgraph Rewriting With replace_pattern()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -397,7 +397,7 @@ Examples of the Interpreter Pattern
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 -  `Shape
-   Propagation <https://github.com/pytorch/pytorch/blob/master/torch/fx/experimental/shape_prop.py>`__
+   Propagation <https://github.com/pytorch/pytorch/blob/release/1.8/torch/fx/passes/shape_prop.py>`__
 -  `Performance Profiler <https://github.com/pytorch/tutorials/pull/1319>`__


@ -725,8 +725,7 @@ For example, let’s examine the following program:
 ::

    def func_to_trace(x):
-        dim0 = x.size[0]
-        if dim0 == 3:
+        if x.sum() > 0:
            return torch.relu(x)
        else:
            return torch.neg(x)
@ -735,7 +734,7 @@ For example, let’s examine the following program:
    """
      <...>
      File "dyn.py", line 6, in func_to_trace
-        if dim0 == 3:
+        if x.sum() > 0:
      File "pytorch/torch/fx/proxy.py", line 155, in __bool__
        return self.tracer.to_bool(self)
      File "pytorch/torch/fx/proxy.py", line 85, in to_bool
@ -743,8 +742,8 @@ For example, let’s examine the following program:
    torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow
    """

-The condition to the ``if`` statement relies on the value of ``dim0``,
-which eventually relies on the value of ``x``, a function input. Since
+The condition to the ``if`` statement relies on the value of ``x.sum()``,
+which relies on the value of ``x``, a function input. Since
 ``x`` can change (i.e. if you pass a new input tensor to the traced
 function), this is *dynamic control flow*. The traceback walks back up
 through your code to show you where this situation happens.
@ -807,8 +806,8 @@ code. This is a valid pattern that is supported by symbolic tracing.
 Many instances of dynamic control flow are semantically static control
 flow. These instances can be made to support symbolic tracing by
 removing the data dependencies on input values, for example by moving
-values to ``Module`` attributes or by passing constant values during
-symbolic tracing:
+values to ``Module`` attributes or by binding concrete values to arguments
+during symbolic tracing:

 ::

@ -818,11 +817,7 @@ symbolic tracing:

        fx.symbolic_trace(f) # Fails!

-        def wrapper(flag):
-            return lambda x: f(x, flag)
-
-        new_f = wrapper(flag=True)
-        fx.symbolic_trace(new_f)
+        fx.symbolic_trace(f, concrete_args={'flag': True})

 In the case of truly dynamic control flow, the sections of the program
 that contain this code can be traced as calls to the Method (see
@ -834,7 +829,7 @@ Non-\ ``torch`` Functions

 FX uses ``__torch_function__`` as the mechanism by which it intercepts
 calls (see the `technical
-overview <https://github.com/pytorch/pytorch/blob/master/torch/fx/OVERVIEW.md#technical-details>`__
+overview <https://github.com/pytorch/pytorch/blob/release/1.8/torch/fx/OVERVIEW.md#technical-details>`__
 for more information about this). Some functions, such as builtin Python
 functions or those in the ``math`` module, are things that are not
 covered by ``__torch_function__``, but we would still like to capture
@ -968,7 +963,18 @@ Miscellanea
      ``ones_like`` or ``zeros_like`` may be a viable substitute.
   -  Nondeterministic constructors (``rand``, ``randn``) will have a
      single random value embedded in the trace. This is likely not the
-      intended behavior.
+      intended behavior. One workaround is to wrap ``torch.randn`` in a ``torch.fx.wrap`` function and call that instead.
+
+    ::
+
+        @torch.fx.wrap
+        def torch_randn(x, shape):
+            return torch.randn(shape)
+
+        def f(x):
+            return x + torch_randn(x, 5)
+        fx.symbolic_trace(f)
+
   -  This behavior may be fixed in a future release.

 -  Type annotations
@ -1004,6 +1010,7 @@ API Reference

 .. autoclass:: torch.fx.Tracer
  :members:
+  :inherited-members:

 .. autoclass:: torch.fx.Proxy

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -71,6 +71,7 @@ Features described in this documentation are classified by release status:
   onnx
   optim
   complex_numbers
+   ddp_comm_hooks
   pipeline
   quantization
   rpc
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@ -80,59 +80,10 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 Quantization API Summary
 ---------------------------------------

-PyTorch provides two different modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization.
+PyTorch provides two different modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization. Please see master(unstable) docs for FX Graph Mode Quantization.

 Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.

-FX Graph Mode Quantization is a new automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
-
-New users of quantization are encouraged to try out FX Graph Mode Quantization first, if it does not work, user may try to follow the guideline of `using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide_tutorial.html>`_ or fall back to eager mode quantization.
-
-The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization:
-
-+-----------------+-------------------+-------------------+
-|                 |Eager Mode         |FX Graph           |
-|                 |Quantization       |Mode               |
-|                 |                   |Quantization       |
-+-----------------+-------------------+-------------------+
-|Release          |beta               |prototype          |
-|Status           |                   |                   |
-+-----------------+-------------------+-------------------+
-|Operator         |Manual             |Automatic          |
-|Fusion           |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quant/DeQuant    |Manual             |Automatic          |
-|Placement        |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quantizing       |Supported          |Supported          |
-|Modules          |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quantizing       |Manual             |Automatic          |
-|Functionals/Torch|                   |                   |
-|Ops              |                   |                   |
-+-----------------+-------------------+-------------------+
-|Support for      |Limited Support    |Fully              |
-|Customization    |                   |Supported          |
-+-----------------+-------------------+-------------------+
-|Quantization Mode|Post Training      |Post Training      |
-|Support          |Quantization:      |Quantization:      |
-|                 |Static, Dynamic,   |Static, Dynamic,   |
-|                 |Weight Only        |Weight Only        |
-|                 |                   |                   |
-|                 |Quantiztion Aware  |Quantiztion Aware  |
-|                 |Training:          |Training:          |
-|                 |Static             |Static             |
-+-----------------+-------------------+-------------------+
-|Input/Output     |``torch.nn.Module``|``torch.nn.Module``|
-|Model Type       |                   |(May need some     |
-|                 |                   |refactors to make  |
-|                 |                   |the model          |
-|                 |                   |compatible with FX |
-|                 |                   |Graph Mode         |
-|                 |                   |Quantization)      |
-+-----------------+-------------------+-------------------+
-
-
 Eager Mode Quantization
 ^^^^^^^^^^^^^^^^^^^^^^^

@ -388,94 +339,6 @@ To learn more about quantization aware training, please see the `QAT
 tutorial
 <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.

-(Prototype) FX Graph Mode Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Quantization types supported by FX Graph Mode can be classified in two ways:
-
-1.
- Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data)
- Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data)
-
-2.
- Weight Only Quantization (only weight is statically quantized)
- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized)
- Static Quantization (both weight and activations are statically quantized)
-
-These two ways of classification are independent, so theoretically we can have 6 different types of quantization.
-
-The supported quantization types in FX Graph Mode Quantization are:
-
- Post Training Quantization
-
-  - Weight Only Quantization
-  - Dynamic Quantization
-  - Static Quantization
-
- Quantization Aware Training
-
-  - Static Quantization
-
-
-There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_dict` (an argument of the `prepare_fx` function).
-
-API Example::
-
-  import torch.quantization.quantize_fx as quantize_fx
-  import copy
-
-  model_fp = UserModel(...)
-
-  #
-  # post training dynamic/weight_only quantization
-  #
-
-  # we need to deepcopy if we still want to keep model_fp unchanged after quantization since quantization apis change the input model
-  model_to_quantize = copy.deepcopy(model_fp)
-  model_to_quantize.eval()
-  qconfig_dict = {"": torch.quantization.default_dynamic_qconfig}
-  # prepare
-  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
-  # no calibration needed when we only have dynamici/weight_only quantization
-  # quantize
-  model_quantized = quantize_fx.convert_fx(model_prepared)
-
-  #
-  # post training static quantization
-  #
-
-  model_to_quantize = copy.deepcopy(model_fp)
-  qconfig_dict = {"": torch.quantization.get_default_qconfig('qnnpack')}
-  model_to_quantize.eval()
-  # prepare
-  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
-  # calibrate (not shown)
-  # quantize
-  model_quantized = quantize_fx.convert_fx(model_prepared)
-
-  #
-  # quantization aware training for static quantization
-  #
-
-  model_to_quantize = copy.deepcopy(model_fp)
-  qconfig_dict = {"": torch.quantization.get_default_qat_qconfig('qnnpack')}
-  model_to_quantize.train()
-  # prepare
-  model_prepared = quantize_fx.prepare_qat_fx(model_to_qunatize, qconfig_dict)
-  # training loop (not shown)
-  # quantize
-  model_quantized = quantize_fx.convert_fx(model_prepared)
-
-  #
-  # fusion
-  #
-  model_to_quantize = copy.deepcopy(model_fp)
-  model_fused = quantize_fx.fuse_fx(model_to_quantize)
-
-Please see the following tutorials for more information about FX Graph Mode Quantization:
- `User Guide on Using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide_tutorial.html>`_
- `FX Graph Mode Post Training Static Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static_tutorial.html>`_
- `FX Graph Mode Post Training Dynamic Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic_tutorial.html>`_
-
 Quantized Tensors
 ---------------------------------------

--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -484,6 +484,7 @@ Sparse tensor functions
 +++++++++++++++++++++++

 .. autofunction:: torch.sparse_coo_tensor
+   :noindex:
 .. autofunction:: torch.sparse.sum
 .. autofunction:: torch.sparse.addmm
 .. autofunction:: torch.sparse.mm
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -563,5 +563,4 @@ Utilities
    promote_types
    use_deterministic_algorithms
    are_deterministic_algorithms_enabled
-    vmap
    _assert
--- a/setup.py
+++ b/setup.py
@ -552,6 +552,50 @@ class build_ext(setuptools.command.build_ext.build_ext):
            with open('compile_commands.json', 'w') as f:
                f.write(new_contents)

+class concat_license_files():
+    """Merge LICENSE and LICENSES_BUNDLED.txt as a context manager
+
+    LICENSE is the main PyTorch license, LICENSES_BUNDLED.txt is auto-generated
+    from all the licenses found in ./third_party/. We concatenate them so there
+    is a single license file in the sdist and wheels with all of the necessary
+    licensing info.
+    """
+    def __init__(self):
+        self.f1 = 'LICENSE'
+        self.f2 = 'third_party/LICENSES_BUNDLED.txt'
+
+    def __enter__(self):
+        """Concatenate files"""
+        with open(self.f1, 'r') as f1:
+            self.bsd_text = f1.read()
+
+        with open(self.f1, 'a') as f1:
+            with open(self.f2, 'r') as f2:
+                self.bundled_text = f2.read()
+                f1.write('\n\n')
+                f1.write(self.bundled_text)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        """Restore content of f1"""
+        with open(self.f1, 'w') as f:
+            f.write(self.bsd_text)
+
+
+try:
+    from wheel.bdist_wheel import bdist_wheel
+except ImportError:
+    # This is useful when wheel is not installed and bdist_wheel is not
+    # specified on the command line. If it _is_ specified, parsing the command
+    # line will fail before wheel_concatenate is needed
+    wheel_concatenate = None
+else:
+    # Need to create the proper LICENSE.txt for the wheel
+    class wheel_concatenate(bdist_wheel):
+        """ check submodules on sdist to prevent incomplete tarballs """
+        def run(self):
+            with concat_license_files():
+                super().run()
+

 class install(setuptools.command.install.install):
    def run(self):
@ -724,6 +768,7 @@ def configure_extension_build():
        'build_ext': build_ext,
        'clean': clean,
        'install': install,
+        'bdist_wheel': wheel_concatenate,
    }

    entry_points = {
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@ -3,9 +3,11 @@
 #include <test/cpp/jit/test_utils.h>

 #include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/import_source.h>
+#include <torch/csrc/jit/testing/file_check.h>
 #include <torch/torch.h>

 namespace torch {
@ -341,6 +343,20 @@ TEST(ModuleAPITest, Define) {
  AT_ASSERT(result.toTensor().item<float>() == 6);
 }

+TEST(ModuleAPITest, Freezing) {
+  Module m("m");
+  m.register_parameter("foo", torch::ones({}), false);
+  m.define(R"(
+    def forward(self, x, b : int = 4):
+      return self.foo + x + b
+  )");
+  m.eval();
+  auto frozen_mod = torch::jit::freeze(m);
+  auto forward_g = frozen_mod.get_method("forward").graph();
+  testing::FileCheck().check_not("GetAttr")->run(*forward_g);
+  ;
+}
+
 TEST(ModuleAPITest, To_CUDA) {
  Module m("test");
  {
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@ -14,7 +14,7 @@ import torch.distributed as dist
 from typing import List, Any, Type, cast
 from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.optim import SGD
-from torch.testing._internal.common_distributed import skip_if_no_gpu, MultiProcessTestCase
+from torch.testing._internal.common_distributed import skip_if_no_gpu, skip_if_not_multigpu, MultiProcessTestCase
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.testing._internal.common_distributed import skip_if_rocm

@ -367,6 +367,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
        all_trainable()
        some_trainable()

+    @skip_if_not_multigpu
    def test_collect_shards(self):
        """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer"""
        self.dist_init(self.rank)
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@ -20,6 +20,10 @@ change. This file contains two types of randomized tests:
   it's fine to increment the seed of the failing test (but you shouldn't need
   to increment it more than once; otherwise something is probably actually
   wrong).
+
+3. `test_geometric_sample`, `test_binomial_sample` and `test_poisson_sample`
+   are validated against `scipy.stats.` which are not guaranteed to be identical
+   across different versions of scipy (namely, they yield invalid results in 1.7+)
 """

 import math
@ -588,6 +592,20 @@ BAD_EXAMPLES = [
        {'scale': torch.tensor([0., 1.], requires_grad=True)},
        {'scale': torch.tensor([1., -1.], requires_grad=True)},
    ]),
+    Example(LKJCholesky, [
+        {
+            'dim': -2,
+            'concentration': 0.1
+        },
+        {
+            'dim': 1,
+            'concentration': 2.,
+        },
+        {
+            'dim': 2,
+            'concentration': 0.,
+        },
+    ]),
    Example(Laplace, [
        {
            'loc': torch.tensor([1., 1.], requires_grad=True),
@ -1376,7 +1394,7 @@ class TestDistributions(TestCase):
        self.assertFalse(RelaxedOneHotCategorical(probs=p, temperature=temp).sample().requires_grad)
        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((2, 2)).size(), (2, 2, 3))
        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((1,)).size(), (1, 3))
-        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp, p))
+        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p))

    def test_relaxed_one_hot_categorical_2d(self):
        probabilities = [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]]
@ -1390,8 +1408,8 @@ class TestDistributions(TestCase):
        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample().size(), (2, 3))
        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample((6,)).size(), (6, 2, 3))
-        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp, p))
-        self._gradcheck_log_prob(RelaxedOneHotCategorical, (temp_2, p))
+        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p))
+        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp_2, p))

    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
    def test_argmax_relaxed_categorical(self):
@ -1627,10 +1645,11 @@ class TestDistributions(TestCase):
                                        'LogNormal(loc={}, scale={})'.format(mean, std))

    def test_logisticnormal(self):
+        set_rng_seed(1)  # see Note [Randomized statistical tests]
        mean = torch.randn(5, 5).requires_grad_()
        std = torch.randn(5, 5).abs().requires_grad_()
        mean_1d = torch.randn(1).requires_grad_()
-        std_1d = torch.randn(1).requires_grad_()
+        std_1d = torch.randn(1).abs().requires_grad_()
        mean_delta = torch.tensor([1.0, 0.0])
        std_delta = torch.tensor([1e-5, 1e-5])
        self.assertEqual(LogisticNormal(mean, std).sample().size(), (5, 6))
@ -1648,9 +1667,11 @@ class TestDistributions(TestCase):
                                       1. / (1. + 1. + math.exp(1))]),
                         atol=1e-4, rtol=0)

-        self._gradcheck_log_prob(LogisticNormal, (mean, std))
-        self._gradcheck_log_prob(LogisticNormal, (mean, 1.0))
-        self._gradcheck_log_prob(LogisticNormal, (0.0, std))
+        # TODO: gradcheck seems to mutate the sample values so that the simplex
+        # constraint fails by a very small margin.
+        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, std))
+        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, 1.0))
+        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (0.0, std))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_logisticnormal_logprob(self):
@ -2578,7 +2599,7 @@ class TestDistributions(TestCase):

        for dim in range(2, 5):
            log_probs = []
-            lkj = LKJCholesky(dim, concentration=1.)
+            lkj = LKJCholesky(dim, concentration=1., validate_args=True)
            for i in range(2):
                sample = lkj.sample()
                sample_tril = tril_matrix_to_vec(sample, diag=-1)
@ -2591,6 +2612,8 @@ class TestDistributions(TestCase):
                # for dim=2, pdf = 0.5 (jacobian adjustment factor is 0.)
                self.assertTrue(all([x == torch.tensor(0.5).log() for x in log_probs]))
            self.assertEqual(log_probs[0], log_probs[1])
+            invalid_sample = torch.cat([sample, sample.new_ones(1, dim)], dim=0)
+            self.assertRaises(ValueError, lambda: lkj.log_prob(invalid_sample))

    def test_independent_shape(self):
        for Dist, params in EXAMPLES:
@ -4498,6 +4521,35 @@ class TestValidation(TestCase):
            for param in params:
                Dist(validate_args=True, **param)

+    def test_invalid_log_probs_arg(self):
+        # Check that validation errors are indeed disabled,
+        # but they might raise another error
+        for Dist, params in EXAMPLES:
+            if Dist == TransformedDistribution:
+                # TransformedDistribution has a distribution instance
+                # as the argument, so we cannot do much about that
+                continue
+            for param in params:
+                d_nonval = Dist(validate_args=False, **param)
+                d_val = Dist(validate_args=True, **param)
+                for v in torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]):
+                    # samples with incorrect shape must throw ValueError only
+                    try:
+                        log_prob = d_val.log_prob(v)
+                    except ValueError:
+                        pass
+                    # get sample of correct shape
+                    val = torch.full(d_val.batch_shape + d_val.event_shape, v)
+                    # check samples with incorrect support
+                    try:
+                        log_prob = d_val.log_prob(val)
+                    except ValueError as e:
+                        if e.args and 'must be within the support' in e.args[0]:
+                            try:
+                                log_prob = d_nonval.log_prob(val)
+                            except RuntimeError:
+                                pass
+
    @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
    def test_invalid(self):
        for Dist, params in BAD_EXAMPLES:
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@ -1508,7 +1508,7 @@ class TestFrozenOptimizations(JitTestCase):
        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
        mod = torch.nn.Sequential(conv, bn)
        # set optimize to False here, by default freezing runs optimize_frozen_module
-        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False)
+        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False)
        # inspect frozen mod
        FileCheck().check("batch_norm").run(frozen_mod.graph)
        torch.jit.optimize_frozen_module(frozen_mod)
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@ -182,7 +182,7 @@ class TestModels(TestCase):
        self.exportTest(toC(FakeQuantNet()), toC(x))

    @skipIfUnsupportedMinOpsetVersion(10)
-    def test_qat_resnet(self):
+    def test_qat_resnet_pertensor(self):
        # Quantize ResNet50 model
        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
        qat_resnet50 = resnet50()
@ -202,6 +202,27 @@ class TestModels(TestCase):

        self.exportTest(toC(qat_resnet50), toC(x))

+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_resnet_per_channel(self):
+        # Quantize ResNet50 model
+        x = torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
+        qat_resnet50 = resnet50()
+
+        qat_resnet50.qconfig = quantization.QConfig(
+            activation=quantization.default_fake_quant,
+            weight=quantization.default_per_channel_weight_fake_quant)
+        quantization.prepare_qat(qat_resnet50, inplace=True)
+        qat_resnet50.apply(torch.quantization.enable_observer)
+        qat_resnet50.apply(torch.quantization.enable_fake_quant)
+
+        _ = qat_resnet50(x)
+        for module in qat_resnet50.modules():
+            if isinstance(module, quantization.FakeQuantize):
+                module.calculate_qparams()
+        qat_resnet50.apply(torch.quantization.disable_observer)
+
+        self.exportTest(toC(qat_resnet50), toC(x))
+
    @disableScriptTest()  # None type in outputs
    def test_googlenet(self):
        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@ -11,13 +11,12 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):

    for opset_version in opset_versions:
        self.opset_version = opset_version
+        self.use_new_jit_passes = True
+        self.onnx_shape_inference = True
        run_model_test(self, model, False,
                       input=inputs, rtol=rtol, atol=atol)

        if self.is_script_test_enabled and opset_version > 11:
-            TestModels.use_new_jit_passes = True
-            TestModels.onnx_shape_inference = True
-
            outputs = model(inputs)
            script_model = torch.jit.script(model)
            run_model_test(self, script_model, False, example_outputs=outputs,
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@ -94,7 +94,8 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
                   example_outputs=None, do_constant_folding=True,
                   dynamic_axes=None, test_with_inputs=None,
                   input_names=None, output_names=None,
-                   fixed_batch_size=False, dict_check=True):
+                   fixed_batch_size=False, dict_check=True,
+                   training=None):
    model.eval()
    if input is None:
        input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)
@ -125,7 +126,7 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
                                   example_outputs=output, do_constant_folding=do_constant_folding,
                                   keep_initializers_as_inputs=self.keep_initializers_as_inputs,
                                   dynamic_axes=dynamic_axes, input_names=input_names,
-                                   output_names=output_names, fixed_batch_size=fixed_batch_size, training=None,
+                                   output_names=output_names, fixed_batch_size=fixed_batch_size, training=training,
                                   onnx_shape_inference=self.onnx_shape_inference,
                                   use_new_jit_passes=self.use_new_jit_passes)
        # compute onnxruntime output prediction
@ -230,14 +231,16 @@ class TestONNXRuntime(unittest.TestCase):

    def run_test(self, model, input, rtol=1e-3, atol=1e-7, do_constant_folding=True,
                 batch_size=2, use_gpu=True, dynamic_axes=None, test_with_inputs=None,
-                 input_names=None, output_names=None, fixed_batch_size=False, dict_check=True):
+                 input_names=None, output_names=None, fixed_batch_size=False, dict_check=True,
+                 training=None):
        def _run_test(m):
            return run_model_test(self, m, batch_size=batch_size,
                                  input=input, use_gpu=use_gpu, rtol=rtol, atol=atol,
                                  do_constant_folding=do_constant_folding,
                                  dynamic_axes=dynamic_axes, test_with_inputs=test_with_inputs,
                                  input_names=input_names, output_names=output_names,
-                                  fixed_batch_size=fixed_batch_size, dict_check=dict_check)
+                                  fixed_batch_size=fixed_batch_size, dict_check=dict_check,
+                                  training=training)
        if self.is_script_test_enabled and self.use_new_jit_passes:
            script_model = torch.jit.script(model)
            _run_test(script_model)
@ -673,6 +676,49 @@ class TestONNXRuntime(unittest.TestCase):
        x = {"test_key_in": torch.randn(1, 2, 3)}
        self.run_test(MyModel(), (x, {}))

+    @disableScriptTest()
+    def test_dict_output(self):
+        class DictModelOutput(OrderedDict):
+            tensor_out: torch.Tensor
+            tuple_out: Optional[Tuple[torch.Tensor]] = None
+            list_out: Optional[List[torch.Tensor]] = None
+
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b, c, d):
+                return DictModelOutput(
+                    tensor_out=a,
+                    tuple_out=(b, c),
+                    list_out=[d],
+                )
+
+        a = torch.randn(2, 3)
+        b = torch.randn(2, 3)
+        c = torch.randn(2, 3)
+        d = torch.randn(2, 3)
+        self.run_test(MyModel(), (a, b, c, d))
+
+    def test_tuple_output(self):
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b, c, d):
+                return a, (b, c), d
+
+        a = torch.randn(2, 3)
+        b = torch.randn(2, 3)
+        c = torch.randn(2, 3)
+        d = torch.randn(2, 3)
+        self.run_test(MyModel(), (a, b, c, d))
+
+    def test_nested_tuple_output(self):
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b, c, d):
+                return a, ((b,), (c, d))
+
+        a = torch.randn(2, 3)
+        b = torch.randn(2, 3)
+        c = torch.randn(2, 3)
+        d = torch.randn(2, 3)
+        self.run_test(MyModel(), (a, b, c, d))
+
    @disableScriptTest()
    def test_optional_inputs_with_no_optionals(self):
        class NoOptionalModel(torch.nn.Module):
@ -2716,7 +2762,7 @@ class TestONNXRuntime(unittest.TestCase):
        self.run_test(ScatterModel(), input=(input, indices, values))

        @torch.jit.script
-        def scatter_sum(src: torch.Tensor, index: torch.Tensor): 
+        def scatter_sum(src: torch.Tensor, index: torch.Tensor):
            size = src.size()
            out = torch.zeros(size, dtype=src.dtype)
            return out.scatter_add_(1, index, src)
@ -5587,6 +5633,42 @@ class TestONNXRuntime(unittest.TestCase):
        mat2 = torch.ones(2, 3)
        self.run_test(M(), input=(cond, mat1, mat2))

+    @skipIfUnsupportedMinOpsetVersion(10)  # ONNX IsInf op is added in opset 10.
+    def test_isinf(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.isinf()
+
+        x = torch.tensor([[1, 2, float('inf')], [2, float('nan'), float('inf')]])
+        self.run_test(M(), (x, ))
+
+    @skipIfUnsupportedMinOpsetVersion(9)  # ONNX IsNaN op is added in opset 9.
+    def test_isnan(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.isnan()
+
+        x = torch.tensor([[1, 2, float('inf')], [2, float('nan'), float('inf')]])
+        self.run_test(M(), (x, ))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_any(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.any()
+
+        x = torch.tensor([[True, False], [False, False]])
+        self.run_test(M(), (x, ))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_all(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.all()
+
+        x = torch.tensor([[True, False], [False, False]])
+        self.run_test(M(), (x, ))
+
    def test_dropout(self):
        class M(torch.nn.Module):
            def __init__(self):
@ -5847,6 +5929,67 @@ class TestONNXRuntime(unittest.TestCase):

        self.assertRaises(TypeError, run_model)

+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_embedding(self):
+        class EmbedModel(torch.nn.Module):
+            def forward(self, input, emb):
+                return torch.nn.functional.embedding(input, emb, padding_idx=1)
+
+        model = EmbedModel()
+        x = torch.randint(4, (4, ))
+        x[2] = x[0] = 1
+        embedding_matrix = torch.rand(10, 3)
+        self.run_test(model, (x, embedding_matrix))
+
+        x = torch.randint(4, (4, 3, 2))
+        x[2] = 1
+        x[0][1] = 1
+        self.run_test(model, (x, embedding_matrix))
+        self.run_test(model, (x, embedding_matrix), training=torch.onnx.TrainingMode.TRAINING)
+
+        class EmbedModelWithoutPaddingIdx(torch.nn.Module):
+            def forward(self, input, emb):
+                return torch.nn.functional.embedding(input, emb)
+
+        model = EmbedModelWithoutPaddingIdx()
+        x = torch.randint(4, (4, 3, 2))
+        self.run_test(model, (x, embedding_matrix))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_embedding_module(self):
+        class EmbedModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.emb = torch.nn.Embedding(4, 3, padding_idx=1)
+                self.emb2 = torch.nn.Embedding(4, 3, padding_idx=1)
+                with torch.no_grad():
+                    self.emb2.weight[1] = torch.ones(3)
+
+            def forward(self, input):
+                return self.emb(input), self.emb2(input)
+
+        model = EmbedModel()
+        x = torch.randint(4, (4, ))
+        x[2] = x[0] = 1
+        self.run_test(model, (x, ))
+
+        x = torch.randint(4, (4, 3, 2))
+        x[2] = 1
+        x[0][1] = 1
+        self.run_test(model, (x, ))
+
+        class EmbedModelWithoutPaddingIdx(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.emb = torch.nn.Embedding(4, 3)
+
+            def forward(self, input):
+                return self.emb(input)
+
+        model = EmbedModelWithoutPaddingIdx()
+        x = torch.randint(4, (4, 3, 2))
+        self.run_test(model, (x, ))
+
    def _dispatch_rnn_test(self, name, *args, **kwargs):
        if name == 'elman':
            self._elman_rnn_test(*args, **kwargs)
@ -5998,6 +6141,20 @@ class TestONNXRuntime(unittest.TestCase):
        x = torch.randn(6, 4, 3, 3)
        self.run_test(FakeQuantizePerTensorModel(), (x))

+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_fake_quantize_per_channel(self):
+        class FakeQuantizePerChannelModel(torch.nn.Module):
+            def forward(self, input):
+                amax = torch.ones(4)
+                scale = amax / 127.
+                zero_point = torch.zeros_like(amax, dtype=torch.long)
+                # Quantize twice to test differnet branches
+                y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255)
+                return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127)
+
+        x = torch.randn(6, 4, 3, 3)
+        self.run_test(FakeQuantizePerChannelModel(), (x))
+
    def test_batchnorm_training(self):
        class MyModule(torch.nn.Module):
            def __init__(self):
--- a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@ -2,6 +2,8 @@ import unittest
 import onnxruntime  # noqa
 import torch

+from torch.cuda.amp import autocast
+
 from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
 from test_pytorch_common import skipIfNoCuda

@ -24,6 +26,43 @@ class TestONNXRuntime_cuda(unittest.TestCase):
        x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
        self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)

+    @skipIfUnsupportedMinOpsetVersion(9)
+    @skipIfNoCuda
+    def test_layer_norm_fp16(self):
+        class LayerNormModel(torch.nn.Module):
+            def __init__(self):
+                super(LayerNormModel, self).__init__()
+                self.layer_norm = torch.nn.LayerNorm([10, 10])
+
+            def forward(self, x):
+                return self.layer_norm(x)
+
+        x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
+        self.run_test(LayerNormModel(), x, rtol=1e-3, atol=1e-5)
+
+
+    @skipIfUnsupportedMinOpsetVersion(12)
+    @skipIfNoCuda
+    def test_softmaxCrossEntropy_fusion_fp16(self):
+        class FusionModel(torch.nn.Module):
+            def __init__(self):
+                super(FusionModel, self).__init__()
+                self.loss = torch.nn.NLLLoss(reduction='none')
+                self.m = torch.nn.LogSoftmax(dim=1)
+
+            @autocast()
+            def forward(self, input, target):
+                output = self.loss(self.m(2 * input), target)
+                return output
+
+        N, C = 5, 4
+        input = torch.randn(N, 16, dtype=torch.float16, device=torch.device('cuda'))
+        target = torch.empty(N, dtype=torch.long, device=torch.device('cuda')).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
+        self.run_test(FusionModel(), (input, target))
+
 TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
 TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test

--- a/test/quantization/test_backward_compatibility.py
+++ b/test/quantization/test_backward_compatibility.py
@ -2,7 +2,7 @@

 import sys
 import os
-
+import unittest
 # torch
 import torch
 import torch.nn as nn
@ -11,7 +11,7 @@ import torch.nn.quantized.dynamic as nnqd
 import torch.nn.intrinsic.quantized as nniq

 # Testing utils
-from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.common_utils import TestCase, IS_AVX512_VNNI_SUPPORTED
 from torch.testing._internal.common_quantized import override_qengines, qengine_is_fbgemm

 def remove_prefix(text, prefix):
@ -216,6 +216,7 @@ class TestSerialization(TestCase):
            # TODO: graph mode quantized conv3d module

    @override_qengines
+    @unittest.skipIf(IS_AVX512_VNNI_SUPPORTED, "This test fails on machines with AVX512_VNNI support. Ref: GH Issue 59098")
    def test_lstm(self):
        class LSTMModule(torch.nn.Module):
            def __init__(self):
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@ -872,7 +872,7 @@ class TestFakeQuantize(TestCase):
            scale, zero_point = float(scale), int(zero_point)
            quant_min, quant_max = obs._calculate_qmin_qmax()

-            Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
+            Y_test = torch.fake_quantize_per_tensor_affine(
                X, scale, zero_point, quant_min, quant_max)
            Y_ref = _fake_quantize_per_tensor_affine_reference(
                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
@ -899,7 +899,7 @@ class TestFakeQuantize(TestCase):
            quant_min, quant_max = obs._calculate_qmin_qmax()

            # forward pass
-            Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask(
+            Y_test = torch.fake_quantize_per_tensor_affine(
                X, scale, zero_point, quant_min, quant_max)
            Y_ref = _fake_quantize_per_tensor_affine_reference(
                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
@ -1246,7 +1246,7 @@ class TestFakeQuantize(TestCase):

            Y = _fake_quantize_per_channel_affine_reference(
                X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
-            Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
+            Y_prime = torch.fake_quantize_per_channel_affine(
                X, scale, zero_point, axis, quant_min, quant_max)
            np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

@ -1339,7 +1339,7 @@ class TestFakeQuantize(TestCase):
            zero_point = zero_point.to(torch.int64)
            quant_min, quant_max = obs._calculate_qmin_qmax()
            X.requires_grad_()
-            Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
+            Y_prime = torch.fake_quantize_per_channel_affine(
                X, scale, zero_point, axis, quant_min, quant_max)
            dout = torch.rand(X.shape, dtype=torch.float).to(device)
            dX = _fake_quantize_per_channel_affine_grad_reference(
--- a/Show More
+++ b/Show More