Merge branch 'master' into issue#58739

add blank lines for readability
revert submodule update
2025-10-27 17:54:55 +08:00 · 2021-06-17 15:55:42 +02:00 · 2021-06-17 15:55:35 +02:00 · 2021-06-17 15:54:36 +02:00 · 2021-06-17 16:40:38 +05:30 · 2021-06-17 16:13:25 +05:30
1562 changed files with 66690 additions and 48492 deletions
--- a/.azure_pipelines/job_templates/pytorch-template-unix.yml
+++ b/.azure_pipelines/job_templates/pytorch-template-unix.yml
@ -48,4 +48,14 @@ jobs:
      _TS_CLONE_P: $(TS_CLONE_PASSWORD)
      _TS_P: $(TS_PAT)
      _TS_SM_P: $(TS_SM_PAT)
+      _AZUREML_CLONE_PASSWORD: $(AZUREML_CLONE_PASSWORD)
+      _SPPASSWORD: $(SPPASSWORD)
    displayName: Run PyTorch Unit Tests
+
+  # Tests results are available outside the docker container since
+  # the current directory is mounted as a volume of the container.
+  - task: PublishTestResults@2
+    condition: always()
+    inputs:
+      testResultsFiles: '**/test-*.xml'
+      testRunTitle: 'Publish test results for Python'
--- a/.azure_pipelines/job_templates/pytorch-template-win.yml
+++ b/.azure_pipelines/job_templates/pytorch-template-win.yml
@ -47,3 +47,11 @@ jobs:
      _TS_P: $(TS_PAT)
      _TS_SM_P: $(TS_SM_PAT)
    displayName: Run PyTorch Unit Tests
+
+  # Tests results are available outside the docker container since
+  # the current directory is mounted as a volume of the container.
+  - task: PublishTestResults@2
+    condition: always()
+    inputs:
+      testResultsFiles: '**\test-*.xml'
+      testRunTitle: 'Publish test results for Python'
--- a/.azure_pipelines/job_templates/wheel-wait-job-template.yml
+++ b/.azure_pipelines/job_templates/wheel-wait-job-template.yml
@ -8,7 +8,7 @@ steps:
    connectionType: 'connectedServiceName'
    serviceConnection: circleciconn
    method: 'POST'
-    headers: '{"Content-Type":"application/json", "BranchName":"$(TARGET_BRANCH_TO_CHECK_PR)", "JobName":"$(TARGET_CIRCLECI_PR)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}'
+    headers: '{"Content-Type":"application/json", "BranchName":"$(_TARGET_BRANCH_TO_CHECK)", "JobName":"$(TARGET_CIRCLECI_BUILD_PR)", "PRNumber":"$(_NUMBER_BUILD_PR)", "TargetCommit":"$(_TARGET_COMMIT)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}'
    body: ''
    urlSuffix: 'api/JobStatus'
    waitForCompletion: true
--- a/.azure_pipelines/job_templates/wheel-wait-template.yml
+++ b/.azure_pipelines/job_templates/wheel-wait-template.yml
@ -1,6 +1,6 @@
 # Initiate 5 agentless-server waiting jobs to check on the
 # status of PR artifact builds, for a maximum wait time of
-# 5 * 60 min =300 minutes. These jobs will pass immediately
+# 11*60 min=660 mins. These jobs will pass immediately
 # once targeted CircleCI build is ready.

 jobs:
@ -8,7 +8,6 @@ jobs:
  pool: server
  timeoutInMinutes: 60
  continueOnError: true
-
  steps:
  - template: wheel-wait-job-template.yml

@ -17,7 +16,6 @@ jobs:
  timeoutInMinutes: 60
  dependsOn: checkjob1
  continueOnError: true
-
  steps:
  - template: wheel-wait-job-template.yml

@ -26,7 +24,6 @@ jobs:
  timeoutInMinutes: 60
  dependsOn: checkjob2
  continueOnError: true
-
  steps:
  - template: wheel-wait-job-template.yml

@ -35,7 +32,6 @@ jobs:
  timeoutInMinutes: 60
  dependsOn: checkjob3
  continueOnError: true
-
  steps:
  - template: wheel-wait-job-template.yml

@ -44,6 +40,53 @@ jobs:
  timeoutInMinutes: 60
  dependsOn: checkjob4
  continueOnError: true
-
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob6
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob5
+  continueOnError: true
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob7
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob6
+  continueOnError: true
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob8
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob7
+  continueOnError: true
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob9
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob8
+  continueOnError: true
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob10
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob9
+  continueOnError: true
+  steps:
+  - template: wheel-wait-job-template.yml
+
+- job: checkjob11
+  pool: server
+  timeoutInMinutes: 60
+  dependsOn: checkjob10
+  continueOnError: true
  steps:
  - template: wheel-wait-job-template.yml
--- a/.azure_pipelines/pytorch-tests-pipeline.yml
+++ b/.azure_pipelines/pytorch-tests-pipeline.yml
@ -7,14 +7,28 @@
 #   2) runs custom PyTorch unit-tests on PyTorch
 #      wheels generated during PR builds.

+resources:
+  webhooks:
+    - webhook: GitHubPyTorchPRTrigger
+      connection: GitHubPyTorchPRTriggerConnection
+      filters:
+        - path: repositoryName
+          value: pytorch_tests
+
 stages:
 - stage: 'EnsureArtifactsReady'
  displayName: 'Ensure PyTorch PR Artifacts are ready'
  jobs:
  - template: job_templates/wheel-wait-template.yml
+  variables:
+    _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}}
+    _NUMBER_BUILD_PR: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}}
+    _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}}

 - stage: 'PRCustomTests'
  displayName: 'Run custom unit tests on PyTorch wheels'
+  dependsOn: EnsureArtifactsReady
+  condition: succeeded()
  jobs:
  - template: job_templates/pytorch-template-unix.yml
    parameters:
@ -24,7 +38,9 @@ stages:
        PR_Custom_Tests:
          _PYTHON_VERSION: $(PYTHON_VERSION_PR)
          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_PR)
-          _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_PR)
-          _TARGET_BRANCH_TO_CHECK: $(TARGET_BRANCH_TO_CHECK_PR)
+          _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_BUILD_PR)
+          _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}}
+          _NUMBER_BUILD_PR: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}}
+          _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}}
          _DOCKER_IMAGE: $(DOCKER_IMAGE_PR)
          _RUN_TESTS: $(RUN_TESTS_PR)
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -126,6 +126,10 @@ class PackageFormatConfigNode(ConfigNode):
        self.props["python_versions"] = python_versions
        self.props["package_format"] = package_format

+        # XXX Disabling conda for 11.3 as there's currently no appropriate cudatoolkit available
+        if package_format == "conda":
+            self.props["gpu_versions"] = filter(lambda x: x != "cuda113", self.find_prop("gpu_versions"))
+
    def get_children(self):
        if self.find_prop("os_name") == "linux":
            return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]]
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -3,6 +3,7 @@ PHASES = ["build", "test"]
 CUDA_VERSIONS = [
    "102",
    "111",
+    "113",
 ]

 ROCM_VERSIONS = [
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -35,6 +35,11 @@ CONFIG_TREE_DATA = [
            ("10.2", [
                ("3.6", [
                    ("shard_test", [X(True)]),
+                    ("slow_gradcheck", [
+                        (True, [
+                            ('shard_test', [XImportant(True)]),
+                        ]),
+                    ]),
                    ("libtorch", [
                        (True, [
                            ('build_only', [X(True)]),
@ -176,10 +181,18 @@ class ExperimentalFeatureConfigNode(TreeConfigNode):
            "cuda_gcc_override": CudaGccOverrideConfigNode,
            "coverage": CoverageConfigNode,
            "pure_torch": PureTorchConfigNode,
+            "slow_gradcheck": SlowGradcheckConfigNode,
        }
        return next_nodes[experimental_feature]


+class SlowGradcheckConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_slow_gradcheck"] = True
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
 class PureTorchConfigNode(TreeConfigNode):
    def modify_label(self, label):
        return "PURE_TORCH=" + str(label)
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -258,7 +258,7 @@ def gen_tree():
    return configs_list


-def instantiate_configs():
+def instantiate_configs(only_slow_gradcheck):

    config_list = []

@ -277,8 +277,12 @@ def instantiate_configs():
        is_onnx = fc.find_prop("is_onnx") or False
        is_pure_torch = fc.find_prop("is_pure_torch") or False
        is_vulkan = fc.find_prop("is_vulkan") or False
+        is_slow_gradcheck = fc.find_prop("is_slow_gradcheck") or False
        parms_list_ignored_for_docker_image = []

+        if only_slow_gradcheck ^ is_slow_gradcheck:
+            continue
+
        python_version = None
        if compiler_name == "cuda" or compiler_name == "android":
            python_version = fc.find_prop("pyver")
@ -342,6 +346,10 @@ def instantiate_configs():
        if build_only or is_pure_torch:
            restrict_phases = ["build"]

+        if is_slow_gradcheck:
+            parms_list_ignored_for_docker_image.append("old")
+            parms_list_ignored_for_docker_image.append("gradcheck")
+
        gpu_resource = None
        if cuda_version and cuda_version != "10":
            gpu_resource = "medium"
@ -381,7 +389,7 @@ def instantiate_configs():
                                        tags_list=RC_PATTERN)
            c.dependent_tests = gen_docs_configs(c)

-        if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
+        if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch and not is_slow_gradcheck:
            c.dependent_tests = gen_dependent_configs(c)

        if (
@ -408,9 +416,9 @@ def instantiate_configs():
    return config_list


-def get_workflow_jobs():
+def get_workflow_jobs(only_slow_gradcheck=False):

-    config_list = instantiate_configs()
+    config_list = instantiate_configs(only_slow_gradcheck)

    x = []
    for conf_options in config_list:
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@ -40,11 +40,13 @@ class WindowsJob:

        target_arch = self.cuda_version.render_dots() if self.cuda_version else "cpu"

+        python_version = "3.8"
+
        base_name_parts = [
            "pytorch",
            "windows",
            self.vscode_spec.render(),
-            "py36",
+            "py" + python_version.replace(".", ""),
            target_arch,
        ]

@ -65,7 +67,7 @@ class WindowsJob:
            ["pytorch", "win"]
            + self.vscode_spec.get_elements()
            + arch_env_elements
-            + ["py3"]
+            + ["py" + python_version.split(".")[0]]
        )

        is_running_on_cuda = bool(self.cuda_version) and not self.force_on_cpu
@ -75,7 +77,7 @@ class WindowsJob:
        else:
            props_dict = {
                "build_environment": build_environment_string,
-                "python_version": miniutils.quote("3.6"),
+                "python_version": miniutils.quote(python_version),
                "vc_version": miniutils.quote(self.vscode_spec.dotted_version()),
                "vc_year": miniutils.quote(str(self.vscode_spec.year)),
                "vc_product": self.vscode_spec.get_product(),
@ -145,18 +147,11 @@ _VC2019 = VcSpec(2019)
 WORKFLOW_DATA = [
    # VS2019 CUDA-10.1
    WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
-    WindowsJob(1, _VC2019, CudaVersion(10, 1), master_only=True),
-    WindowsJob(2, _VC2019, CudaVersion(10, 1), master_only=True),
-    # VS2019 CUDA-11.1
-    WindowsJob(None, _VC2019, CudaVersion(11, 1)),
-    WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only=True),
-    WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only=True),
-    WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, nightly_only=True),
-    # VS2019 CPU-only
-    WindowsJob(None, _VC2019, None),
-    WindowsJob(1, _VC2019, None),
-    WindowsJob(2, _VC2019, None),
+    # VS2019 CUDA-10.1 force on cpu
    WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
+
+    # TODO: This test is disabled due to https://github.com/pytorch/pytorch/issues/59724
+    # WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),
 ]


--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/docker/common/install_breakpad.sh
+++ b/.circleci/docker/common/install_breakpad.sh
@ -2,9 +2,15 @@

 set -ex

-git clone https://github.com/malfet/breakpad.git -b pytorch/release-1.9
+git clone https://github.com/driazati/breakpad.git
 pushd breakpad

+# breakpad has no actual releases, so this is pinned to the top commit from
+# main when this was forked (including the one patch commit). This uses a fork
+# of the breakpad mainline that automatically daisy-chains out to any previously
+# installed signal handlers (instead of overwriting them).
+git checkout 5485e473ed46d065e05489e50dfc59d90dfd7e22
+
 git clone https://chromium.googlesource.com/linux-syscall-support src/third_party/lss
 pushd src/third_party/lss
 # same as with breakpad, there are no real releases for this repo so use a
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@ -61,10 +61,6 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh
 ENV INSTALLED_VISION ${VISION}

-ADD ./common/install_openssl.sh install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-RUN bash ./install_openssl.sh
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -97,5 +93,9 @@ ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

+ADD ./common/install_openssl.sh install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+RUN bash ./install_openssl.sh
+
 USER jenkins
 CMD ["bash"]
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.circleci/docker/ubuntu/Dockerfile
@ -113,10 +113,6 @@ ADD ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh

-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -134,5 +130,9 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+
 USER jenkins
 CMD ["bash"]
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -145,14 +145,17 @@ def gen_build_workflows_tree():
        binary_build_definitions.get_post_upload_jobs,
        binary_build_definitions.get_binary_smoke_test_jobs,
    ]
+    build_jobs = [f() for f in build_workflows_functions]
+    master_build_jobs = filter_master_only_jobs(build_jobs)

    binary_build_functions = [
        binary_build_definitions.get_binary_build_jobs,
        binary_build_definitions.get_nightly_tests,
        binary_build_definitions.get_nightly_uploads,
    ]
-    build_jobs = [f() for f in build_workflows_functions]
-    master_build_jobs = filter_master_only_jobs(build_jobs)
+
+    slow_gradcheck_jobs = pytorch_build_definitions.get_workflow_jobs(only_slow_gradcheck=True)
+
    return {
        "workflows": {
            "binary_builds": {
@ -167,6 +170,10 @@ def gen_build_workflows_tree():
                "when": r"<< pipeline.parameters.run_master_build >>",
                "jobs": master_build_jobs,
            },
+            "slow_gradcheck_build": {
+                "when": r"<< pipeline.parameters.run_slow_gradcheck_build >>",
+                "jobs": slow_gradcheck_jobs,
+            },
        }
    }

--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,7 +62,6 @@ popd

 # Clone the Builder master repo
 retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
-git checkout release/1.9
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -24,7 +24,7 @@ do
 done
 lipo -i ${ZIP_DIR}/install/lib/*.a
 # copy the umbrella header and license
-cp ${PROJ_ROOT}/ios/LibTorch.h ${ZIP_DIR}/src/
+cp ${PROJ_ROOT}/ios/LibTorch-Lite.h ${ZIP_DIR}/src/
 cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
 # zip the library
 ZIPFILE=libtorch_ios_nightly_build.zip
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@ -4,10 +4,14 @@ echo "RUNNING ON $(uname -a) WITH $(nproc) CPUS AND $(free -m)"
 set -eux -o pipefail
 source /env

-# Defaults here so they can be changed in one place
-export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}
+# Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
+MEMORY_LIMIT_MAX_JOBS=18
+NUM_CPUS=$(( $(nproc) - 2 ))

-if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+# Defaults here for **binary** linux builds so they can be changed in one place
+export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
+
+if [[ "${DESIRED_CUDA}" == "cu111" || "${DESIRED_CUDA}" == "cu113" ]]; then
  export BUILD_SPLIT_CUDA="ON"
 fi

@ -22,5 +26,9 @@ else
  build_script='manywheel/build.sh'
 fi

+if [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then
+  export BUILD_DEBUG_INFO=1
+fi
+
 # Build the package
 SKIP_ALL_TESTS=1 "/builder/$build_script"
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -9,6 +9,10 @@ python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"

 # Set up Python
 if [[ "$PACKAGE_TYPE" == conda ]]; then
+  # There was a bug that was introduced in conda-package-handling >= 1.6.1 that makes archives
+  # above a certain size fail out when attempting to extract
+  # see: https://github.com/conda/conda-package-handling/issues/71
+  conda install -y conda-package-handling=1.6.0
  retry conda create -qyn testenv python="$DESIRED_PYTHON"
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -85,7 +85,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="1.9.0.dev$DATE"
+BASE_BUILD_VERSION="1.10.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@ -148,7 +148,7 @@ if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
 fi

 export DATE="$DATE"
-export NIGHTLIES_DATE_PREAMBLE=1.9.0.dev
+export NIGHTLIES_DATE_PREAMBLE=1.10.0.dev
 export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION"
 export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER"
 export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION"
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,7 +15,7 @@ else
  export VC_YEAR=2019
 fi

-if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+if [[ "${DESIRED_CUDA}" == "cu111" || "${DESIRED_CUDA}" == "cu113" ]]; then
  export BUILD_SPLIT_CUDA="ON"
 fi

--- a/.circleci/scripts/upload_binary_size_to_scuba.py
+++ b/.circleci/scripts/upload_binary_size_to_scuba.py
@ -142,8 +142,8 @@ if __name__ == "__main__":
        report_android_sizes(file_dir)
    else:
        size = get_size(file_dir)
-        if size != 0:
-            try:
-                send_message([build_message(size)])
-            except Exception:
-                logging.exception("can't send message")
+        # Sending the message anyway if no size info is collected.
+        try:
+            send_message([build_message(size)])
+        except Exception:
+            logging.exception("can't send message")
--- a/.circleci/scripts/vs_install.ps1
+++ b/.circleci/scripts/vs_install.ps1
@ -14,6 +14,10 @@ $VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStud
                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")

+if (${env:INSTALL_WINDOWS_SDK} -eq "1") {
+    $VS_INSTALL_ARGS += "--add Microsoft.VisualStudio.Component.Windows10SDK.19041"
+}
+
 curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
 if ($LASTEXITCODE -ne 0) {
    echo "Download of the VS 2019 Version 16.8.5 installer failed"
--- a/.circleci/scripts/windows_cuda_install.sh
+++ b/.circleci/scripts/windows_cuda_install.sh
@ -25,7 +25,7 @@ else
    exit 1
 fi

-if [[ "$cuda_major_version" == "11" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
+if [[ "$cuda_major_version" == "11" && "${JOB_EXECUTOR:-}" == "windows-with-nvidia-gpu" ]]; then
    cuda_install_packages="${cuda_install_packages} Display.Driver"
 fi

--- a/.circleci/scripts/windows_cudnn_install.sh
+++ b/.circleci/scripts/windows_cudnn_install.sh
@ -20,9 +20,13 @@ else
 fi

 cudnn_installer_link="https://ossci-windows.s3.amazonaws.com/${cudnn_installer_name}.zip"
+cudnn_install_folder="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/"

-curl --retry 3 -O $cudnn_installer_link
-7z x ${cudnn_installer_name}.zip -ocudnn
-cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/"
+curl --retry 3 -O "$cudnn_installer_link"
+7z x "${cudnn_installer_name}.zip" -ocudnn
+# shellcheck recommends to use '${var:?}/*' to avoid potentially expanding to '/*'
+# Remove all of the directories before attempting to copy files
+rm -rf "${cudnn_install_folder:?}/*"
+cp -rf cudnn/cuda/* "${cudnn_install_folder}"
 rm -rf cudnn
-rm -f ${cudnn_installer_name}.zip
+rm -f "${cudnn_installer_name}.zip"
--- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@ -84,7 +84,7 @@ pytorch_windows_params: &pytorch_windows_params
      default: "10.1"
    python_version:
      type: string
-      default: "3.6"
+      default: "3.8"
    vc_version:
      type: string
      default: "14.16"
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@ -17,6 +17,9 @@ parameters:
  run_master_build:
    type: boolean
    default: false
+  run_slow_gradcheck_build:
+    type: boolean
+    default: false

 executors:
  windows-with-nvidia-gpu:
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -22,7 +22,7 @@
        command: |
            ls -lah /final_pkgs
    - run:
-        name: save binary size
+        name: upload build & binary data
        no_output_timeout: "5m"
        command: |
            source /env
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@ -74,6 +74,14 @@ jobs:
            docker commit "$id" ${COMMIT_DOCKER_IMAGE}
            time docker push ${COMMIT_DOCKER_IMAGE}
          fi
+    - run:
+        name: upload build & binary data
+        no_output_timeout: "5m"
+        command: |
+            cd /pytorch && export COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+            python3 -mpip install requests && \
+            SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \
+            python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
    - store_artifacts:
        path: /home/circleci/project/dist

@ -245,7 +253,7 @@ jobs:
        default: "10.1"
      python_version:
        type: string
-        default: "3.6"
+        default: "3.8"
      vc_version:
        type: string
        default: "14.16"
@ -312,7 +320,7 @@ jobs:
        default: "10.1"
      python_version:
        type: string
-        default: "3.6"
+        default: "3.8"
      vc_version:
        type: string
        default: "14.16"
--- a/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml
+++ b/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml
@ -35,7 +35,7 @@
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
          cuda_version: "11.3"
          name: periodic_pytorch_windows_cuda11.3_build
-          python_version: "3.6"
+          python_version: "3.8"
          use_cuda: "1"
          vc_product: BuildTools
          vc_version: "14.28.29333"
@ -45,7 +45,7 @@
          cuda_version: "11.3"
          executor: windows-with-nvidia-gpu
          name: periodic_pytorch_windows_cuda11.3_test1
-          python_version: "3.6"
+          python_version: "3.8"
          requires:
            - periodic_pytorch_windows_cuda11.3_build
          test_name: pytorch-windows-test1
@ -58,7 +58,7 @@
          cuda_version: "11.3"
          executor: windows-with-nvidia-gpu
          name: periodic_pytorch_windows_cuda11.3_test2
-          python_version: "3.6"
+          python_version: "3.8"
          requires:
            - periodic_pytorch_windows_cuda11.3_build
          test_name: pytorch-windows-test2
@ -116,8 +116,8 @@
      - pytorch_windows_build:
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
          cuda_version: "11.3"
-          name: pytorch_windows_vs2019_py36_cuda11.3_build
-          python_version: "3.6"
+          name: pytorch_windows_vs2019_py38_cuda11.3_build
+          python_version: "3.8"
          use_cuda: "1"
          vc_product: BuildTools
          vc_version: "14.28.29333"
@ -131,10 +131,10 @@
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
          cuda_version: "11.3"
          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py36_cuda11.3_test1
-          python_version: "3.6"
+          name: pytorch_windows_vs2019_py38_cuda11.3_test1
+          python_version: "3.8"
          requires:
-            - pytorch_windows_vs2019_py36_cuda11.3_build
+            - pytorch_windows_vs2019_py38_cuda11.3_build
          test_name: pytorch-windows-test1
          use_cuda: "1"
          vc_product: BuildTools
@ -149,10 +149,10 @@
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
          cuda_version: "11.3"
          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py36_cuda11.3_test2
-          python_version: "3.6"
+          name: pytorch_windows_vs2019_py38_cuda11.3_test2
+          python_version: "3.8"
          requires:
-            - pytorch_windows_vs2019_py36_cuda11.3_build
+            - pytorch_windows_vs2019_py38_cuda11.3_build
          test_name: pytorch-windows-test2
          use_cuda: "1"
          vc_product: BuildTools
@ -186,10 +186,18 @@
          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
      - pytorch_linux_test:
-          name: periodic_pytorch_xenial_cuda10_2_cudnn7_gcc7_old_gradcheck_tests
+          name: periodic_pytorch_xenial_cuda10_2_cudnn7_gcc7_old_gradcheck_test1
          requires:
            - periodic_pytorch_xenial_cuda10_2_cudnn7_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-old-gradcheck-tests"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-old-gradcheck-test1"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: periodic_pytorch_xenial_cuda10_2_cudnn7_gcc7_old_gradcheck_test2
+          requires:
+            - periodic_pytorch_xenial_cuda10_2_cudnn7_gcc7_build
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-old-gradcheck-test2"
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
--- a/.github/pytorch-circleci-labels.yml
+++ b/.github/pytorch-circleci-labels.yml
@ -13,3 +13,5 @@ labels_to_circle_params:
      - run_build
  ci/master:
    parameter: run_master_build
+  ci/slow-gradcheck:
+    parameter: run_slow_gradcheck_build
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -28,7 +28,12 @@ runner_types:
    max_available: 50
    disk_size: 150
  windows.4xlarge:
-    instance_type: c5.4xlarge
+    instance_type: c5d.4xlarge
    os: windows
    max_available: 200
    disk_size: 256
+  windows.8xlarge.nvidia.gpu:
+    instance_type: p3.2xlarge
+    os: windows
+    max_available: 25
+    disk_size: 256
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+import yaml
+
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+WORKFLOWS = REPO_ROOT / ".github" / "workflows"
+
+
+def concurrency_key(filename: Path) -> str:
+    workflow_name = filename.with_suffix("").name.replace("_", "-")
+    return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}"
+
+
+def should_check(filename: Path) -> bool:
+    with open(filename, "r") as f:
+        content = f.read()
+
+    data = yaml.safe_load(content)
+    on = data.get("on", data.get(True, {}))
+    return "pull_request" in on
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Ensure all relevant GitHub actions jobs will be cancelled based on a concurrency key"
+    )
+    args = parser.parse_args()
+
+    files = list(WORKFLOWS.glob("*.yml"))
+
+    errors_found = False
+    files = [f for f in files if should_check(f)]
+    for filename in files:
+        with open(filename, "r") as f:
+            data = yaml.safe_load(f)
+
+        expected = {
+            "group": concurrency_key(filename),
+            "cancel-in-progress": True,
+        }
+        if data.get("concurrency", None) != expected:
+            print(
+                f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
+                file=sys.stderr,
+            )
+            errors_found = True
+
+    if errors_found:
+        sys.exit(1)
--- a/.github/scripts/generate_linux_ci_workflows.py
+++ b/.github/scripts/generate_linux_ci_workflows.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 from pathlib import Path
+from typing import Any, Dict

 import jinja2

@ -8,141 +9,202 @@ DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"

 GITHUB_DIR = Path(__file__).parent.parent

-CPU_TEST_RUNNER = "linux.2xlarge"
-CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
+
+# it would be nice to statically specify that build_environment must be
+# present, but currently Python has no easy way to do that
+# https://github.com/python/mypy/issues/4617
+PyTorchWorkflow = Dict[str, Any]
+
+WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge"
+WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu"


-class PyTorchLinuxWorkflow:
-    def __init__(
-            self,
-            build_environment: str,
-            docker_image_base: str,
-            on_pull_request: bool = False,
-            enable_doc_jobs: bool = False,
-    ):
-        self.build_environment = build_environment
-        self.docker_image_base = docker_image_base
-        self.test_runner_type = CPU_TEST_RUNNER
-        self.on_pull_request = on_pull_request
-        self.enable_doc_jobs = enable_doc_jobs
-        if "cuda" in build_environment:
-            self.test_runner_type = CUDA_TEST_RUNNER
-
-    def generate_workflow_file(
-        self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
-    ) -> Path:
-        output_file_path = GITHUB_DIR.joinpath(
-            f"workflows/{self.build_environment}.yml"
-        )
-        with open(output_file_path, "w") as output_file:
-            output_file.writelines(["# @generated DO NOT EDIT MANUALLY\n"])
-            output_file.write(
-                workflow_template.render(
-                    build_environment=self.build_environment,
-                    docker_image_base=self.docker_image_base,
-                    test_runner_type=self.test_runner_type,
-                    enable_doc_jobs=self.enable_doc_jobs,
-                    on_pull_request=self.on_pull_request,
-                )
-            )
-            output_file.write('\n')
-        return output_file_path
+def PyTorchWindowsWorkflow(
+    *,
+    build_environment: str,
+    test_runner_type: str,
+    cuda_version: str,
+    on_pull_request: bool = False
+) -> PyTorchWorkflow:
+    return {
+        "build_environment": build_environment,
+        "test_runner_type": test_runner_type,
+        "cuda_version": cuda_version,
+        "on_pull_request": on_pull_request,
+    }


-WORKFLOWS = [
+LINUX_CPU_TEST_RUNNER = "linux.2xlarge"
+LINUX_CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
+
+
+def PyTorchLinuxWorkflow(
+    *,
+    build_environment: str,
+    docker_image_base: str,
+    test_runner_type: str,
+    on_pull_request: bool = False,
+    enable_doc_jobs: bool = False,
+) -> PyTorchWorkflow:
+    return {
+        "build_environment": build_environment,
+        "docker_image_base": docker_image_base,
+        "test_runner_type": test_runner_type,
+        "on_pull_request": on_pull_request,
+        "enable_doc_jobs": enable_doc_jobs,
+    }
+
+
+def generate_workflow_file(
+    *,
+    workflow: PyTorchWorkflow,
+    workflow_template: jinja2.Template,
+) -> Path:
+    output_file_path = GITHUB_DIR / f"workflows/{workflow['build_environment']}.yml"
+    with open(output_file_path, "w") as output_file:
+        GENERATED = "generated"
+        output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
+        output_file.write(workflow_template.render(**workflow))
+        output_file.write("\n")
+    return output_file_path
+
+
+WINDOWS_WORKFLOWS = [
+    PyTorchWindowsWorkflow(
+        build_environment="pytorch-win-vs2019-cpu-py3",
+        cuda_version="cpu",
+        test_runner_type=WINDOWS_CPU_TEST_RUNNER,
+        on_pull_request=True
+    ),
+    PyTorchWindowsWorkflow(
+        build_environment="pytorch-win-vs2019-cuda10-cudnn7-py3",
+        cuda_version="10.1",
+        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
+    ),
+    PyTorchWindowsWorkflow(
+        build_environment="pytorch-win-vs2019-cuda11-cudnn8-py3",
+        cuda_version="11.1",
+        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
+    )
+]
+
+LINUX_WORKFLOWS = [
    PyTorchLinuxWorkflow(
        build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
        on_pull_request=True,
        enable_doc_jobs=True,
    ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-gcc7",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-asan",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang7-onnx",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    PyTorchLinuxWorkflow(
        build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7",
        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+        test_runner_type=LINUX_CUDA_TEST_RUNNER,
    ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     test_runner_type=LINUX_CUDA_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     test_runner_type=LINUX_CUDA_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-android-ndk-r19c-x86_32",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-android-ndk-r19c-x86_64",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-android-ndk-r19c-arm-v7a",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-android-ndk-r19c-arm-v8a",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-mobile",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-mobile-custom-dynamic",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-mobile-custom-static",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
    # PyTorchLinuxWorkflow(
    #     build_environment="pytorch-linux-xenial-py3.6-clang5-mobile-code-analysis",
    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
    # ),
 ]

@ -151,11 +213,10 @@ if __name__ == "__main__":
        variable_start_string="!{{",
        loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
    )
-    workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
-    for workflow in WORKFLOWS:
-        print(
-            workflow.generate_workflow_file(
-                workflow_template=workflow_template,
-                jinja_env=jinja_env
-            )
-        )
+    template_and_workflows = [
+        (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS),
+        (jinja_env.get_template("windows_ci_workflow.yml.j2"), WINDOWS_WORKFLOWS)
+    ]
+    for template, workflows in template_and_workflows:
+        for workflow in workflows:
+            print(generate_workflow_file(workflow=workflow, workflow_template=template))
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -16,12 +16,12 @@ LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
 class NoGitTagException(Exception):
    pass

-def get_pytorch_root():
+def get_pytorch_root() -> Path:
    return Path(subprocess.check_output(
        ['git', 'rev-parse', '--show-toplevel']
    ).decode('ascii').strip())

-def get_tag():
+def get_tag() -> str:
    root = get_pytorch_root()
    # We're on a tag
    am_on_tag = (
@ -46,7 +46,7 @@ def get_tag():
        tag = re.sub(TRAILING_RC_PATTERN, "", tag)
    return tag

-def get_base_version():
+def get_base_version() -> str:
    root = get_pytorch_root()
    dirty_version = open(root / 'version.txt', 'r').read().strip()
    # Strips trailing a0 from version.txt, not too sure why it's there in the
@ -54,29 +54,34 @@ def get_base_version():
    return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)

 class PytorchVersion:
-    def __init__(self, gpu_arch_type, gpu_arch_version, no_build_suffix):
+    def __init__(
+        self,
+        gpu_arch_type: str,
+        gpu_arch_version: str,
+        no_build_suffix: bool,
+    ) -> None:
        self.gpu_arch_type = gpu_arch_type
        self.gpu_arch_version = gpu_arch_version
        self.no_build_suffix = no_build_suffix

-    def get_post_build_suffix(self):
+    def get_post_build_suffix(self) -> str:
        if self.gpu_arch_type == "cuda":
            return f"+cu{self.gpu_arch_version.replace('.', '')}"
        return f"+{self.gpu_arch_type}{self.gpu_arch_version}"

-    def get_release_version(self):
+    def get_release_version(self) -> str:
        if not get_tag():
            raise NoGitTagException(
                "Not on a git tag, are you sure you want a release version?"
            )
        return f"{get_tag()}{self.get_post_build_suffix()}"

-    def get_nightly_version(self):
+    def get_nightly_version(self) -> str:
        date_str = datetime.today().strftime('%Y%m%d')
        build_suffix = self.get_post_build_suffix()
        return f"{get_base_version()}.dev{date_str}{build_suffix}"

-def main():
+def main() -> None:
    parser = argparse.ArgumentParser(
        description="Generate pytorch version for binary builds"
    )
--- a/.github/scripts/lint_native_functions.py
+++ b/.github/scripts/lint_native_functions.py
@ -14,19 +14,19 @@ is simply to make sure that there is *some* configuration of ruamel that can rou
 the YAML, not to be prescriptive about it.
 '''

-import ruamel.yaml
+import ruamel.yaml  # type: ignore[import]
 import difflib
 import sys
 from pathlib import Path
 from io import StringIO

-def fn(base):
+def fn(base: str) -> str:
    return str(base / Path("aten/src/ATen/native/native_functions.yaml"))

 with open(Path(__file__).parent.parent.parent / fn('.'), "r") as f:
    contents = f.read()

-yaml = ruamel.yaml.YAML()
+yaml = ruamel.yaml.YAML()  # type: ignore[attr-defined]
 yaml.preserve_quotes = True
 yaml.width = 1000
 yaml.boolean_representation = ['False', 'True']
--- a/.github/scripts/regenerate_cancel_redundant_workflow.py
+++ b/.github/scripts/regenerate_cancel_redundant_workflow.py
@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-'''
-This file verifies that the workflows that are potentially canceled in our cancel_redundant_workflow.yml
-match the workflows we have running on pull requests (found in .github/workflows). This way, anytime a
-workflow is added or removed, people can be reminded to modify the cancel_redundant_workflow.yml accordingly.
-'''
-
-
-import ruamel.yaml
-from pathlib import Path
-
-
-yaml = ruamel.yaml.YAML()
-yaml.preserve_quotes = True
-yaml.boolean_representation = ['False', 'True']
-yaml.default_flow_style = False
-
-
-if __name__ == '__main__':
-    workflow_paths = (Path(__file__).parent.parent / 'workflows').rglob('*')
-    workflows = []
-    for path in workflow_paths:
-        if path.suffix in {'.yml', '.yaml'}:
-            with open(path) as f:
-                data = yaml.load(f)
-                assert 'name' in data, 'Every GHA workflow must have a name.'
-                if 'pull_request' in data['on']:
-                    workflows.append(data['name'])
-
-    with open('.github/workflows/cancel_redundant_workflows.yml', 'r') as f:
-        data = yaml.load(f)
-
-    # Replace workflows to cancel
-    data['on']['workflow_run']['workflows'] = sorted(workflows)
-
-    with open('.github/workflows/cancel_redundant_workflows.yml', 'w') as f:
-        yaml.dump(data, f)
--- a/.github/scripts/report_git_status.sh
+++ b/.github/scripts/report_git_status.sh
@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-CHANGES=$(git status --porcelain)
+CHANGES=$(git status --porcelain "$1")
 echo "$CHANGES"
-git diff
+git diff "$1"
 [ -z "$CHANGES" ]
--- a/.github/scripts/run_torchbench.py
+++ b/.github/scripts/run_torchbench.py
@ -31,7 +31,7 @@ direction: decrease
 timeout: 720
 tests:"""

-def gen_abtest_config(control: str, treatment: str, models: List[str]):
+def gen_abtest_config(control: str, treatment: str, models: List[str]) -> str:
    d = {}
    d["control"] = control
    d["treatment"] = treatment
@ -43,7 +43,7 @@ def gen_abtest_config(control: str, treatment: str, models: List[str]):
    config = config + "\n"
    return config

-def deploy_torchbench_config(output_dir: str, config: str):
+def deploy_torchbench_config(output_dir: str, config: str) -> None:
    # Create test dir if needed
    pathlib.Path(output_dir).mkdir(exist_ok=True)
    # TorchBench config file name
@ -71,7 +71,7 @@ def extract_models_from_pr(torchbench_path: str, prbody_file: str) -> List[str]:
            return []
    return model_list

-def run_torchbench(pytorch_path: str, torchbench_path: str, output_dir: str):
+def run_torchbench(pytorch_path: str, torchbench_path: str, output_dir: str) -> None:
    # Copy system environment so that we will not override
    env = dict(os.environ)
    command = ["python", "bisection.py", "--work-dir", output_dir,
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@ -1,5 +1,5 @@
-# Template is at:    .github/templates/linux_ci_workflow.yml
-# Generation script: .github/scripts/generate_linux_ci_workflows.py
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
 name: Linux CI (!{{ build_environment }})

 on:
@ -23,6 +23,10 @@ env:
  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"

+concurrency:
+  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
 jobs:
  calculate-docker-image:
    runs-on: linux.2xlarge
@ -32,6 +36,15 @@ jobs:
    outputs:
      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -49,7 +62,6 @@ jobs:
          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          eval "$(aws ecr get-login --no-include-email --region us-east-1)"
          set -x
          # Check if image already exists, if it does then skip building it
          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
@ -84,6 +96,7 @@ jobs:
        run: |
          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
          cd .circleci/docker && ./build_docker.sh
+
  build:
    runs-on: linux.2xlarge
    needs: calculate-docker-image
@ -128,6 +141,22 @@ jobs:
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}" \
            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
@ -135,11 +164,22 @@ jobs:
      - name: Archive artifacts into zip
        run: |
          zip -r artifacts.zip dist/ build/
+      # Upload to github so that people can click and download artifacts
      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        name: Store PyTorch Build Artifacts on Github
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 30
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
          if-no-files-found: error
          path:
            artifacts.zip
@ -148,6 +188,7 @@ jobs:
        run: |
          # Prune all of the docker images
          docker system prune -af
+
  test:
    runs-on: !{{ test_runner_type }}
    needs:
@ -167,6 +208,8 @@ jobs:
          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
+        with:
+          submodules: recursive
      - name: Pull docker image
        run: |
          docker pull "${DOCKER_IMAGE}"
@ -187,7 +230,7 @@ jobs:
              ;;
          esac
          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: actions/download-artifact@v2
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -209,6 +252,7 @@ jobs:
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
            -e IN_CI \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@ -231,7 +275,7 @@ jobs:
        if: always()
        with:
          name: test-reports
-          retention-days: 30
+          retention-days: 14
          if-no-files-found: error
          path:
            test/**/*.xml
@ -242,6 +286,12 @@ jobs:
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
          # Prune all of the docker images
          docker system prune -af
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
  render_test_results:
    if: always()
    needs:
@ -289,6 +339,7 @@ jobs:
          export PYTHONPATH=$PWD
          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
  {%- if enable_doc_jobs %}
+
  pytorch_python_doc_build:
    runs-on: linux.2xlarge
    needs:
@ -305,7 +356,7 @@ jobs:
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -317,7 +368,7 @@ jobs:
      - name: Preserve github env variables for use in docker
        run: |
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: actions/download-artifact@v2
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -350,7 +401,7 @@ jobs:
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Archive artifacts into zip
        run: |
          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@ -0,0 +1,196 @@
+# Template is at:    .github/templates/windows_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: Windows CI (!{{ build_environment }})
+
+on:
+{%- if on_pull_request %}
+  pull_request:
+{%- endif %}
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: !{{ build_environment }}
+  BUILD_WHEEL: 1
+  CUDA_VERSION: "!{{ cuda_version }}"
+  IN_CI: 1
+  INSTALL_WINDOWS_SDK: 1
+  JOB_BASE_NAME: test
+  PYTHON_VERSION: "3.8"
+  SCCACHE_BUCKET: "ossci-compiler-cache"
+  VC_PRODUCT: "BuildTools"
+  VC_VERSION: ""
+  VC_YEAR: "2019"
+{%- if cuda_version != "cpu" %}
+  TORCH_CUDA_ARCH_LIST: "7.0"
+  USE_CUDA: 1
+{%- endif %}
+
+concurrency:
+  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: "windows.4xlarge"
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+{%- if cuda_version != "cpu" %}
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+{%- endif %}
+      - name: Build
+        shell: bash
+        run: |
+          .jenkins/pytorch/win-build.sh
+      # Upload to github so that people can click and download artifacts
+      - name: Upload artifacts to Github
+        if: always()
+        uses: actions/upload-artifact@v2
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+      - name: Upload artifacts to s3
+        if: always()
+        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+
+  test:
+    runs-on: !{{ test_runner_type }}
+    env:
+      JOB_BASE_NAME: !{{ build_environment }}-test
+    needs:
+      - build
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+{%- if cuda_version != "cpu" %}
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+{%- endif %}
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+      - name: Check build-results folder
+        shell: powershell
+        run: |
+          tree /F C:\$Env:GITHUB_RUN_ID\build-results
+      # Needed for coverage in win-test.sh
+      - uses: actions/setup-python@v2
+        name: Setup Python3
+        with:
+          python-version: '3.x'
+      - name: Run test scripts
+        shell: bash
+        env:
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+        run: |
+            .jenkins/pytorch/win-test.sh
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test/**/*.xml
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
+  render_test_results:
+    if: always()
+    needs:
+      - test
+    runs-on: ubuntu-18.04
+    # TODO: Make this into a composite step
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow tools/print_test_stats.py to use Git commands
+          fetch-depth: 0
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Test Reports
+        with:
+          name: test-reports
+          path: test/test-reports
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        # boto3 version copied from .circleci/docker/common/install_conda.sh
+        run: |
+          pip install -r requirements.txt
+          pip install boto3==1.16.34 junitparser rich
+      - name: Output Test Results (Click Me)
+        run: |
+          python tools/render_junit.py test
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_JOB: !{{ build_environment }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
--- a/.github/workflows/build_linux_conda.yml
+++ b/.github/workflows/build_linux_conda.yml
@ -29,8 +29,7 @@ jobs:
    needs: generate-build-matrix
    runs-on: linux.2xlarge
    strategy:
-      matrix:
-        ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
      fail-fast: false
    container:
      image: ${{ matrix.container_image }}
@ -41,7 +40,7 @@ jobs:
      DESIRED_CUDA: ${{ matrix.gpu_arch_version }}
      GPU_ARCH_VERSION: ${{ matrix.GPU_ARCH_VERSION }}
      GPU_ARCH_TYPE: ${{ matrix.gpu_arch_type }}
-      NO_BUILD_SUFFIX: True
+      NO_BUILD_SUFFIX: true
      # TODO: This is a legacy variable, we should just default all build to use
      #       this folder within the conda/build_pytorch.sh script
      TORCH_CONDA_BUILD_FOLDER: pytorch-nightly
@ -92,4 +91,23 @@ jobs:
        with:
          name: pytorch-conda-py${{ matrix.python_version }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }}
          path: /remote/**/*.bz2
-      # TODO: Add a step here for uploading binaries
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
+
+concurrency:
+  group: build-linux-conda-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/build_linux_libtorch.yml
+++ b/.github/workflows/build_linux_libtorch.yml
@ -29,8 +29,7 @@ jobs:
    needs: generate-build-matrix
    runs-on: linux.2xlarge
    strategy:
-      matrix:
-        ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
      fail-fast: false
    container:
      image: ${{ matrix.container_image }}
@ -91,4 +90,23 @@ jobs:
        with:
          name: pytorch-libtorch-${{ matrix.libtorch_variant }}-${{ matrix.devtoolset }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }}
          path: /remote/**/*.zip
-      # TODO: Add a step here for uploading binaries
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
+
+concurrency:
+  group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/build_linux_wheels.yml
+++ b/.github/workflows/build_linux_wheels.yml
@ -29,8 +29,7 @@ jobs:
    needs: generate-build-matrix
    runs-on: linux.2xlarge
    strategy:
-      matrix:
-        ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
      fail-fast: false
    container:
      image: ${{ matrix.container_image }}
@ -90,4 +89,23 @@ jobs:
        with:
          name: pytorch-wheel-py${{ matrix.python_version }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }}
          path: /remote/**/*.whl
-      # TODO: Add a step here for uploading binaries
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
+
+concurrency:
+  group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/cancel_redundant_workflows.yml
+++ b/.github/workflows/cancel_redundant_workflows.yml
@ -1,24 +0,0 @@
-name: Cancel redundant workflows
-on:
-  workflow_run:
-    types:
-    - requested
-    # NOTE: Make sure to add to this list as you add more workflows running on 'pull_request'
-    workflows:
-    - Lint
-    - Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
-    - Test tools
-    - TorchBench CI (pytorch-linux-py3.7-cu102)
-    - clang-format
-jobs:
-  cancel:
-    # We do not want to cancel reruns on master
-    if: github.event.workflow_run.head_branch != 'master'
-    runs-on: ubuntu-18.04
-    steps:
-    - name: Cancel duplicate workflow runs
-      uses: potiuk/cancel-workflow-runs@a81b3c4d59c61e27484cfacdc13897dd908419c9
-      with:
-        cancelMode: duplicates
-        token: ${{ secrets.GITHUB_TOKEN }}
-        sourceRunId: ${{ github.event.workflow_run.id }}
--- a/.github/workflows/clang_format.yml
+++ b/.github/workflows/clang_format.yml
@ -42,3 +42,7 @@ jobs:
          fi
          echo "$GIT_DIFF"
          exit 1
+
+concurrency:
+  group: clang-format-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -3,7 +3,7 @@ name: Lint
 on:
  push:
    branches:
-    - master
+      - master
  pull_request:

 jobs:
@ -23,39 +23,11 @@ jobs:
      - name: Ensure consistent CircleCI YAML config
        if: always() && steps.requirements.outcome == 'success'
        run: cd .circleci && ./ensure-consistency.py
-      - name: Ensure consistent GHA workflows in cancel_redundant_workflows.yml
-        if: always() && steps.requirements.outcome == 'success'
-        run: |
-          pip install ruamel.yaml==0.17.4
-          echo "Please locally run .github/scripts/regenerate_cancel_redundant_workflow.py and commit if this step fails."
-          .github/scripts/regenerate_cancel_redundant_workflow.py
-          git diff --exit-code .github/workflows/cancel_redundant_workflows.yml
      - name: Lint native_functions.yaml
        if: always() && steps.requirements.outcome == 'success'
        run: |
          pip install ruamel.yaml==0.17.4
          .github/scripts/lint_native_functions.py
-      - name: Extract scripts from GitHub Actions workflows
-        if: always() && steps.requirements.outcome == 'success'
-        run: |
-          # For local lints, remove the .extracted_scripts folder if it was already there
-          rm -rf .extracted_scripts
-          tools/extract_scripts.py --out=.extracted_scripts
-      - name: Install ShellCheck
-        id: install_shellcheck
-        if: always()
-        # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
-        run: |
-          set -x
-          scversion="v0.7.2"
-          wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
-          sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
-          rm -r "shellcheck-${scversion}"
-          shellcheck --version
-      - name: Run ShellCheck
-        if: always() && steps.install_shellcheck.outcome == 'success'
-        run: |
-          tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
      - name: Ensure correct trailing newlines
        if: always() && steps.requirements.outcome == 'success'
        run: |
@ -109,7 +81,7 @@ jobs:
        if: always() && steps.requirements.outcome == 'success'
        run: |
          set -eux
-          python torch/testing/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
+          python torch/testing/_check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
      - name: Ensure no direct cub include
        if: always()
        run: |
@ -127,9 +99,12 @@ jobs:
        uses: actions/checkout@v2
      - name: Attempt to run setup.py
        run: |
-          python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
+          if ! python2 setup.py | grep -q "Python 2 has reached end-of-life and is no longer supported by PyTorch."; then
+            echo 'Running setup.py with Python 2 did not give the expected error message.'
+            false
+          fi

-  templates:
+  shellcheck:
    runs-on: ubuntu-18.04
    steps:
      - name: Setup Python
@ -137,14 +112,68 @@ jobs:
        with:
          python-version: 3.x
          architecture: x64
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Install requirements
+        id: requirements
+        run: |
+          pip install -r requirements.txt
      - name: Install Jinja2
-        run: pip install Jinja2
+        run: |
+          pip install Jinja2==3.0.1
      - name: Checkout PyTorch
        uses: actions/checkout@v2
      - name: Regenerate workflows
-        run: .github/scripts/generate_linux_ci_workflows.py
+        id: generate_workflows
+        run: .github/scripts/generate_ci_workflows.py
      - name: Assert that regenerating the workflows didn't change them
-        run: .github/scripts/report_git_status.sh
+        run: |
+          if ! .github/scripts/report_git_status.sh .github/workflows; then
+            echo
+            echo 'As shown by the above diff, the committed .github/workflows'
+            echo 'are not up to date according to .github/templates.'
+            echo 'Please run this command, commit, and push again to your PR:'
+            echo
+            echo '    .github/scripts/generate_ci_workflows.py'
+            echo
+            echo 'If running that command does nothing, you may need to rebase'
+            echo 'onto a more recent commit from the PyTorch master branch.'
+            false
+          fi
+      - name: Install ShellCheck
+        id: install_shellcheck
+        if: always()
+        # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
+        run: |
+          set -x
+          scversion="v0.7.2"
+          wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+          sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
+          rm -r "shellcheck-${scversion}"
+          shellcheck --version
+      - name: Extract scripts from GitHub Actions workflows
+        if: always() && steps.install_shellcheck.outcome == 'success'
+        run: |
+          # For local lints, remove the .extracted_scripts folder if it was already there
+          rm -rf .extracted_scripts
+          tools/extract_scripts.py --out=.extracted_scripts
+      - name: Run ShellCheck
+        if: always() && steps.install_shellcheck.outcome == 'success'
+        run: |
+          if ! tools/run_shellcheck.sh .extracted_scripts .jenkins/pytorch; then
+            echo
+            echo 'ShellCheck gave a nonzero exit code. Please fix the warnings'
+            echo 'listed above. Note that if a path in one of the above warning'
+            echo 'messages starts with .extracted_scripts/ then that means it'
+            echo 'is referring to a shell script embedded within another file,'
+            echo 'whose path is given by the path components immediately'
+            echo 'following the .extracted_scripts/ prefix.'
+            false
+          fi
+      - name: Check that jobs will be cancelled
+        if: always() && steps.generate_workflows.outcome == 'success'
+        run: |
+          .github/scripts/ensure_actions_will_cancel.py

  toc:
    runs-on: ubuntu-18.04
@ -160,13 +189,27 @@ jobs:
        run: npm install -g markdown-toc
      - name: Regenerate ToCs and check that they didn't change
        run: |
-          set -eux
+          set -eu
          export PATH=~/.npm-global/bin:"$PATH"
          for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
            markdown-toc --bullets='-' -i "$FILE"
          done

-          .github/scripts/report_git_status.sh
+          if ! .github/scripts/report_git_status.sh .; then
+            echo
+            echo 'As shown by the above diff, the table of contents in one or'
+            echo 'more Markdown files is not up to date with the file contents.'
+            echo 'You can either apply that Git diff directly to correct the'
+            echo 'table of contents, or if you have npm installed, you can'
+            echo 'install the npm package markdown-toc and run the following'
+            # shellcheck disable=SC2016
+            echo 'command (replacing $FILE with the filename for which you want'
+            echo 'to regenerate the table of contents):'
+            echo
+            # shellcheck disable=SC2016
+            echo "    markdown-toc --bullets='-' -i \"\$FILE\""
+            false
+          fi

  flake8-py3:
    runs-on: ubuntu-18.04
@ -214,20 +257,21 @@ jobs:
          path: flake8-output/
      - name: Fail if there were any warnings
        run: |
-          set -eux
+          set -eu
          # Re-output flake8 status so GitHub logs show it on the step that actually failed
          cat "${GITHUB_WORKSPACE}"/flake8-output.txt
-          [ ! -s "${GITHUB_WORKSPACE}"/flake8-output.txt ]
+          if [ -s "${GITHUB_WORKSPACE}"/flake8-output.txt ]; then
+            echo 'Please fix the above Flake8 warnings.'
+            false
+          fi

  clang-tidy:
    if: github.event_name == 'pull_request'
    runs-on: ubuntu-18.04
+    container:
+      # ubuntu18.04-cuda10.2-py3.6-tidy11
+      image: ghcr.io/pytorch/cilint-clang-tidy:52a8ad78d49fc9f40241fee7988db48c920499df
    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.x
-          architecture: x64
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -236,58 +280,46 @@ jobs:
        env:
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
        run: |
+          cd "${GITHUB_WORKSPACE}"
          mkdir clang-tidy-output
          cd clang-tidy-output
          echo "$HEAD_SHA" > commit-sha.txt
-      - name: Install dependencies
-        run: |
-          set -eux
-          # Install CUDA
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
-          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
-          sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
-          sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
-          sudo apt-get update
-          sudo apt-get --no-install-recommends -y install cuda-toolkit-10-2
-          # Install dependencies
-          pip install pyyaml typing_extensions
-          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          sudo apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main"
-          sudo apt-get update
-          sudo apt-get install -y clang-tidy-11
-          sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
      - name: Generate build files
        run: |
+          cd "${GITHUB_WORKSPACE}"
          set -eux
          git remote add upstream https://github.com/pytorch/pytorch
          git fetch upstream "$GITHUB_BASE_REF"

-          if [[ ! -d build ]]; then
+          if [ ! -d build ]; then
            git submodule update --init --recursive

            export USE_NCCL=0
            export USE_DEPLOY=1
            # We really only need compile_commands.json, so no need to build!
-            time python setup.py --cmake-only build
+            time python3 setup.py --cmake-only build

            # Generate ATen files.
-            time python -m tools.codegen.gen \
+            time python3 -m tools.codegen.gen \
              -s aten/src/ATen \
              -d build/aten/src/ATen

            # Generate PyTorch files.
-            time python tools/setup_helpers/generate_code.py            \
+            time python3 tools/setup_helpers/generate_code.py            \
              --declarations-path build/aten/src/ATen/Declarations.yaml \
              --native-functions-path aten/src/ATen/native/native_functions.yaml \
              --nn-path aten/src
          fi
      - name: Run clang-tidy
        env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
+          cd "${GITHUB_WORKSPACE}"
          set -eux

+          wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff"
+
          # Run Clang-Tidy
          # The negative filters below are to exclude files that include onnx_pb.h or
          # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
@ -296,27 +328,28 @@ jobs:
          # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
          # deploy/interpreter files are excluded due to using macros and other techniquies
          # that are not easily converted to accepted c++
-          python tools/clang_tidy.py                               \
-            --verbose                                              \
-            --paths torch/csrc/                                    \
-            --diff "$BASE_SHA"                                   \
-            -g"-torch/csrc/jit/passes/onnx/helper.cpp"             \
-            -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp"\
-            -g"-torch/csrc/jit/serialization/onnx.cpp"             \
-            -g"-torch/csrc/jit/serialization/export.cpp"           \
-            -g"-torch/csrc/jit/serialization/import.cpp"           \
-            -g"-torch/csrc/jit/serialization/import_legacy.cpp"    \
-            -g"-torch/csrc/onnx/init.cpp"                          \
-            -g"-torch/csrc/cuda/nccl.*"                            \
-            -g"-torch/csrc/cuda/python_nccl.cpp"                   \
-            -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
-            -g"-torch/csrc/generic/*.cpp"                          \
-            -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
-            -g"-torch/csrc/deploy/interpreter/interpreter.cpp"     \
-            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
-            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
-            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
-            "$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
+          python3 tools/clang_tidy.py \
+            --verbose \
+            --paths torch/csrc/ \
+            --diff-file pr.diff \
+            -g"-torch/csrc/jit/passes/onnx/helper.cpp" \
+            -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp" \
+            -g"-torch/csrc/jit/serialization/onnx.cpp" \
+            -g"-torch/csrc/jit/serialization/export.cpp" \
+            -g"-torch/csrc/jit/serialization/import.cpp" \
+            -g"-torch/csrc/jit/serialization/import_legacy.cpp" \
+            -g"-torch/csrc/onnx/init.cpp" \
+            -g"-torch/csrc/cuda/nccl.*" \
+            -g"-torch/csrc/cuda/python_nccl.cpp" \
+            -g"-torch/csrc/autograd/FunctionsManual.cpp" \
+            -g"-torch/csrc/generic/*.cpp" \
+            -g"-torch/csrc/jit/codegen/cuda/runtime/*" \
+            -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \
+            -g"-torch/csrc/deploy/interpreter/interpreter.h" \
+            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \
+            -g"-torch/csrc/deploy/interpreter/test_main.cpp" \
+            "$@" >"${GITHUB_WORKSPACE}"/clang-tidy-output.txt
+

          cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt

@ -380,3 +413,7 @@ jobs:
        run: |
          set -eux
          for CONFIG in mypy*.ini; do mypy --config="$CONFIG"; done
+
+concurrency:
+  group: lint-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
+++ b/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
@ -1,6 +1,6 @@
 # @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml
-# Generation script: .github/scripts/generate_linux_ci_workflows.py
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
 name: Linux CI (pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7)

 on:
@ -21,6 +21,10 @@ env:
  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"

+concurrency:
+  group: pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
 jobs:
  calculate-docker-image:
    runs-on: linux.2xlarge
@ -30,6 +34,15 @@ jobs:
    outputs:
      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -47,7 +60,6 @@ jobs:
          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          eval "$(aws ecr get-login --no-include-email --region us-east-1)"
          set -x
          # Check if image already exists, if it does then skip building it
          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
@ -82,6 +94,7 @@ jobs:
        run: |
          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
          cd .circleci/docker && ./build_docker.sh
+
  build:
    runs-on: linux.2xlarge
    needs: calculate-docker-image
@ -126,6 +139,22 @@ jobs:
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}" \
            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
@ -133,11 +162,22 @@ jobs:
      - name: Archive artifacts into zip
        run: |
          zip -r artifacts.zip dist/ build/
+      # Upload to github so that people can click and download artifacts
      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        name: Store PyTorch Build Artifacts on Github
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 30
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
          if-no-files-found: error
          path:
            artifacts.zip
@ -146,6 +186,7 @@ jobs:
        run: |
          # Prune all of the docker images
          docker system prune -af
+
  test:
    runs-on: linux.8xlarge.nvidia.gpu
    needs:
@ -165,6 +206,8 @@ jobs:
          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
+        with:
+          submodules: recursive
      - name: Pull docker image
        run: |
          docker pull "${DOCKER_IMAGE}"
@ -185,7 +228,7 @@ jobs:
              ;;
          esac
          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: actions/download-artifact@v2
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -207,6 +250,7 @@ jobs:
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
            -e IN_CI \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@ -229,7 +273,7 @@ jobs:
        if: always()
        with:
          name: test-reports
-          retention-days: 30
+          retention-days: 14
          if-no-files-found: error
          path:
            test/**/*.xml
@ -240,6 +284,12 @@ jobs:
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
          # Prune all of the docker images
          docker system prune -af
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
  render_test_results:
    if: always()
    needs:
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@ -1,6 +1,6 @@
 # @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml
-# Generation script: .github/scripts/generate_linux_ci_workflows.py
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
 name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)

 on:
@ -22,6 +22,10 @@ env:
  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"

+concurrency:
+  group: pytorch-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
 jobs:
  calculate-docker-image:
    runs-on: linux.2xlarge
@ -31,6 +35,15 @@ jobs:
    outputs:
      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -48,7 +61,6 @@ jobs:
          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          eval "$(aws ecr get-login --no-include-email --region us-east-1)"
          set -x
          # Check if image already exists, if it does then skip building it
          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
@ -83,6 +95,7 @@ jobs:
        run: |
          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
          cd .circleci/docker && ./build_docker.sh
+
  build:
    runs-on: linux.2xlarge
    needs: calculate-docker-image
@ -127,6 +140,22 @@ jobs:
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}" \
            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
@ -134,11 +163,22 @@ jobs:
      - name: Archive artifacts into zip
        run: |
          zip -r artifacts.zip dist/ build/
+      # Upload to github so that people can click and download artifacts
      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        name: Store PyTorch Build Artifacts on Github
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 30
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
          if-no-files-found: error
          path:
            artifacts.zip
@ -147,6 +187,7 @@ jobs:
        run: |
          # Prune all of the docker images
          docker system prune -af
+
  test:
    runs-on: linux.2xlarge
    needs:
@ -166,6 +207,8 @@ jobs:
          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
+        with:
+          submodules: recursive
      - name: Pull docker image
        run: |
          docker pull "${DOCKER_IMAGE}"
@ -186,7 +229,7 @@ jobs:
              ;;
          esac
          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: actions/download-artifact@v2
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -208,6 +251,7 @@ jobs:
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
            -e IN_CI \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@ -230,7 +274,7 @@ jobs:
        if: always()
        with:
          name: test-reports
-          retention-days: 30
+          retention-days: 14
          if-no-files-found: error
          path:
            test/**/*.xml
@ -241,6 +285,12 @@ jobs:
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
          # Prune all of the docker images
          docker system prune -af
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
  render_test_results:
    if: always()
    needs:
@ -287,6 +337,7 @@ jobs:
        run: |
          export PYTHONPATH=$PWD
          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+
  pytorch_python_doc_build:
    runs-on: linux.2xlarge
    needs:
@ -303,7 +354,7 @@ jobs:
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Checkout PyTorch
        uses: actions/checkout@v2
        with:
@ -315,7 +366,7 @@ jobs:
      - name: Preserve github env variables for use in docker
        run: |
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: actions/download-artifact@v2
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -348,7 +399,7 @@ jobs:
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Archive artifacts into zip
        run: |
          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@ -0,0 +1,171 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/windows_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: Windows CI (pytorch-win-vs2019-cpu-py3)
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: pytorch-win-vs2019-cpu-py3
+  BUILD_WHEEL: 1
+  CUDA_VERSION: "cpu"
+  IN_CI: 1
+  INSTALL_WINDOWS_SDK: 1
+  JOB_BASE_NAME: test
+  PYTHON_VERSION: "3.8"
+  SCCACHE_BUCKET: "ossci-compiler-cache"
+  VC_PRODUCT: "BuildTools"
+  VC_VERSION: ""
+  VC_YEAR: "2019"
+
+concurrency:
+  group: pytorch-win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: "windows.4xlarge"
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - name: Build
+        shell: bash
+        run: |
+          .jenkins/pytorch/win-build.sh
+      # Upload to github so that people can click and download artifacts
+      - name: Upload artifacts to Github
+        if: always()
+        uses: actions/upload-artifact@v2
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+      - name: Upload artifacts to s3
+        if: always()
+        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+
+  test:
+    runs-on: windows.4xlarge
+    env:
+      JOB_BASE_NAME: pytorch-win-vs2019-cpu-py3-test
+    needs:
+      - build
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+      - name: Check build-results folder
+        shell: powershell
+        run: |
+          tree /F C:\$Env:GITHUB_RUN_ID\build-results
+      # Needed for coverage in win-test.sh
+      - uses: actions/setup-python@v2
+        name: Setup Python3
+        with:
+          python-version: '3.x'
+      - name: Run test scripts
+        shell: bash
+        env:
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+        run: |
+            .jenkins/pytorch/win-test.sh
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test/**/*.xml
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
+  render_test_results:
+    if: always()
+    needs:
+      - test
+    runs-on: ubuntu-18.04
+    # TODO: Make this into a composite step
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow tools/print_test_stats.py to use Git commands
+          fetch-depth: 0
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Test Reports
+        with:
+          name: test-reports
+          path: test/test-reports
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        # boto3 version copied from .circleci/docker/common/install_conda.sh
+        run: |
+          pip install -r requirements.txt
+          pip install boto3==1.16.34 junitparser rich
+      - name: Output Test Results (Click Me)
+        run: |
+          python tools/render_junit.py test
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_JOB: pytorch-win-vs2019-cpu-py3
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
--- a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
@ -0,0 +1,188 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/windows_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: Windows CI (pytorch-win-vs2019-cuda10-cudnn7-py3)
+
+on:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: pytorch-win-vs2019-cuda10-cudnn7-py3
+  BUILD_WHEEL: 1
+  CUDA_VERSION: "10.1"
+  IN_CI: 1
+  INSTALL_WINDOWS_SDK: 1
+  JOB_BASE_NAME: test
+  PYTHON_VERSION: "3.8"
+  SCCACHE_BUCKET: "ossci-compiler-cache"
+  VC_PRODUCT: "BuildTools"
+  VC_VERSION: ""
+  VC_YEAR: "2019"
+  TORCH_CUDA_ARCH_LIST: "7.0"
+  USE_CUDA: 1
+
+concurrency:
+  group: pytorch-win-vs2019-cuda10-cudnn7-py3-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: "windows.4xlarge"
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+      - name: Build
+        shell: bash
+        run: |
+          .jenkins/pytorch/win-build.sh
+      # Upload to github so that people can click and download artifacts
+      - name: Upload artifacts to Github
+        if: always()
+        uses: actions/upload-artifact@v2
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+      - name: Upload artifacts to s3
+        if: always()
+        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+
+  test:
+    runs-on: windows.8xlarge.nvidia.gpu
+    env:
+      JOB_BASE_NAME: pytorch-win-vs2019-cuda10-cudnn7-py3-test
+    needs:
+      - build
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+      - name: Check build-results folder
+        shell: powershell
+        run: |
+          tree /F C:\$Env:GITHUB_RUN_ID\build-results
+      # Needed for coverage in win-test.sh
+      - uses: actions/setup-python@v2
+        name: Setup Python3
+        with:
+          python-version: '3.x'
+      - name: Run test scripts
+        shell: bash
+        env:
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+        run: |
+            .jenkins/pytorch/win-test.sh
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test/**/*.xml
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
+  render_test_results:
+    if: always()
+    needs:
+      - test
+    runs-on: ubuntu-18.04
+    # TODO: Make this into a composite step
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow tools/print_test_stats.py to use Git commands
+          fetch-depth: 0
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Test Reports
+        with:
+          name: test-reports
+          path: test/test-reports
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        # boto3 version copied from .circleci/docker/common/install_conda.sh
+        run: |
+          pip install -r requirements.txt
+          pip install boto3==1.16.34 junitparser rich
+      - name: Output Test Results (Click Me)
+        run: |
+          python tools/render_junit.py test
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_JOB: pytorch-win-vs2019-cuda10-cudnn7-py3
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
--- a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@ -0,0 +1,188 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/windows_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: Windows CI (pytorch-win-vs2019-cuda11-cudnn8-py3)
+
+on:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: pytorch-win-vs2019-cuda11-cudnn8-py3
+  BUILD_WHEEL: 1
+  CUDA_VERSION: "11.1"
+  IN_CI: 1
+  INSTALL_WINDOWS_SDK: 1
+  JOB_BASE_NAME: test
+  PYTHON_VERSION: "3.8"
+  SCCACHE_BUCKET: "ossci-compiler-cache"
+  VC_PRODUCT: "BuildTools"
+  VC_VERSION: ""
+  VC_YEAR: "2019"
+  TORCH_CUDA_ARCH_LIST: "7.0"
+  USE_CUDA: 1
+
+concurrency:
+  group: pytorch-win-vs2019-cuda11-cudnn8-py3-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: "windows.4xlarge"
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+      - name: Build
+        shell: bash
+        run: |
+          .jenkins/pytorch/win-build.sh
+      # Upload to github so that people can click and download artifacts
+      - name: Upload artifacts to Github
+        if: always()
+        uses: actions/upload-artifact@v2
+        # Don't fail on upload to GH since it's only for user convenience
+        continue-on-error: true
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+      - name: Upload artifacts to s3
+        if: always()
+        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\w\build-results
+
+  test:
+    runs-on: windows.8xlarge.nvidia.gpu
+    env:
+      JOB_BASE_NAME: pytorch-win-vs2019-cuda11-cudnn8-py3-test
+    needs:
+      - build
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+      - name: Clean workspace (including things in .gitignore)
+        shell: bash
+        run: |
+          git clean -xdf
+      - name: Install Visual Studio 2019 toolchain
+        shell: powershell
+        run: |
+          .\.circleci\scripts\vs_install.ps1
+      - name: Install Cuda
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cuda_install.sh
+      - name: Install Cudnn
+        shell: bash
+        run: |
+          .circleci/scripts/windows_cudnn_install.sh
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+      - name: Check build-results folder
+        shell: powershell
+        run: |
+          tree /F C:\$Env:GITHUB_RUN_ID\build-results
+      # Needed for coverage in win-test.sh
+      - uses: actions/setup-python@v2
+        name: Setup Python3
+        with:
+          python-version: '3.x'
+      - name: Run test scripts
+        shell: bash
+        env:
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+        run: |
+            .jenkins/pytorch/win-test.sh
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test/**/*.xml
+
+  # this is a separate step from test because the log files from test are too
+  # long: basically, GitHub tries to render all of the log files when you click
+  # through an action causing extreme slowdown on actions that contain too many
+  # logs (like test); we can always move it back to the other one, but it
+  # doesn't create the best experience
+  render_test_results:
+    if: always()
+    needs:
+      - test
+    runs-on: ubuntu-18.04
+    # TODO: Make this into a composite step
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow tools/print_test_stats.py to use Git commands
+          fetch-depth: 0
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Test Reports
+        with:
+          name: test-reports
+          path: test/test-reports
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        # boto3 version copied from .circleci/docker/common/install_conda.sh
+        run: |
+          pip install -r requirements.txt
+          pip install boto3==1.16.34 junitparser rich
+      - name: Output Test Results (Click Me)
+        run: |
+          python tools/render_junit.py test
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_JOB: pytorch-win-vs2019-cuda11-cudnn8-py3
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }} # dunno if this corresponds
+        run: |
+          export PYTHONPATH=$PWD
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@ -64,3 +64,7 @@ jobs:
        with:
          name: TorchBench result
          path: ~/.torchbench/bisection/pr${{ github.event.number }}
+
+concurrency:
+  group: run-torchbench-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/test_tools.yml
+++ b/.github/workflows/test_tools.yml
@ -29,3 +29,7 @@ jobs:
          make setup_lint
      - name: Run tests
        run: python -m unittest discover -vs tools/test -p 'test_*.py'
+
+concurrency:
+  group: test-tools-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/update_disabled_tests.yml
+++ b/.github/workflows/update_disabled_tests.yml
@ -1,30 +0,0 @@
-name: Update disabled tests
-
-on:
-  issues:
-    types: [opened, edited, labeled, unlabeled, closed, reopened]
-  # Have the ability to trigger this job manually through the API
-  workflow_dispatch:
-
-jobs:
-  update-disabled-tests:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Generate new disabled test list
-        run: |
-          # score changes every request, so we strip it out to avoid creating a commit every time we query.
-          curl 'https://api.github.com/search/issues?q=is%3Aissue+is%3Aopen+label%3A%22module%3A+flaky-tests%22+repo:pytorch/pytorch+in%3Atitle+DISABLED' \
-          | sed 's/"score": [0-9\.]*/"score": 0.0/g' > disabled-tests.json
-      - name: Push file to test-infra repository
-        uses: dmnemec/copy_file_to_another_repo_action@5f40763ccee2954067adba7fb8326e4df33bcb92
-        env:
-           API_TOKEN_GITHUB: ${{ secrets.TEST_INFRA_TOKEN }}
-        with:
-          source_file: 'disabled-tests.json'
-          destination_repo: 'pytorch/test-infra'
-          destination_folder: 'stats'
-          destination_branch: master
-          user_email: 'test-infra@pytorch.org'
-          user_name: 'Pytorch Test Infra'
-          commit_message: 'Updating disabled tests stats'
--- a/.gitignore
+++ b/.gitignore
@ -15,8 +15,8 @@ coverage.xml
 .hypothesis
 .mypy_cache
 /.extracted_scripts/
-**/.pytorch-test-times
-**/.pytorch-slow-tests
+**/.pytorch-test-times.json
+**/.pytorch-slow-tests.json
 */*.pyc
 */*.so*
 */**/__pycache__
--- a/.gitmodules
+++ b/.gitmodules
@ -130,6 +130,9 @@
    ignore = dirty
    path = third_party/tensorpipe
    url = https://github.com/pytorch/tensorpipe.git
+[submodule "third_party/cudnn_frontend"]
+	path = third_party/cudnn_frontend
+	url = https://github.com/NVIDIA/cudnn-frontend.git
 [submodule "third_party/kineto"]
    path = third_party/kineto
    url = https://github.com/pytorch/kineto
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -24,7 +24,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then
  exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@"
 fi

-if [[ "$BUILD_ENVIRONMENT" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7* ]]; then
  # Enabling DEPLOY build (embedded torch python interpreter, experimental)
  # only on one config for now, can expand later
  export USE_DEPLOY=ON
@ -200,8 +200,10 @@ fi

 # Patch required to build xla
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  git clone --recursive -b r1.9 https://github.com/pytorch/xla.git
-  ./xla/scripts/apply_patches.sh
+  clone_pytorch_xla
+  # shellcheck disable=SC1091
+  source "xla/.circleci/common.sh"
+  apply_patches
 fi

 if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3.6-gcc7-build || "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3.6-gcc5.4-build ]]; then
@ -311,36 +313,10 @@ fi

 # Test XLA build
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  # TODO: Move this to Dockerfile.
-
-  pip_install lark-parser
-  pip_install cloud-tpu-client
-
-  sudo apt-get -qq update
-  sudo apt-get -qq install npm nodejs
-
-  # XLA build requires Bazel
-  # We use bazelisk to avoid updating Bazel version manually.
-  sudo npm install -g @bazel/bazelisk
-  sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel
-
-  # Install bazels3cache for cloud cache
-  sudo npm install -g bazels3cache
-  BAZELS3CACHE="$(which bazels3cache)"
-  if [ -z "${BAZELS3CACHE}" ]; then
-    echo "Unable to find bazels3cache..."
-    exit 1
-  fi
-
-  bazels3cache --bucket="${XLA_CLANG_CACHE_S3_BUCKET_NAME}" --maxEntrySizeBytes=0
-  pushd xla
-  export CC=clang-9 CXX=clang++-9
-  # Use cloud cache to build when available.
-  # shellcheck disable=SC1003
-  sed -i '/bazel build/ a --remote_http_cache=http://localhost:7777 \\' build_torch_xla_libs.sh
-
-  python setup.py install
-  popd
+  XLA_DIR=xla
+  # These functions are defined in .circleci/common.sh in pytorch/xla repo
+  install_deps_pytorch_xla $XLA_DIR
+  build_torch_xla $XLA_DIR
  assert_git_not_dirty
 fi

--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@ -52,9 +52,9 @@ function get_exit_code() {
 function file_diff_from_base() {
  # The fetch may fail on Docker hosts, this fetch is necessary for GHA
  set +e
-  git fetch origin release/1.9 --quiet
+  git fetch origin master --quiet
  set -e
-  git diff --name-only "$(git merge-base origin/release/1.9 HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
 }

 function get_bazel() {
@ -86,3 +86,7 @@ function checkout_install_torchvision() {
  time python setup.py install
  popd
 }
+
+function clone_pytorch_xla() {
+  git clone --recursive https://github.com/pytorch/xla.git
+}
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@ -28,13 +28,7 @@ fi
 export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
 # shellcheck disable=SC1091
 source "${WORKSPACE_DIR}"/miniconda3/bin/activate
-
-# NOTE: mkl 2021.3.0+ cmake requires sub-command PREPEND, may break the build
-retry conda install -y \
-  mkl=2021.2.0 mkl-include=2021.2.0 \
-  numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 \
-  cmake cffi ninja typing_extensions dataclasses pip
-
+retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses pip
 # The torch.hub tests make requests to GitHub.
 #
 # The certifi package from conda-forge is new enough to make the
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@ -53,7 +53,8 @@ test_python_all() {

  # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
  # CIRCLE_PULL_REQUEST comes from CircleCI
-  # GITHUB_HEAD_REF comes from Github Actions
+  # NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+  #       see https://github.com/pytorch/pytorch/issues/60111
  IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
  if [ -n "$IN_PULL_REQUEST" ]; then
    DETERMINE_FROM=$(mktemp)
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@ -25,6 +25,8 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_store
+time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_process_group_agent
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 assert_git_not_dirty
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -37,8 +37,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  # mainly used so that we're not spending extra cycles testing cpu
  # devices on expensive gpu machines
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
-elif [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
-  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
@ -126,7 +124,8 @@ fi

 # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
 # CIRCLE_PULL_REQUEST comes from CircleCI
-# GITHUB_HEAD_REF comes from Github Actions
+# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+#       see https://github.com/pytorch/pytorch/issues/60111
 IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
 if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
  DETERMINE_FROM=$(mktemp)
@ -339,23 +338,9 @@ test_torch_function_benchmark() {
 }

 test_xla() {
-  export XLA_USE_XRT=1 XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
-  # Issue #30717: randomize the port of XLA/gRPC workers is listening on to reduce flaky tests.
-  XLA_PORT=$(shuf -i 40701-40999 -n 1)
-  export XRT_WORKERS="localservice:0;grpc://localhost:$XLA_PORT"
-  pushd xla
-  echo "Running Python Tests"
-  ./test/run_tests.sh
-
-  # Disabled due to MNIST download issue.
-  # See https://github.com/pytorch/pytorch/issues/53267
-  # echo "Running MNIST Test"
-  # python test/test_train_mnist.py --tidy
-
-  echo "Running C++ Tests"
-  pushd test/cpp
-  CC=clang-9 CXX=clang++-9 ./run_tests.sh
-  popd
+  # shellcheck disable=SC1091
+  source "./xla/.circleci/common.sh"
+  run_torch_xla_tests "$(pwd)" "$(pwd)/xla"
  assert_git_not_dirty
 }

@ -368,7 +353,7 @@ test_backward_compatibility() {
  python -m venv venv
  # shellcheck disable=SC1091
  . venv/bin/activate
-  pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
+  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
  pip show torch
  python dump_all_function_schemas.py --filename nightly_schemas.txt
  deactivate
@ -452,7 +437,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test1 ]]; then
    test_torch_deploy
  fi
  test_without_numpy
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@ -22,7 +22,7 @@ call %INSTALLER_DIR%\install_miniconda3.bat


 :: Install ninja and other deps
-if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" dataclasses typing_extensions )
+if "%REBUILD%"=="" ( pip install -q "ninja==1.10.0.post1" dataclasses typing_extensions )

 :: Override VS env here
 pushd .
@ -38,7 +38,15 @@ if not "%USE_CUDA%"=="1" goto cuda_build_end

 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

+if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
+    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
+    exit /b 1
+)
 rem version transformer, for example 10.1 to 10_1.
+if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
+    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
+    exit /b 1
+)
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%

@ -119,6 +127,6 @@ python setup.py install --cmake && sccache --show-stats && (
    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"

    :: export test times so that potential sharded tests that'll branch off this build will use consistent data
-    python test/run_test.py --export-past-test-times %PYTORCH_FINAL_PACKAGE_DIR%/.pytorch-test-times
+    python test/run_test.py --export-past-test-times %PYTORCH_FINAL_PACKAGE_DIR%/.pytorch-test-times.json
  )
 )
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@ -1,4 +1,14 @@
 rem remove dot in cuda_version, fox example 11.1 to 111
+
+if not "%USE_CUDA%"=="1" (
+    exit /b 0
+)
+
+if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
+    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.' 
+    exit /b 1
+)
+
 set VERSION_SUFFIX=%CUDA_VERSION:.=%
 set CUDA_SUFFIX=cuda%VERSION_SUFFIX%

--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -20,9 +20,7 @@ if NOT "%BUILD_ENVIRONMENT%"=="" (
 )
 call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
 if NOT "%BUILD_ENVIRONMENT%"=="" (
-    :: We have to pin Python version to 3.6.7, until mkl supports Python 3.7
-    :: Numba is pinned to 0.44.0 to avoid https://github.com/numba/numba/issues/4352
-    call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 scipy==1.5.0 typing_extensions dataclasses libuv
+    call conda install -y -q python=3.8 numpy mkl cffi pyyaml boto3 protobuf numba scipy typing_extensions dataclasses libuv
    if %errorlevel% neq 0 ( exit /b %errorlevel% )
    call conda install -y -q -c conda-forge cmake
    if %errorlevel% neq 0 ( exit /b %errorlevel% )
--- a/.jenkins/pytorch/win-test-helpers/test_distributed.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_distributed.bat
@ -19,3 +19,9 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% )

 %1\python.exe test/run_test.py --verbose -i distributed/test_data_parallel
 if %errorlevel% neq 0 ( exit /b %errorlevel% )
+
+%1\python.exe test/run_test.py --verbose -i distributed/test_store
+if %errorlevel% neq 0 ( exit /b %errorlevel% )
+
+%1\python.exe test/run_test.py --verbose -i distributed/test_pg_wrapper
+if %errorlevel% neq 0 ( exit /b %errorlevel% )
--- a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat
@ -1,7 +1,7 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat

 echo Copying over test times file
-copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times" "%TEST_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"

 pushd test

--- a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
@ -1,7 +1,7 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat

 echo Copying over test times file
-copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times" "%TEST_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"

 pushd test

--- a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat
@ -1,7 +1,7 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat

 echo Copying over test times file
-copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times" "%TEST_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"

 cd test && python run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="%1" && cd ..

--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@ -23,7 +23,7 @@ export PROJECT_DIR_WIN
 export TEST_DIR="${PWD}/test"
 TEST_DIR_WIN=$(cygpath -w "${TEST_DIR}")
 export TEST_DIR_WIN
-export PYTORCH_FINAL_PACKAGE_DIR="/c/users/circleci/workspace/build-results"
+export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/users/circleci/workspace/build-results}"
 PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}")
 export PYTORCH_FINAL_PACKAGE_DIR_WIN
 export PYTORCH_TEST_SKIP_NOARCH=1
@ -42,10 +42,10 @@ fi

 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

-# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
-# CIRCLE_PULL_REQUEST comes from CircleCI
-# GITHUB_HEAD_REF comes from Github Actions
-IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+# Try to pull value from CIRCLE_PULL_REQUEST
+# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+#       see https://github.com/pytorch/pytorch/issues/60111
+IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
 if [ -n "$IN_PULL_REQUEST" ]; then
  DETERMINE_FROM="${TMP_DIR}/determine_from"
  file_diff_from_base "$DETERMINE_FROM"
@ -57,9 +57,9 @@ fi

 run_tests() {
    # Run nvidia-smi if available
-    for path in  /c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe /c/Windows/System32/nvidia-smi.exe; do
-        if [ -x $path ]; then
-            $path;
+    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
+        if [[ -x "$path" ]]; then
+            "$path" || echo "true";
            break
        fi
    done
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -132,17 +132,22 @@ genrule(
        "aten/src/ATen/RegisterSparseCPU.cpp",
        "aten/src/ATen/RegisterSparseCsrCPU.cpp",
        "aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
-        "aten/src/ATen/RegisterMeta.cpp",
        "aten/src/ATen/RegisterCompositeExplicitAutograd.cpp",
+        "aten/src/ATen/RegisterMeta.cpp",
        "aten/src/ATen/RegisterSchema.cpp",
        "aten/src/ATen/CPUFunctions.h",
        "aten/src/ATen/CUDAFunctions.h",
+        "aten/src/ATen/CompositeExplicitAutogradFunctions.h",
+        "aten/src/ATen/CompositeImplicitAutogradFunctions.h",
        "aten/src/ATen/Functions.h",
        "aten/src/ATen/Functions.cpp",
        "aten/src/ATen/RedispatchFunctions.h",
        "aten/src/ATen/RedispatchFunctions.cpp",
+        "aten/src/ATen/Operators.h",
+        "aten/src/ATen/Operators.cpp",
        "aten/src/ATen/NativeFunctions.h",
        "aten/src/ATen/MetaFunctions.h",
+        "aten/src/ATen/NativeMetaFunctions.h",
        "aten/src/ATen/core/TensorBody.h",
        "aten/src/ATen/core/TensorMethods.cpp",
        "aten/src/ATen/core/ATenOpList.cpp",
@ -326,12 +331,8 @@ filegroup(
        "aten/src/TH/THAllocator.cpp",
        "aten/src/TH/THBlas.cpp",
        "aten/src/TH/THGeneral.cpp",
-        "aten/src/TH/THLapack.cpp",
        "aten/src/TH/THStorageFunctions.cpp",
        "aten/src/TH/THTensor.cpp",
-        "aten/src/TH/THTensorEvenMoreMath.cpp",
-        "aten/src/TH/THTensorLapack.cpp",
-        "aten/src/TH/THTensorMath.cpp",
        "aten/src/TH/THTensorMoreMath.cpp",
    ],
 )
@ -385,7 +386,6 @@ filegroup(
        "aten/src/THC/THCTensorMath.cu.cc",
        "aten/src/THC/THCTensorMathMagma.cu.cc",
        "aten/src/THC/THCTensorMathPairwise.cu.cc",
-        "aten/src/THC/THCTensorMathReduce.cu.cc",
        "aten/src/THC/THCTensorMathScan.cu.cc",
        "aten/src/THC/THCTensorScatterGather.cu.cc",
        "aten/src/THC/THCTensorSort.cu.cc",
@ -398,16 +398,6 @@ filegroup(
        "aten/src/THC/generated/THCTensorMathPointwiseInt.cu.cc",
        "aten/src/THC/generated/THCTensorMathPointwiseLong.cu.cc",
        "aten/src/THC/generated/THCTensorMathPointwiseShort.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceBFloat16.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceBool.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceByte.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceChar.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceDouble.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceFloat.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceHalf.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceInt.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceLong.cu.cc",
-        "aten/src/THC/generated/THCTensorMathReduceShort.cu.cc",
        "aten/src/THC/generated/THCTensorSortByte.cu.cc",
        "aten/src/THC/generated/THCTensorSortChar.cu.cc",
        "aten/src/THC/generated/THCTensorSortDouble.cu.cc",
@ -431,7 +421,6 @@ filegroup(
        "aten/src/THCUNN/LogSigmoid.cu.cc",
        "aten/src/THCUNN/MultiLabelMarginCriterion.cu.cc",
        "aten/src/THCUNN/MultiMarginCriterion.cu.cc",
-        "aten/src/THCUNN/RReLU.cu.cc",
        "aten/src/THCUNN/SoftMarginCriterion.cu.cc",
        "aten/src/THCUNN/SoftPlus.cu.cc",
        "aten/src/THCUNN/SoftShrink.cu.cc",
@ -1888,8 +1877,6 @@ cc_library(
            "torch/lib/c10d/*.hpp",
        ],
        exclude = [
-            "torch/lib/c10d/ProcessGroupMPI.hpp",
-            "torch/lib/c10d/ProcessGroupNCCL.hpp",
            "torch/csrc/autograd/generated/VariableType.h",
            "torch/csrc/autograd/generated/RegistrationDeclarations.h",
            "torch/csrc/autograd/generated/variable_factories.h",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,6 +197,9 @@ cmake_dependent_option(
 cmake_dependent_option(
  USE_WHOLE_CUDNN "Use whole-library linking for cuDNN" OFF
    "USE_STATIC_CUDNN" OFF)
+cmake_dependent_option(
+    USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" OFF
+    "USE_CUDNN" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
@ -286,6 +289,12 @@ cmake_dependent_option(
 cmake_dependent_option(
    USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
    "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
+cmake_dependent_option(
+    USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
+cmake_dependent_option(
+    USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(
+    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
    "USE_DISTRIBUTED" OFF)
@ -351,6 +360,7 @@ option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo." OFF)
 option(USE_SYSTEM_SLEEF "Use system-provided sleef." OFF)
 option(USE_SYSTEM_GLOO "Use system-provided gloo." OFF)
 option(USE_SYSTEM_FP16 "Use system-provided fp16." OFF)
+option(USE_SYSTEM_PYBIND11 "Use system-provided PyBind11." OFF)
 option(USE_SYSTEM_PTHREADPOOL "Use system-provided pthreadpool." OFF)
 option(USE_SYSTEM_PSIMD "Use system-provided psimd." OFF)
 option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
@ -371,6 +381,7 @@ if(USE_SYSTEM_LIBS)
  set(USE_SYSTEM_BENCHMARK ON)
  set(USE_SYSTEM_ONNX ON)
  set(USE_SYSTEM_XNNPACK ON)
+  set(USE_SYSTEM_PYBIND11 ON)
 endif()

 # Used when building Caffe2 through setup.py
--- a/22
+++ b/22
@ -19,20 +19,20 @@
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/lib/c10d/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang
-/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang
-/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang
-/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang
+/torch/lib/c10d/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang @cbalioglu
+/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang @cbalioglu
+/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang @cbalioglu
+/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @SciPioneer @mingzhe09088 @H-Huang @cbalioglu

 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @SciPioneer @H-Huang
-/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @SciPioneer @H-Huang
+/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @SciPioneer @H-Huang @cbalioglu
+/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @SciPioneer @H-Huang @cbalioglu

 # ONNX Export
-/torch/csrc/jit/passes/onnx.h @bowenbao @neginraoof @spandantiwari
-/torch/csrc/jit/passes/onnx.cpp @bowenbao @neginraoof @spandantiwari
-/torch/csrc/jit/passes/onnx/ @bowenbao @neginraoof @spandantiwari
-/torch/onnx/ @bowenbao @neginraoof @spandantiwari
-/test/onnx/ @bowenbao @neginraoof @spandantiwari
+/torch/csrc/jit/passes/onnx.h @bowenbao @neginraoof @shubhambhokare1
+/torch/csrc/jit/passes/onnx.cpp @bowenbao @neginraoof @shubhambhokare1
+/torch/csrc/jit/passes/onnx/ @bowenbao @neginraoof @shubhambhokare1
+/torch/onnx/ @bowenbao @neginraoof @shubhambhokare1
+/test/onnx/ @bowenbao @neginraoof @shubhambhokare1
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -77,11 +77,17 @@ https://github.com/pytorch/pytorch#from-source

 To develop PyTorch on your machine, here are some tips:

-1. Uninstall all existing PyTorch installs:
+1. Uninstall all existing PyTorch installs. You may need to run `pip
+uninstall torch` multiple times. You'll know `torch` is fully
+uninstalled when you see `WARNING: Skipping torch as it is not
+installed`. (You should only have to `pip uninstall` a few times, but
+you can always `uninstall` with `timeout` or in a loop if you're feeling
+lazy.)
+
+
 ```bash
-conda uninstall pytorch
-pip uninstall torch
-pip uninstall torch # run this command twice
+conda -y uninstall pytorch
+yes | pip uninstall torch
 ```

 2. Clone a copy of PyTorch from source:
@ -134,8 +140,10 @@ For example:
 You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall
 if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).

-In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch`
-and `python setup.py clean`. Then you can install in `develop` mode again.
+In case you want to reinstall, make sure that you uninstall PyTorch
+first by running `pip uninstall torch` until you see `WARNING: Skipping
+torch as it is not installed`; next run `python setup.py clean`. After
+that, you can install in `develop` mode again.

 ### Tips and Debugging
 * A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/)
@ -902,7 +910,7 @@ tensor([1., 2., 3., 4.], dtype=torch.float64)
 ```

 GDB tries to automatically load `pytorch-gdb` thanks to the
-[.gdbinit](.gdbinit) at the root of the pytorch repo. Howevever, auto-loadings is disabled by default, because of security reasons:
+[.gdbinit](.gdbinit) at the root of the pytorch repo. However, auto-loadings is disabled by default, because of security reasons:

 ```
 $ gdb
--- a/62
+++ b/62
@ -1,7 +1,8 @@
 # This makefile does nothing but delegating the actual building to cmake.
+PYTHON = python3

 all:
-	@mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
+	@mkdir -p build && cd build && cmake .. $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && $(MAKE)

 local:
 	@./scripts/build_local.sh
@ -28,16 +29,35 @@ shellcheck-gha:
 	tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)

 generate-gha-workflows:
-	./.github/scripts/generate_linux_ci_workflows.py
+	.github/scripts/generate_ci_workflows.py
 	$(MAKE) shellcheck-gha

+shellcheck:
+	@$(PYTHON) tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'shellcheck' \
+		--step "Regenerate workflows"
+	@$(PYTHON) tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'shellcheck' \
+		--step "Assert that regenerating the workflows didn't change them"
+	@$(PYTHON) tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'shellcheck' \
+		--step 'Extract scripts from GitHub Actions workflows'
+	@$(PYTHON) tools/actions_local_runner.py \
+		$(CHANGED_ONLY) \
+		--job 'shellcheck'
+
 setup_lint:
-	python tools/actions_local_runner.py --file .github/workflows/lint.yml \
-	 	--job 'flake8-py3' --step 'Install dependencies' --no-quiet
-	python tools/actions_local_runner.py --file .github/workflows/lint.yml \
-	 	--job 'cmakelint' --step 'Install dependencies' --no-quiet
-	python tools/actions_local_runner.py --file .github/workflows/lint.yml \
-	 	--job 'mypy' --step 'Install dependencies' --no-quiet
+	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+		--job 'flake8-py3' --step 'Install dependencies' --no-quiet
+	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+		--job 'cmakelint' --step 'Install dependencies' --no-quiet
+	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+		--job 'mypy' --step 'Install dependencies' --no-quiet
+	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+		--job 'shellcheck' --step 'Install Jinja2' --no-quiet

 	@if [ "$$(uname)" = "Darwin" ]; then \
 		if [ -z "$$(which brew)" ]; then \
@ -46,20 +66,15 @@ setup_lint:
 		fi; \
 		brew install shellcheck; \
 	else \
-		python tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'quick-checks' --step 'Install ShellCheck' --no-quiet; \
+		$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+		--job 'shellcheck' --step 'Install ShellCheck' --no-quiet; \
 	fi
 	pip install jinja2

 quick_checks:
-	@python tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'quick-checks' \
-		--step 'Extract scripts from GitHub Actions workflows'
-
 # TODO: This is broken when 'git config submodule.recurse' is 'true' since the
 # lints will descend into third_party submodules
-	@python tools/actions_local_runner.py \
+	@$(PYTHON) tools/actions_local_runner.py \
 		--file .github/workflows/lint.yml \
 		--job 'quick-checks' \
 		--step 'Ensure no trailing spaces' \
@ -70,23 +85,20 @@ quick_checks:
 		--step 'Ensure no unqualified noqa' \
 		--step 'Ensure no unqualified type ignore' \
 		--step 'Ensure no direct cub include' \
-		--step 'Run ShellCheck' \
 		--step 'Ensure correct trailing newlines'

 flake8:
-	@python tools/actions_local_runner.py \
-		--file-filter '.py' \
+	@$(PYTHON) tools/actions_local_runner.py \
 		$(CHANGED_ONLY) \
 		--job 'flake8-py3'

 mypy:
-	@python tools/actions_local_runner.py \
-		--file-filter '.py' \
+	@$(PYTHON) tools/actions_local_runner.py \
 		$(CHANGED_ONLY) \
 		--job 'mypy'

 cmakelint:
-	@python tools/actions_local_runner.py \
+	@$(PYTHON) tools/actions_local_runner.py \
 		--file .github/workflows/lint.yml \
 		--job 'cmakelint' \
 		--step 'Run cmakelint'
@ -96,12 +108,12 @@ clang_tidy:
 	exit 1

 toc:
-	@python tools/actions_local_runner.py \
+	@$(PYTHON) tools/actions_local_runner.py \
 		--file .github/workflows/lint.yml \
 		--job 'toc' \
 		--step "Regenerate ToCs and check that they didn't change"

-lint: flake8 mypy quick_checks cmakelint generate-gha-workflows
+lint: flake8 mypy quick_checks cmakelint shellcheck

 quicklint: CHANGED_ONLY=--changed-only
-quicklint: mypy flake8 mypy quick_checks cmakelint generate-gha-workflows
+quicklint: mypy flake8 mypy quick_checks cmakelint shellcheck
--- a/README.md
+++ b/README.md
@ -48,7 +48,7 @@ You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to
 | Linux (ppc64le) GPU | <center>—</center> | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/) | <center>—</center> |
 | Linux (aarch64) CPU | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py36)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py36) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38) |

-See also the [ci.pytorch.org HUD](https://ezyang.github.io/pytorch-ci-hud/build/pytorch-master).
+See also the [ci.pytorch.org HUD](https://hud.pytorch.org/build2/pytorch-master).


 ## More About PyTorch
@ -270,13 +270,13 @@ Sometimes there are regressions in new versions of Visual Studio, so
 it's best to use the same Visual Studio Version [16.8.5](https://github.com/pytorch/pytorch/blob/master/.circleci/scripts/vs_install.ps1) as Pytorch CI's.
 You can use Visual Studio Enterprise, Professional or Community though PyTorch CI uses Visual Studio BuildTools.

-If you want to build legacy python code, please refert to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
+If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#building-on-legacy-code-and-cuda)

 Build with CPU

 It's fairly easy to build with CPU.

-Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the buliding environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configuraions for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
+Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.

 Build with CUDA

@ -353,7 +353,7 @@ should increase shared memory size either with `--ipc=host` or `--shm-size` comm

 **NOTE:** Must be built with a docker version > 18.06

-The `Dockerfile` is supplied to build images with Cuda support and cuDNN v7.
+The `Dockerfile` is supplied to build images with CUDA 11.1 support and cuDNN v8.
 You can pass `PYTHON_VERSION=x.y` make variable to specify which Python version is to be used by Miniconda, or leave it
 unset to use the default.
 ```bash
--- a/RELEASE.md
+++ b/RELEASE.md
@ -37,7 +37,7 @@ An example of this would look like:
 release/1.8
 ```

-Please make sure to create branch that pins divergent point of release branch from the main thunk, i.e. `orig/release/{MAJOR}.{MINOR}`
+Please make sure to create branch that pins divergent point of release branch from the main branch, i.e. `orig/release/{MAJOR}.{MINOR}`
 ### Making release branch specific changes

 These are examples of changes that should be made to release branches so that CI / tooling can function normally on
@ -51,8 +51,13 @@ them:
  * Example: https://github.com/pytorch/pytorch/pull/40706
 * Add `release/{MAJOR}.{MINOR}` to list of branches in [`browser-extension.json`](https://github.com/pytorch/pytorch/blob/fb-config/browser-extension.json) for FaceHub integrated setups
  * Example: https://github.com/pytorch/pytorch/commit/f99fbd94d18627bae776ea2448e075ca4d5e37b2
+* A release branch should also be created in [`pytorch/builder`](https://github.com/pytorch/builder) repo and pinned in `pytorch/pytorch`
+  * Example: https://github.com/pytorch/pytorch/pull/58514

-> TODO: Create release branch in [`pytorch/builder`](https://github.com/pytorch/builder) repo and pin release CI to use that branch rather than HEAD of builder repo.
+These are examples of changes that should be made to the *default* branch after a release branch is cut
+
+* Nightly versions should be updated in all version files to the next MINOR release (i.e. 0.9.0 -> 0.10.0) in the default branch:
+  * Example: https://github.com/pytorch/pytorch/pull/51891

 ### Getting CI signal on release branches:
 Create a PR from `release/{MAJOR}.{MINOR}` to `orig/release/{MAJOR}.{MINOR}` in order to start CI testing for cherry-picks into release branch.
--- a/android/README.md
+++ b/android/README.md
@ -34,8 +34,8 @@ repositories {

 dependencies {
    ...
-    implementation 'org.pytorch:pytorch_android:1.9.0-SNAPSHOT'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.9.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android:1.10.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.10.0-SNAPSHOT'
    ...
 }
 ```
@ -95,13 +95,12 @@ dependencies {
    implementation(name:'pytorch_android', ext:'aar')
    implementation(name:'pytorch_android_torchvision', ext:'aar')
    ...
-    implementation 'com.android.support:appcompat-v7:28.0.0'
    implementation 'com.facebook.soloader:nativeloader:0.8.0'
    implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3'
 }
 ```
 We also have to add all transitive dependencies of our aars.
-As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L62-L63) on `'com.android.support:appcompat-v7:28.0.0'`, `'com.facebook.soloader:nativeloader:0.8.0'` and 'com.facebook.fbjni:fbjni-java-only:0.0.3', we need to add them.
+As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.8.0'` and `'com.facebook.fbjni:fbjni-java-only:0.0.3'`, we need to add them.
 (In case of using maven dependencies they are added automatically from `pom.xml`).

 You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly.
--- a/android/build.gradle
+++ b/android/build.gradle
@ -12,7 +12,6 @@ allprojects {
            rulesVersion = "1.2.0"
            junitVersion = "4.12"

-            androidSupportAppCompatV7Version = "28.0.0"
            fbjniJavaOnlyVersion = "0.0.3"
            soLoaderNativeLoaderVersion = "0.8.0"
        }
--- a/android/gradle.properties
+++ b/android/gradle.properties
@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64

-VERSION_NAME=1.9.0-SNAPSHOT
+VERSION_NAME=1.10.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@ -74,7 +74,6 @@ android {

 dependencies {
    implementation 'com.facebook.fbjni:fbjni-java-only:' + rootProject.fbjniJavaOnlyVersion
-    implementation 'com.android.support:appcompat-v7:' + rootProject.androidSupportAppCompatV7Version
    implementation 'com.facebook.soloader:nativeloader:' + rootProject.soLoaderNativeLoaderVersion

    testImplementation 'junit:junit:' + rootProject.junitVersion
--- a/android/pytorch_android_torchvision/build.gradle
+++ b/android/pytorch_android_torchvision/build.gradle
@ -42,7 +42,6 @@ android {
 dependencies {
    implementation project(':pytorch_android')

-    implementation 'com.android.support:appcompat-v7:' + rootProject.androidSupportAppCompatV7Version
    implementation 'com.facebook.soloader:nativeloader:' + rootProject.soLoaderNativeLoaderVersion

    testImplementation 'junit:junit:' + rootProject.junitVersion
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@ -149,8 +149,8 @@ dependencies {
    //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar')
    //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar')

-    nightlyImplementation 'org.pytorch:pytorch_android:1.9.0-SNAPSHOT'
-    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.9.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android:1.10.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.10.0-SNAPSHOT'

    aarImplementation(name:'pytorch_android', ext:'aar')
    aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@ -120,7 +120,7 @@ set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
-set(ATen_VEC256_TEST_SRCS ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE)
+set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
--- a/aten/src/ATen/BatchedFallback.cpp
+++ b/aten/src/ATen/BatchedFallback.cpp
@ -87,8 +87,6 @@ static void warnFallback(const c10::FunctionSchema& schema, bool is_inplace) {
 //   the operator, and then pop the results off the stack.
 void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  const auto& schema = op.schema();
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
-  const auto num_returns = schema.returns().size();
  warnFallback(schema, /*in_place*/true);

  const auto num_arguments = schema.arguments().size();
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/BatchedTensorImpl.cpp
@ -2,6 +2,7 @@

 #include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>

 namespace at {

@ -23,8 +24,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
  const auto value_sizes = value_.sizes();
  const auto value_strides = value_.strides();
  sizes_and_strides_.resize(public_dims);
-  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-  for (int64_t dim = 0; dim < public_dims; dim++) {
+  for (const auto dim : c10::irange(public_dims)) {
    auto actual_dim = actualDim(dim, /*wrap_dim=*/false);
    sizes_and_strides_.size_at_unchecked(dim) = value_sizes.at(actual_dim);
    sizes_and_strides_.stride_at_unchecked(dim) = value_strides.at(actual_dim);
@ -51,7 +51,7 @@ int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const {
  // but it might require newer (>= ~2015) CPUs. We should clean this up
  // if/when we have dropped support for older CPUs.
  int64_t non_bdim_count = 0;
-  for (int64_t actual_dim = 0; actual_dim < kVmapMaxTensorDims; actual_dim++) {
+  for (const auto actual_dim : c10::irange(kVmapMaxTensorDims)) {
    if (is_bdim[actual_dim]) {
      continue;
    }
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@ -1031,7 +1031,6 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {

  m.impl("sum.dim_IntList", sum_batching_rule);
  m.impl("is_complex", native::is_complex);
-  m.impl("conj", native::conj);

  // inplace operations
  m.impl("fill_.Scalar", fill_inplace_scalar_batching_rule);
@ -1085,7 +1084,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
  UNARY_POINTWISE(ceil);
  UNARY_POINTWISE(cos);
  UNARY_POINTWISE(cosh);
-  UNARY_POINTWISE(_conj);
+  UNARY_POINTWISE(conj_physical);
  UNARY_POINTWISE(digamma);
  UNARY_POINTWISE(exp);
  UNARY_POINTWISE(expm1);
@ -1144,10 +1143,10 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
  BINARY_POINTWISE(mul);
  BINARY_POINTWISE(div);
  {
-    using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional<std::string>);
-    using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional<std::string>);
-    m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, c10::optional<std::string>>);
-    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, const Scalar&, c10::optional<std::string>>);
+    using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional<c10::string_view>);
+    using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional<c10::string_view>);
+    m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, c10::optional<c10::string_view>>);
+    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, const Scalar&, c10::optional<c10::string_view>>);
  }

  // at::pow has three out-of-place overloads
@ -1181,6 +1180,10 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
  TRIVIAL_OP(imag)
  TRIVIAL_OP(real);
  TRIVIAL_OP(view_as_real);
+  TRIVIAL_OP(_view_as_real_physical);
+  TRIVIAL_OP(conj);
+  TRIVIAL_OP(_conj);
+  TRIVIAL_OP(resolve_conj);
  m.impl("view_as_complex", view_as_complex_batching_rule);
 #undef TRIVIAL

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -41,7 +41,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})

-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec256/*.h" "quantized/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
@ -309,6 +309,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+      set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+    endif()
    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND
        CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
      set(GCC_7 True)
@ -497,7 +500,7 @@ set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
-set(ATen_VEC256_TEST_SRCS  ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE)
+set(ATen_VEC_TEST_SRCS  ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_TEST_SRCS ${ATen_QUANTIZED_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
--- a/aten/src/ATen/ConjugateFallback.cpp
+++ b/aten/src/ATen/ConjugateFallback.cpp
@ -0,0 +1,152 @@
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/NativeFunctions.h>
+
+namespace at {
+
+void conjugateFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  // Situations to handle:
+  //  1. Out-of-place operation.  Easy: materialize all inputs and
+  //     call it a day.
+  //  2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
+  //     Materialize other inputs as in (1).
+  //  3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
+  //  Materialize other inputs as in (1).
+  //
+  //  It is important to be able to tell if we READ from an argument and if we
+  //  WRITE from an argument.  Conservative approach is to assume that we always
+  //  READ from an argument, but in out-of-place operations you can skip
+  //  conjugating inputs on entry that never get used.  In current schema we
+  //  can't easily tell if inplace situation has happened, so don't do it.
+
+  const auto& arguments = op.schema().arguments();
+  const auto num_arguments = arguments.size();
+  const auto stack_start = stack->size() - num_arguments;
+
+  c10::optional<bool> is_write;
+  for (int64_t i = 0; i < num_arguments; ++i) {
+    const auto& alias_info = arguments[i].alias_info();
+    // Three possible states:
+    // 1. alias_info has no value --> out-of-place operation
+    // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation
+    // 3. alias_info does have a value, alias_info->is_write=False --> view operation
+    if (alias_info.has_value()) {
+      if (is_write.has_value()) {
+        TORCH_CHECK(*is_write == alias_info->isWrite(),
+          "Unsupported operator for conjugate fallback: ", op.schema().name(),
+          "Conjugate fallback doesn't work for operators with a mix "
+          "mutable and non-mutable inputs that alias with outputs, "
+           "this must be implemented manually.  "
+          "If you got this error on a core op, please report a bug to PyTorch.");
+      } else {
+        is_write = alias_info->isWrite();
+      }
+    }
+  }
+
+  if (is_write.has_value() && !*is_write) {
+    // We assume that view operators automatically handle conjugation
+    // correctly by propagating the Conjugate dispatch key in key_set.
+    // This is not necessarily always right, so you should test these cases.
+    op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::Conjugate), stack);
+    return;
+  }
+
+  // Mutable inputs to be tracked separately
+  std::vector<Tensor> mutable_inputs;
+
+  for (int64_t i = 0; i < num_arguments; ++i) {
+    auto& ivalue = (*stack)[stack_start + i];
+    if (!(ivalue.isTensor() || ivalue.isTensorList())) {
+      continue;
+    }
+    const auto& argument = arguments[i];
+    bool mut_arg = false;
+    if (argument.alias_info()) {
+      // View operations were already filtered above, so only in-place/out= operations should get here.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite());
+      mut_arg = true;
+    }
+    if (ivalue.isTensor()) {
+      auto* impl = ivalue.unsafeToTensorImpl();
+      if (!impl->is_conj()) {
+        continue;
+      }
+
+      auto tensor = std::move(ivalue).toTensor();
+      TORCH_CHECK_NOT_IMPLEMENTED(!tensor.is_meta(), "Conjugate Fallback does not support meta tensors.");
+      if (mut_arg) {
+        // TODO: This is a waste if the argument is write only
+        tensor._set_conj(false);
+        at::conj_physical_(tensor);
+        mutable_inputs.emplace_back(tensor);
+      } else {
+        tensor = at::resolve_conj(tensor);
+      }
+      (*stack)[stack_start + i] = std::move(tensor);
+    } else if (ivalue.isTensorList()) {
+      auto tensors = std::move(ivalue).toTensorList();
+      if (mut_arg) {
+        for(const auto j : c10::irange(tensors.size())) {
+          Tensor t = tensors[j];
+          t._set_conj(false);
+          at::conj_physical_(t);
+          mutable_inputs.emplace_back(t);
+        }
+      } else {
+        for(const auto j : c10::irange(tensors.size())) {
+          tensors[j] = at::resolve_conj(tensors[j]);
+        }
+      }
+      (*stack)[stack_start + i] = std::move(tensors);
+    }
+  }
+
+
+  op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::Conjugate), stack);
+
+    for (auto& mutable_input : mutable_inputs) {
+      at::conj_physical_(mutable_input);
+      mutable_input._set_conj(true);
+    }
+}
+
+TORCH_LIBRARY_IMPL(_, Conjugate, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&conjugateFallback>());
+}
+
+TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
+  m.impl("requires_grad_", torch::CppFunction::makeFallthrough());
+  m.impl("set_.source_Storage_storage_offset", torch::CppFunction::makeFallthrough());
+  m.impl("set_.source_Tensor", torch::CppFunction::makeFallthrough());
+  m.impl("set_", torch::CppFunction::makeFallthrough());
+  m.impl("copy_", torch::CppFunction::makeFallthrough());
+  m.impl("clone", torch::CppFunction::makeFallthrough());
+  m.impl("conj", torch::CppFunction::makeFallthrough());
+  m.impl("_conj", torch::CppFunction::makeFallthrough());
+  m.impl("_conj_physical", torch::CppFunction::makeFallthrough());
+  m.impl("conj_physical", torch::CppFunction::makeFallthrough());
+  m.impl("conj_physical_", torch::CppFunction::makeFallthrough());
+  m.impl("resolve_conj", torch::CppFunction::makeFallthrough());
+  m.impl("empty_like", torch::CppFunction::makeFallthrough());
+  m.impl("empty.memory_format", torch::CppFunction::makeFallthrough());
+  m.impl("empty.out", torch::CppFunction::makeFallthrough());
+  m.impl("empty_strided", torch::CppFunction::makeFallthrough());
+  m.impl("full_like", torch::CppFunction::makeFallthrough());
+  m.impl("stride.int", torch::CppFunction::makeFallthrough());
+  m.impl("stride.Dimname", torch::CppFunction::makeFallthrough());
+  m.impl("size.int", torch::CppFunction::makeFallthrough());
+  m.impl("size.Dimname", torch::CppFunction::makeFallthrough());
+  m.impl("is_complex", torch::CppFunction::makeFallthrough());
+  m.impl("_view_as_real_physical", torch::CppFunction::makeFallthrough());
+  m.impl("view_as_real", torch::CppFunction::makeFallthrough());
+  m.impl("imag", torch::CppFunction::makeFallthrough());
+  m.impl("real", torch::CppFunction::makeFallthrough());
+  m.impl("view", torch::CppFunction::makeFallthrough());
+  m.impl("reshape", torch::CppFunction::makeFallthrough());
+}
+
+} // namespace at
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -106,6 +106,9 @@ inline constexpr bool should_include_kernel_dtype(
    int bit_width = bitwidth;                                                     \
    int64_t quant_min = qmin;                                                     \
    int64_t quant_max = qmax;                                                     \
+    (void)bit_width; /* Suppress unused variable warning */                       \
+    (void)quant_min; /* Suppress unused variable warning */                       \
+    (void)quant_max; /* Suppress unused variable warning */                       \
    return __VA_ARGS__();                                                         \
  }

--- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@ -35,253 +35,6 @@ namespace {
  }
 }

-Tensor & _th_nonzero_out(const Tensor & self, Tensor & result) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THBoolTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THBFloat16Tensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexDouble: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexFloat: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_nonzero_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_nonzero(const Tensor & self) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THBoolTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THBFloat16Tensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexDouble: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexFloat: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_nonzero not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Scalar _th_std_var(const Tensor& self, int64_t correction, bool take_sqrt) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
-            return convert<double>(THDoubleTensor_std_var_all(self_, correction, take_sqrt));
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
-            return convert<float>(THFloatTensor_std_var_all(self_, correction, take_sqrt));
-            break;
-        }
-        default:
-            AT_ERROR("_th_var not supported on CPUType for ", dispatch_scalar_type);
-    }
-}
-Tensor & _th_renorm_out(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm, Tensor & result) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toDouble();
-            auto maxnorm_ = maxnorm.toDouble();
-            THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toFloat();
-            auto maxnorm_ = maxnorm.toFloat();
-            THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_renorm_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toDouble();
-            auto maxnorm_ = maxnorm.toDouble();
-            THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toFloat();
-            auto maxnorm_ = maxnorm.toFloat();
-            THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_renorm not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toDouble();
-            auto maxnorm_ = maxnorm.toDouble();
-            THDoubleTensor_renorm(self_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto p_ = p.toFloat();
-            auto maxnorm_ = maxnorm.toFloat();
-            THFloatTensor_renorm(self_, self_, p_, dim, maxnorm_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_renorm_ not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 Tensor & _th_histc_out(const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max, Tensor & result) {
    // DeviceGuard omitted
    auto dispatch_scalar_type = infer_scalar_type(self);
@ -334,84 +87,6 @@ Tensor _th_histc(const Tensor & self, int64_t bins, const Scalar& min, const Sca
    return result;
 }

-std::tuple<Tensor &,Tensor &> _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2) {
-    TORCH_WARN_ONCE(
-      "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
-      "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
-      "the returned tuple (although it returns other information about the problem).\n",
-      "To get the qr decomposition consider using torch.linalg.qr.\n",
-      "The returned solution in torch.lstsq stored the residuals of the solution in the ",
-      "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
-      "residuals in the field 'residuals' of the returned named tuple.\n",
-      "The unpacking of the solution, as in\n",
-      "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
-      "should be replaced with\n",
-      "X = torch.linalg.lstsq(A, B).solution"
-    );
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_gels(res1_, res2_, self_, A_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_gels(res1_, res2_, self_, A_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_gels_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor &, Tensor &>(res1, res2);
-}
-std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
-    TORCH_WARN_ONCE(
-      "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
-      "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
-      "the returned tuple (although it returns other information about the problem).\n",
-      "To get the qr decomposition consider using torch.linalg.qr.\n",
-      "The returned solution in torch.lstsq stored the residuals of the solution in the ",
-      "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
-      "residuals in the field 'residuals' of the returned named tuple.\n",
-      "The unpacking of the solution, as in\n",
-      "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
-      "should be replaced with\n",
-      "X = torch.linalg.lstsq(A, B).solution"
-    );
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
-    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_gels(res1_, res2_, self_, A_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_gels(res1_, res2_, self_, A_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_gels not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor, Tensor>(res1, res2);
-}
-
 } // namespace th
 } // namespace legacy
 } // namespace native
--- a/aten/src/ATen/LegacyTHFunctionsCPU.h
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.h
@ -20,16 +20,8 @@ namespace cpu {

 Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source);
 Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source);
-Tensor& _th_nonzero_out(const Tensor& self, Tensor& result);
-Tensor _th_nonzero(const Tensor & self);
-Scalar _th_std_var(const Tensor& self, int64_t correction, bool take_sqrt);
-Tensor & _th_renorm_out(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm, Tensor & result);
-Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
-Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
 Tensor & _th_histc_out(const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max, Tensor & result);
 Tensor _th_histc(const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max);
-std::tuple<Tensor &,Tensor &> _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2);
-std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);

 } // namespace th
 } // namespace legacy
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@ -20,9 +20,6 @@ namespace cuda {

 Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, const Scalar& value);
 Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, const Scalar& value);
-Tensor & _th_renorm_out(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm, Tensor & result);
-Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
-Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
 Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim);
 Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim);
 std::tuple<Tensor &,Tensor &> _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2);
@ -54,10 +51,7 @@ std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(const Tensor & self,
 std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self);
 Tensor & _thnn_log_sigmoid_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & buffer, Tensor & grad_input);
 Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
-Tensor & _thnn_rrelu_with_noise_forward_out(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator, Tensor & output);
-Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
 Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
-Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones);
 std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const optional<Tensor> & bias, IntArrayRef stride, IntArrayRef padding);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@ -8,9 +8,9 @@ MemOverlap has_internal_overlap(const Tensor& tensor) {
 }

 MemOverlap has_internal_overlap(TensorImpl* t) {
-  AT_ASSERT(t->layout() == kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t->layout() == kStrided);

-  if (t->is_contiguous()) {
+  if (t->is_non_overlapping_and_dense()) {
    return MemOverlap::NO;
  }

@ -45,18 +45,16 @@ MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {
  if (a->numel() == 0 || b->numel() == 0) {
    return MemOverlapStatus::NO;
  }
-  if (!a->is_contiguous() || !b->is_contiguous()) {
+  if (!a->is_non_overlapping_and_dense() || !b->is_non_overlapping_and_dense()) {
    return MemOverlapStatus::TOO_HARD;
  }
-  if (!a->has_storage() || !b->has_storage()) {
-    return MemOverlapStatus::NO;
-  }
  // Test for storage equality, rather than pointer equality.
  // This reduces precision, but if people are aliasing the
  // same pointer across multiple storages there are many
  // similar situations (e.g., storage().data() == storage().data()+1)
  // which we will miss.
-  if (a->storage().is_alias_of(b->storage())) {
+  auto a_storage = a->unsafe_storage();
+  if (a_storage && a_storage.is_alias_of(b->unsafe_storage())) {
    const auto a_begin = static_cast<char*>(a->data());
    const auto a_end = a_begin + a->numel() * a->itemsize();
    const auto b_begin = static_cast<char*>(b->data());
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -128,7 +128,7 @@ void launch_no_thread_state(std::function<void()> fn);
 TORCH_API void intraop_launch(std::function<void()> func);

 // Launches intra-op parallel task, returns a future
-TORCH_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
    std::function<void()> func);

 // Returns number of intra-op threads used by default
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@ -271,10 +271,10 @@ void intraop_launch(std::function<void()> func) {
 #endif // C10_MOBILE
 }

-std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
+c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
    std::function<void()> func) {
 #ifndef C10_MOBILE
-  auto future = std::make_shared<c10::ivalue::Future>(c10::NoneType::get());
+  auto future = c10::make_intrusive<c10::ivalue::Future>(c10::NoneType::get());
  if (!in_parallel_region() && get_num_threads() > 1) {
    _get_intraop_pool().run(
      [func, future]() {
@ -290,7 +290,7 @@ std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
 #else
  // TODO: caffe2::PThreadPool only provides a data-parallel API.
  // Task parallelism is not currently supported.
-  auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
+  auto future = c10::make_intrusive<c10::ivalue::Future>(NoneType::get());
  func();
  future->markCompleted();
  return future;
--- a/aten/src/ATen/ParallelNativeTBB.cpp
+++ b/aten/src/ATen/ParallelNativeTBB.cpp
@ -70,11 +70,12 @@ int get_num_threads() {
 }

 int get_thread_num() {
-  return tbb::this_task_arena::current_thread_index();
+  auto tid = tbb::this_task_arena::current_thread_index();
+  return std::max(tid, 0);
 }

 bool in_parallel_region() {
-  return tbb::this_task_arena::current_thread_index() != -1;
+  return tbb::this_task_arena::current_thread_index() >= 0;
 }

 void intraop_launch(std::function<void()> func) {
@ -85,9 +86,9 @@ void intraop_launch(std::function<void()> func) {
  }
 }

-std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
+c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
    std::function<void()> func) {
-  auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
+  auto future = c10::make_intrusive<c10::ivalue::Future>(NoneType::get());
  if (get_num_threads() > 1) {
    tg_.run(
      [func, future]() {
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -101,10 +101,10 @@ void intraop_launch(std::function<void()> func) {
  func();
 }

-std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
+c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
    std::function<void()> func) {
  func();
-  auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
+  auto future = c10::make_intrusive<c10::ivalue::Future>(NoneType::get());
  future->markCompleted();
  return future;
 }
--- a/aten/src/ATen/ParallelOpenMP.h
+++ b/aten/src/ATen/ParallelOpenMP.h
@ -17,16 +17,22 @@ inline void parallel_for(
    const int64_t end,
    const int64_t grain_size,
    const F& f) {
-  TORCH_CHECK(grain_size >= 0);
-  at::internal::lazy_init_num_threads();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grain_size >= 0);
  if (begin >= end) {
    return;
  }
-  if (end - begin == 1) {
+
+#ifdef _OPENMP
+  at::internal::lazy_init_num_threads();
+  const auto numiter = end - begin;
+  const bool use_parallel = (
+    numiter > grain_size && numiter > 1 &&
+    omp_get_max_threads() > 1 && !omp_in_parallel());
+  if (!use_parallel) {
    f(begin, end);
    return;
  }
-#ifdef _OPENMP
+
  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
  std::exception_ptr eptr;
  // Work around memory leak when using 1 thread in nested "omp parallel"
@ -34,7 +40,7 @@ inline void parallel_for(
  // returns false when omp_get_max_threads() == 1 inside nested "omp parallel"
  // See issue gh-32284

-#pragma omp parallel if (omp_get_max_threads() > 1 && !omp_in_parallel() && ((end - begin) > grain_size))
+#pragma omp parallel
  {
    // choose number of tasks based on grain size and number of threads
    // can't use num_threads clause due to bugs in GOMP's thread pool (See #32008)
@ -76,7 +82,8 @@ inline scalar_t parallel_reduce(
  at::internal::lazy_init_num_threads();
  if (begin >= end) {
    return ident;
-  } else if (in_parallel_region() || get_num_threads() == 1) {
+  } else if ((end - begin) <= grain_size || in_parallel_region() ||
+             get_num_threads() == 1) {
    return f(begin, end, ident);
  } else {
    const int64_t num_results = divup((end - begin), grain_size);
@ -84,7 +91,7 @@ inline scalar_t parallel_reduce(
    scalar_t* results_data = results.data();
    std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
    std::exception_ptr eptr;
-#pragma omp parallel for if ((end - begin) >= grain_size)
+#pragma omp parallel for
    for (int64_t id = 0; id < num_results; id++) {
      int64_t i = begin + id * grain_size;
      try {
--- a/Show More
+++ b/Show More