2021-04-21 nightly release (1e03a2505f9cee92587bf45fbbbbfedede5cb9ec)

2025-10-20 21:14:14 +08:00 · 2021-04-21 00:01:48 -07:00
parent 24ee5bc666
commit aca2009f6b
194 changed files with 4365 additions and 2228 deletions
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@ -125,7 +125,7 @@ def FalsePred(_):
 def TruePred(_):
    return True

-_VC2019 = VcSpec(2019, ["14", "28", "29333"], hide_version=True)
+_VC2019 = VcSpec(2019)

 WORKFLOW_DATA = [
    # VS2019 CUDA-10.1
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -6786,7 +6786,7 @@ workflows:
          python_version: "3.6"
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6799,7 +6799,7 @@ workflows:
          test_name: pytorch-windows-test1
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6812,7 +6812,7 @@ workflows:
          test_name: pytorch-windows-test2
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_build:
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6821,7 +6821,7 @@ workflows:
          python_version: "3.6"
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6840,7 +6840,7 @@ workflows:
          test_name: pytorch-windows-test1
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6859,7 +6859,7 @@ workflows:
          test_name: pytorch-windows-test2
          use_cuda: "1"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_build:
          build_environment: pytorch-win-vs2019-cpu-py3
@ -6868,7 +6868,7 @@ workflows:
          python_version: "3.6"
          use_cuda: "0"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cpu-py3
@ -6886,7 +6886,7 @@ workflows:
          test_name: pytorch-windows-test1
          use_cuda: "0"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cpu-py3
@ -6904,7 +6904,7 @@ workflows:
          test_name: pytorch-windows-test2
          use_cuda: "0"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - pytorch_windows_test:
          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6922,7 +6922,7 @@ workflows:
          test_name: pytorch-windows-test1
          use_cuda: "0"
          vc_product: BuildTools
-          vc_version: "14.28.29333"
+          vc_version: ""
          vc_year: "2019"
      - update_s3_htmls:
          context: org-member
--- a/.github/scripts/generate_linux_ci_workflows.py
+++ b/.github/scripts/generate_linux_ci_workflows.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+
+import jinja2
+
+DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
+
+GITHUB_DIR = Path(__file__).parent.parent
+
+CPU_TEST_RUNNER = "linux.2xlarge"
+CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
+
+
+class PyTorchLinuxWorkflow:
+    def __init__(self, build_environment: str, docker_image_base: str):
+        self.build_environment = build_environment
+        self.docker_image_base = docker_image_base
+        self.test_runner_type = CPU_TEST_RUNNER
+        if "cuda" in build_environment:
+            self.test_runner_type = CUDA_TEST_RUNNER
+
+    def generate_workflow_file(
+        self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
+    ) -> Path:
+        output_file_path = GITHUB_DIR.joinpath(
+            f"workflows/{self.build_environment}.yml"
+        )
+        with open(output_file_path, "w") as output_file:
+            output_file.write(
+                workflow_template.render(
+                    build_environment=self.build_environment,
+                    docker_image_base=self.docker_image_base,
+                    test_runner_type=self.test_runner_type
+                )
+            )
+            output_file.write('\n')
+        return output_file_path
+
+
+WORKFLOWS = [
+    PyTorchLinuxWorkflow(
+        build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3.6-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-asan",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang7-onnx",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+    # PyTorchLinuxWorkflow(
+    #     build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+    # ),
+]
+
+if __name__ == "__main__":
+    jinja_env = jinja2.Environment(
+        variable_start_string="!{{",
+        loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
+    )
+    workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
+    for workflow in WORKFLOWS:
+        print(
+            workflow.generate_workflow_file(
+                workflow_template=workflow_template,
+                jinja_env=jinja_env
+            )
+        )
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
+DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
+YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
+
+install_nvidia_docker2_amzn2() {
+    (
+        set -x
+        # Needed for yum-config-manager
+        sudo yum install -y yum-utils
+        sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
+        sudo yum install -y nvidia-docker2
+        sudo systemctl restart docker
+    )
+}
+
+install_nvidia_driver() {
+    (
+        set -x
+        sudo yum groupinstall -y "Development Tools"
+        curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+        sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+        nvidia-smi
+    )
+}
+
+# Install container toolkit based on distribution
+echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
+case "${DISTRIBUTION}" in
+    amzn*)
+        install_nvidia_docker2_amzn2
+        ;;
+    *)
+        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+        exit 1
+        ;;
+esac
+
+echo "== Installing nvidia driver ${DRIVER_FN} =="
+install_nvidia_driver
--- a/.github/scripts/report_git_status.sh
+++ b/.github/scripts/report_git_status.sh
@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+CHANGES=$(git status --porcelain)
+echo "$CHANGES"
+git diff
+[ -z "$CHANGES" ]
--- a/.github/templates/linux_ci_workflow.yml.in
+++ b/.github/templates/linux_ci_workflow.yml.in
@ -0,0 +1,193 @@
+# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
+#
+# Template is at:    .github/templates/linux_ci_workflow.yml
+# Generation script: .github/scripts/generate_linux_ci_workflows.py
+name: Linux CI (!{{ build_environment }})
+
+on:
+  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  # pull_request:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: !{{ build_environment }}
+  DOCKER_IMAGE_BASE: !{{ docker_image_base }}
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+
+jobs:
+  calculate-docker-image:
+    runs-on: ubuntu-18.04
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+  build:
+    runs-on: linux.2xlarge
+    needs: calculate-docker-image
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Create test binary build directories
+        run: |
+          mkdir -pv ../custom-op-build
+          mkdir -pv ../custom-backend-build
+          mkdir -pv ../jit-hook-build
+      - name: Build PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Why the three volume mounts here? So test binaries are put in the correct spot
+          # NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
+          # See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
+          # TODO: Stop building test binaries as part of the build phase
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e MAX_JOBS \
+            -e SCCACHE_BUCKET \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          (cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 30
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+  test:
+    runs-on: !{{ test_runner_type }}
+    needs:
+      - calculate-docker-image
+      - build
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          (cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Why the three volume mounts here? So test binaries are put in the correct spot
+          # NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
+          # See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e IN_CI \
+            -e MAX_JOBS \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
--- a/.github/workflows/auto_label.yml
+++ b/.github/workflows/auto_label.yml
@ -25,7 +25,7 @@ jobs:
          ISSUE_NUMBER="${PR_NUMBER}"
        else
          TITLE="${ISSUE_TITLE}"
-          ISSUE_NUMBER="${ISSUE_NUMBER}"
+          # ISSUE_NUMBER is already set
        fi
        echo ::set-output name=TITLE::"${TITLE}"
        echo ::set-output name=ISSUE_NUMBER::"${ISSUE_NUMBER}"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -34,10 +34,10 @@ jobs:
      - name: Extract scripts from GitHub Actions workflows
        run: tools/extract_scripts.py --out=.extracted_scripts
      - name: ShellCheck
-        # https://github.com/koalaman/shellcheck/tree/v0.7.1#installing-a-pre-compiled-binary
+        # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
        run: |
          set -x
-          scversion="v0.7.1"
+          scversion="v0.7.2"
          wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
          sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
          rm -r "shellcheck-${scversion}"
@ -45,19 +45,23 @@ jobs:
          tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
      - name: Ensure correct trailing newlines
        run: |
-          (! git grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
+          (! git --no-pager grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
      - name: Ensure no trailing spaces
        run: |
-          (! git grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
+          (! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
      - name: Ensure no tabs
        run: |
-          (! git grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
+          (! git --no-pager grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
      - name: Ensure no non-breaking spaces
        run: |
-          (! git grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
+          (! git --no-pager grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
      - name: Ensure canonical include
        run: |
-          (! git grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
+          (! git --no-pager grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
+      - name: Ensure no unqualified noqa
+        run: |
+          # shellcheck disable=SC2016
+          (! git --no-pager grep -InP '# noqa(?!: [A-Z]+\d{3})' -- '**.py' ':(exclude)caffe2' || (echo 'The above lines have unqualified `noqa`; please convert them to `noqa: XXXX`'; false))
      # note that this next step depends on a clean checkout;
      # if you run it locally then it will likely to complain
      # about all the generated files in torch/test
@ -75,7 +79,7 @@ jobs:
          python torch/testing/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
      - name: Ensure no direct cub include
        run: |
-          (! git grep -I -no $'#include <cub/' --  ./aten  ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
+          (! git --no-pager grep -I -no $'#include <cub/' --  ./aten  ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))

  python2-setup-compat:
    runs-on: ubuntu-18.04
@ -91,6 +95,23 @@ jobs:
        run: |
          python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."

+  templates:
+    runs-on: ubuntu-18.04
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+          architecture: x64
+      - name: Install Jinja2
+        run: pip install Jinja2
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Regenerate workflows
+        run: .github/scripts/generate_linux_ci_workflows.py
+      - name: Assert that regenerating the workflows didn't change them
+        run: .github/scripts/report_git_status.sh
+
  toc:
    runs-on: ubuntu-18.04
    # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
@ -107,16 +128,11 @@ jobs:
        run: |
          set -eux
          export PATH=~/.npm-global/bin:"$PATH"
-          for FILE in {CONTRIBUTING,README}.md; do
+          for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
            markdown-toc --bullets='-' -i "$FILE"
          done
      - name: Assert that regenerating the ToCs didn't change them
-        run: |
-          set -eux
-          CHANGES=$(git status --porcelain)
-          echo "$CHANGES"
-          git diff
-          [ -z "$CHANGES" ]
+        run: .github/scripts/report_git_status.sh

  flake8-py3:
    runs-on: ubuntu-18.04
@ -137,21 +153,23 @@ jobs:
          mkdir flake8-output
          cd flake8-output
          echo "$HEAD_SHA" > commit-sha.txt
-      - name: Run flake8
+      - name: Install dependencies
        run: |
          set -eux
          pip install typing-extensions # for tools/translate_annotations.py
          pip install -r requirements-flake8.txt
          flake8 --version
+      - name: Run flake8
+        run: |
+          set -eux
          flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt
-          cp flake8-output.txt flake8-output/annotations.json
      - name: Translate annotations
        if: github.event_name == 'pull_request'
        env:
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
        run: |
          tools/translate_annotations.py \
-            --file=flake8-output.txt \
+            --file="${GITHUB_WORKSPACE}"/flake8-output.txt \
            --regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)' \
            --commit="$HEAD_SHA" \
            > flake8-output/annotations.json
@ -202,10 +220,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y clang-tidy-11
          sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
-      - name: Run clang-tidy
-        env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
-          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+      - name: Generate build files
        run: |
          set -eux
          git remote add upstream https://github.com/pytorch/pytorch
@ -229,6 +244,12 @@ jobs:
              --native-functions-path aten/src/ATen/native/native_functions.yaml \
              --nn-path aten/src
          fi
+      - name: Run clang-tidy
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+        run: |
+          set -eux

          # Run Clang-Tidy
          # The negative filters below are to exclude files that include onnx_pb.h or
@ -283,13 +304,16 @@ jobs:
          architecture: x64
      - name: Fetch PyTorch
        uses: actions/checkout@v2
-      - name: Run cmakelint
+      - name: Install dependencies
        run: |
          set -eux
          pip install cmakelint
          cmakelint --version
+      - name: Run cmakelint
+        run: |
+          set -eux
          git ls-files -z -- bootstrap '*.cmake' '*.cmake.in' '*CMakeLists.txt' | \
-          grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/)' | \
+          grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/|cmake/Caffe2Config.cmake.in|aten/src/ATen/ATenConfig.cmake.in|cmake/Caffe2ConfigVersion.cmake.in|cmake/TorchConfig.cmake.in|cmake/TorchConfigVersion.cmake.in|cmake/cmake_uninstall.cmake.in)' | \
          xargs -0 cmakelint --config=.cmakelintrc --spaces=2 --quiet

  mypy:
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@ -0,0 +1,193 @@
+# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
+#
+# Template is at:    .github/templates/linux_ci_workflow.yml
+# Generation script: .github/scripts/generate_linux_ci_workflows.py
+name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
+
+on:
+  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  # pull_request:
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+
+jobs:
+  calculate-docker-image:
+    runs-on: ubuntu-18.04
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+  build:
+    runs-on: linux.2xlarge
+    needs: calculate-docker-image
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Create test binary build directories
+        run: |
+          mkdir -pv ../custom-op-build
+          mkdir -pv ../custom-backend-build
+          mkdir -pv ../jit-hook-build
+      - name: Build PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Why the three volume mounts here? So test binaries are put in the correct spot
+          # NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
+          # See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
+          # TODO: Stop building test binaries as part of the build phase
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e MAX_JOBS \
+            -e SCCACHE_BUCKET \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          (cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 30
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+  test:
+    runs-on: linux.2xlarge
+    needs:
+      - calculate-docker-image
+      - build
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+    steps:
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: actions/download-artifact@v2
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          (cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        run: |
+          SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+          MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
+          export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
+          # Why the three volume mounts here? So test binaries are put in the correct spot
+          # NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
+          # See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e IN_CI \
+            -e MAX_JOBS \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
--- a/.gitignore
+++ b/.gitignore
@ -292,3 +292,12 @@ bazel-*
 # direnv, posh-direnv
 .envrc
 .psenvrc
+
+# generated shellcheck directories
+.shellcheck_generated*/
+
+# zip archives
+*.zip
+
+# core dump files
+core.*
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -59,6 +59,17 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  export BUILD_SPLIT_CUDA=ON
 fi

+if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
+  export BUILD_CAFFE2=OFF
+fi
+
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+  export ATEN_THREADING=NATIVE
+fi
+
 # TODO: Don't run this...
 pip_install -r requirements.txt || true

@ -234,7 +245,7 @@ else
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    mkdir "$CUSTOM_OP_BUILD"
+    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
    make VERBOSE=1
@ -246,7 +257,7 @@ else
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    mkdir "$JIT_HOOK_BUILD"
+    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
    make VERBOSE=1
@ -257,7 +268,7 @@ else
    CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
    CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
    python --version
-    mkdir "$CUSTOM_BACKEND_BUILD"
+    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
    make VERBOSE=1
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then
    # Save sccache logs to file
    sccache --stop-server || true
    rm ~/sccache_error.log || true
-    if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
+    if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
+      # sccache --start-server seems to hang forever on self hosted runners for GHA
+      # so let's just go ahead and skip the --start-server altogether since it seems
+      # as though sccache still gets used even when the sscache server isn't started
+      # explicitly
+      echo "Skipping sccache server initialization, setting environment variables"
+      export SCCACHE_IDLE_TIMEOUT=1200
+      export SCCACHE_ERROR_LOG=~/sccache_error.log
+      export RUST_LOG=sccache::server=error
+    elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
      SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
    else
      # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@ -26,7 +26,7 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
  retry bash "${WORKSPACE_DIR}"/miniconda3.sh -b -p "${WORKSPACE_DIR}"/miniconda3
 fi
 export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
-# shellcheck disable=SC1090
+# shellcheck disable=SC1091
 source "${WORKSPACE_DIR}"/miniconda3/bin/activate
 retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses pip
 # The torch.hub tests make requests to GitHub.
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@ -51,7 +51,11 @@ test_python_all() {
  export GLOO_SOCKET_IFNAME=lo0
  echo "Ninja version: $(ninja --version)"

-  if [ -n "$CIRCLE_PULL_REQUEST" ]; then
+  # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+  # CIRCLE_PULL_REQUEST comes from CircleCI
+  # GITHUB_HEAD_REF comes from Github Actions
+  IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+  if [ -n "$IN_PULL_REQUEST" ]; then
    DETERMINE_FROM=$(mktemp)
    file_diff_from_base "$DETERMINE_FROM"
  fi
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
  export ATEN_CPU_CAPABILITY=avx
 fi

-if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
+# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+# CIRCLE_PULL_REQUEST comes from CircleCI
+# GITHUB_HEAD_REF comes from Github Actions
+IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
  DETERMINE_FROM=$(mktemp)
  file_diff_from_base "$DETERMINE_FROM"
 fi
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@ -42,12 +42,16 @@ fi

 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

-if [ -n "$CIRCLE_PULL_REQUEST" ]; then
+# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
+# CIRCLE_PULL_REQUEST comes from CircleCI
+# GITHUB_HEAD_REF comes from Github Actions
+IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
+if [ -n "$IN_PULL_REQUEST" ]; then
  DETERMINE_FROM="${TMP_DIR}/determine_from"
  file_diff_from_base "$DETERMINE_FROM"
 fi

-if [[ "${CIRCLE_JOB}" == *11* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
  export BUILD_SPLIT_CUDA=ON
 fi

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -10,6 +10,7 @@
 - [Unit testing](#unit-testing)
  - [Python Unit Testing](#python-unit-testing)
  - [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
+  - [Local linting](#local-linting)
  - [Running `mypy`](#running-mypy)
  - [C++ Unit Testing](#c-unit-testing)
 - [Writing documentation](#writing-documentation)
@ -357,13 +358,44 @@ The above is an example of testing a change to all Loss functions: this
 command runs tests such as `TestNN.test_BCELoss` and
 `TestNN.test_MSELoss` and can be useful to save keystrokes.

+
+### Local linting
+
+You can run the same linting steps that are used in CI locally via `make`:
+
+```bash
+make lint -j 6  # run lint (using 6 parallel jobs)
+```
+
+These jobs may require extra dependencies that aren't dependencies of PyTorch
+itself, so you can install them via this command, which you should only have to
+run once:
+
+```bash
+make setup_lint
+```
+
+To run a specific linting step, use one of these targets or see the
+[`Makefile`](Makefile) for a complete list of options.
+
+```bash
+# Check for tabs, trailing newlines, etc.
+make quick_checks
+
+make flake8
+
+make mypy
+
+make cmakelint
+```
+
 ### Running `mypy`

 `mypy` is an optional static type checker for Python. We have multiple `mypy`
 configs for the PyTorch codebase, so you can run them all using this command:

 ```bash
-for CONFIG in mypy*.ini; do mypy --config="$CONFIG"; done
+make mypy
 ```

 See [Guide for adding type annotations to
--- a/GLOSSARY.md
+++ b/GLOSSARY.md
@ -1,6 +1,7 @@
 # PyTorch Glossary

- [PyTorch Glossary](#pytorch-glossary)
+<!-- toc -->
+
 - [Operation and Kernel](#operation-and-kernel)
  - [ATen](#aten)
  - [Operation](#operation)
@ -19,6 +20,8 @@
  - [Tracing](#tracing)
  - [Scripting](#scripting)

+<!-- tocstop -->
+
 # Operation and Kernel

 ## ATen
--- a/55
+++ b/55
@ -14,8 +14,63 @@ ios:

 clean: # This will remove ALL build folders.
 	@rm -r build*/
+	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)

 linecount:
 	@cloc --read-lang-def=caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
+
+SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
+shellcheck-gha:
+	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
+	tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
+	tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
+
+generate-gha-workflows:
+	./.github/scripts/generate_linux_ci_workflows.py
+	$(MAKE) shellcheck-gha
+
+setup_lint:
+	python tools/actions_local_runner.py --file .github/workflows/lint.yml \
+	 	--job 'flake8-py3' --step 'Install dependencies'
+	python tools/actions_local_runner.py --file .github/workflows/lint.yml \
+	 	--job 'cmakelint' --step 'Install dependencies'
+	pip install jinja2
+
+quick_checks:
+# TODO: This is broken when 'git config submodule.recurse' is 'true'
+	@python tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'quick-checks' \
+		--step 'Ensure no trailing spaces' \
+		--step 'Ensure no tabs' \
+		--step 'Ensure no non-breaking spaces' \
+		--step 'Ensure canonical include' \
+		--step 'Ensure no unqualified noqa' \
+		--step 'Ensure no direct cub include' \
+		--step 'Ensure correct trailing newlines'
+
+flake8:
+	@python tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'flake8-py3' \
+		--step 'Run flake8'
+
+mypy:
+	@python tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'mypy' \
+		--step 'Run mypy'
+
+cmakelint:
+	@python tools/actions_local_runner.py \
+		--file .github/workflows/lint.yml \
+		--job 'cmakelint' \
+		--step 'Run cmakelint'
+
+clang_tidy:
+	echo "clang-tidy local lint is not yet implemented"
+	exit 1
+
+lint: flake8 mypy quick_checks cmakelint generate-gha-workflows
--- a/android/README.md
+++ b/android/README.md
@ -201,8 +201,7 @@ After that, you can use libtorch C++ API from your native code.
 namespace pytorch_testapp_jni {
 namespace {
    struct JITCallGuard {
-      torch::autograd::AutoGradMode no_autograd_guard{false};
-      torch::AutoNonVariableTypeMode non_var_guard{true};
+      c10::InferenceMode guard;
      torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
    };
 }
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@ -26,14 +26,8 @@ namespace pytorch_jni {
 namespace {

 struct JITCallGuard {
-  // AutoGrad is disabled for mobile by default.
-  torch::autograd::AutoGradMode no_autograd_guard{false};
-  // VariableType dispatch is not included in default mobile build. We need set
-  // this guard globally to avoid dispatch error (only for dynamic dispatch).
-  // Thanks to the unification of Variable class and Tensor class it's no longer
-  // required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
-  // always set NonVariableTypeMode for inference only use case.
-  torch::AutoNonVariableTypeMode non_var_guard{true};
+  // Inference only workload.
+  c10::InferenceMode guard;
  // Disable graph optimizer to ensure list of unused ops are not changed for
  // custom mobile build.
  torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@ -17,14 +17,8 @@ namespace pytorch_jni {
 namespace {

 struct LiteJITCallGuard {
-  // VariableType dispatch is not included in default mobile build. We need set
-  // this guard globally to avoid dispatch error (only for dynamic dispatch).
-  // Thanks to the unification of Variable class and Tensor class it's no longer
-  // required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
-  // always set NonVariableTypeMode for inference only use case.
-  // TODO: avoid having to set this guard for custom mobile build with mobile
-  // interpreter.
-  torch::AutoNonVariableTypeMode non_var_guard{true};
+  // Inference only workload.
+  c10::InferenceMode guard;
 };

 } // namespace
--- a/android/test_app/app/src/main/cpp/pytorch_testapp_jni.cpp
+++ b/android/test_app/app/src/main/cpp/pytorch_testapp_jni.cpp
@ -24,8 +24,7 @@ void log(const char* m, T t) {
 }

 struct JITCallGuard {
-  torch::autograd::AutoGradMode no_autograd_guard{false};
-  torch::AutoNonVariableTypeMode non_var_guard{true};
+  c10::InferenceMode guard;
  torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
 };
 } // namespace
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -40,6 +40,10 @@ TORCH_LIBRARY_IMPL(_, AutogradCPU, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }

+TORCH_LIBRARY_IMPL(_, AutogradXPU, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
 TORCH_LIBRARY_IMPL(_, AutogradCUDA, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -202,6 +202,17 @@ public:
    }
    return mask;
  }
+  Vec256<T> isnan() const {
+    Vec256<T> vec;
+    for (int64_t i = 0; i != size(); i++) {
+      if (_isnan(values[i])) {
+        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));
+      } else {
+        std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T));
+      }
+    }
+    return vec;
+  }
  Vec256<T> map(T (*f)(T)) const {
    Vec256<T> ret;
    for (int64_t i = 0; i != size(); i++) {
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@ -96,6 +96,9 @@ public:
    __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
    return _mm256_movemask_pd(cmp);
  }
+  Vec256<double> isnan() const {
+    return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
+  }
  Vec256<double> map(double (*f)(double)) const {
    __at_align32__ double tmp[size()];
    store(tmp);
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@ -103,6 +103,9 @@ public:
    __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
    return _mm256_movemask_ps(cmp);
  }
+  Vec256<float> isnan() const {
+    return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+  }
  Vec256<float> map(float (*f)(float)) const {
    __at_align32__ float tmp[size()];
    store(tmp);
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@ -283,6 +283,19 @@ public:
    }
    return mask;
  }
+  Vec256<float> isnan() const {
+    __at_align32__ float tmp[size()];
+    __at_align32__ float res[size()];
+    store(tmp);
+    for (int i = 0; i < size(); i++) {
+      if (_isnan(tmp[i])) {
+        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
+      } else {
+        std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
+      }
+    }
+    return loadu(res);
+  };
  Vec256<float> map(float (*f)(float)) const {
    __at_align32__ float tmp[size()];
    store(tmp);
--- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
@ -5,6 +5,55 @@


 namespace at {
+namespace meta {
+TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_size) {
+  for (int64_t i = 0; i < input.ndimension(); i++) {
+    TORCH_CHECK(
+        input.size(i) > 0,
+        "adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
+        "but input has sizes ",
+        input.sizes(),
+        " with dimension ",
+        i,
+        " being "
+        "empty");
+  }
+
+  TORCH_CHECK(
+      (input.ndimension() == 3 || input.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "adaptive_max_pool2d: internal error: output_size.size() must be 2");
+
+  int dimH = 1;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+
+  if (input.ndimension() == 4) {
+    sizeB = input.size(0);
+    dimH++;
+  }
+
+  sizeD = input.size(dimH - 1);
+
+  int64_t osizeH = output_size[0];
+  int64_t osizeW = output_size[1];
+
+  /* resize output */
+  if (input.ndimension() == 3) {
+    set_output(0, {sizeD, osizeH, osizeW}, input.options());
+    /* indices will contain i,j locations for each output point */
+    set_output(1, {sizeD, osizeH, osizeW}, input.options().dtype(kLong));
+  } else {
+    set_output(0, {sizeB, sizeD, osizeH, osizeW}, input.options());
+    /* indices will contain i,j locations for each output point */
+    set_output(1, {sizeB, sizeD, osizeH, osizeW}, input.options().dtype(kLong));
+  }
+}
+} // namespace meta
+
 namespace native {

 namespace {
@ -115,102 +164,6 @@ static void adaptive_max_pool2d_out_frame(
  });
 }

-void adaptive_max_pool2d_out_cpu_template(
-          Tensor& output,
-          Tensor& indices,
-          const Tensor& input,
-          IntArrayRef output_size)
-{
-  int dimW = 2;
-  int dimH = 1;
-  int64_t sizeB = 1;
-  int64_t sizeD = 0;
-  int64_t isizeH = 0;
-  int64_t isizeW = 0;
-
-  int64_t istrideD = 0;
-  int64_t istrideH = 0;
-  int64_t istrideW = 0;
-  int64_t istrideB = 0;
-
-  for (int64_t i = 0; i < input.ndimension(); i++) {
-    TORCH_CHECK(input.size(i) > 0,
-      "adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
-      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
-      "empty");
-  }
-
-  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
-    "non-empty 3D or 4D (batch mode) tensor expected for input");
-
-  TORCH_CHECK(output_size.size() == 2,
-    "adaptive_max_pool2d: internal error: output_size.size() must be 2");
-
-  if (input.ndimension() == 4)
-  {
-    istrideB = input.stride(0);
-    sizeB = input.size(0);
-    dimW++;
-    dimH++;
-  }
-
-  /* sizes */
-  sizeD  = input.size(dimH-1);
-  isizeH = input.size(dimH);
-  isizeW = input.size(dimW);
-  /* strides */
-  istrideD = input.stride(dimH-1);
-  istrideH = input.stride(dimH);
-  istrideW = input.stride(dimW);
-
-  int64_t osizeH = output_size[0];
-  int64_t osizeW = output_size[1];
-
-  /* resize output */
-  if (input.ndimension() == 3)
-  {
-    output.resize_({sizeD, osizeH, osizeW});
-    /* indices will contain i,j locations for each output point */
-    indices.resize_({sizeD, osizeH, osizeW});
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
-      auto input_data = input.data_ptr<scalar_t>();
-      auto output_data = output.data_ptr<scalar_t>();
-      auto indices_data = indices.data_ptr<int64_t>();
-
-      adaptive_max_pool2d_single_out_frame<scalar_t>(input_data, output_data,
-                                                     indices_data,
-                                                     sizeD,
-                                                     isizeH, isizeW,
-                                                     osizeH, osizeW,
-                                                     istrideD,
-                                                     istrideH, istrideW);
-      }
-    );
-  }
-  else
-  {
-    output.resize_({sizeB, sizeD, osizeH, osizeW});
-    /* indices will contain i,j locations for each output point */
-    indices.resize_({sizeB, sizeD, osizeH, osizeW});
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
-      auto input_data = input.data_ptr<scalar_t>();
-      auto output_data = output.data_ptr<scalar_t>();
-      auto indices_data = indices.data_ptr<int64_t>();
-
-      adaptive_max_pool2d_out_frame<scalar_t>(input_data, output_data,
-                                              indices_data,
-                                              sizeB, sizeD,
-                                              isizeH, isizeW,
-                                              osizeH, osizeW,
-                                              istrideB, istrideD,
-                                              istrideH, istrideW);
-      }
-    );
-  }
-}
-
 template <typename scalar_t>
 static void adaptive_max_pool2d_backward_single_out_frame(
          scalar_t *gradInput_p,
@ -346,31 +299,83 @@ Tensor& adaptive_max_pool2d_backward_out_cpu_template(

 } // namespace

-std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cpu(const Tensor& input,
-  IntArrayRef output_size,
-  Tensor& output,
-  Tensor& indices)
-{
-  adaptive_max_pool2d_out_cpu_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor&, Tensor&>(output, indices);
-}
+TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cpu)
+(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
+  int dimW = 2;
+  int dimH = 1;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;

-std::tuple<Tensor, Tensor> adaptive_max_pool2d_cpu(
-  const Tensor& input,
-  IntArrayRef output_size)
-{
-  Tensor output = at::empty({0}, input.options());
-  Tensor indices = at::empty({0}, input.options().dtype(kLong));
-  adaptive_max_pool2d_out_cpu_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor, Tensor>(output, indices);
+  int64_t istrideD = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+  int64_t istrideB = 0;
+
+  if (input.ndimension() == 4) {
+    istrideB = input.stride(0);
+    sizeB = input.size(0);
+    dimW++;
+    dimH++;
+  }
+
+  /* sizes */
+  sizeD = input.size(dimH - 1);
+  isizeH = input.size(dimH);
+  isizeW = input.size(dimW);
+  /* strides */
+  istrideD = input.stride(dimH - 1);
+  istrideH = input.stride(dimH);
+  istrideW = input.stride(dimW);
+
+  int64_t osizeH = output_size[0];
+  int64_t osizeW = output_size[1];
+
+  /* resize output */
+  if (input.ndimension() == 3) {
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
+          auto input_data = input.data_ptr<scalar_t>();
+          auto output_data = output.data_ptr<scalar_t>();
+          auto indices_data = indices.data_ptr<int64_t>();
+
+          adaptive_max_pool2d_single_out_frame<scalar_t>(
+              input_data,
+              output_data,
+              indices_data,
+              sizeD,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideH,
+              istrideW);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
+          auto input_data = input.data_ptr<scalar_t>();
+          auto output_data = output.data_ptr<scalar_t>();
+          auto indices_data = indices.data_ptr<int64_t>();
+
+          adaptive_max_pool2d_out_frame<scalar_t>(
+              input_data,
+              output_data,
+              indices_data,
+              sizeB,
+              sizeD,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideB,
+              istrideD,
+              istrideH,
+              istrideW);
+        });
+  }
 }

 Tensor& adaptive_max_pool2d_backward_out_cpu(const Tensor& gradOutput_,
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@ -5,6 +5,57 @@


 namespace at {
+namespace meta {
+TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_size) {
+  for (int64_t i = 0; i < input.ndimension(); i++) {
+    TORCH_CHECK(
+        input.size(i) > 0,
+        "adaptive_max_pool3d: expected input to have non-empty spatial dimensions, "
+        "but input has sizes ",
+        input.sizes(),
+        " with dimension ",
+        i,
+        " being "
+        "empty");
+  }
+
+  TORCH_CHECK(
+      (input.ndimension() == 4 || input.ndimension() == 5),
+      "non-empty 4D or 5D (batch mode) tensor expected for input");
+
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "adaptive_max_pool3d: internal error: output_size.size() must be 3");
+
+  int dimD = 0;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+
+  if (input.ndimension() == 5) {
+    sizeB = input.size(0);
+    dimD++;
+  }
+
+  /* sizes */
+  sizeD = input.size(dimD);
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  /* resize output */
+  if (input.ndimension() == 4) {
+    set_output(0, {sizeD, osizeT, osizeH, osizeW}, input.options());
+    /* indices will contain max input locations for each output point */
+    set_output(1, {sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
+  } else {
+    set_output(0, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options());
+    /* indices will contain max input locations for each output point */
+    set_output(1, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
+  }
+}
+} // namespace meta
+
 namespace native {

 namespace {
@ -393,31 +444,97 @@ Tensor& adaptive_max_pool3d_backward_out_cpu_template(

 } // namespace

-std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cpu(const Tensor& input,
-  IntArrayRef output_size,
-  Tensor& output,
-  Tensor& indices)
-{
-  adaptive_max_pool3d_out_cpu_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor&, Tensor&>(output, indices);
-}
+TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
+(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
+  int dimD = 0;
+  int dimT = 1;
+  int dimH = 2;
+  int dimW = 3;
+  int64_t sizeB = 1;
+  int64_t sizeD = 0;
+  int64_t isizeT = 0;
+  int64_t isizeH = 0;
+  int64_t isizeW = 0;

-std::tuple<Tensor, Tensor> adaptive_max_pool3d_cpu(
-  const Tensor& input,
-  IntArrayRef output_size)
-{
-  Tensor output = at::empty({0}, input.options());
-  Tensor indices = at::empty({0}, input.options().dtype(kLong));
-  adaptive_max_pool3d_out_cpu_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor, Tensor>(output, indices);
+  int64_t istrideB = 0;
+  int64_t istrideD = 0;
+  int64_t istrideT = 0;
+  int64_t istrideH = 0;
+  int64_t istrideW = 0;
+
+  if (input.ndimension() == 5) {
+    istrideB = input.stride(0);
+    sizeB = input.size(0);
+    dimD++;
+    dimT++;
+    dimH++;
+    dimW++;
+  }
+
+  /* sizes */
+  sizeD = input.size(dimD);
+  isizeT = input.size(dimT);
+  isizeH = input.size(dimH);
+  isizeW = input.size(dimW);
+  /* strides */
+  istrideD = input.stride(dimD);
+  istrideT = input.stride(dimT);
+  istrideH = input.stride(dimH);
+  istrideW = input.stride(dimW);
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  if (input.ndimension() == 4) {
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
+          auto input_data = input.data_ptr<scalar_t>();
+          auto output_data = output.data_ptr<scalar_t>();
+          auto indices_data = indices.data_ptr<int64_t>();
+
+          adaptive_max_pool3d_single_out_frame<scalar_t>(
+              input_data,
+              output_data,
+              indices_data,
+              sizeD,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideT,
+              istrideH,
+              istrideW);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
+          auto input_data = input.data_ptr<scalar_t>();
+          auto output_data = output.data_ptr<scalar_t>();
+          auto indices_data = indices.data_ptr<int64_t>();
+
+          adaptive_max_pool3d_out_frame<scalar_t>(
+              input_data,
+              output_data,
+              indices_data,
+              sizeB,
+              sizeD,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW,
+              istrideB,
+              istrideD,
+              istrideT,
+              istrideH,
+              istrideW);
+        });
+  }
 }

 Tensor& adaptive_max_pool3d_backward_out_cpu(const Tensor& gradOutput_,
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -918,8 +918,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
  // - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
  // original torch.solve supported only the matrix case, while NumPy works for both cases
  // for the batched input we need to be able to distinguish them
-  auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
-  bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
+  bool vector_case = linalg_solve_is_vector_rhs(input, other);

  bool is_batched_column_major = false;
  if (vector_case) {
@ -929,7 +928,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
  }

  // if 'other' is a batch of 2D tensors, then 'input' can be non-batched and will be broadcasted
-  auto expected_shape = expected_batched_rhs_shape;
+  auto expected_shape = IntArrayRef(input.sizes().data(), input.dim() - 1);  // input.shape[:-1]
  if (!vector_case && other.dim() > 2) {
    expected_shape = other.sizes();
  }
@ -1020,8 +1019,7 @@ Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& resul

  // Now check LAPACK/MAGMA error codes
  // batchCheckErrors(Tensor, char*) calls 'infos = infos.to(kCPU)'
-  auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
-  bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
+  bool vector_case = linalg_solve_is_vector_rhs(input, other);
  if (vector_case ? result.dim() > 1 : result.dim() > 2) {
    batchCheckErrors(infos, "linalg_solve");
  } else {
@ -1606,9 +1604,8 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tens

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-template<typename scalar_t>
-static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
-                        std::vector<int64_t>& infos) {
+template <typename scalar_t>
+static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n) {
 #ifndef USE_LAPACK
  AT_ERROR("qr: LAPACK library not found in compilation");
 #else
@ -1627,6 +1624,7 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
  int lwork = -1;
  scalar_t wkopt;
  lapackGeqrf<scalar_t>(m, n, self_data, m, tau_data, &wkopt, lwork, &info);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
  lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
  Tensor work = at::empty({lwork}, self.options());

@ -1636,10 +1634,10 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,

    // now compute the actual R and TAU
    lapackGeqrf<scalar_t>(m, n, self_working_ptr, m, tau_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+
+    // info from lapackGeqrf only reports if the i-th parameter is wrong
+    // so we don't need to check it all the time
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
  }
 #endif
 }
@ -1647,7 +1645,6 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
 std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string mode) {
  bool compute_q, reduced;
  std::tie(compute_q, reduced) = _parse_qr_mode(mode);
-  std::vector<int64_t> infos(batchCount(self), 0);
  int64_t m = self.size(-2), n = self.size(-1);

  // Setup inputs for apply_geqrf
@ -1682,13 +1679,8 @@ std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string
  q_working_copy.narrow(-1, 0, n).copy_(self);

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cpu", [&]{
-    apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n, infos);
+    apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n);
  });
-  if (self.dim() > 2) {
-    batchCheckErrors(infos, "qr_cpu");
-  } else {
-    singleCheckErrors(infos[0], "qr_cpu");
-  }

  R = q_working_copy.slice(-2, 0, n_columns_q).slice(-1, 0, n).triu();
  if (!compute_q) {
@ -2977,204 +2969,378 @@ Tensor& _lstsq_helper_cpu(
 #endif
 }

-std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
-    const Tensor& self, const Tensor& b,
-    c10::optional<double> cond,
-    c10::optional<std::string> driver) {
-  TORCH_CHECK(
-    self.device().type() == b.device().type(),
-    "torch.linalg.lstsq: input tensors should be on the same device"
-  );
-  TORCH_CHECK(
-    self.scalar_type() == b.scalar_type(),
-    "torch.linalg.lstsq: input tensors should be of the same dtype"
-  );
-  TORCH_CHECK(
-    self.dim() >= 2,
-    "torch.linalg.lstsq: input `self` Tensor should be at least 2D"
-  );
-  TORCH_CHECK(
-    b.dim() >= 1,
-    "torch.linalg.lstsq: input `b` Tensor should be at least 1D"
-  );
-  auto dim_diff = self.dim() - b.dim();
-  TORCH_CHECK(
-    0 <= dim_diff && dim_diff <= 1,
-    "torch.linalg.lstsq: self.dim() must be greater or equal to b.dim() and "
-    "(self.dim() - b.dim()) <= 1"
-  );
-  Tensor b_2d = dim_diff ? b.unsqueeze(-1) : b;
-  TORCH_CHECK(
-    self.size(-2) == b_2d.size(-2),
-    dim_diff ? "torch.linalg.lstsq: self.size(-2) should match b.size(-1)" :
-      "torch.linalg.lstsq: self.size(-2) should match b.size(-2)"
-  );
+/*
+  Solves a least squares problem. That is minimizing the squared Frobenius norm of |B - A X|.

-  // if `driver` is empty, we use `driver_opt` to be set to
-  // c10::nullopt if working with CUDA tensors,
+  Input args:
+  * 'input' - Tensor containing batches of m-by-n matrix A.
+  * 'other' - Tensor containing batches of max(m, n)-by-nrhs matrix B.
+  * 'cond' - relative tolerance for determining rank of A.
+  * 'driver' - the name of the LAPACK driver that is used to compute the solution.
+  Output args (modified in-place):
+  * 'solution' - Tensor to store the solution matrix X.
+  * 'residuals' - Tensor to store values of the residual sum of squares for each column of the solution.
+  * 'rank' - Tensor to store the rank of A.
+  * 'singular_values' - Tensor to store the singular values of A.
+  * 'infos' - Tensor to store error codes of linear algebra math library.
+
+  For further details, please see the LAPACK documentation for GELS/GELSY/GELSS/GELSD routines.
+*/
+static void linalg_lstsq_out_info(
+    Tensor& solution,
+    Tensor& residuals,
+    Tensor& rank,
+    Tensor& singular_values,
+    Tensor& infos,
+    const Tensor& input,
+    const Tensor& other,
+    double rcond,
+    std::string& driver) {
+  // These internal asserts make explicit the assumptions in the implementation
+  // Error check with the actual error messages are done on the higher level of
+  // the hierarchy of calls
+  TORCH_INTERNAL_ASSERT(input.dim() >= 2);
+  TORCH_INTERNAL_ASSERT(other.dim() >= 1);
+
+  auto dim_diff = input.dim() - other.dim();
+  TORCH_INTERNAL_ASSERT(0 <= dim_diff && dim_diff <= 1);
+
+  TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
+  TORCH_INTERNAL_ASSERT(input.device() == other.device());
+
+  TORCH_INTERNAL_ASSERT(solution.scalar_type() == input.scalar_type());
+  TORCH_INTERNAL_ASSERT(solution.device() == input.device());
+
+  TORCH_INTERNAL_ASSERT(residuals.device() == input.device());
+
+  TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong);
+  TORCH_INTERNAL_ASSERT(rank.device() == input.device());
+
+  auto real_dtype = toValueType(input.scalar_type());
+  TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype);
+  TORCH_INTERNAL_ASSERT(singular_values.device() == input.device());
+
+  TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
+  TORCH_INTERNAL_ASSERT(infos.device() == input.device());
+  TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
+  TORCH_INTERNAL_ASSERT(infos.is_contiguous());
+
+  bool vector_case = linalg_solve_is_vector_rhs(input, other);
+  // we need to unsqueeze 'other' because 2-dimensional tensors are expected in the implementation
+  Tensor other_2d = vector_case ? other.unsqueeze(-1) : other;
+
+  TORCH_INTERNAL_ASSERT(input.size(-2) == other_2d.size(-2));
+
+  std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
+  // the actual shape of the solution returned is (*, n,) or (*, n, nrhs)
+  // but LAPACK requires extra dimensions to store raw residuals
+  // so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
+  auto m = input.size(-2);
+  auto n = input.size(-1);
+  auto nrhs = other.size(-1);
+  expected_solution_shape.push_back(std::max(m, n));
+  if (!vector_case) {
+    expected_solution_shape.push_back(nrhs);
+  }
+
+  // if 'solution' has no elements we can modify it
+  if (solution.numel() == 0) {
+    if (vector_case) {
+      solution.resize_(expected_solution_shape, MemoryFormat::Contiguous);
+    } else {
+      auto shape_transposed = expected_solution_shape;
+      std::swap(shape_transposed.end()[-1], shape_transposed.end()[-2]);
+      solution.resize_(shape_transposed, MemoryFormat::Contiguous);
+      solution.transpose_(-2, -1);
+    }
+  }
+
+  // if 'solution' is non-empty it must have the expected shape
+  TORCH_INTERNAL_ASSERT(solution.sizes().equals(expected_solution_shape));
+
+  // 'solution' must be in batched column major order (Fortran contiguous) for 2D inputs
+  // or C contiguous for 1D input
+  if (vector_case) {
+    TORCH_INTERNAL_ASSERT(solution.is_contiguous());
+  } else {
+    TORCH_INTERNAL_ASSERT(solution.transpose(-2, -1).is_contiguous());
+  }
+
+  // for 1-dimensional 'other', we need to unsqueeze the 'solution' before passing to "apply_solve"
+  if (vector_case) {
+    solution = solution.unsqueeze_(-1);
+  }
+
+  // _linalg_lstsq_helper_ performs calculations in-place and 'solution' must be a copy of other_2d
+  solution.narrow(-2, 0, other_2d.size(-2)).copy_(other_2d);
+
+  // if 'rank' is empty we might resize it
+  auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
+  if (rank.numel() == 0 && driver != "gels") { // gels driver doesn't set 'rank'
+    rank.resize_(input_batch_shape, MemoryFormat::Contiguous);
+  }
+
+  // if 'rank' is non-empty it must have the expected shape and be contiguous
+  if (driver != "gels") {
+    TORCH_INTERNAL_ASSERT(rank.sizes().equals(input_batch_shape));
+    TORCH_INTERNAL_ASSERT(rank.is_contiguous());
+  }
+
+  // if 'singular_values' is empty we might resize it
+  auto singular_values_shape = input_batch_shape.vec();
+  singular_values_shape.push_back(std::min(m, n));
+  if (singular_values.numel() == 0 && (driver == "gelsd" || driver == "gelss")) {
+    singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
+  }
+
+  // if 'singular_values' is non-empty it must have the expected shape and be contiguous
+  if (driver == "gelsd" || driver == "gelss") {
+    TORCH_INTERNAL_ASSERT(singular_values.sizes().equals(singular_values_shape));
+    TORCH_INTERNAL_ASSERT(singular_values.is_contiguous());
+  }
+
+  // 'input' is modified in-place so we need a column-major copy
+  auto input_working_copy = copyBatchedColumnMajor(input);
+
+  // now the actual call that computes the result in-place (apply_lstsq)
+  at::_lstsq_helper_(solution, rank, singular_values, infos, input_working_copy, rcond, driver);
+
+  if (m > n && driver != "gelsy") {
+    // LAPACK stores residuals data for postprocessing in rows n:(m-n)
+    auto raw_residuals = solution.narrow(/*dim=*/-2, /*start=*/n, /*length*/m - n);
+    if (raw_residuals.is_complex()) {
+      raw_residuals.mul_(raw_residuals.conj());
+      raw_residuals = at::real(raw_residuals);
+    } else {
+      raw_residuals.pow_(2);
+    }
+    at::sum_out(residuals, raw_residuals, /*dim=*/-2, /*keepdim=*/false, /*dtype*/real_dtype);
+  }
+  solution = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
+  if (m == 0) {
+    solution.zero_();
+  }
+
+  // for 1-dimensional 'other', we need to squeeze the solution after "apply_lstsq"
+  if (vector_case) {
+    solution = solution.squeeze_(-1);
+  }
+}
+
+static std::string get_default_lstsq_driver(c10::optional<std::string> driver, const Tensor& input) {
+  // if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors,
  // otherwise to "gelsy" driver.
-  // CUDA tensors are treated specially because MAGMA
-  // has only 'gels' driver supported.
-  c10::optional<std::string> driver_opt = driver;
+  std::string driver_str;
  // check whether the user provided name is a valid driver name
  if (driver.has_value()) {
-    auto driver_str = driver.value();
+    driver_str = driver.value();
    // convert `driver_str` to lower case inplace.
    std::transform(driver_str.begin(), driver_str.end(), driver_str.begin(),
      [](unsigned char c) { return std::tolower(c); });
    static std::unordered_set<std::string> allowed_drivers = {
      "gels", "gelsy", "gelsd", "gelss"
    };
-    if (at::kCPU == self.device().type()) {
+    if (input.device() == at::kCPU) {
      TORCH_CHECK(
        allowed_drivers.find(driver_str) != allowed_drivers.end(),
        "torch.linalg.lstsq: parameter `driver` should be one of "
        "(gels, gelsy, gelsd, gelss)"
      );
-    }
-    //else if (at::kCUDA == self.device().type()) {
-    else {
+    } else { // else if (input.is_cuda())
      TORCH_CHECK(
        driver_str == "gels",
        "torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
      );
    }
+  } else {
+    // if driver name is not provided, set to default 'gelsy' if on CPU,
+    // or to `gels` if on CUDA.
+    driver_str = input.is_cuda() ? "gels" : "gelsy";
  }
-  // if driver name is not provided, set to default 'gelsy' if on CPU,
-  // or to `gels` if on CUDA.
-  else {
-    driver_opt = (at::kCPU == self.device().type())
-      ? c10::optional<std::string>("gelsy")
-      : c10::optional<std::string>("gels");
+  return driver_str;
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
+    const Tensor& input,
+    const Tensor& other,
+    c10::optional<double> rcond,
+    c10::optional<std::string> driver,
+    Tensor& solution,
+    Tensor& residuals,
+    Tensor& rank,
+    Tensor& singular_values) {
+  TORCH_CHECK(input.dim() >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions.");
+  TORCH_CHECK(other.dim() >= 1, "torch.linalg.lstsq: other must have at least 1 dimension.");
+  TORCH_CHECK(
+      input.scalar_type() == other.scalar_type(),
+      "torch.linalg.lstsq: Expected input and other to have the same dtype, but got input's dtype ",
+      input.scalar_type(),
+      " and other's dtype ",
+      other.scalar_type());
+
+  auto dim_diff = input.dim() - other.dim();
+  TORCH_CHECK(
+      0 <= dim_diff && dim_diff <= 1,
+      "torch.linalg.lstsq: input.dim() must be greater or equal to other.dim() and (input.dim() - other.dim()) <= 1");
+  Tensor other_2d = dim_diff ? other.unsqueeze(-1) : other;
+  TORCH_CHECK(
+      input.size(-2) == other_2d.size(-2),
+      dim_diff ? "torch.linalg.lstsq: input.size(-2) should match other.size(-1)"
+               : "torch.linalg.lstsq: input.size(-2) should match other.size(-2)");
+
+  checkSameDevice("torch.linalg.lstsq", other, input, "other");
+  checkSameDevice("torch.linalg.lstsq", solution, input, "solution");
+  checkSameDevice("torch.linalg.lstsq", residuals, input, "residuals");
+  checkSameDevice("torch.linalg.lstsq", rank, input, "rank");
+  checkSameDevice("torch.linalg.lstsq", singular_values, input, "singular_values");
+
+  // 'solution' is expected to have same dtype as input
+  checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution");
+
+  // 'residuals' is expected to have real float dtype
+  ScalarType real_dtype = c10::toValueType(input.scalar_type());
+  checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution");
+
+  // 'rank' is expected to have integer dtype
+  // actual LAPACK calls use int32_t type for rank, but we promote it to int64_t
+  // to be consistent with torch.linalg.matrix_rank output dtype
+  ScalarType rank_expected_type = ScalarType::Long;
+  checkLinalgCompatibleDtype("torch.linalg.lstsq", rank.scalar_type(), rank_expected_type, "rank");
+
+  // 'singular_values' is expected to have real float dtype
+  checkLinalgCompatibleDtype("torch.linalg.lstsq", singular_values.scalar_type(), real_dtype, "singular_values");
+
+  std::string driver_name = get_default_lstsq_driver(driver, input);
+
+  // set default rcond value
+  // TODO: Change this to match non-legacy NumPy behaviour
+  double rcond_value = rcond.has_value() && (rcond.value() > 0)
+    ? rcond.value()
+    : _get_epsilon(c10::toValueType(input.scalar_type()));
+
+  auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));
+
+  // now check whether the provided output tensors can be used directly
+
+  // Two types of 'other' tensors are supported:
+  // - 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
+  // - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
+  // original torch.lstsq supported only the matrix case, while NumPy works for both cases
+  // for the batched input we need to be able to distinguish them
+  // auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
+  // bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
+  bool vector_case = linalg_solve_is_vector_rhs(input, other);
+
+  // provided output tensor can be used directly if:
+  // 1. the shape matches the expected shape
+  // 2. the dtype matches the expected dtype
+  // 3. the tensor is contiguous
+
+  // Checks for the 'solution' tensor
+  std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
+  // the actual shape of the shape of the solution returned in (*, n,) or (*, n, nrhs)
+  // but LAPACK requires extra dimensions so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
+  expected_solution_shape.push_back(std::max(input.size(-1), input.size(-2)));
+  if (!vector_case && other.dim() > 2) {
+    expected_solution_shape.push_back(other.size(-1));
  }

-  // CUDA has only `gels` driver now which ONLY works with overdetermined systems
-  if (at::kCUDA == self.device().type()) {
-    TORCH_CHECK(
-      self.size(-2) >= self.size(-1),
-      "torch.linalg.lstsq: only overdetermined systems (m >= n) are allowed on CUDA"
-    );
+  bool solution_equal_expected_shape = solution.sizes().equals(expected_solution_shape);
+  bool solution_input_same_type = (solution.scalar_type() == input.scalar_type());
+
+  bool is_solution_batched_column_major = false;
+  if (vector_case) {
+    is_solution_batched_column_major = solution.is_contiguous();
+  } else if (!vector_case && solution.dim() >= 2) {
+    is_solution_batched_column_major = solution.transpose(-2, -1).is_contiguous();
  }

-  // LAPACK/MAGMA requries inputs to be in the column-major-order.
-  auto self_working_copy = copyBatchedColumnMajor(self);
+  // 'residuals' is not checked here because at::sum_out(residuals, ...) does that

-  // Tensor b must be of size (..., max(m, n), nrhs)
-  // and in the column-major order.
-  // We allow the batch dims of `self` to broadcast over the batch
-  // dims of `b` so that it is possible to solve multiple systems with
-  // with the same lhs (encoded by `self`) / rhs (encoded by `b`).
-  // `b_working_copy` is modified in-place and the combination of
-  // batch broadcasting plus LAPACK/MAGMA requirements impose the following
-  // restrictions on sizes/strides of `b`:
-  // 1. b.size = (broadcasted_batch_size(self, b), max(m, n), nrhs).
-  // 2. b.stride should correspond to an almost contiguous Tensor in the column-major-order,
-  //   i.e. b.stride = b.transpose(-2, -1).contiguous().transpose(-2, -1).strides()
-  auto m = self.size(-2);
-  auto n = self.size(-1);
-  auto b_working_copy = copyBatchedColumnMajor(b_2d,
-    /*nrows=*/std::max(m, n),
-    /*desired_batch_sizes=*/broadcast_batch_size(self, b_2d, self.dim() - 2));
+  auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);

-  double rcond = cond.has_value() && (cond.value() > 0)
-    ? cond.value()
-    : _get_epsilon(c10::toValueType(self.scalar_type()));
-
-  auto batch_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend() - 2);
-  Tensor rank = at::empty({0}, self.options().dtype(at::kLong));
-  if (driver_opt.value() != "gels") {
-    rank.resize_(batch_shape, MemoryFormat::Contiguous);
+  // Checks for the 'rank' tensor
+  // rank is a scalar value for each matrix in the batch so
+  // rank's expected shape is equal to input.shape[0:input.ndim-2]
+  bool rank_equal_expected_shape = true;
+  bool rank_equal_expected_type = true;
+  bool rank_is_contiguous = true;
+  if (driver_name != "gels") { // gels driver doesn't set 'rank'
+    rank_equal_expected_shape = rank.sizes().equals(input_batch_shape);
+    rank_equal_expected_type = (rank.scalar_type() == at::kLong);
+    rank_is_contiguous = rank.is_contiguous();
  }

-  auto singular_values_shape = batch_shape.vec();
-  singular_values_shape.push_back(std::min(m, n));
-  auto real_dtype = c10::toValueType(self.scalar_type());
-  Tensor singular_values = at::empty({0}, self.options().dtype(real_dtype));
-  if (driver_opt.value() == "gelsd" || driver_opt.value() == "gelss") {
-    singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
+  // Checks for the 'singular_values' tensor
+  // singular values are computed only with "gelsd" and "gelss" drivers currently
+  bool singular_values_equal_expected_shape = true;
+  bool singular_values_equal_expected_type = true;
+  bool singular_values_is_contiguous = true;
+  if (driver_name == "gelsd" || driver_name == "gelss") {
+    auto singular_values_shape = input_batch_shape.vec();
+    singular_values_shape.push_back(std::min(input.size(-1), input.size(-2)));
+    singular_values_equal_expected_shape = singular_values.sizes().equals(singular_values_shape);
+    singular_values_equal_expected_type = (singular_values.scalar_type() == real_dtype);
+    singular_values_is_contiguous = singular_values.is_contiguous();
  }

-  Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt).device(kCPU));
+  // if solution is not empty and not in batched column major format
+  bool copy_needed = (solution.numel() != 0 && !is_solution_batched_column_major);
+  copy_needed |= !solution_input_same_type;  // or solution does not have the same dtype as input
+  copy_needed |= (solution.numel() != 0 && !solution_equal_expected_shape); // or solution does not have the expected shape

-  Tensor x, residuals;
+  copy_needed |= !rank_equal_expected_type;
+  copy_needed |= (rank.numel() != 0 && !rank_equal_expected_shape);
+  copy_needed |= (rank.numel() != 0 && !rank_is_contiguous);

-  // path if neither `self` nor `b` is empty
-  if (self.numel() && b.numel()) {
-    x = at::_lstsq_helper_(b_working_copy, rank, singular_values, infos, self_working_copy, rcond, driver_opt.value());
-    if (m > n && driver_opt.value() != "gelsy") {
-      residuals = x.narrow(-2, n, std::max(m, n) - n).abs().pow_(2).sum(-2);
-    }
-    x = x.narrow(-2, 0, n);
-  }
-  // if either `self` or `b` is empty, return an empty tensor or,
-  // if non-zero sizes, return a tensor of zeros.
-  else {
-    x = b_working_copy.zero_().narrow(-2, 0, n);
+  copy_needed |= !singular_values_equal_expected_type;
+  copy_needed |= (singular_values.numel() != 0 && !singular_values_equal_expected_shape);
+  copy_needed |= (singular_values.numel() != 0 && !singular_values_is_contiguous);
+
+  if (copy_needed) { // we have to allocate temporary tensors
+    Tensor solution_tmp = at::empty({0}, input.options());
+    Tensor residuals_tmp = at::empty({0}, input.options().dtype(real_dtype));
+    Tensor rank_tmp = at::empty({0}, input.options().dtype(at::kLong));
+    Tensor singular_values_tmp = at::empty({0}, input.options().dtype(real_dtype));
+
+    linalg_lstsq_out_info(solution_tmp, residuals_tmp, rank_tmp, singular_values_tmp, infos, input, other, rcond_value, driver_name);
+
+    at::native::resize_output(solution, solution_tmp.sizes());
+    solution.copy_(solution_tmp);
+
+    at::native::resize_output(residuals, residuals_tmp.sizes());
+    residuals.copy_(residuals_tmp);
+
+    at::native::resize_output(rank, rank_tmp.sizes());
+    rank.copy_(rank_tmp);
+
+    at::native::resize_output(singular_values, singular_values_tmp.sizes());
+    singular_values.copy_(singular_values_tmp);
+  } else {
+    // else use the provided output storage directly
+    linalg_lstsq_out_info(solution, residuals, rank, singular_values, infos, input, other, rcond_value, driver_name);
  }

-  auto return_empty_if_undefined = [&self](Tensor& t,
-      c10::optional<at::ScalarType> dtype = c10::nullopt,
-      c10::optional<std::vector<int64_t>> shape = c10::nullopt) {
-    if (t.defined()) {
-      return t;
-    }
-    else {
-      auto output_dtype = dtype.has_value() ? dtype.value() : self.scalar_type();
-      if (shape.has_value()) {
-        return at::empty(shape.value(), self.options().dtype(output_dtype));
-      }
-      else {
-        return at::empty({0}, self.options().dtype(output_dtype));
-      }
-    }
-  };
-
-  // Some output stays undefined for some values of driver.
-  // Instead of returning undefined tensors which get exposed as
-  // Nones in the Python interface, we return empty tensors.
-  // This way we follow the convention of output types in the
-  // torch.linalg namespace.
-  // NOTE: we run drivers only if both inputs are non-empty!
-  // Hence the code below explicitly handles each and every output
-  // if `self` is empty.
-
-  // Numpy and Scipy always return ranks for empty matrices,
-  // even for drivers which are not rank-revealing.
-  if (self.numel()) {
-    rank = return_empty_if_undefined(rank, at::kLong);
-  }
-  else {
-    rank = at::zeros(batch_shape, self.options().dtype(at::kLong));
-  }
-
-  // undefined residuals could only be an empty Tensor of shape (0)
-  residuals = return_empty_if_undefined(residuals);
-
-  if (!self.numel()
-    && (driver_opt.value() == "gelss" || driver_opt.value() == "gelsd")) {
-    // when `self` is empty, return singular_values of shape
-    // (*self.shape[:-2], 0) only if driver is in ('gelss', 'gelsd')
-    auto singular_values_empty_shape = batch_shape.vec();
-    singular_values_empty_shape.push_back(0);
-    singular_values = return_empty_if_undefined(
-      singular_values,
-      at::toValueType(self.scalar_type()),
-      singular_values_empty_shape);
-  }
-  else {
-    // otherwise return an empty tensor of shape (0)
-    singular_values = return_empty_if_undefined(
-      singular_values,
-      at::toValueType(self.scalar_type()));
-  }
-
-  if (self.dim() > 2) {
+  if (infos.numel() > 1) {
    batchCheckErrors(infos, "torch.linalg.lstsq");
  } else {
-    singleCheckErrors(infos.item().toInt(), "torch.linalg.lstsq");
+    singleCheckErrors(infos.item<int64_t>(), "torch.linalg.lstsq");
  }

-  return std::make_tuple(x, residuals, rank, singular_values);
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(solution, residuals, rank, singular_values);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
+    const Tensor& input, const Tensor& other,
+    c10::optional<double> rcond,
+    c10::optional<std::string> driver) {
+  Tensor solution = at::empty({0}, input.options());
+  Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
+  Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
+  Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
+  std::tie(solution, residuals, rank, singular_values) =
+      at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values);
+  return std::make_tuple(solution, residuals, rank, singular_values);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/aten/src/ATen/native/DilatedMaxPool2d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@ -1,9 +1,7 @@
 #include <ATen/ATen.h>
-#include <ATen/Parallel.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/Pool.h>
-#include <tuple>


 namespace at {
@ -11,117 +9,10 @@ namespace native {

 namespace {

-template <typename scalar_t>
-static void max_pool2d_with_indices_single_out_frame(
-          scalar_t *input_p,
-          scalar_t *output_p,
-          int64_t *ind_p,
-          int64_t nslices,
-          int64_t iwidth,
-          int64_t iheight,
-          int64_t owidth,
-          int64_t oheight,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH
-          )
-{
-  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
-      /* loop over output */
-      int64_t i, j;
-      scalar_t *ip = input_p   + k*iwidth*iheight;
-      for(i = 0; i < oheight; i++)
-      {
-        for(j = 0; j < owidth; j++)
-        {
-          int64_t hstart = i * dH - padH;
-          int64_t wstart = j * dW - padW;
-          int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight);
-          int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth);
-          while(hstart < 0)
-            hstart += dilationH;
-          while(wstart < 0)
-            wstart += dilationW;
-
-          /* local pointers */
-          scalar_t *op = output_p  + k*owidth*oheight + i*owidth + j;
-          int64_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
-
-          /* compute local max: */
-          int64_t maxindex = hstart*iwidth + wstart;
-          scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
-          for(int64_t y = hstart; y < hend; y += dilationH)
-          {
-            for(int64_t x = wstart; x < wend; x += dilationW)
-            {
-              int64_t tcntr = y*iwidth + x;
-              scalar_t val = *(ip + tcntr);
-              if ((val > maxval) || std::isnan(val))
-              {
-                maxval = val;
-                maxindex = tcntr;
-              }
-            }
-          }
-
-          /* set output to local max */
-          *op = maxval;
-
-          /* store location of max */
-          *indp = maxindex;
-        }
-      }
-    }
-  });
-}
-
-template <typename scalar_t>
-static void max_pool2d_with_indices_out_frame(
-          scalar_t *input_data,
-          scalar_t *output_data,
-          int64_t *indices_data,
-          int64_t nbatch,
-          int64_t nInputPlane,
-          int64_t inputWidth,
-          int64_t inputHeight,
-          int64_t outputWidth,
-          int64_t outputHeight,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH)
-{
-  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++) {
-      max_pool2d_with_indices_single_out_frame(
-        input_data+p*nInputPlane*inputWidth*inputHeight,
-        output_data+p*nInputPlane*outputWidth*outputHeight,
-        indices_data+p*nInputPlane*outputWidth*outputHeight,
-        nInputPlane,
-        inputWidth, inputHeight,
-        outputWidth, outputHeight,
-        kW, kH, dW, dH,
-        padW, padH,
-        dilationW, dilationH);
-    }
-  });
-}
-
 void max_pool2d_with_indices_out_cpu_template(
          Tensor& output,
          Tensor& indices,
-          const Tensor& input_,
+          const Tensor& input,
          IntArrayRef kernel_size,
          IntArrayRef stride,
          IntArrayRef padding,
@ -152,152 +43,50 @@ void max_pool2d_with_indices_out_cpu_template(
  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);

-  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
    "non-empty 3D or 4D (batch mode) tensor expected for input");

+  TORCH_CHECK(input.dtype() == output.dtype(),
+    "expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());
+
  /* sizes */
-  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
-  const int64_t nInputPlane = input_.size(-3);
-  const int64_t inputHeight = input_.size(-2);
-  const int64_t inputWidth = input_.size(-1);
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);

  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);

  pool2d_shape_check(
-    input_,
+    input,
    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
    nInputPlane,
    inputHeight, inputWidth,
-    outputHeight, outputWidth, input_.suggest_memory_format());
+    outputHeight, outputWidth, input.suggest_memory_format());

-  /* get contiguous input */
-  Tensor input = input_.contiguous();
-
-  /* resize output */
-  if (input.ndimension() == 3)
-  {
+  /* resize output and indices */
+  if (input.ndimension() == 3) {
    output.resize_({nInputPlane, outputHeight, outputWidth});
    /* indices will contain the locations for each output point */
    indices.resize_({nInputPlane, outputHeight, outputWidth});
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
-      "max_pool2d_with_indices_cpu",
-      [&] {
-        /* get raw pointers */
-        scalar_t *input_data = input.data_ptr<scalar_t>();
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        max_pool2d_with_indices_single_out_frame(
-          input_data, output_data,
-          indices_data,
-          nInputPlane,
-          inputWidth, inputHeight,
-          outputWidth, outputHeight,
-          kW, kH, dW, dH,
-          padW, padH,
-          dilationW, dilationH);
-      }
-    );
-  }
-  else
-  {
-    output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+  } else {
+    output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
    /* indices will contain the locations for each output point */
-    indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
-      "max_pool2d_with_indices_cpu",
-      [&] {
-        scalar_t *input_data = input.data_ptr<scalar_t>();
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        max_pool2d_with_indices_out_frame(
-          input_data,
-          output_data,
-          indices_data,
-          nbatch,
-          nInputPlane,
-          inputWidth, inputHeight,
-          outputWidth, outputHeight,
-          kW, kH, dW, dH,
-          padW, padH,
-          dilationW, dilationH); }
-    );
+    indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
  }
-}

-template <typename scalar_t>
-static void max_pool2d_with_indices_backward_single_out_frame(
-          scalar_t *gradInput_p,
-          scalar_t *gradOutput_p,
-          int64_t *ind_p,
-          int64_t nInputPlane,
-          int64_t inputWidth,
-          int64_t inputHeight,
-          int64_t outputWidth,
-          int64_t outputHeight,
-          int dW,
-          int dH)
-{
-  at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
-      scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
-      scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
-      int64_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
-
-      /* calculate max points */
-      int64_t i, j;
-      for(i = 0; i < outputHeight; i++)
-      {
-        for(j = 0; j < outputWidth; j++)
-        {
-          /* retrieve position of max */
-          int64_t maxp = ind_p_k[i*outputWidth + j];
-          if (maxp != -1) {
-            /* update gradient */
-            gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
-          }
-        }
-      }
-    }
-  });
-}
-
-template <typename scalar_t>
-static void max_pool2d_with_indices_backward_out_frame(
-          scalar_t *gradInput_data,
-          scalar_t *gradOutput_data,
-          int64_t *indices_data,
-          int64_t nbatch,
-          int64_t nInputPlane,
-          int64_t inputWidth,
-          int64_t inputHeight,
-          int64_t outputWidth,
-          int64_t outputHeight,
-          int dW,
-          int dH)
-{
-  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++) {
-      max_pool2d_with_indices_backward_single_out_frame<scalar_t>(
-        gradInput_data+p*nInputPlane*inputWidth*inputHeight,
-        gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
-        indices_data+p*nInputPlane*outputWidth*outputHeight,
-        nInputPlane,
-        inputWidth, inputHeight,
-        outputWidth, outputHeight,
-        dW, dH);
-    }
-  });
+  max_pool2d_kernel(
+      kCPU, output, indices, input,
+      kW, kH,
+      dW, dH,
+      padW, padH,
+      dilationW, dilationH);
 }

 Tensor& max_pool2d_with_indices_backward_out_cpu_template(
          Tensor& gradInput,
-          const Tensor& gradOutput_,
+          const Tensor& gradOutput,
          const Tensor& input,
          const Tensor& indices,
          IntArrayRef kernel_size,
@ -333,11 +122,13 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
    "non-empty 3D or 4D (batch mode) tensor expected for input");

-  /* get contiguous gradOutput */
-  const Tensor gradOutput = gradOutput_.contiguous();
+  TORCH_CHECK(input.dtype() == gradOutput.dtype(),
+    "expected dtype ", input.dtype(), " for `gradOutput` but got dtype ", gradOutput.dtype());
+  TORCH_CHECK(input.dtype() == gradInput.dtype(),
+    "expected dtype ", input.dtype(), " for `gradInput` but got dtype ", gradInput.dtype());

  /* resize */
-  gradInput.resize_as_(input);
+  gradInput.resize_(input.sizes(), input.suggest_memory_format());
  gradInput.zero_();

  /* sizes */
@ -354,7 +145,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(

  max_pool2d_backward_shape_check(
    input,
-    gradOutput_,
+    gradOutput,
    indices,
    nbatch,
    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
@ -363,48 +154,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
    outputHeight_for_shape_check, outputWidth_for_shape_check,
    input.suggest_memory_format());

-  /* backprop */
-  if (input.ndimension() == 3)
-  {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
-      "max_pool2d_with_indices_backward",
-      [&] {
-        /* get raw pointers */
-        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        max_pool2d_with_indices_backward_single_out_frame(
-          gradInput_data, gradOutput_data,
-          indices_data,
-          nInputPlane,
-          inputWidth, inputHeight,
-          outputWidth, outputHeight,
-          dW, dH);
-      }
-    );
-  }
-  else
-  {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
-      "max_pool2d_with_indices_backward",
-      [&] {
-        /* get raw pointers */
-        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        max_pool2d_with_indices_backward_out_frame<scalar_t>(
-          gradInput_data, gradOutput_data,
-          indices_data,
-          nbatch,
-          nInputPlane,
-          inputWidth, inputHeight,
-          outputWidth, outputHeight,
-          dW, dH);
-      }
-    );
-  }
+  max_pool2d_backward_kernel(kCPU, gradInput, gradOutput, indices);

  return gradInput;
 }
@ -461,7 +211,8 @@ std::tuple<Tensor, Tensor> max_pool2d_with_indices_cpu(
  return std::tuple<Tensor, Tensor>(output, indices);
 }

-Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
+Tensor& max_pool2d_with_indices_backward_out_cpu(
+  const Tensor& gradOutput,
  const Tensor& input,
  IntArrayRef kernel_size,
  IntArrayRef stride,
@ -473,7 +224,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
 {
  max_pool2d_with_indices_backward_out_cpu_template(
    gradInput,
-    gradOutput_,
+    gradOutput,
    input,
    indices,
    kernel_size,
@ -485,7 +236,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
 }

 Tensor max_pool2d_with_indices_backward_cpu(
-  const Tensor& gradOutput_,
+  const Tensor& gradOutput,
  const Tensor& input,
  IntArrayRef kernel_size,
  IntArrayRef stride,
@ -494,10 +245,10 @@ Tensor max_pool2d_with_indices_backward_cpu(
  bool ceil_mode,
  const Tensor& indices)
 {
-  auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto gradInput = at::empty({0}, input.options());
  max_pool2d_with_indices_backward_out_cpu_template(
    gradInput,
-    gradOutput_,
+    gradOutput,
    input,
    indices,
    kernel_size,
@ -508,5 +259,8 @@ Tensor max_pool2d_with_indices_backward_cpu(
  return gradInput;
 }

+DEFINE_DISPATCH(max_pool2d_kernel);
+DEFINE_DISPATCH(max_pool2d_backward_kernel);
+
 } // at::native
 } // at
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -514,4 +514,20 @@ static inline void checkLinalgCompatibleDtype(const std::string& fn_name, Scalar
      out_name, " with dtype ", out_type);
 }

+/*
+  Two types of 'other' tensors are supported when solving
+  a system of linear equations matmul(input, x) = other:
+  * 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
+  * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case).
+  The original torch.solve supported only the matrix case, while NumPy works for both cases.
+  For the batched input we need to be able to distinguish them.
+  Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m).
+  This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389
+*/
+static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) {
+  auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
+  bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
+  return vector_case;
+}
+
 }}  // namespace at::native
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@ -1,14 +1,20 @@
 #include <ATen/ATen.h>
-#include <ATen/Parallel.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/div_rtn.h>
-#include <tuple>
+#include <ATen/native/DispatchStub.h>

 #pragma once

 namespace at {
 namespace native {

+using max_pool2d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
+    int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH);
+using max_pool2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+
+DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
+DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
+
 namespace {

 template <typename dest_t, typename src_t>
--- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
@ -0,0 +1,359 @@
+#include <ATen/ATen.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec256/vec256.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/cpu/utils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t>
+void cpu_max_pool(
+    Tensor& output_,
+    Tensor indices_,
+    const Tensor& input_,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH) {
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+  auto indices = indices_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t numel = output.numel();
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_height = output.size(-2);
+  int64_t output_width = output.size(-1);
+
+  // parallel on dim N, C, H, W
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, c, channels, oh, output_height, ow, output_width);
+
+    for (int64_t i = begin; i < end; i++) {
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
+      int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
+      while(ih0 < 0) { ih0 += dilationH; }
+      while(iw0 < 0) { iw0 += dilationW; }
+
+      // local pointers
+      scalar_t* input_ptr = input_data + c * input_height * input_width;
+
+      // compute local max
+      int64_t maxindex = ih0 * input_width + iw0;
+      scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
+      for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
+        for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
+          int64_t index = ih * input_width + iw;
+          scalar_t val = input_ptr[index];
+          if ((val > maxval) || std::isnan(val)) {
+            maxval = val;
+            maxindex = index;
+          }
+        }
+      }
+
+      // set output to local max and store location of max
+      output_data[i] = maxval;
+      indices_data[i] = maxindex;
+
+      // move on to next output index
+      data_index_step(c, channels, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous()) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_pool_channels_last(
+    Tensor& output_,
+    Tensor indices_,
+    const Tensor& input_,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH) {
+  TORCH_CHECK(input_.ndimension() == 4,
+              "max pooling with channels last format supports tensors with 4 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_height = input.size(2);
+  int64_t input_width = input.size(3);
+  int64_t output_height = output.size(2);
+  int64_t output_width = output.size(3);
+
+  using Vec = vec256::Vec256<scalar_t>;
+  using integer_t = vec256::int_same_size_t<scalar_t>;
+  using iVec = vec256::Vec256<integer_t>;
+  // for the convience of vectorization, use integer of the same size of scalar_t,
+  //   e.g. int32_t for float, int64_t for double
+  // need to make sure doesn't overflow
+  TORCH_CHECK(input_height <= std::ceil((double)std::numeric_limits<integer_t>::max() / (double)input_width));
+
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % Vec::size());
+    // temp buffer holding index with integer_t
+    std::unique_ptr<integer_t []> index_buffer(new integer_t[len]);
+
+    for (int64_t i = begin; i < end; i++) {
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
+      int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
+      while(ih0 < 0) { ih0 += dilationH; }
+      while(iw0 < 0) { iw0 += dilationW; }
+
+      scalar_t* out = output_data + i * channels;
+      int64_t* ind = indices_data + i * channels;
+
+      // Pass I: init out lane
+      iVec index0_vec = iVec(ih0 * input_width + iw0);
+      Vec out_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += Vec::size()) {
+        index0_vec.store(index_buffer.get() + d1);
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        ind[d1] = ih0 * input_width + iw0;
+        out[d1] = -std::numeric_limits<scalar_t>::infinity();
+      }
+      // Pass II: compute local max
+      for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
+        for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
+          scalar_t* in = input_data + n * input_height * input_width * channels +
+              ih * input_width * channels + iw * channels;
+
+          int64_t d2 = 0;
+          for (; d2 < len; d2 += Vec::size()) {
+            iVec index_vec = iVec(ih * input_width + iw);
+            Vec val_vec = Vec::loadu(in + d2);
+            iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2);
+            Vec maxval_vec = Vec::loadu(out + d2);
+
+            // true = all ones, false = all zeros
+            Vec mask = (val_vec > maxval_vec) | val_vec.isnan();
+            iVec imask = vec256::cast<integer_t>(mask);
+            Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask);
+            iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask);
+
+            out_vec.store(out + d2);
+            ind_vec.store(index_buffer.get() + d2);
+          }
+          for (; d2 < size; d2++) {
+            int64_t index = ih * input_width + iw;
+            scalar_t val = in[d2];
+            int64_t maxindex = ind[d2];
+            scalar_t maxval = out[d2];
+
+            bool mask = (val > maxval) || std::isnan(val);
+            out[d2] = mask ? val : maxval;
+            ind[d2] = mask ? index : maxindex;
+          }
+        }
+      }
+      // convert indice data type
+      vec256::convert<integer_t, int64_t>(index_buffer.get(), ind, len);
+
+      // move on to next output index
+      data_index_step(n, nbatch, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous(memory_format)) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_pool_backward(
+    Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  auto grad_output = grad_output_.contiguous();
+  auto indices = indices_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t c = begin; c < end; c++) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
+      int64_t * indices_ptr = indices_data + c * output_height * output_width;
+
+      for (int64_t oh = 0; oh < output_height; oh++) {
+        for (int64_t ow = 0; ow < output_width; ow++) {
+          // retrieve position of max
+          int64_t index = oh * output_width + ow;
+          int64_t maxindex = indices_ptr[index];
+          if (maxindex != -1) {
+            // update gradient
+            grad_input_ptr[maxindex] += grad_output_ptr[index];
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_pool_backward_channels_last(
+    Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  TORCH_CHECK(grad_output_.ndimension() == 4,
+              "max pooling backward with channels last format supports tensors with 4 dims.");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_height = grad_input.size(2);
+  int64_t input_width = grad_input.size(3);
+  int64_t output_height = grad_output.size(2);
+  int64_t output_width = grad_output.size(3);
+
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t n = begin; n < end; n++) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+      int64_t* indices_ptr = indices_data + n * output_height * output_width * channels;
+
+      for (int64_t oh = 0; oh < output_height; oh++) {
+        for (int64_t ow = 0; ow < output_width; ow++) {
+          scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
+          int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels;
+          // TODO: gcc vectorization
+          for (int64_t c = 0; c < channels; c++) {
+            int64_t maxindex = ind[c];
+            if (maxindex != -1) {
+              grad_input_ptr[maxindex * channels + c] += gout[c];
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void max_pool2d_kernel_impl(
+    Tensor& output,
+    Tensor& indices,
+    const Tensor& input,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d", [&] {
+        cpu_max_pool<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d_channels_last", [&] {
+        cpu_max_pool_channels_last<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void max_pool2d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  switch (grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward", [&] {
+        cpu_max_pool_backward<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward_channels_last", [&] {
+        cpu_max_pool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(max_pool2d_kernel, &max_pool2d_kernel_impl);
+REGISTER_DISPATCH(max_pool2d_backward_kernel, &max_pool2d_backward_kernel_impl);
+
+}} // at::native
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
@ -190,109 +190,6 @@ __global__ void atomicadaptivemaxgradinput(
  }
 }

-// 4d tensor B x D x H x W
-
-void adaptive_max_pool2d_out_cuda_template(
-           Tensor& output,
-           Tensor& indices,
-           const Tensor& input,
-           IntArrayRef output_size)
-{
-  TensorArg output_arg{ output, "output", 1 };
-  TensorArg indices_arg{ indices, "indices", 2 };
-  TensorArg input_arg{ input, "input", 3 };
-
-  checkAllSameGPU("adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
-
-  for (int64_t i = 0; i < input.ndimension(); i++) {
-     TORCH_CHECK(input.size(i) > 0,
-        "adaptive_max_pool2d_cuda(): expected input to have non-empty spatial dimensions, "
-        "but input has sizes ", input.sizes(), " with dimension ", i, " being "
-        "empty");
-  }
-
-  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
-    "non-empty 3D or 4D (batch mode) tensor expected for input");
-
-  TORCH_CHECK(output_size.size() == 2,
-    "adaptive_max_pool2d: internal error: output_size.size() must be 2");
-
-  int64_t osizeH = output_size[0];
-  int64_t osizeW = output_size[1];
-
-  if (input.ndimension() == 3) {
-    int64_t sizeD  = input.size(0);
-    int64_t isizeH = input.size(1);
-    int64_t isizeW = input.size(2);
-
-    int64_t istrideD = input.stride(0);
-    int64_t istrideH = input.stride(1);
-    int64_t istrideW = input.stride(2);
-
-    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
-      "adaptive_max_pool2d_cuda",
-      [&] {
-        output.resize_({sizeD, osizeH, osizeW});
-        indices.resize_({sizeD, osizeH, osizeW});
-
-        scalar_t *input_data = input.data_ptr<scalar_t>();
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        // cuda blocks & threads:
-        int blocksH = (int)(16L / sizeD);
-        blocksH = blocksH < 1 ? 1 : blocksH;
-        dim3 blocks(sizeD, blocksH);
-        dim3 threads(32, 8);
-
-        // run maxpool kernel
-        adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
-                                    input_data, output_data,
-                                    indices_data,
-                                    isizeH, isizeW, osizeH, osizeW,
-                                    istrideD, istrideH, istrideW);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-    );
-  } else {
-    Tensor input_ = input.contiguous();
-    int64_t sizeB  = input_.size(0);
-    int64_t sizeD  = input_.size(1);
-    int64_t isizeH = input_.size(2);
-    int64_t isizeW = input_.size(3);
-
-    int64_t istrideD = input_.stride(1);
-    int64_t istrideH = input_.stride(2);
-    int64_t istrideW = input_.stride(3);
-
-    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input_.scalar_type(),
-      "adaptive_max_pool2d_cuda",
-      [&] {
-        output.resize_({sizeB, sizeD, osizeH, osizeW});
-        indices.resize_({sizeB, sizeD, osizeH, osizeW});
-
-        scalar_t *input_data = input_.data_ptr<scalar_t>();
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        // cuda blocks & threads:
-        int blocksH = (int)(16L / sizeD);
-        blocksH = blocksH < 1 ? 1 : blocksH;
-        dim3 blocks(sizeB*sizeD, blocksH);
-        dim3 threads(32, 8);
-
-        // run maxpool kernel
-        adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
-                                    input_data, output_data,
-                                    indices_data,
-                                    isizeH, isizeW, osizeH, osizeW,
-                                    istrideD, istrideH, istrideW);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-    );
-  }
-}
-
 void adaptive_max_pool2d_backward_out_cuda_template(
           Tensor& gradInput,
           const Tensor& gradOutput_,
@ -409,31 +306,108 @@ void adaptive_max_pool2d_backward_out_cuda_template(

 } // namespace

-std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cuda(const Tensor& input,
-  IntArrayRef output_size,
-  Tensor& output,
-  Tensor& indices)
-{
-  adaptive_max_pool2d_out_cuda_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor&, Tensor&>(output, indices);
-}
+// 4d tensor B x D x H x W

-std::tuple<Tensor, Tensor> adaptive_max_pool2d_cuda(
-  const Tensor& input,
-  IntArrayRef output_size)
-{
-  Tensor output = at::empty({0}, input.options());
-  Tensor indices = at::empty({0}, input.options().dtype(kLong));
-  adaptive_max_pool2d_out_cuda_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor, Tensor>(output, indices);
+TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cuda)
+(const Tensor& input,
+IntArrayRef output_size,
+const Tensor& output,
+const Tensor& indices) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg indices_arg{indices, "indices", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  checkAllSameGPU(
+      "adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
+
+  int64_t osizeH = output_size[0];
+  int64_t osizeW = output_size[1];
+
+  if (input.ndimension() == 3) {
+    int64_t sizeD = input.size(0);
+    int64_t isizeH = input.size(1);
+    int64_t isizeW = input.size(2);
+
+    int64_t istrideD = input.stride(0);
+    int64_t istrideH = input.stride(1);
+    int64_t istrideW = input.stride(2);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "adaptive_max_pool2d_cuda", [&] {
+          scalar_t* input_data = input.data_ptr<scalar_t>();
+          scalar_t* output_data = output.data_ptr<scalar_t>();
+          int64_t* indices_data = indices.data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          // run maxpool kernel
+          adaptivemaxpool<<<
+              blocks,
+              threads,
+              0,
+              at::cuda::getCurrentCUDAStream()>>>(
+              input_data,
+              output_data,
+              indices_data,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideH,
+              istrideW);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+  } else {
+    Tensor input_ = input.contiguous();
+    int64_t sizeB = input_.size(0);
+    int64_t sizeD = input_.size(1);
+    int64_t isizeH = input_.size(2);
+    int64_t isizeW = input_.size(3);
+
+    int64_t istrideD = input_.stride(1);
+    int64_t istrideH = input_.stride(2);
+    int64_t istrideW = input_.stride(3);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input_.scalar_type(),
+        "adaptive_max_pool2d_cuda",
+        [&] {
+          scalar_t* input_data = input_.data_ptr<scalar_t>();
+          scalar_t* output_data = output.data_ptr<scalar_t>();
+          int64_t* indices_data = indices.data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeB * sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          // run maxpool kernel
+          adaptivemaxpool<<<
+              blocks,
+              threads,
+              0,
+              at::cuda::getCurrentCUDAStream()>>>(
+              input_data,
+              output_data,
+              indices_data,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideH,
+              istrideW);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+  }
 }

 Tensor& adaptive_max_pool2d_backward_out_cuda(const Tensor& gradOutput_,
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
@ -291,90 +291,6 @@ void atomicadaptivemaxgradinput_loop(
  }
 }

-// 5d tensor B x D x T x H x W
-
-void adaptive_max_pool3d_out_cuda_template(
-           Tensor& output,
-           Tensor& indices,
-           const Tensor& input_,
-           IntArrayRef output_size)
-{
-  TensorArg output_arg{ output, "output", 1 };
-  TensorArg indices_arg{ indices, "indices", 2 };
-  TensorArg input_arg{ input_, "input_", 3 };
-
-  checkAllSameGPU("adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
-
-  for (int64_t i = 0; i < input_.ndimension(); i++) {
-    TORCH_CHECK(input_.size(i) > 0,
-      "adaptive_max_pool3d_cuda(): expected input to have non-empty spatial dimensions, "
-      "but input has sizes ", input_.sizes(), " with dimension ", i, " being "
-      "empty");
-  }
-
-  TORCH_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5),
-    "non-empty 4D or 5D (batch mode) tensor expected for input");
-
-  TORCH_CHECK(output_size.size() == 3,
-    "adaptive_max_pool3d: internal error: output_size.size() must be 3");
-
-  int64_t osizeT = output_size[0];
-  int64_t osizeH = output_size[1];
-  int64_t osizeW = output_size[2];
-
-  int64_t sizeD, isizeT, isizeH, isizeW;
-  int64_t istrideD, istrideT, istrideH, istrideW;
-  int64_t totalZ;
-
-  const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
-
-  if (input.ndimension() == 4) {
-    sizeD = input.size(0);
-    isizeT = input.size(1);
-    isizeH = input.size(2);
-    isizeW = input.size(3);
-
-    istrideD = input.stride(0);
-    istrideT = input.stride(1);
-    istrideH = input.stride(2);
-    istrideW = input.stride(3);
-
-    output.resize_({sizeD, osizeT, osizeH, osizeW});
-    indices.resize_({sizeD, osizeT, osizeH, osizeW});
-
-    totalZ = sizeD * osizeT;
-  } else {
-    int64_t sizeB = input.size(0);
-    sizeD = input.size(1);
-    isizeT = input.size(2);
-    isizeH = input.size(3);
-    isizeW = input.size(4);
-
-    istrideD = input.stride(1);
-    istrideT = input.stride(2);
-    istrideH = input.stride(3);
-    istrideW = input.stride(4);
-
-    output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
-    indices.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
-
-    totalZ = sizeB * sizeD * osizeT;
-  }
-
-  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
-    "adaptive_max_pool3d_cuda",
-    [&] {
-      scalar_t *input_data = input.data_ptr<scalar_t>();
-      scalar_t *output_data = output.data_ptr<scalar_t>();
-      int64_t *indices_data = indices.data_ptr<int64_t>();
-
-      adaptivemaxpool_loop(
-        input_data, output_data, indices_data, totalZ, isizeT, isizeH, isizeW,
-        osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW);
-    }
-  );
-}
-
 void adaptive_max_pool3d_backward_out_cuda_template(
           Tensor& gradInput,
           const Tensor& gradOutput_,
@ -460,31 +376,79 @@ void adaptive_max_pool3d_backward_out_cuda_template(

 } // namespace

-std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cuda(const Tensor& input,
-  IntArrayRef output_size,
-  Tensor& output,
-  Tensor& indices)
-{
-  adaptive_max_pool3d_out_cuda_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor&, Tensor&>(output, indices);
-}
+// 5d tensor B x D x T x H x W

-std::tuple<Tensor, Tensor> adaptive_max_pool3d_cuda(
-  const Tensor& input,
-  IntArrayRef output_size)
-{
-  Tensor output = at::empty({0}, input.options());
-  Tensor indices = at::empty({0}, input.options().dtype(kLong));
-  adaptive_max_pool3d_out_cuda_template(
-    output,
-    indices,
-    input,
-    output_size);
-  return std::tuple<Tensor, Tensor>(output, indices);
+TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cuda)
+(const Tensor& input,
+ IntArrayRef output_size,
+ const Tensor& output,
+ const Tensor& indices) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg indices_arg{indices, "indices", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  checkAllSameGPU(
+      "adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  const Tensor& input_ = input.ndimension() == 4 ? input : input.contiguous();
+
+  if (input_.ndimension() == 4) {
+    sizeD = input_.size(0);
+    isizeT = input_.size(1);
+    isizeH = input_.size(2);
+    isizeW = input_.size(3);
+
+    istrideD = input_.stride(0);
+    istrideT = input_.stride(1);
+    istrideH = input_.stride(2);
+    istrideW = input_.stride(3);
+
+    totalZ = sizeD * osizeT;
+  } else {
+    int64_t sizeB = input_.size(0);
+    sizeD = input_.size(1);
+    isizeT = input_.size(2);
+    isizeH = input_.size(3);
+    isizeW = input_.size(4);
+
+    istrideD = input_.stride(1);
+    istrideT = input_.stride(2);
+    istrideH = input_.stride(3);
+    istrideW = input_.stride(4);
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, input_.scalar_type(), "adaptive_max_pool3d_cuda", [&] {
+        scalar_t* input_data = input_.data_ptr<scalar_t>();
+        scalar_t* output_data = output.data_ptr<scalar_t>();
+        int64_t* indices_data = indices.data_ptr<int64_t>();
+
+        adaptivemaxpool_loop(
+            input_data,
+            output_data,
+            indices_data,
+            totalZ,
+            isizeT,
+            isizeH,
+            isizeW,
+            osizeT,
+            osizeH,
+            osizeW,
+            istrideD,
+            istrideT,
+            istrideH,
+            istrideW);
+      });
 }

 Tensor& adaptive_max_pool3d_backward_out_cuda(const Tensor& gradOutput_,
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@ -1192,7 +1192,41 @@ void magmaGels<c10::complex<double>>(
      reinterpret_cast<magmaDoubleComplex*>(hwork), lwork, info);
  AT_CUDA_CHECK(cudaGetLastError());
 }
-#endif
+
+namespace {
+
+/*
+  MAGMA can return errors both as a return value and in the info argument.
+  The return value and info should always be identical.
+  In general, the meaning is as given in this table.
+  Predefined error codes are large negative numbers. Using the symbolic
+  constants below is preferred, but the numeric values can be found in
+  include/magma_types.h.
+
+  Info                       |  Description
+  -----------                |  -----------
+  info = 0 (MAGMA_SUCCESS)   |  Successful exit
+  info < 0, but small        |  For info = -i, the i-th argument had an illegal value
+  info > 0                   |  Function-specific error such as singular matrix
+  MAGMA_ERR_DEVICE_ALLOC     |  Could not allocate GPU device memory
+  MAGMA_ERR_HOST_ALLOC       |  Could not allocate CPU host memory
+  MAGMA_ERR_ILLEGAL_VALUE    |  An argument had an illegal value (deprecated; instead it should return -i to say the i-th argument was bad)
+  MAGMA_ERR_INVALID_PTR      |  Can't free pointer
+  MAGMA_ERR_NOT_IMPLEMENTED  |  Function or option not implemented
+  MAGMA_ERR_NOT_SUPPORTED    |  Function or option not supported on the current architecture
+*/
+void checkMagmaInternalError(magma_int_t info, const std::string& magma_function_name) {
+  // if info > 0 the error is function-specific, do nothing in this case
+  TORCH_CHECK(info >= 0,
+      "MAGMA error: ",
+      magma_strerror(info),
+      ", info = ", info,
+      ", when calling ", magma_function_name);
+}
+
+} // anonymous namespace
+
+#endif // USE_MAGMA

 #define ALLOCATE_ARRAY(name, type, size) \
  auto storage_##name = pin_memory<type>(size); \
@ -1968,7 +2002,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau, int64_t n_columns)

 template <typename scalar_t>
 static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns,
-                     bool compute_q, std::vector<int64_t>& infos) {
+                     bool compute_q) {
 #ifndef USE_MAGMA
 AT_ERROR("qr: MAGMA library not found in "
    "compilation. Please rebuild with MAGMA.");
@ -1996,10 +2030,7 @@ AT_ERROR("qr: MAGMA library not found in "
  for (int64_t i = 0; i < batch_size; i++) {
    scalar_t* r_working_ptr = &r_data[i * r_matrix_stride];
    magmaGeqrf<scalar_t>(m, n, r_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/true);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+    checkMagmaInternalError(info, "geqrf");
  }
  if (!compute_q) {
    // this is for mode='r'
@ -2017,15 +2048,10 @@ AT_ERROR("qr: MAGMA library not found in "
  for (int64_t i = 0; i < batch_size; i++) {
    scalar_t* q_working_ptr = &q_data[i * q_matrix_stride];
    magmaGeqrf<scalar_t>(m, n, q_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/false);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+    checkMagmaInternalError(info, "geqrf");
+
    magmaOrgqr<scalar_t>(m, n_columns, k, q_working_ptr, m, tau_data, work_data, nb, &info);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+    checkMagmaInternalError(info, "orgqr");
  }
 #endif
 }
@ -2033,7 +2059,6 @@ AT_ERROR("qr: MAGMA library not found in "
 std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string mode) {
  bool compute_q, reduced;
  std::tie(compute_q, reduced) = _parse_qr_mode(mode);
-  std::vector<int64_t> infos(batchCount(self), 0);

  // Setup input geometry and inputs for apply_qr
  std::vector<int64_t> q_sizes, q_strides;
@ -2066,13 +2091,8 @@ std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string
  int64_t n = r_working_copy.size(-1);

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cuda", [&]{
-    apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q, infos);
+    apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q);
  });
-  if (self.dim() > 2) {
-    batchCheckErrors(infos, "qr_cuda");
-  } else {
-    singleCheckErrors(infos[0], "qr_cuda");
-  }

  if (compute_q) {
    q_working_copy = q_working_copy.narrow(-1, 0, n_columns_q);
@ -2647,6 +2667,11 @@ TORCH_CHECK(false, "torch.linalg.lstsq: MAGMA library not found in "
    auto trans = MagmaNoTrans;
    auto m = magma_int_cast(a.size(-2), "m");
    auto n = magma_int_cast(a.size(-1), "n");
+
+    TORCH_CHECK(
+      m >= n,
+      "torch.linalg.lstsq: only overdetermined systems (input.size(-2) >= input.size(-1)) are allowed on CUDA");
+
    auto nrhs = magma_int_cast(b.size(-1), "nrhs");
    auto ldda = std::max<magma_int_t>(1, m);
    auto lddb = std::max<magma_int_t>(1, std::max(m, n));
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -669,7 +669,7 @@ cunn_SoftMaxBackward(scalar_t *gradInput, outscalar_t *output, outscalar_t *grad
  const int grad_output_shift = ((uint64_t)gradOutput) % ALIGN_BYTES / sizeof(outscalar_t);

  accscalar_t threadSum = ilpReduce<AddFloat, ILP, outscalar_t, accscalar_t>(
-      shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
+      grad_output_shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
  accscalar_t sum_k = blockReduce<Add, accscalar_t>(
        sdata, threadSum, Add<accscalar_t>(), accscalar_t(0));

--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@ -99,7 +99,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
      indices.resize_as_(self);
      indices.zero_();
    }
-    return {values, indices};
+    return std::forward_as_tuple(values, indices);
  }

  Tensor self_;
@ -153,7 +153,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
  }

  if (numel == 0) {
-    return {values, indices};
+    return std::forward_as_tuple(values, indices);
  }

  int64_t numel_or_intmax = std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
@ -206,7 +206,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
  if (indices_tmp.defined()) {
    indices.copy_(indices_tmp);
  }
-  return {values, indices};
+  return std::forward_as_tuple(values, indices);
 }

 std::tuple<Tensor &,Tensor &> sort_out_cuda(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices) {
--- a/aten/src/ATen/native/metal/MetalShaders.h
+++ b/aten/src/ATen/native/metal/MetalShaders.h
@ -590,7 +590,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
        // we compute the "linear index" of the output element,
        // and convert it to the equivalent "linear index" of the input element.
        ushort offset = 4 * s2 + idx;
-        ushort linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
+        int64_t linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
        if(linear_idx >= numel1){
            value[idx] = 0;
            continue;
@ -615,6 +615,98 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
    }
 }

+constant bool transpose_in_is_arr = (ushort_arg_3 > 1 || ushort_arg_4 > 4);
+constant bool transpose_in_is_tex = !transpose_in_is_arr;
+constant bool transpose_out_is_arr = (ushort_arg_5 > 1 || ushort_arg_6 > 4);
+constant bool transpose_out_is_tex = !transpose_out_is_arr;
+kernel void transpose(texture2d_array<half, access::read>in_arr[[texture(0),function_constant(transpose_in_is_arr)]],
+                      texture2d<half, access::read> in_tex[[texture(0), function_constant(transpose_in_is_tex)]],
+                      texture2d_array<half, access::write>out_arr[[texture(1),function_constant(transpose_out_is_arr)]],
+                      texture2d<half, access::write> out_tex[[texture(1), function_constant(transpose_out_is_tex)]],
+                      constant ushort* inSizeBuffer [[buffer(0)]],
+                      constant ushort* outSizeBuffer [[buffer(1)]],
+                      device ushort* indexBuffer [[buffer(2)]],
+                      ushort3 gid[[thread_position_in_grid]]) {
+
+    const ushort dim0 = ushort_arg_0;
+    const ushort dim1 = ushort_arg_1;
+    const ushort dim = ushort_arg_2;
+    const ushort N1 = ushort_arg_3;
+    const ushort C1 = ushort_arg_4;
+    const ushort N2 = ushort_arg_5;
+    const ushort C2 = ushort_arg_6;
+    ushort W1,W2,H1,H2;
+    if(transpose_in_is_arr) {
+        W1 = in_arr.get_width();
+        H1 = in_arr.get_height();
+    } else {
+        W1 = in_tex.get_width();
+        H1 = in_tex.get_height();
+    }
+    if(transpose_out_is_arr) {
+        W2 = out_arr.get_width();
+        H2 = out_arr.get_height();
+    } else {
+        W2 = out_tex.get_width();
+        H2 = out_tex.get_height();
+    }
+    if (gid.x >= W2 || gid.y >= H2) {
+        return;
+    }
+    const int numel = H2 * W2 * C2 * N2;
+    const ushort slices2 = divRoundUp(C2, 4);
+    const ushort slices1 = divRoundUp(C1, 4);
+    const ushort n2 = gid.z / slices2;
+    const ushort s2 = gid.z - n2 * slices2;
+    half4 value;
+    for (int idx = 0; idx < 4; ++idx){
+        ushort offset = 4 * s2 + idx;
+        int64_t linear_idx2 = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
+        if(linear_idx2 >= numel) {
+            value[idx] = 0;
+            continue;
+        }
+
+        ushort d2 = 0;
+        for(int j = dim-1; j>=0; --j){
+            d2  = outSizeBuffer[j];
+            indexBuffer[j] = linear_idx2 % d2;
+            linear_idx2 /= d2;
+        }
+
+        // swap dims
+        ushort tmp = indexBuffer[dim0];
+        indexBuffer[dim0] = indexBuffer[dim1];
+        indexBuffer[dim1] = tmp;
+
+        int64_t linear_idx1 = 0;
+        ushort m = 1;
+        ushort d1 = 0;
+        for(int k = dim-1; k>=0; --k) {
+            d1 = indexBuffer[k];
+            linear_idx1 += d1 * m;
+            m *= inSizeBuffer[k];
+        }
+
+        auto x1 = linear_idx1 % W1;
+        auto y1 = ((int)(linear_idx1/W1)) % H1;
+        auto c1 = ((int)(linear_idx1/W1/H1) % C1);
+        auto n1 = ((int)(linear_idx1/W1/H1/C1) % N1);
+        auto z1 = (int)c1 / 4 + n1 * slices1;
+        auto pos = c1 % 4;
+        if(transpose_in_is_arr) {
+            value[idx] = in_arr.read(ushort2(x1, y1), z1)[pos];
+        } else {
+            value[idx] = in_tex.read(ushort2(x1, y1))[pos];
+        }
+    }
+    if(transpose_out_is_arr) {
+        out_arr.write(value, gid.xy, gid.z);
+    } else {
+        out_tex.write(value, gid.xy);
+    }
+}
+
 )PT_METAL_SHADERS";

 #endif /* MPSCNNShaders_h */
--- a/aten/src/ATen/native/metal/MetalUtils.h
+++ b/aten/src/ATen/native/metal/MetalUtils.h
@ -1,15 +1,22 @@
 #include <ATen/Tensor.h>
+#include <ATen/native/metal/mpscnn/MPSCNNContext.h>
 #include <ATen/native/metal/MetalCommandBuffer.h>
 #include <ATen/native/metal/MetalTensorImpl.h>
 #include <ATen/native/metal/MetalTensorImplStorage.h>
 #include <vector>

+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+typedef float16_t fp16_t;
+#else
+typedef uint16_t fp16_t;
+#endif
+
 namespace at {
 namespace native {
 namespace metal {

-std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src);
-std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src);
+std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src);
+std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src);

 std::vector<float> NCHWToNC4(
    const float* src,
@ -67,6 +74,15 @@ static inline MetalCommandBuffer* getCommandBufferFromTensor(
  return cmdBuffer;
 }

+template<typename T>
+id<MTLBuffer>makeMTLBuffer(const std::vector<T>& src) {
+    id<MTLBuffer> buffer = [[MPSCNNContext sharedInstance].device
+          newBufferWithLength:src.size() * sizeof(T)
+                      options:MTLResourceOptionCPUCacheModeWriteCombined];
+    memcpy(buffer.contents, src.data(), src.size() * sizeof(T));
+    return buffer;
+}
+
 } // namespace metal
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/metal/MetalUtils.mm
+++ b/aten/src/ATen/native/metal/MetalUtils.mm
@ -1,28 +1,27 @@
 #import <ATen/native/metal/MetalUtils.h>
+#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
 #import <Accelerate/Accelerate.h>

 namespace at {
 namespace native {
 namespace metal {

-std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src) {
-  unsigned long count = src.size();
-  std::vector<uint16_t> output(count, 0);
-  vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
-  vImage_Buffer float16{
-      (void*)output.data(), 1, count, count * sizeof(uint16_t)};
-  if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
-      kvImageNoError) {
-    TORCH_CHECK(false);
-  }
-
+std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src) {
+    unsigned long count = src.size();
+    std::vector<fp16_t> output(count, 0);
+    vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
+    vImage_Buffer float16{(void*)output.data(), 1, count, count * sizeof(fp16_t)};
+    if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
+        kvImageNoError) {
+      TORCH_CHECK(false);
+    }
  return output;
 }

-std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src) {
+std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src) {
  unsigned long count = src.size();
  std::vector<float> output(count, 0);
-  vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(uint16_t)};
+  vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(fp16_t)};
  vImage_Buffer float32{(void*)output.data(), 1, count, count * sizeof(float)};
  if (vImageConvert_Planar16FtoPlanarF(&float16, &float32, 0) !=
      kvImageNoError) {
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNClampOp.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNClampOp.mm
@ -1,3 +1,4 @@
+#import <ATen/native/metal/MetalUtils.h>
 #import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
 #import <ATen/native/metal/mpscnn/MPSCNNClampOp.h>
 #import <ATen/native/metal/mpscnn/MPSCNNContext.h>
@ -35,9 +36,9 @@
  [encoder setTexture:[_X texture] atIndex:0];
  [encoder setTexture:[_Y texture] atIndex:1];
  id<MTLBuffer> clampBuffer = [[MPSCNNContext sharedInstance].device
-      newBufferWithLength:2 * sizeof(fp16)
+      newBufferWithLength:2 * sizeof(fp16_t)
                  options:MTLResourceOptionCPUCacheModeWriteCombined];
-  fp16* clampBufferPtr = (fp16*)[clampBuffer contents];
+  fp16_t* clampBufferPtr = (fp16_t*)[clampBuffer contents];
  clampBufferPtr[0] = _min.floatValue;
  clampBufferPtr[1] = _max.floatValue;
  [encoder setBuffer:clampBuffer offset:0 atIndex:0];
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNOp.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNOp.h
@ -1,12 +1,6 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>

-#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
-typedef float16_t fp16;
-#else
-typedef uint16_t fp16;
-#endif
-
@protocol MPSCNNOp<NSObject>

@property(nonatomic, strong) MPSCNNKernel* kernel;
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageUtils.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageUtils.h
@ -1,6 +1,7 @@
 #import <ATen/Tensor.h>
 #import <ATen/native/metal/MetalCommandBuffer.h>
 #import <ATen/native/metal/MetalTensorImpl.h>
+#import <ATen/native/metal/MetalUtils.h>

 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>

@ -10,7 +11,7 @@ namespace metal {

 MPSImage* createStaticImage(const std::vector<int64_t>& sizes);
 MPSImage* createStaticImage(
-    const uint16_t* src,
+    const fp16_t* src,
    const std::vector<int64_t>& sizes);
 MPSImage* createStaticImage(
    const float* src,
@ -35,7 +36,7 @@ MPSTemporaryImage* createTemporaryImage(

 void copyToHost(float* dst, MPSImage* image);

-std::vector<uint16_t> staticImageToFp16Array(MPSImage* image);
+std::vector<fp16_t> staticImageToFp16Array(MPSImage* image);
 at::Tensor staticImageToTensor(MPSImage* image);

 static inline MPSImage* imageFromTensor(const Tensor& tensor) {
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageUtils.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageUtils.mm
@ -25,7 +25,7 @@ MPSImage* createStaticImage(const std::vector<int64_t>& sizes) {
 }

 MPSImage* createStaticImage(
-    const uint16_t* src,
+    const fp16_t* src,
    const std::vector<int64_t>& sizes) {
  int64_t N = sizes[0];
  int64_t C = sizes[1];
@ -45,7 +45,7 @@ MPSImage* createStaticImage(

  int64_t slices = (C + 3) / 4 * N;
  int64_t numComponents = image.featureChannels < 3 ? image.featureChannels : 4;
-  int64_t bytesPerRow = W * numComponents * sizeof(uint16_t);
+  int64_t bytesPerRow = W * numComponents * sizeof(fp16_t);
  uint8_t* ptr = (uint8_t*)src;
  for (int i = 0; i < slices; ++i) {
    [image.texture replaceRegion:MTLRegionMake2D(0, 0, W, H)
@ -256,7 +256,7 @@ void copyToHost(float* dst, MPSImage* image) {
  memcpy(dst, buffer.contents, buffer.length);
 }

-std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
+std::vector<fp16_t> staticImageToFp16Array(MPSImage* image) {
  if (image.pixelFormat == MTLPixelFormatR16Float ||
      image.pixelFormat == MTLPixelFormatRG16Float ||
      image.pixelFormat == MTLPixelFormatRGBA16Float) {
@ -265,8 +265,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
    int64_t numComponents =
        image.featureChannels < 3 ? image.featureChannels : 4;
    int64_t count = image.width * image.height * image.numberOfImages * C;
-    std::vector<uint16_t> output(count, 0);
-    int64_t bytesPerRow = image.width * numComponents * sizeof(uint16_t);
+    std::vector<fp16_t> output(count, 0);
+    int64_t bytesPerRow = image.width * numComponents * sizeof(fp16_t);
    uint8_t* buffer = (uint8_t*)output.data();
    for (int i = 0; i < slices * image.numberOfImages; ++i) {
      [image.texture getBytes:buffer
@ -285,8 +285,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {

 at::Tensor staticImageToTensor(MPSImage* image) {
  auto outputSize = [image sizes];
-  std::vector<uint16_t> fp16 = staticImageToFp16Array(image);
-  auto fp32 = metal::Fp16ToFp32(fp16);
+  std::vector<fp16_t> fp16Array = staticImageToFp16Array(image);
+  auto fp32 = metal::Fp16ToFp32(fp16Array);
  std::vector<float> fp32_nchw = metal::NC4ToNCHW(fp32.data(), outputSize);
  auto tensor = at::empty(outputSize);
  int64_t size_bytes = c10::multiply_integers(outputSize) * sizeof(float);
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
@ -24,6 +24,9 @@ bool test_div();
 bool test_div_broadcast();
 bool test_div_broadcast2();
 bool test_t();
+bool test_transpose();
+bool test_transpose2();
+bool test_transpose3();
 bool test_view();
 bool test_view2();
 bool test_view3();
@ -43,5 +46,8 @@ bool test_upsampling_nearest2d_vec();
 bool test_adaptive_avg_pool2d();
 bool test_hardtanh_();
 bool test_reshape();
+bool test_mean_dim();
+bool test_mean_dim2();
+bool test_mean_dim3();

 #endif
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@ -4,7 +4,6 @@
 #import <ATen/native/metal/mpscnn/MPSImageUtils.h>
 #import <ATen/native/metal/mpscnn/tests/MPSCNNTests.h>
 #import <ATen/native/metal/ops/MetalConvolution.h>
-#import <ATen/native/metal/ops/MetalTranspose.h>

 #import <Foundation/Foundation.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
@ -490,7 +489,7 @@ bool test_t() {
      auto X1 = at::rand({H, W}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
      auto Y1 = at::t(X1).contiguous();
      auto X2 = X1.metal();
-      auto Y2 = at::native::metal::t(X2).cpu();
+      auto Y2 = at::t(X2).cpu();
      return almostEqual(Y1, Y2);
    });
    if (!b) {
@ -500,6 +499,39 @@ bool test_t() {
  return result;
 }

+bool test_transpose() {
+    __block std::vector<int64_t> size {1, 2, 2, 5};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool{
+        auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+        auto Y1 = at::transpose(X1, 1, 3).contiguous();
+        auto X2 = X1.metal();
+        auto Y2 = at::transpose(X2, 1, 3).cpu();
+        return almostEqual(Y1, Y2);
+    });
+}
+
+bool test_transpose2() {
+    __block std::vector<int64_t> size {1, 2, 58, 28, 28};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool{
+        auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+        auto Y1 = at::transpose(X1, 1, 2).contiguous();
+        auto X2 = X1.metal();
+        auto Y2 = at::transpose(X2, 1, 2).cpu();
+        return almostEqual(Y1, Y2);
+    });
+}
+
+bool test_transpose3() {
+    __block std::vector<int64_t> size {4, 5, 6};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool{
+        auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+        auto Y1 = at::transpose(X1, 2, 0).contiguous();
+        auto X2 = X1.metal();
+        auto Y2 = at::transpose(X2, 2, 0).cpu();
+        return almostEqual(Y1, Y2);
+    });
+}
+
 bool test_view() {
  // array -> array
  __block std::vector<int64_t> size{1, 10, 2, 2};
@ -775,3 +807,38 @@ bool test_hardtanh_() {
  return true;
 #endif
 }
+
+bool test_mean_dim() {
+    __block std::vector<int64_t> size{1, 5, 2, 2};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool {
+      auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+      auto Y1 = at::mean(X1, {2,3}, true);
+      auto X2 = X1.metal();
+      auto Y2 = at::mean(X2, {2,3}, true).cpu();
+      return almostEqual(Y1, Y2);
+    });
+}
+
+bool test_mean_dim2() {
+    __block std::vector<int64_t> size{1, 5, 2, 2};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool {
+      auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+      auto Y1 = at::mean(X1, {1,3}, false);
+      auto X2 = X1.metal();
+      auto Y2 = at::mean(X2, {1,3}, false).cpu();
+      return almostEqual(Y1, Y2);
+    });
+}
+
+bool test_mean_dim3() {
+    __block std::vector<int64_t> size{1, 5, 2, 2};
+    return TEST(size, __PRETTY_FUNCTION__, ^bool {
+      auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+      auto Y1 = at::mean(X1, {0,1,2,3});
+      PRINT_TENSOR("Y1", Y1);
+      auto X2 = X1.metal();
+      auto Y2 = at::mean(X2, {0,1,2,3}).cpu();
+      PRINT_TENSOR("Y2", Y2);
+      return almostEqual(Y1, Y2);
+    });
+}
--- a/aten/src/ATen/native/metal/ops/MetalReduce.mm
+++ b/aten/src/ATen/native/metal/ops/MetalReduce.mm
@ -0,0 +1,84 @@
+#include <ATen/Tensor.h>
+#import <ATen/native/metal/MetalCommandBuffer.h>
+#import <ATen/native/metal/MetalTensorImpl.h>
+#import <ATen/native/metal/MetalTensorImplStorage.h>
+#import <ATen/native/metal/MetalUtils.h>
+#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
+#import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
+#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
+
+#include <ATen/ATen.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace metal {
+
+API_AVAILABLE(ios(11.3), macos(10.13))
+static inline MPSNNReduceUnary* kernelForReducedDim(int dim) {
+  id<MTLDevice> device = [MPSCNNContext sharedInstance].device;
+  if (dim == 3) {
+    return [[MPSNNReduceRowMean alloc] initWithDevice:device];
+  } else if (dim == 2) {
+    return [[MPSNNReduceColumnMean alloc] initWithDevice:device];
+  } else if (dim == 1) {
+    return [[MPSNNReduceFeatureChannelsMean alloc] initWithDevice:device];
+  }
+  return nil;
+}
+
+Tensor wrapper_mean_dim(
+    const Tensor& input,
+    IntArrayRef dims,
+    bool keepdim,
+    c10::optional<ScalarType> dtype) {
+  if (@available(iOS 11.3, *)) {
+    MPSImage* X = imageFromTensor(input);
+    auto textureSize = input.sizes().vec();
+    TORCH_CHECK(textureSize.size() == 4);
+    // TODO: [T87340633] Support reducing the batch dimension
+    TORCH_CHECK(textureSize[0] == 1);
+    auto mask = make_dim_mask(dims, input.dim());
+    MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
+    MPSImage* Y = nil;
+    for (int dim : dims) {
+      textureSize[dim] = 1;
+      MPSNNReduceUnary* kernel = kernelForReducedDim(dim);
+      if (kernel) {
+        Y = createTemporaryImage(commandBuffer, textureSize);
+        [kernel encodeToCommandBuffer:commandBuffer.buffer
+                          sourceImage:X
+                     destinationImage:Y];
+        X = Y;
+      }
+    }
+    MetalTensorImplStorage mt{textureSize};
+    mt.texture()->setCommandBuffer(commandBuffer);
+    mt.texture()->copyFromTexture(Y);
+    auto shape = DimVector(input.sizes());
+    for (int dim = shape.size() - 1; dim >= 0; dim--) {
+      if (mask[dim]) {
+        if (keepdim) {
+          shape[dim] = 1;
+        } else {
+          shape.erase(shape.begin() + dim);
+        }
+      }
+    }
+    auto output = makeTensor(std::move(mt), input.options()).view(shape);
+    return output;
+  } else {
+    // TODO: [T87350528] Fallback to shader kernels for 10.0 users
+    TORCH_CHECK(
+        false, "MPSNNReduceUnary is only available on iOS 11.3 and above");
+  }
+}
+
+TORCH_LIBRARY_IMPL(aten, Metal, m) {
+  m.impl("mean.dim", TORCH_FN(wrapper_mean_dim));
+};
+
+}
+}
+}
--- a/aten/src/ATen/native/metal/ops/MetalTranspose.h
+++ b/aten/src/ATen/native/metal/ops/MetalTranspose.h
@ -1,17 +0,0 @@
-#ifndef MetalCopy_h
-#define MetalCopy_h
-
-#include <ATen/Tensor.h>
-
-namespace at {
-namespace native {
-namespace metal {
-
-// TODO: Remove the header once we are able to call it through dispatcher
-Tensor t(const Tensor& input);
-
-} // namespace metal
-} // namespace native
-} // namespace at
-
-#endif
--- a/aten/src/ATen/native/metal/ops/MetalTranspose.mm
+++ b/aten/src/ATen/native/metal/ops/MetalTranspose.mm
@ -3,37 +3,94 @@
 #import <ATen/native/metal/MetalTensorImplStorage.h>
 #import <ATen/native/metal/MetalUtils.h>
 #import <ATen/native/metal/mpscnn/MPSCNNContext.h>
+#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
 #import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
 #import <ATen/native/metal/mpscnn/MPSImageUtils.h>
+
+#include <ATen/ATen.h>
 #include <torch/library.h>

 namespace at {
 namespace native {
 namespace metal {

+Tensor transpose(const Tensor& input, int64_t dim0, int64_t dim1) {
+  TORCH_CHECK(input.is_metal());
+  auto ndims = input.dim();
+  dim0 = maybe_wrap_dim(dim0, ndims);
+  dim1 = maybe_wrap_dim(dim1, ndims);
+  if (dim0 == dim1) {
+    return input;
+  }
+  auto outputSizes = input.sizes().vec();
+  std::swap(outputSizes[dim0], outputSizes[dim1]);
+  MPSImage* X = imageFromTensor(input);
+  MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
+  if (input.dim() == 2) {
+    MetalTensorImplStorage mt{outputSizes};
+    mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
+    MPSImage* Y = mt.texture()->image();
+    MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
+        initWithDevice:[MPSCNNContext sharedInstance].device];
+    [transpose encodeToCommandBuffer:commandBuffer.buffer
+                         sourceImage:X
+                    destinationImage:Y];
+    auto output = makeTensor(std::move(mt), input.options());
+    return output;
+  } else {
+    id<MTLBuffer> sizeBuf1 = makeMTLBuffer<ushort>(
+        std::vector<ushort>{input.sizes().begin(), input.sizes().end()});
+    id<MTLBuffer> sizeBuf2 = makeMTLBuffer<ushort>(
+        std::vector<ushort>{outputSizes.begin(), outputSizes.end()});
+    id<MTLBuffer> indexBuf = makeMTLBuffer(std::vector<ushort>(input.dim(), 1));
+    MetalTensorImplStorage mt{outputSizes};
+    mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
+    MPSImage* Y = mt.texture()->image();
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer.buffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        [[MPSCNNContext sharedInstance] specializedPipelineState:@"transpose"
+                                                       Constants:@[
+                                                         @(dim0),
+                                                         @(dim1),
+                                                         @(input.dim()),
+                                                         @(X.numberOfImages),
+                                                         @(X.featureChannels),
+                                                         @(Y.numberOfImages),
+                                                         @(Y.featureChannels),
+                                                       ]];
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X texture] atIndex:0];
+    [encoder setTexture:[Y texture] atIndex:1];
+    [encoder setBuffer:sizeBuf1 offset:0 atIndex:0];
+    [encoder setBuffer:sizeBuf2 offset:0 atIndex:1];
+    [encoder setBuffer:indexBuf offset:0 atIndex:2];
+
+    const auto& launchParams =
+        mpscnn::spatialPointwiseKernelLaunchParams(state, Y);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    [X markRead];
+    [Y markRead];
+
+    auto output = makeTensor(std::move(mt), input.options());
+    return output;
+  }
+}
+
 Tensor t(const Tensor& input) {
-  TORCH_CHECK(input.is_metal());
  TORCH_CHECK(input.is_metal());
  TORCH_CHECK(input.dim() == 2);
-  auto strides = input.strides().vec();
-  auto sizes = input.sizes().vec();
-  MPSImage* X = imageFromTensor(input);
-  TORCH_CHECK(X.numberOfImages == 1);
-  TORCH_CHECK(X.featureChannels == 1);
-  MetalTensorImplStorage mt({sizes[1], sizes[0]});
-  MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
-  mt.texture()->allocateTemporaryTextureStorage(
-      {1, 1, sizes[1], sizes[0]}, commandBuffer);
-  MPSImage* Y = mt.texture()->image();
-  MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
-      initWithDevice:[MPSCNNContext sharedInstance].device];
-  [transpose encodeToCommandBuffer:commandBuffer.buffer
-                       sourceImage:X
-                  destinationImage:Y];
-  auto output = makeTensor(std::move(mt), input.options());
-  return output;
+  return metal::transpose(input, 0, input.dim() < 2 ? 0 : 1);
 }

+TORCH_LIBRARY_IMPL(aten, Metal, m) {
+  m.impl("t", TORCH_FN(t));
+  m.impl("transpose.int", TORCH_FN(transpose));
+};
+
 }
 }
 }
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5034,9 +5034,6 @@
  structured_delegate: digamma.out
  variants: method

- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
-  variants: method
-
 - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
  variants: method
  dispatch:
@ -5804,6 +5801,11 @@
  dispatch:
    CompositeExplicitAutograd: polygamma

+- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: polygamma_
+
 - func: erfinv(Tensor self) -> Tensor
  structured_delegate: erfinv.out
  variants: method, function
@ -7439,6 +7441,7 @@
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
+  structured: True
  dispatch:
    CPU: adaptive_max_pool2d_out_cpu
    CUDA: adaptive_max_pool2d_out_cuda
@ -7446,9 +7449,7 @@
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
  python_module: nn
-  dispatch:
-    CPU: adaptive_max_pool2d_cpu
-    CUDA: adaptive_max_pool2d_cuda
+  structured_delegate: adaptive_max_pool2d.out

 - func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
@ -7465,6 +7466,7 @@
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
+  structured: True
  dispatch:
    CPU: adaptive_max_pool3d_out_cpu
    CUDA: adaptive_max_pool3d_out_cuda
@ -7472,9 +7474,7 @@
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
  python_module: nn
-  dispatch:
-    CPU: adaptive_max_pool3d_cpu
-    CUDA: adaptive_max_pool3d_cuda
+  structured_delegate: adaptive_max_pool3d.out

 - func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
@ -8637,6 +8637,12 @@
  dispatch:
    CompositeExplicitAutograd: linalg_lstsq

+- func: linalg_lstsq.out(Tensor self, Tensor b, float? cond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_lstsq_out
+
 - func: _lstsq_helper_(Tensor(a!) self, Tensor(b!) rank, Tensor(c!) singular_values, Tensor(d!) infos, Tensor a, float cond, str driver_name) -> Tensor(a!)
  variants: function
  dispatch:
--- a/aten/src/ATen/test/vec256_test_all_types.cpp
+++ b/aten/src/ATen/test/vec256_test_all_types.cpp
@ -12,6 +12,8 @@ namespace {
    template <typename T>
    class MinMax : public ::testing::Test {};
    template <typename T>
+    class Nan : public ::testing::Test {};
+    template <typename T>
    class Interleave : public ::testing::Test {};
    template <typename T>
    class SignManipulation : public ::testing::Test {};
@ -67,6 +69,7 @@ namespace {
    TYPED_TEST_CASE(Comparison, RealFloatIntTestedTypes);
    TYPED_TEST_CASE(Bitwise, FloatIntTestedTypes);
    TYPED_TEST_CASE(MinMax, RealFloatIntTestedTypes);
+    TYPED_TEST_CASE(Nan, RealFloatTestedTypes);
    TYPED_TEST_CASE(Interleave, RealFloatIntTestedTypes);
    TYPED_TEST_CASE(SignManipulation, FloatIntTestedTypes);
    TYPED_TEST_CASE(Rounding, RealFloatTestedTypes);
@ -435,9 +438,29 @@ namespace {
            [](const vec& v) { return v.erfinv(); },
            createDefaultUnaryTestCase<vec>(TestSeed(), false, true));
    }
-
-
-
+    TYPED_TEST(Nan, IsNan) {
+        using vec = TypeParam;
+        using VT = ValueType<TypeParam>;
+        CACHE_ALIGN VT test_vals[vec::size()];
+        CACHE_ALIGN VT expected_vals[vec::size()];
+        auto vals = 1 << (vec::size());
+        for (int val = 0; val < vals; ++val) {
+          for (int i = 0; i < vec::size(); ++i) {
+            if (val & (1 << i)) {
+              test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
+              // All bits are set to 1 if true, otherwise 0.
+              // same rule as at::Vec256<T>::binary_pred.
+              std::memset(static_cast<void*>(&expected_vals[i]), 0xFF, sizeof(VT));
+            } else {
+              test_vals[i] = (VT)0.123;
+              std::memset(static_cast<void*>(&expected_vals[i]), 0, sizeof(VT));
+            }
+          }
+          vec actual = vec::loadu(test_vals).isnan();
+          vec expected = vec::loadu(expected_vals);
+          AssertVec256<vec>(NAME_INFO(isnan), expected, actual).check();
+        }
+    }
    TYPED_TEST(LGamma, LGamma) {
        using vec = TypeParam;
        using UVT = UvalueType<vec>;
--- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
@ -32,8 +32,7 @@ static void FusedOverhead(benchmark::State& state) {
 }

 static void UnfusedOverhead(benchmark::State& state) {
-  torch::NoGradGuard ng;
-  torch::AutoNonVariableTypeMode nv;
+  c10::InferenceMode guard;
  overrideCanFuseOnCPU(false);

  Module m("m");
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@ -244,7 +244,7 @@ if __name__ == '__main__':
    vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']

    if args.print_json:
-        print_stderr = lambda *args, **kwargs: None    # noqa
+        print_stderr = lambda *args, **kwargs: None    # noqa: E731,F811
    print_stderr(args)

    bench_args = copy.deepcopy(vars(args))
--- a/benchmarks/fastrnns/factory.py
+++ b/benchmarks/fastrnns/factory.py
@ -239,7 +239,6 @@ def varlen_lstm_factory(cell, script):
    def dynamic_rnn(sequences: List[Tensor], hiddens: Tuple[Tensor, Tensor], wih: Tensor,
                    whh: Tensor, bih: Tensor, bhh: Tensor
                    ) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
-        # noqa
        hx, cx = hiddens
        hxs = hx.unbind(1)
        cxs = cx.unbind(1)
--- a/benchmarks/operator_benchmark/benchmark_all_other_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@ -1,11 +1,11 @@
 import operator_benchmark as op_bench
-from pt import ( # noqa
-    add_test, as_strided_test, batchnorm_test, binary_test, cat_test,  # noqa
-    channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,  # noqa
-    fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,  # noqa
-    softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,  # noqa
-    groupnorm_test, interpolate_test, instancenorm_test, remainder_test, softmax_test,  # noqa
-    split_test, sum_test, tensor_to_test  # noqa
+from pt import (  # noqa: F401
+    add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
+    channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
+    fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
+    softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
+    groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
+    split_test, sum_test, tensor_to_test
 )

 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
@ -1,5 +1,5 @@
 import operator_benchmark as op_bench
-from pt import ( # noqa
+from pt import (  # noqa: F401
    qactivation_test,
    qarithmetic_test,
    qbatchnorm_test,
--- a/benchmarks/operator_benchmark/benchmark_all_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_test.py
@ -1,9 +1,9 @@
 import operator_benchmark as op_bench
-from pt import ( # noqa
-    unary_test,  # noqa
+from pt import (  # noqa: F401
+    unary_test,
 )
-import benchmark_all_other_test  # noqa
-import benchmark_all_quantized_test  # noqa
+import benchmark_all_other_test  # noqa: F401
+import benchmark_all_quantized_test  # noqa: F401

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -7,7 +7,7 @@ import copy
 import ast

 # needs to be imported after torch
-import torch.utils.cpp_extension as cpp_extension # noqa
+import torch.utils.cpp_extension as cpp_extension  # noqa: F401

 import benchmark_utils
 from collections import namedtuple
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -1,7 +1,7 @@
 import time
 import json
 import torch
-import cpp_extension # noqa
+import cpp_extension  # noqa: F401


 """PyTorch performance microbenchmarks.
--- a/benchmarks/operator_benchmark/c2/add_test.py
+++ b/benchmarks/operator_benchmark/c2/add_test.py
@ -1,6 +1,6 @@
 import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
-from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core


--- a/benchmarks/operator_benchmark/c2/batch_box_cox_test.py
+++ b/benchmarks/operator_benchmark/c2/batch_box_cox_test.py
@ -1,6 +1,6 @@
 import benchmark_caffe2 as op_bench_c2
 import operator_benchmark as op_bench
-from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core


--- a/benchmarks/operator_benchmark/c2/batch_gather_test.py
+++ b/benchmarks/operator_benchmark/c2/batch_gather_test.py
@ -1,6 +1,6 @@
 import benchmark_caffe2 as op_bench_c2
 import operator_benchmark as op_bench
-from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core
 import numpy

--- a/benchmarks/operator_benchmark/c2/clip_ranges_test.py
+++ b/benchmarks/operator_benchmark/c2/clip_ranges_test.py
@ -1,6 +1,6 @@
 import benchmark_caffe2 as op_bench_c2
 import operator_benchmark as op_bench
-from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core, dyndep

 dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")
--- a/benchmarks/operator_benchmark/c2/concat_test.py
+++ b/benchmarks/operator_benchmark/c2/concat_test.py
@ -1,7 +1,7 @@
 import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
 import random
-from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core


@ -17,9 +17,9 @@ cross_product_configs = {
 cat_configs_short = op_bench.config_list(
    attr_names=['sizes', 'N', 'axis'],
    attrs=[
-        [(1,    1,      1), 2, 0], # noqa
-        [(512,  512,    2), 2, 1], # noqa
-        [(128, 1024,    2), 2, 1], # noqa
+        [(1,    1,      1), 2, 0],  # noqa: E241
+        [(512,  512,    2), 2, 1],  # noqa: E241
+        [(128, 1024,    2), 2, 1],  # noqa: E241
    ],
    cross_product_configs=cross_product_configs,
    tags=['short'],
@ -29,14 +29,14 @@ cat_configs_short = op_bench.config_list(
 cat_configs_static_runtime = op_bench.config_list(
    attr_names=['sizes', 'N', 'axis', 'add_axis'],
    attrs=[
-        [(1, 40), 5, 1, 1], # noqa
-        [[(1, 160), (1, 14)], -1, 1, 0], # noqa
-        [[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0], # noqa
-        [[(1, 580), (1, 174)], -1, 1, 0], # noqa
-        [(20, 40), 5, 1, 1], # noqa
-        [[(20, 160), (20, 14)], -1, 1, 0], # noqa
-        [[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0], # noqa
-        [[(20, 580), (20, 174)], -1, 1, 0], # noqa
+        [(1, 40), 5, 1, 1],
+        [[(1, 160), (1, 14)], -1, 1, 0],
+        [[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0],
+        [[(1, 580), (1, 174)], -1, 1, 0],
+        [(20, 40), 5, 1, 1],
+        [[(20, 160), (20, 14)], -1, 1, 0],
+        [[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0],
+        [[(20, 580), (20, 174)], -1, 1, 0],
    ],
    cross_product_configs=cross_product_configs,
    tags=['static_runtime'],
@ -45,22 +45,22 @@ cat_configs_static_runtime = op_bench.config_list(
 cat_configs_long = op_bench.config_list(
    attr_names=['sizes', 'N', 'axis'],
    attrs=[
-        [(2**10,    2**10,      2), 2, 0], # noqa
-        [(2**10+1,  2**10-1,    2), 2, 1], # noqa
-        [(2**10,    2**10,      2), 2, 2], # noqa
+        [(2**10,    2**10,      2), 2, 0],  # noqa: E241
+        [(2**10+1,  2**10-1,    2), 2, 1],  # noqa: E226,E241
+        [(2**10,    2**10,      2), 2, 2],  # noqa: E241

-        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1], # noqa
+        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1],  # noqa: E201,E226,E241
            5, 0],
-        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6], # noqa
+        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6],  # noqa: E201,E226,E241,E272
            5, 1],
-        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)], # noqa
+        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)],  # noqa: E201,E241,E272
            5, 2],

-        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6], # noqa
+        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6],  # noqa: E241
            50, 0],
-        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6], # noqa
+        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6],  # noqa: E241,E272
            50, 1],
-        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)], # noqa
+        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)],  # noqa: E226,E241,E272
            50, 2],
    ],
    cross_product_configs=cross_product_configs,
@ -71,9 +71,9 @@ cat_configs_long = op_bench.config_list(
 cat_configs_multidim = op_bench.config_list(
    attr_names=['sizes', 'N', 'axis', 'dtype'],
    attrs=[
-        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2], # noqa
-        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2], # noqa
-        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
+        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2],  # noqa: E241
+        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2],  # noqa: E241
+        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4],  # noqa: E226,E241
    ],
    cross_product_configs=cross_product_configs,
    tags=['multidim'],
--- a/benchmarks/operator_benchmark/c2/matmul_test.py
+++ b/benchmarks/operator_benchmark/c2/matmul_test.py
@ -1,7 +1,7 @@

 import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
-from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

 """Microbenchmarks for MatMul operator"""
--- a/benchmarks/operator_benchmark/c2/quantile_op_test.py
+++ b/benchmarks/operator_benchmark/c2/quantile_op_test.py
@ -1,6 +1,6 @@
 import benchmark_caffe2 as op_bench_c2
 import operator_benchmark as op_bench
-from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core


--- a/benchmarks/operator_benchmark/c2/replace_nan_test.py
+++ b/benchmarks/operator_benchmark/c2/replace_nan_test.py
@ -1,6 +1,6 @@
 import benchmark_caffe2 as op_bench_c2
 import operator_benchmark as op_bench
-from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
+from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core


--- a/benchmarks/operator_benchmark/operator_benchmark.py
+++ b/benchmarks/operator_benchmark/operator_benchmark.py
@ -1,5 +1,5 @@
 # TODO (mingzhe09088): get rid of noqa
-import benchmark_runner # noqa
-from benchmark_pytorch import TorchBenchmarkBase # noqa
-from benchmark_test_generator import * # noqa
-from benchmark_utils import * # noqa
+import benchmark_runner  # noqa: F401
+from benchmark_pytorch import TorchBenchmarkBase  # noqa: F401
+from benchmark_test_generator import *  # noqa: F401,F403
+from benchmark_utils import *  # noqa: F401,F403
--- a/benchmarks/operator_benchmark/pt/cat_test.py
+++ b/benchmarks/operator_benchmark/pt/cat_test.py
@ -14,9 +14,9 @@ cross_product_configs = {
 cat_configs_short = op_bench.config_list(
    attr_names=['sizes', 'N', 'dim'],
    attrs=[
-        [(1,    1,      1), 2, 0], # noqa
-        [(512,  512,    2), 2, 1], # noqa
-        [(128, 1024,    2), 2, 1], # noqa
+        [(1,    1,      1), 2, 0],  # noqa: E241
+        [(512,  512,    2), 2, 1],  # noqa: E241
+        [(128, 1024,    2), 2, 1],  # noqa: E241
    ],
    cross_product_configs=cross_product_configs,
    tags=['short'],
@ -26,12 +26,12 @@ cat_configs_short = op_bench.config_list(
 cat_configs_static_runtime = op_bench.config_list(
    attr_names=['sizes', 'N', 'dim'],
    attrs=[
-        [[(1, 160), (1, 14)], -1, 1], # noqa
-        [[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1], # noqa
-        [[(1, 580), (1, 174)], -1, 1], # noqa
-        [[(20, 160), (20, 14)], -1, 1], # noqa
-        [[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1], # noqa
-        [[(20, 580), (20, 174)], -1, 1], # noqa
+        [[(1, 160), (1, 14)], -1, 1],
+        [[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1],
+        [[(1, 580), (1, 174)], -1, 1],
+        [[(20, 160), (20, 14)], -1, 1],
+        [[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1],
+        [[(20, 580), (20, 174)], -1, 1],
    ],
    cross_product_configs=cross_product_configs,
    tags=['static_runtime'],
@ -40,22 +40,22 @@ cat_configs_static_runtime = op_bench.config_list(
 cat_configs_long = op_bench.config_list(
    attr_names=['sizes', 'N', 'dim'],
    attrs=[
-        [(2**10,    2**10,      2), 2, 0], # noqa
-        [(2**10+1,  2**10-1,    2), 2, 1], # noqa
-        [(2**10,    2**10,      2), 2, 2], # noqa
+        [(2**10,    2**10,      2), 2, 0],  # noqa: E241
+        [(2**10+1,  2**10-1,    2), 2, 1],  # noqa: E226,E241
+        [(2**10,    2**10,      2), 2, 2],  # noqa: E241

-        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1], # noqa
+        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1],  # noqa: E201,E226,E241
            5, 0],
-        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6], # noqa
+        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6],  # noqa: E201,E226,E241,E272
            5, 1],
-        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)], # noqa
+        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)],  # noqa: E201,E241,E272
            5, 2],

-        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6], # noqa
+        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6],  # noqa: E241
            50, 0],
-        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6], # noqa
+        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6],  # noqa: E241,E272
            50, 1],
-        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)], # noqa
+        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)],  # noqa: E226,E241,E272
            50, 2],
    ],
    cross_product_configs=cross_product_configs,
@ -66,9 +66,9 @@ cat_configs_long = op_bench.config_list(
 cat_configs_multidim = op_bench.config_list(
    attr_names=['sizes', 'N', 'dim'],
    attrs=[
-        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2], # noqa
-        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2], # noqa
-        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
+        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2],  # noqa: E241
+        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2],  # noqa: E241
+        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4],  # noqa: E226,E241
    ],
    cross_product_configs=cross_product_configs,
    tags=['multidim'],
--- a/benchmarks/operator_benchmark/pt/qactivation_test.py
+++ b/benchmarks/operator_benchmark/pt/qactivation_test.py
@ -8,17 +8,17 @@ r"""Microbenchmarks for the quantized activations."""
 qactivation_long_configs = op_bench.cross_product_configs(
    dims=(
        # VGG-16 relu's with original shape: (-1, 3, 224, 224)
-        ( 64, 224, 224),  # ReLU-1   # noqa
-        (128, 112, 112),  # ReLU-6   # noqa
-        (256,  56,  56),  # ReLU-11  # noqa
-        (512,  28,  28),  # ReLU-18  # noqa
-        (512,  14,  14),  # ReLU-25  # noqa
+        ( 64, 224, 224),  # ReLU-1   # noqa: E201
+        (128, 112, 112),  # ReLU-6
+        (256,  56,  56),  # ReLU-11  # noqa: E241
+        (512,  28,  28),  # ReLU-18  # noqa: E241
+        (512,  14,  14),  # ReLU-25  # noqa: E241
        # Batch = 16
-        (16,  64, 224, 224),  # ReLU-1   # noqa
-        (16, 128, 112, 112),  # ReLU-6   # noqa
-        (16, 256,  56,  56),  # ReLU-11  # noqa
-        (16, 512,  28,  28),  # ReLU-18  # noqa
-        (16, 512,  14,  14),  # ReLU-25  # noqa
+        (16,  64, 224, 224),  # ReLU-1   # noqa: E241
+        (16, 128, 112, 112),  # ReLU-6
+        (16, 256,  56,  56),  # ReLU-11  # noqa: E241
+        (16, 512,  28,  28),  # ReLU-18  # noqa: E241
+        (16, 512,  14,  14),  # ReLU-25  # noqa: E241
    ),
    contig=(False, True),
    inplace=(False, True),
--- a/benchmarks/operator_benchmark/pt/qobserver_test.py
+++ b/benchmarks/operator_benchmark/pt/qobserver_test.py
@ -43,7 +43,7 @@ qobserver_per_tensor_configs_short = op_bench.config_list(
    cross_product_configs={
        'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
    },
-    **qobserver_short_configs_dict,  # noqa
+    **qobserver_short_configs_dict,
 )

 qobserver_per_tensor_configs_long = op_bench.cross_product_configs(
@ -67,7 +67,7 @@ q_hist_observer_per_tensor_configs_short = op_bench.config_list(
    cross_product_configs={
        'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
    },
-    **q_hist_observer_short_configs_dict,  # noqa
+    **q_hist_observer_short_configs_dict,
 )

 q_hist_observer_per_tensor_configs_long = op_bench.cross_product_configs(
--- a/benchmarks/operator_benchmark/pt/qpool_test.py
+++ b/benchmarks/operator_benchmark/pt/qpool_test.py
@ -6,11 +6,11 @@ import operator_benchmark as op_bench
 qpool2d_long_configs = op_bench.config_list(
    attrs=(
        #  C    H    W   k       s       p
-        (  1,   3,   3, (3, 3), (1, 1), (0, 0)),  # dummy        # noqa
-        (  3,  64,  64, (3, 3), (2, 2), (1, 1)),  # dummy        # noqa
+        (  1,   3,   3, (3, 3), (1, 1), (0, 0)),  # dummy        # noqa: E201,E241
+        (  3,  64,  64, (3, 3), (2, 2), (1, 1)),  # dummy        # noqa: E201,E241
        # VGG16 pools with original input shape: (-1, 3, 224, 224)
-        ( 64, 224, 224, (2, 2), (2, 2), (0, 0)),  # MaxPool2d-4  # noqa
-        (256,  56,  56, (2, 2), (2, 2), (0, 0)),  # MaxPool2d-16 # noqa
+        ( 64, 224, 224, (2, 2), (2, 2), (0, 0)),  # MaxPool2d-4  # noqa: E201
+        (256,  56,  56, (2, 2), (2, 2), (0, 0)),  # MaxPool2d-16 # noqa: E241
    ),
    attr_names=('C', 'H', 'W',   # Input layout
                'k', 's', 'p'),  # Pooling parameters
@ -23,7 +23,7 @@ qpool2d_long_configs = op_bench.config_list(
 )

 qpool2d_short_configs = op_bench.config_list(
-    attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),),  # dummy  # noqa
+    attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),),  # dummy
    attr_names=('C', 'H', 'W',        # Input layout
                'k', 's', 'p'),  # Pooling parameters
    cross_product_configs={
@ -37,15 +37,15 @@ qpool2d_short_configs = op_bench.config_list(
 qadaptive_avgpool2d_long_configs = op_bench.cross_product_configs(
    input_size=(
        # VGG16 pools with original input shape: (-1, 3, 224, 224)
-        (112, 112),  # MaxPool2d-9  # noqa
+        (112, 112),  # MaxPool2d-9
    ),
    output_size=(
        (448, 448),
        # VGG16 pools with original input shape: (-1, 3, 224, 224)
-        (224, 224),  # MaxPool2d-4  # noqa
-        (112, 112),  # MaxPool2d-9  # noqa
-        ( 56,  56),  # MaxPool2d-16 # noqa
-        ( 14,  14),  # MaxPool2d-30 # noqa
+        (224, 224),  # MaxPool2d-4
+        (112, 112),  # MaxPool2d-9
+        ( 56,  56),  # MaxPool2d-16 # noqa: E201,E241
+        ( 14,  14),  # MaxPool2d-30 # noqa: E201,E241
    ),
    N=(1, 4),
    C=(1, 3, 64, 128),
--- a/benchmarks/operator_benchmark/pt/stack_test.py
+++ b/benchmarks/operator_benchmark/pt/stack_test.py
@ -10,8 +10,8 @@ from typing import List
 stack_configs_static_runtime = op_bench.config_list(
    attr_names=['sizes', 'N'],
    attrs=[
-        [(20, 40), 5], # noqa
-        [(1, 40), 5], # noqa
+        [(20, 40), 5],
+        [(1, 40), 5],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
@ -23,9 +23,9 @@ stack_configs_static_runtime = op_bench.config_list(
 stack_configs_short = op_bench.config_list(
    attr_names=['sizes', 'N'],
    attrs=[
-        [(1,    1,      1), 2], # noqa
-        [(512,  512,    2), 2], # noqa
-        [(128, 1024,    2), 2], # noqa
+        [(1,    1,      1), 2],  # noqa: E241
+        [(512,  512,    2), 2],  # noqa: E241
+        [(128, 1024,    2), 2],  # noqa: E241
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
@ -37,9 +37,9 @@ stack_configs_short = op_bench.config_list(
 stack_configs_long = op_bench.config_list(
    attr_names=['sizes', 'N'],
    attrs=[
-        [(2**10,    2**10,      2), 2], # noqa
-        [(2**10+1,  2**10-1,    2), 2], # noqa
-        [(2**10,    2**10,      2), 2], # noqa
+        [(2**10,    2**10,      2), 2],  # noqa: E241
+        [(2**10+1,  2**10-1,    2), 2],  # noqa: E226,E241
+        [(2**10,    2**10,      2), 2],  # noqa: E241
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
@ -52,9 +52,9 @@ stack_configs_long = op_bench.config_list(
 stack_configs_multidim = op_bench.config_list(
    attr_names=['sizes', 'N'],
    attrs=[
-        [(2**6,     2**5,   2**2,   2**4,   2**5), 2], # noqa
-        [(2**4,     2**5,   2**2,   2**4,   2**5), 8], # noqa
-        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17], # noqa
+        [(2**6,     2**5,   2**2,   2**4,   2**5), 2],  # noqa: E241
+        [(2**4,     2**5,   2**2,   2**4,   2**5), 8],  # noqa: E241
+        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17],  # noqa: E226,E241
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
--- a/benchmarks/operator_benchmark/pt_extension/cpp_extension_test.py
+++ b/benchmarks/operator_benchmark/pt_extension/cpp_extension_test.py
@ -1,6 +1,6 @@
 import unittest

-import cpp_extension  # noqa
+import cpp_extension  # noqa: F401
 import torch


--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@ -86,7 +86,7 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
    return Backend::QuantizedCPU;
  } else if (t == DispatchKey::QuantizedCUDA) {
    return Backend::QuantizedCUDA;
-  } else if (t == DispatchKey::XPU) {
+  } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
    return Backend::XPU;
  } else if (t == DispatchKey::SparseXPU) {
    return Backend::SparseXPU;
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -146,6 +146,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
  switch (t) {
    case DispatchKey::CPU:
      return DispatchKey::AutogradCPU;
+    case DispatchKey::XPU:
+      return DispatchKey::AutogradXPU;
    case DispatchKey::CUDA:
      return DispatchKey::AutogradCUDA;
    case DispatchKey::XLA:
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@ -19,6 +19,10 @@
 #include <execinfo.h>
 #endif

+#ifdef FBCODE_CAFFE2
+#include <common/process/StackTrace.h>
+#endif
+
 namespace c10 {

 #if SUPPORTS_BACKTRACE
@ -167,7 +171,14 @@ std::string get_backtrace(
    size_t frames_to_skip,
    size_t maximum_number_of_frames,
    bool skip_python_frames) {
-#if SUPPORTS_BACKTRACE
+#ifdef FBCODE_CAFFE2
+  // For some reason, the stacktrace implementation in fbcode is
+  // better than ours, see  https://github.com/pytorch/pytorch/issues/56399
+  // When it's available, just use that.
+  facebook::process::StackTrace st;
+  return st.toString();
+
+#elif SUPPORTS_BACKTRACE

  // We always skip this frame (backtrace).
  frames_to_skip += 1;
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@ -209,7 +209,6 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
  float weight_sum = 0.0;
  if (!label_prob_mode_) {
    const int* label_data = T.data<int>();
-    const float* Xdata = X.data<float>();

    for (int i = 0; i < N; ++i) {
      CAFFE_ENFORCE(
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@ -26,6 +26,15 @@ C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
  return at::DeviceTypeName(static_cast<at::DeviceType>(d));
 }

+void setTotalBytesLimit(::google::protobuf::io::CodedInputStream& stream, int bytes_limit, int warning_threshold) {
+  #if GOOGLE_PROTOBUF_VERSION >= 3011000
+    // Only take one parameter since protobuf 3.11
+    stream.SetTotalBytesLimit(bytes_limit);
+  #else
+    stream.SetTotalBytesLimit(bytes_limit, warning_threshold);
+  #endif
+}
+
 C10_EXPORT int DeviceId(const DeviceOption& option) {
  switch (option.device_type()) {
    case PROTO_CPU:
@ -136,7 +145,7 @@ C10_EXPORT bool ParseProtoFromLargeString(
  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
  // Set PlanDef message size limit to 2G.
-  coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
+  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
  return proto->ParseFromCodedStream(&coded_stream);
 }

@ -149,7 +158,7 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
  // Total bytes hard limit / warning limit are set to 2GB and 512MB
  // respectively.
  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
+  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
  return proto->ParseFromCodedStream(&coded_stream);
 }

@ -200,7 +209,7 @@ C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
  // Set PlanDef message size limit to 2G.
-  coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
+  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
  return proto->ParseFromCodedStream(&coded_stream);
 }

@ -244,7 +253,13 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
  std::unique_ptr<CodedInputStream> coded_input(
      new CodedInputStream(raw_input.get()));
  // A hack to manually allow using very large protocol buffers.
-  coded_input->SetTotalBytesLimit(2147483647, 536870912);
+  #if GOOGLE_PROTOBUF_VERSION >= 3011000
+    // Only take one parameter since protobuf 3.11
+    coded_input->SetTotalBytesLimit(2147483647);
+  #else
+    // Total bytes hard limit / warning limit are set to 2GB and 512MB respectively.
+    coded_input->SetTotalBytesLimit(2147483647, 536870912);
+  #endif
  bool success = proto->ParseFromCodedStream(coded_input.get());
  coded_input.reset();
  raw_input.reset();
--- a/caffe2/utils/signal_handler.cc
+++ b/caffe2/utils/signal_handler.cc
@ -1,5 +1,6 @@
 #include "caffe2/utils/signal_handler.h"
 #include "caffe2/core/logging.h"
+#include <c10/util/Backtrace.h>

 #if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)

@ -149,63 +150,11 @@ const char* getSignalName(int signum) {
  return nullptr;
 }

-_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
-  auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
-  pcs.push_back(_Unwind_GetIP(context));
-  return _URC_NO_REASON;
-}
-
-std::vector<uintptr_t> getBacktrace() {
-  std::vector<uintptr_t> pcs;
-  _Unwind_Backtrace(unwinder, &pcs);
-  return pcs;
-}
-
 void printBlobSizes() {
  ::caffe2::Workspace::ForEach(
      [&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
 }

-void printStacktrace() {
-  std::vector<uintptr_t> pcs = getBacktrace();
-  Dl_info info;
-  size_t i = 0;
-  for (uintptr_t pcAddr : pcs) {
-    const void* pc = reinterpret_cast<const void*>(pcAddr);
-    const char* path = nullptr;
-    const char* name = "???";
-    char* demangled = nullptr;
-    int offset = -1;
-
-    std::cerr << "[" << i << "] ";
-    if (dladdr(pc, &info)) {
-      path = info.dli_fname;
-      name = info.dli_sname ?: "???";
-      offset = reinterpret_cast<uintptr_t>(pc) -
-          reinterpret_cast<uintptr_t>(info.dli_saddr);
-
-      int status;
-      demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
-      if (status == 0) {
-        name = demangled;
-      }
-    }
-    std::cerr << name;
-    if (offset >= 0) {
-      std::cerr << "+" << reinterpret_cast<void*>(offset);
-    }
-    std::cerr << "(" << pc << ")";
-    if (path) {
-      std::cerr << " in " << path;
-    }
-    std::cerr << std::endl;
-    if (demangled) {
-      free(demangled);
-    }
-    i += 1;
-  }
-}
-
 void callPreviousSignalHandler(
    struct sigaction* action,
    int signum,
@ -229,7 +178,7 @@ void stacktraceSignalHandler(bool needsLock) {
  pid_t tid = syscall(SYS_gettid);
  std::cerr << fatalSignalName << "(" << fatalSignum << "), PID: " << ::getpid()
            << ", Thread " << tid << ": " << std::endl;
-  printStacktrace();
+  std::cerr << c10::get_backtrace();
  std::cerr << std::endl;
  if (needsLock) {
    pthread_mutex_unlock(&writingMutex);
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -18,7 +18,7 @@ Features described in this documentation are classified by release status:
  breaking changes can happen and notice will be given one release ahead
  of time).

-  *Beta:*  Features are tagged as Beta because the API may change based on
+  *Beta:*  These features are tagged as Beta because the API may change based on
  user feedback, because the performance needs to improve, or because
  coverage across operators is not yet complete. For Beta features, we are
  committing to seeing the feature through to the Stable classification.
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -19,7 +19,6 @@ TorchScript

 .. toctree::
    :maxdepth: 1
-    :caption: Language Reference

    jit_language_reference_v2

--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@ -39,7 +39,7 @@ files =
    benchmarks/instruction_counts,
    tools/autograd/*.py,
    tools/clang_tidy.py,
-    tools/codegen/*.py,
+    tools/codegen,
    tools/extract_scripts.py,
    tools/mypy_wrapper.py,
    tools/print_test_stats.py,
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@ -697,7 +697,7 @@ class TestBenchmarkUtils(TestCase):
                2000  /usr/include/c++/8/bits/atomic_base.h:at::Tensor at::detail::make_tensor ... t_null_type<c10::StorageImpl> >&&, c10::DispatchKey&&, caffe2::TypeMeta&)
                2000  /usr/include/c++/8/array:at::Tensor& c10::Dispatcher::callWithDispatchKe ... , c10::Scalar)> const&, c10::DispatchKey, at::Tensor&, c10::Scalar) const

-            Total: 8869966"""  # noqa
+            Total: 8869966"""  # noqa: B950
        )

        self.regularizeAndAssertExpectedInline(
@ -935,7 +935,7 @@ class TestBenchmarkUtils(TestCase):
                  compute_optimized      |  \x1b[2m\x1b[91m   3    \x1b[0m\x1b[0m  |     4.0     |      11      |  \x1b[92m\x1b[1m    2100    \x1b[0m\x1b[0m  |      2100
                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[92m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[92m\x1b[1m    1700    \x1b[0m\x1b[0m

-            Times are in microseconds (us)."""  # noqa
+            Times are in microseconds (us)."""  # noqa: B950
        )

        compare.colorize(rowwise=True)
@ -949,7 +949,7 @@ class TestBenchmarkUtils(TestCase):
                  compute_optimized      |  \x1b[92m\x1b[1m   3    \x1b[0m\x1b[0m  |     4.0     |  \x1b[2m\x1b[91m    11    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    2100    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    2100    \x1b[0m\x1b[0m
                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[31m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[31m\x1b[1m    1700    \x1b[0m\x1b[0m

-            Times are in microseconds (us)."""  # noqa
+            Times are in microseconds (us)."""  # noqa: B950
        )

        def print_new_expected(s: str) -> None:
--- a/test/cpp/api/grad_mode.cpp
+++ b/test/cpp/api/grad_mode.cpp
@ -61,13 +61,14 @@ TEST(GradModeTest, TestRequiresGradViewOpExiting) {

    if (requires_grad) {
      ASSERT_THROWS_WITH(view_out.mul_(2),  // go through kernels: VariableType, InplaceOrView, CPU
-        "A view was created in no_grad mode and is being modified inplace")
+        "a leaf Variable that requires grad is being used in an in-place operation")
    } else {
        view_out.mul_(2);
    }

    tmp = view_out.view({2, 3});
    ASSERT_EQ(tmp.requires_grad(), requires_grad);
-    assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
+    // TODO: update when above error is fixed
+    // assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
  }
 }
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@ -424,7 +424,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
  ASSERT_TRUE(almost_equal(tensor[2], 3.125));

  tensor = torch::tensor({1.5f, 2.25f, 3.125f});
-  ASSERT_TRUE(tensor.is_variable());
  ASSERT_EQ(tensor.numel(), 3);
  ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
  ASSERT_EQ(tensor.dtype(), default_dtype);
@ -433,7 +432,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
  ASSERT_TRUE(almost_equal(tensor[2], 3.125f));

  tensor = torch::tensor(at::ArrayRef<float>({1.5f, 2.25f, 3.125f}));
-  ASSERT_TRUE(tensor.is_variable());
  ASSERT_EQ(tensor.numel(), 3);
  ASSERT_EQ(tensor.dtype(), default_dtype);
  ASSERT_TRUE(almost_equal(tensor[0], 1.5));
@ -441,7 +439,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
  ASSERT_TRUE(almost_equal(tensor[2], 3.125));

  tensor = torch::tensor(std::vector<float>({1.5f, 2.25f, 3.125f}));
-  ASSERT_TRUE(tensor.is_variable());
  ASSERT_EQ(tensor.numel(), 3);
  ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
  ASSERT_EQ(tensor.dtype(), default_dtype);
--- a/test/cpp/rpc/test_tensorpipe_serialization.cpp
+++ b/test/cpp/rpc/test_tensorpipe_serialization.cpp
@ -46,6 +46,7 @@ TEST(TensorpipeSerialize, Base) {
    tensorpipe::Descriptor::Tensor t;
    t.length = tpTensor.length;
    t.sourceDevice = tpTensor.buffer.device();
+    t.targetDevice = tpTensor.targetDevice;
    t.metadata = tpTensor.metadata;
    recvingTpDescriptor.tensors.push_back(std::move(t));
  }
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@ -4324,7 +4324,8 @@ TEST(LoopNest, fuseLoopsSimple) {
  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  auto fused_loop = LoopNest::fuseLoops({forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4364,7 +4365,8 @@ TEST(LoopNest, fuseLoopsMultiple) {
  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
  auto par = Block::make({forI, forJ, forK});
-  auto fused_loop = LoopNest::fuseLoops({forI, forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4421,7 +4423,8 @@ TEST(LoopNest, fuseLoopsNested) {
  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
  auto par = Block::make({forM, forN});
-  auto fused_loop = LoopNest::fuseLoops({forM, forN});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4481,7 +4484,8 @@ TEST(LoopNest, fuseLoopsNested2D) {
          50,
          Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
  auto par = Block::make({forI, forM});
-  auto fused_loop = LoopNest::fuseLoops({forI, forM});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4522,7 +4526,8 @@ TEST(LoopNest, fuseLoopsNested2DInner) {
  auto forN = For::make(
      n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
-  auto fused_loop = LoopNest::fuseLoops({forJ, forN});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));

  std::ostringstream oss;
  oss << *forI;
@ -4557,8 +4562,8 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) {
  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}), "Loops with different stop bounds");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsDifferentStartBounds) {
@ -4578,8 +4583,8 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) {
  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}), "Loops with different start bounds");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsNotContiguous) {
@ -4601,8 +4606,8 @@ TEST(LoopNest, fuseLoopsNotContiguous) {
  auto initB = Store::make(b_buf, {0}, 0);
  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, initB, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}), "Only contiguous loops can be fused");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsWithDifferentParents) {
@ -4628,8 +4633,8 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) {
  auto initB = Store::make(b_buf, {0}, 0);
  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forI, initB, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}), "loops with different parents");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsWithVariableBounds) {
@ -4650,7 +4655,8 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) {
  auto forJ = For::make(j, 0, N, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  auto fused_loop = LoopNest::fuseLoops({forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4686,7 +4692,8 @@ TEST(LoopNest, fuseLoopsWithExprBounds) {
  auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  auto fused_loop = LoopNest::fuseLoops({forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4722,7 +4729,8 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
  auto forJ = For::make(j, M, N * 2, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  auto fused_loop = LoopNest::fuseLoops({forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4757,7 +4765,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
      For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
  auto par = Block::make({forJ, forK});

-  auto fused_loop = LoopNest::fuseLoops({forJ, forK});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4803,7 +4812,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
  auto forM = For::make(m, 0, 20, forN);
  auto par = Block::make({forI, forM});

-  auto fused_loop = LoopNest::fuseLoops({forI, forM});
+  For* fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));

  std::ostringstream oss;
  oss << *par;
@ -4839,9 +4849,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
  auto forK =
      For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
@ -4861,9 +4870,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
  auto forK =
      For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
  auto par = Block::make({forJ, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
@ -4905,9 +4913,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
  auto par = Block::make({forM, forN});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forM, forN}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
@ -4948,9 +4955,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
          50,
          Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
  auto par = Block::make({forI, forM});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forI, forM}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
@ -4977,9 +4983,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
      100,
      Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forN}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
@ -5004,9 +5009,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
      Store::make(
          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
  auto par = Block::make({forJ, forK});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forJ, forK}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }

 TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
@ -5031,9 +5035,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
  auto par = Block::make({forK, forJ});
-  ASSERT_THROWS_WITH(
-      LoopNest::fuseLoops({forK, forJ}),
-      "not valid since it results in a loop carried dependence");
+  For* fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
 }

 TEST(LoopNest, areLoopsPerfectlyNested) {
--- a/test/distributed/bin/test_script.py
+++ b/test/distributed/bin/test_script.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def main():
+    print("Success, smoke test")
+
+
+if __name__ == "__main__":
+    main()
--- a/test/distributed/pipeline/sync/skip/test_api.py
+++ b/test/distributed/pipeline/sync/skip/test_api.py
@ -32,7 +32,7 @@ def test_skippable_repr():

        def forward(self, x):
            yield stash("hello", x)
-            return self.conv(x) # noqa
+            return self.conv(x)  # noqa: B901

    m = Hello()
    assert (
--- a/test/distributed/pipeline/sync/skip/test_gpipe.py
+++ b/test/distributed/pipeline/sync/skip/test_gpipe.py
@ -30,7 +30,7 @@ def test_1to3(balance, checkpoint, setup_rpc):
        def forward(self, input):
            yield stash("1to3", input)
            output = self.conv(input)
-            return output # noqa
+            return output  # noqa: B901

    class Layer2(nn.Module):
        def __init__(self):
@ -73,7 +73,7 @@ def test_none_skip(setup_rpc):
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("none", None)
-            return input # noqa
+            return input  # noqa: B901

    @skippable(pop=["none"])
    class Pop(nn.Module):
--- a/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py
+++ b/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py
@ -19,7 +19,7 @@ class Pass(nn.Module):
 class StashFoo(nn.Module):
    def forward(self, input):
        yield stash("foo", input)
-        return input # noqa
+        return input  # noqa: B901


@skippable(pop=["foo"])
@ -33,7 +33,7 @@ class PopFoo(nn.Module):
 class StashBar(nn.Module):
    def forward(self, input):
        yield stash("bar", input)
-        return input # noqa
+        return input  # noqa: B901


@skippable(pop=["bar"])
--- a/test/distributed/pipeline/sync/skip/test_leak.py
+++ b/test/distributed/pipeline/sync/skip/test_leak.py
@ -17,7 +17,7 @@ from torch.distributed.pipeline.sync.skip.tracker import current_skip_tracker
 class Stash(nn.Module):
    def forward(self, input):
        yield stash("skip", input)
-        return input # noqa
+        return input  # noqa: B901


@skippable(pop=["skip"])
--- a/test/distributed/pipeline/sync/skip/test_stash_pop.py
+++ b/test/distributed/pipeline/sync/skip/test_stash_pop.py
@ -24,7 +24,7 @@ def test_stash(skip_tracker):
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    l1 = Stash()

@ -41,13 +41,13 @@ def test_pop():
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    @skippable(pop=["foo"])
    class Pop(nn.Module):
        def forward(self, input):
            foo = yield pop("foo")
-            return foo # noqa
+            return foo

    l1 = Stash()
    l2 = Pop()
@ -83,7 +83,7 @@ def test_stash_not_declared():
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    l1 = Stash()

@ -96,13 +96,13 @@ def test_pop_not_declared():
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    @skippable()
    class Pop(nn.Module):
        def forward(self, input):
            foo = yield pop("foo")
-            return foo # noqa
+            return foo

    l1 = Stash()
    l2 = Pop()
@ -130,7 +130,7 @@ def test_stash_none():
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", None)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    l1 = Stash()
    l1(torch.tensor(42))
--- a/test/distributed/pipeline/sync/skip/test_tracker.py
+++ b/test/distributed/pipeline/sync/skip/test_tracker.py
@ -40,7 +40,7 @@ def test_default_skip_tracker_by_data_parallel():
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
-            return input * 2 # noqa
+            return input * 2  # noqa: B901

    @skippable(pop=["foo"])
    class Pop(nn.Module):
--- a/test/distributed/pipeline/sync/test_pipeline.py
+++ b/test/distributed/pipeline/sync/test_pipeline.py
@ -12,7 +12,7 @@ def test_clock_cycles():
    assert list(_clock_cycles(1, 3)) == [[(0, 0)], [(0, 1)], [(0, 2)]]
    assert list(_clock_cycles(3, 1)) == [[(0, 0)], [(1, 0)], [(2, 0)]]

-    assert list(_clock_cycles(3, 3)) == [  # noqa
+    assert list(_clock_cycles(3, 3)) == [
        [(0, 0)],
        [(1, 0), (0, 1)],
        [(2, 0), (1, 1), (0, 2)],
@ -20,7 +20,7 @@ def test_clock_cycles():
        [(2, 2)],
    ]

-    assert list(_clock_cycles(4, 2)) == [  # noqa
+    assert list(_clock_cycles(4, 2)) == [
        [(0, 0)],
        [(1, 0), (0, 1)],
        [(2, 0), (1, 1)],
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@ -49,6 +49,7 @@ from torch.testing._internal.common_distributed import (
    create_device,
    with_dist_debug_levels,
    with_nccl_blocking_wait,
+    create_tcp_store,
 )
 from torch.testing._internal.common_utils import (
    TestCase,
@ -299,27 +300,9 @@ class PrefixFileStoreTest(TestCase, StoreTestBase):
    def _create_store(self):
        return c10d.PrefixStore(self.prefix, self.filestore)

-
-def create_tcp_store(addr, world_size=1, wait_for_workers=True):
-    """
-    Creates a TCP store. Retries if the chosen port is already in use.
-    """
-    ports = []
-    for _ in range(10):
-        try:
-            port = common.find_free_port()
-            ports.append(port)
-            return c10d.TCPStore(addr, port, world_size, True, wait_for_workers=wait_for_workers)
-        except RuntimeError as error:
-            if str(error) == "Address already in use":
-                continue
-            raise
-    raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
-
-
 class TCPStoreTest(TestCase, StoreTestBase):
    def _create_store(self):
-        store = create_tcp_store("localhost")
+        store = create_tcp_store()
        store.set_timeout(timedelta(seconds=300))
        return store

@ -329,7 +312,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
        else:
            err_msg_reg = "^Address already in use$"
        with self.assertRaisesRegex(RuntimeError, err_msg_reg):
-            addr = "localhost"
+            addr = DEFAULT_HOSTNAME
            port = common.find_free_port()

            # Use noqa to silence flake8.
@ -418,7 +401,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
    def setUp(self):
        super(PrefixTCPStoreTest, self).setUp()
-        self.tcpstore = create_tcp_store("localhost")
+        self.tcpstore = create_tcp_store()
        self.prefix = "test_prefix"
        self.tcpstore.set_timeout(timedelta(seconds=300))

@ -652,7 +635,7 @@ class RendezvousFileTest(TestCase):
@skip_if_win32()
 class RendezvousTCPTest(TestCase):
    def create_tcp_url(self):
-        addr = "localhost"
+        addr = DEFAULT_HOSTNAME
        port = common.find_free_port()
        url = "tcp://%s:%d?world_size=%d" % (addr, port, 1)
        return url
--- a/Show More
+++ b/Show More