(torch.distributed) Add torch.distributed.is_torchelastic_launched() util method + make init_method=tcp:// compatible with torchelastic (#63910 ) (#64826 )

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63910 Addresses the current issue that `init_method=tcp://` is not compatible with `torch.distributed.run` and `torch.distributed.launch`. When running with a training script that initializes the process group with `init_method=tcp://localhost:$port` as such: ``` $ python -u -m torch.distributed.run --max_restarts 0 --nproc_per_node 1 --nnodes 1 --master_addr $(hostname) --master_port 6000 ~/tmp/test.py ``` An `Address in use` error is raised since the training script tries to create a TCPStore on port 6000, which is already taken since the elastic agent is already running a TCPStore on that port. For details see: https://github.com/pytorch/pytorch/issues/63874. This change does a couple of things: 1. Adds `is_torchelastic_launched()` check function that users can use in the training scripts to see whether the script is launched via torchelastic. 1. Update the `torch.distributed` docs page to include the new `is_torchelastic_launched()` function. 1. Makes `init_method=tcp://` torchelastic compatible by modifying `_tcp_rendezvous_handler` in `torch.distributed.rendezvous` (this is NOT the elastic rendezvous, it is the old rendezvous module which is slotted for deprecation in future releases) to check `is_torchelastic_launched()` AND `torchelastic_use_agent_store()` and if so, only create TCPStore clients (no daemons, not even for rank 0). 1. Adds a bunch of unittests to cover the different code paths NOTE: the issue mentions that we should fail-fast with an assertion on `init_method!=env://` when `is_torchelastic_launched()` is `True`. There are three registered init_methods in pytorch: env://, tcp://, file://. Since this diff makes tcp:// compatible with torchelastic and I've validated that file is compatible with torchelastic. There is no need to add assertions. I did update the docs to point out that env:// is the RECOMMENDED init_method. We should probably deprecate the other init_methods in the future but this is out of scope for this issue. Test Plan: Unittests. Reviewed By: cbalioglu Differential Revision: D30529984 fbshipit-source-id: 267aea6d4dad73eb14a2680ac921f210ff547cc5 Co-authored-by: Kiuk Chung <kiuk@fb.com>
Pin macos mkl conda version to fix the cmake build (#61773 ) (#64831 )
2025-10-31 12:15:03 +08:00 · 2021-09-12 10:38:42 -07:00 · 2021-09-10 15:55:28 -07:00 · 2021-09-09 20:06:18 -07:00 · 2021-09-09 20:03:00 -07:00 · 2021-09-09 19:51:45 -07:00
194 changed files with 3815 additions and 1491 deletions
--- a/.circleci/cimodel/data/simple/android_definitions.py
+++ b/.circleci/cimodel/data/simple/android_definitions.py
@ -97,13 +97,13 @@ WORKFLOW_DATA = [
        is_master_only=False,
        is_pr_only=True),
    AndroidGradleJob(
-        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single_lite_interpreter",
+        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
        "pytorch_android_gradle_custom_build_single",
        [DOCKER_REQUIREMENT_NDK],
        is_master_only=False,
        is_pr_only=True,
        extra_props=tuple({
-            "lite_interpreter": miniutils.quote(str(int(True)))
+            "lite_interpreter": miniutils.quote(str(int(False)))
        }.items())),
    AndroidGradleJob(
        "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build",
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@ -61,14 +61,20 @@ class IOSJob:


 WORKFLOW_DATA = [
-    IOSJob(XCODE_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
-    IOSJob(XCODE_VERSION, ArchVariant("x86_64", "lite_interpreter"), is_org_member_context=False, extra_props={
+    IOSJob(XCODE_VERSION, ArchVariant("x86_64"), is_org_member_context=False, extra_props={
        "lite_interpreter": miniutils.quote(str(int(True)))}),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64")),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64", "metal"), extra_props={"use_metal": miniutils.quote(str(int(True)))}),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64", "lite_interpreter"), extra_props={
+    IOSJob(XCODE_VERSION, ArchVariant("x86_64", "full_jit"), is_org_member_context=False, extra_props={
+        "lite_interpreter": miniutils.quote(str(int(False)))}),
+    IOSJob(XCODE_VERSION, ArchVariant("arm64"), extra_props={
+        "lite_interpreter": miniutils.quote(str(int(True)))}),
+    IOSJob(XCODE_VERSION, ArchVariant("arm64", "metal"), extra_props={
+        "use_metal": miniutils.quote(str(int(True))),
+        "lite_interpreter": miniutils.quote(str(int(True)))}),
+    IOSJob(XCODE_VERSION, ArchVariant("arm64", "full_jit"), extra_props={
+        "lite_interpreter": miniutils.quote(str(int(False)))}),
+    IOSJob(XCODE_VERSION, ArchVariant("arm64", "custom"), extra_props={
+        "op_list": "mobilenetv2.yaml",
        "lite_interpreter": miniutils.quote(str(int(True)))}),
-    IOSJob(XCODE_VERSION, ArchVariant("arm64", "custom"), extra_props={"op_list": "mobilenetv2.yaml"}),
 ]


--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -297,7 +297,7 @@ pytorch_android_params: &pytorch_android_params
      default: ""
    lite_interpreter:
      type: string
-      default: "0"
+      default: "1"
  environment:
    BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
@ -324,7 +324,7 @@ pytorch_ios_params: &pytorch_ios_params
      default: "0"
    lite_interpreter:
      type: string
-      default: "0"
+      default: "1"
  environment:
    BUILD_ENVIRONMENT: << parameters.build_environment >>
    IOS_ARCH: << parameters.ios_arch >>
@ -1759,8 +1759,8 @@ jobs:
          no_output_timeout: "30m"
          command: |
            set -e
-            if [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
-              echo "Run Build Test is not for BUILD_LITE_INTERPRETER, skipping."
+            if [ ${BUILD_LITE_INTERPRETER} == 0 ]; then
+              echo "Run Build Test is not for full jit, skipping."
              exit 0
            fi
            PROJ_ROOT=/Users/distiller/project
@ -1788,8 +1788,8 @@ jobs:
            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
              echo "not SIMULATOR build, skip it."
              exit 0
-            elif [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
-              echo "Run Simulator Tests is not for BUILD_LITE_INTERPRETER, skipping."
+            elif [ ${BUILD_LITE_INTERPRETER} == 0 ]; then
+              echo "Run Simulator Tests is not for full jit, skipping."
              exit 0
            fi
            WORKSPACE=/Users/distiller/workspace
@ -6534,8 +6534,8 @@ workflows:
              only:
                - /gh\/.*\/head/
                - /pull\/.*/
-          lite_interpreter: "1"
-          name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single_lite_interpreter
+          lite_interpreter: "0"
+          name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
          requires:
            - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
      - pytorch_android_gradle_build:
@ -6555,38 +6555,42 @@ workflows:
          build_environment: pytorch-ios-12.0.0-x86_64_build
          ios_arch: x86_64
          ios_platform: SIMULATOR
+          lite_interpreter: "1"
          name: pytorch_ios_12_0_0_x86_64_build
      - pytorch_ios_build:
-          build_environment: pytorch-ios-12.0.0-x86_64_lite_interpreter_build
+          build_environment: pytorch-ios-12.0.0-x86_64_full_jit_build
          ios_arch: x86_64
          ios_platform: SIMULATOR
-          lite_interpreter: "1"
-          name: pytorch_ios_12_0_0_x86_64_lite_interpreter_build
+          lite_interpreter: "0"
+          name: pytorch_ios_12_0_0_x86_64_full_jit_build
      - pytorch_ios_build:
          build_environment: pytorch-ios-12.0.0-arm64_build
          context: org-member
          ios_arch: arm64
          ios_platform: OS
+          lite_interpreter: "1"
          name: pytorch_ios_12_0_0_arm64_build
      - pytorch_ios_build:
          build_environment: pytorch-ios-12.0.0-arm64_metal_build
          context: org-member
          ios_arch: arm64
          ios_platform: OS
+          lite_interpreter: "1"
          name: pytorch_ios_12_0_0_arm64_metal_build
          use_metal: "1"
      - pytorch_ios_build:
-          build_environment: pytorch-ios-12.0.0-arm64_lite_interpreter_build
+          build_environment: pytorch-ios-12.0.0-arm64_full_jit_build
          context: org-member
          ios_arch: arm64
          ios_platform: OS
-          lite_interpreter: "1"
-          name: pytorch_ios_12_0_0_arm64_lite_interpreter_build
+          lite_interpreter: "0"
+          name: pytorch_ios_12_0_0_arm64_full_jit_build
      - pytorch_ios_build:
          build_environment: pytorch-ios-12.0.0-arm64_custom_build
          context: org-member
          ios_arch: arm64
          ios_platform: OS
+          lite_interpreter: "1"
          name: pytorch_ios_12_0_0_arm64_custom_build
          op_list: mobilenetv2.yaml
      - pytorch_linux_build:
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@ -88,6 +88,7 @@ case "$image" in
    DB=yes
    VISION=yes
    KATEX=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-py3.6-gcc7.2)
    ANACONDA_PYTHON_VERSION=3.6
@ -100,6 +101,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7)
    CUDA_VERSION=10.0
@ -109,6 +111,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7)
    CUDA_VERSION=10.1
@ -119,6 +122,7 @@ case "$image" in
    DB=yes
    VISION=yes
    KATEX=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)
    CUDA_VERSION=10.2
@ -129,6 +133,7 @@ case "$image" in
    DB=yes
    VISION=yes
    KATEX=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7)
    CUDA_VERSION=11.1
@ -139,6 +144,7 @@ case "$image" in
    DB=yes
    VISION=yes
    KATEX=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)
    CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
@ -149,6 +155,7 @@ case "$image" in
    DB=yes
    VISION=yes
    KATEX=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-py3-clang5-asan)
    ANACONDA_PYTHON_VERSION=3.6
@ -156,6 +163,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-py3-clang7-onnx)
    ANACONDA_PYTHON_VERSION=3.6
@ -163,6 +171,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-xenial-py3-clang5-android-ndk-r19c)
    ANACONDA_PYTHON_VERSION=3.6
@ -181,6 +190,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-bionic-py3.6-clang9)
    ANACONDA_PYTHON_VERSION=3.6
@ -188,6 +198,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    VULKAN_SDK_VERSION=1.2.162.1
    SWIFTSHADER=yes
    ;;
@ -198,6 +209,7 @@ case "$image" in
    DB=yes
    VISION=yes
    BREAKPAD=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9)
    CUDA_VERSION=10.2
@ -207,6 +219,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9)
    CUDA_VERSION=10.2
@ -216,6 +229,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)
    CUDA_VERSION=10.2
@ -225,6 +239,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ;;
  pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9)
    CUDA_VERSION=11.0
@ -234,6 +249,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ROCM_VERSION=3.9
    ;;
  pytorch-linux-bionic-rocm4.0.1-py3.6)
@ -242,6 +258,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ROCM_VERSION=4.0.1
    ;;
  pytorch-linux-bionic-rocm4.1-py3.6)
@ -250,6 +267,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ROCM_VERSION=4.1
    ;;
  pytorch-linux-bionic-rocm4.2-py3.6)
@ -258,6 +276,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    ROCM_VERSION=4.2
    ;;
  *)
@ -265,6 +284,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
+    BREAKPAD=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
      extract_version_from_image_name py ANACONDA_PYTHON_VERSION
--- a/.circleci/docker/common/install_breakpad.sh
+++ b/.circleci/docker/common/install_breakpad.sh
@ -2,12 +2,18 @@

 set -ex

-git clone https://github.com/google/breakpad.git
-cd breakpad
+git clone https://github.com/malfet/breakpad.git -b pytorch/release-1.9
+pushd breakpad

 git clone https://chromium.googlesource.com/linux-syscall-support src/third_party/lss
+pushd src/third_party/lss
+# same as with breakpad, there are no real releases for this repo so use a
+# commit as the pin
+git checkout e1e7b0ad8ee99a875b272c8e33e308472e897660
+popd
+
 ./configure
 make
 make install
-cd ..
+popd
 rm -rf breakpad
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@ -61,6 +61,10 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh
 ENV INSTALLED_VISION ${VISION}

+ADD ./common/install_openssl.sh install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+RUN bash ./install_openssl.sh
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -93,9 +97,5 @@ ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

-ADD ./common/install_openssl.sh install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-RUN bash ./install_openssl.sh
-
 USER jenkins
 CMD ["bash"]
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.circleci/docker/ubuntu/Dockerfile
@ -113,6 +113,10 @@ ADD ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh

+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -130,9 +134,5 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-
 USER jenkins
 CMD ["bash"]
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,6 +62,7 @@ popd

 # Clone the Builder master repo
 retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+git checkout release/1.9
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -9,10 +9,6 @@ python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"

 # Set up Python
 if [[ "$PACKAGE_TYPE" == conda ]]; then
-  # There was a bug that was introduced in conda-package-handling >= 1.6.1 that makes archives
-  # above a certain size fail out when attempting to extract
-  # see: https://github.com/conda/conda-package-handling/issues/71
-  conda install -y conda-package-handling=1.6.0
  retry conda create -qyn testenv python="$DESIRED_PYTHON"
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
@ -38,6 +34,10 @@ if [[ "$DESIRED_CUDA" == "cu112" ]]; then
  EXTRA_CONDA_FLAGS="-c=conda-forge"
 fi

+# Move debug wheels out of the the package dir so they don't get installed
+mkdir -p /tmp/debug_final_pkgs
+mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
+
 # Install the package
 # These network calls should not have 'retry's because they are installing
 # locally and aren't actually network calls
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -68,6 +68,18 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  fi
 fi

+USE_GOLD_LINKER="OFF"
+# GOLD linker can not be used if CUPTI is statically linked into PyTorch, see https://github.com/pytorch/pytorch/issues/57744
+if [[ ${DESIRED_CUDA} == "cpu" ]]; then
+  USE_GOLD_LINKER="ON"
+fi
+
+USE_WHOLE_CUDNN="OFF"
+# Link whole cuDNN for CUDA-11.1 to include fp16 fast kernels
+if [[  "$(uname)" == "Linux" && "${DESIRED_CUDA}" == "cu111" ]]; then
+  USE_WHOLE_CUDNN="ON"
+fi
+
 # Default to nightly, since that's where this normally uploads to
 PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
@ -169,7 +181,9 @@ export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
 export CIRCLE_BRANCH="$CIRCLE_BRANCH"
 export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"

-export USE_GOLD_LINKER=1
+export USE_GOLD_LINKER="${USE_GOLD_LINKER}"
+export USE_GLOO_WITH_OPENSSL="ON"
+export USE_WHOLE_CUDNN="${USE_WHOLE_CUDNN}"
 # =================== The above code will be executed inside Docker container ===================
 EOL

--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@ -50,43 +50,50 @@ else
 fi

 add_to_env_file() {
-  local content
-  content=$1
-  # BASH_ENV should be set by CircleCI
-  echo "${content}" >> "${BASH_ENV:-/tmp/env}"
+  local name=$1
+  local value=$2
+  case "$value" in
+    *\ *)
+      # BASH_ENV should be set by CircleCI
+      echo "${name}='${value}'" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+    *)
+      echo "${name}=${value}" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+  esac
 }

-add_to_env_file "IN_CI=1"
-add_to_env_file "COMMIT_SOURCE=${CIRCLE_BRANCH:-}"
-add_to_env_file "BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}"
-add_to_env_file "CIRCLE_PULL_REQUEST=${CIRCLE_PULL_REQUEST}"
+add_to_env_file IN_CI 1
+add_to_env_file COMMIT_SOURCE "${CIRCLE_BRANCH:-}"
+add_to_env_file BUILD_ENVIRONMENT "${BUILD_ENVIRONMENT}"
+add_to_env_file CIRCLE_PULL_REQUEST "${CIRCLE_PULL_REQUEST}"


 if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
-  add_to_env_file "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2"
+  add_to_env_file SCCACHE_BUCKET ossci-compiler-cache-circleci-v2

  SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
  MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
  MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
-  add_to_env_file "MAX_JOBS=${MAX_JOBS}"
+  add_to_env_file MAX_JOBS "${MAX_JOBS}"

  if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-    add_to_env_file "TORCH_CUDA_ARCH_LIST=5.2"
+    add_to_env_file TORCH_CUDA_ARCH_LIST 5.2
  fi

  if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
    # This IAM user allows write access to S3 bucket for sccache & bazels3cache
    set +x
-    add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
-    add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
    set -x
  else
    # This IAM user allows write access to S3 bucket for sccache
    set +x
-    add_to_env_file "XLA_CLANG_CACHE_S3_BUCKET_NAME=${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file "AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
-    add_to_env_file "AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
    set -x
  fi
 fi
--- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@ -32,7 +32,7 @@ pytorch_android_params: &pytorch_android_params
      default: ""
    lite_interpreter:
      type: string
-      default: "0"
+      default: "1"
  environment:
    BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
@ -59,7 +59,7 @@ pytorch_ios_params: &pytorch_ios_params
      default: "0"
    lite_interpreter:
      type: string
-      default: "0"
+      default: "1"
  environment:
    BUILD_ENVIRONMENT: << parameters.build_environment >>
    IOS_ARCH: << parameters.ios_arch >>
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@ -528,8 +528,8 @@
          no_output_timeout: "30m"
          command: |
            set -e
-            if [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
-              echo "Run Build Test is not for BUILD_LITE_INTERPRETER, skipping."
+            if [ ${BUILD_LITE_INTERPRETER} == 0 ]; then
+              echo "Run Build Test is not for full jit, skipping."
              exit 0
            fi
            PROJ_ROOT=/Users/distiller/project
@ -557,8 +557,8 @@
            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
              echo "not SIMULATOR build, skip it."
              exit 0
-            elif [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
-              echo "Run Simulator Tests is not for BUILD_LITE_INTERPRETER, skipping."
+            elif [ ${BUILD_LITE_INTERPRETER} == 0 ]; then
+              echo "Run Simulator Tests is not for full jit, skipping."
              exit 0
            fi
            WORKSPACE=/Users/distiller/workspace
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -200,7 +200,7 @@ fi

 # Patch required to build xla
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  git clone --recursive https://github.com/pytorch/xla.git
+  git clone --recursive -b r1.9 https://github.com/pytorch/xla.git
  ./xla/scripts/apply_patches.sh
 fi

--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@ -52,9 +52,9 @@ function get_exit_code() {
 function file_diff_from_base() {
  # The fetch may fail on Docker hosts, this fetch is necessary for GHA
  set +e
-  git fetch origin master --quiet
+  git fetch origin release/1.9 --quiet
  set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/release/1.9 HEAD)" > "$1"
 }

 function get_bazel() {
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@ -28,7 +28,13 @@ fi
 export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
 # shellcheck disable=SC1091
 source "${WORKSPACE_DIR}"/miniconda3/bin/activate
-retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses pip
+
+# NOTE: mkl 2021.3.0+ cmake requires sub-command PREPEND, may break the build
+retry conda install -y \
+  mkl=2021.2.0 mkl-include=2021.2.0 \
+  numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 \
+  cmake cffi ninja typing_extensions dataclasses pip
+
 # The torch.hub tests make requests to GitHub.
 #
 # The certifi package from conda-forge is new enough to make the
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -368,7 +368,7 @@ test_backward_compatibility() {
  python -m venv venv
  # shellcheck disable=SC1091
  . venv/bin/activate
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
  pip show torch
  python dump_all_function_schemas.py --filename nightly_schemas.txt
  deactivate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -194,6 +194,9 @@ cmake_dependent_option(
 cmake_dependent_option(
    USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
    "USE_CUDNN" OFF)
+cmake_dependent_option(
+  USE_WHOLE_CUDNN "Use whole-library linking for cuDNN" OFF
+    "USE_STATIC_CUDNN" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
@ -266,6 +269,9 @@ option(USE_ZSTD "Use ZSTD" OFF)
 cmake_dependent_option(
  USE_MKLDNN "Use MKLDNN. Only available on x86, x86_64, and AArch64." "${CPU_INTEL}"
  "CPU_INTEL OR CPU_AARCH64" OFF)
+cmake_dependent_option(
+  USE_MKLDNN_ACL "Use Compute Library for the Arm architecture." OFF
+  "USE_MKLDNN AND CPU_AARCH64" OFF)
 set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(
    USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.4.1)
-option(BUILD_LITE_INTERPRETER "Master flag to build pytorch_jni_lite" OFF)
+option(BUILD_LITE_INTERPRETER "Master flag to build pytorch_jni_lite" ON)
 message(
  STATUS
  "BUILD_LITE_INTERPRETER (pytorch_jni_lite): ${BUILD_LITE_INTERPRETER}")
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@ -17,8 +17,8 @@ android {
        }
        externalNativeBuild {
            cmake {
-              if(System.env.BUILD_LITE_INTERPRETER == '1') {
-                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=ON"
+              if(System.env.BUILD_LITE_INTERPRETER == '0') {
+                arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF"
              } else {
                arguments "-DANDROID_STL=c++_shared"
              }
@ -37,12 +37,12 @@ android {
    sourceSets {
        main {
            java {
-              if(System.env.BUILD_LITE_INTERPRETER == '1') {
-                println 'Build pytorch_jni_lite'
-              } else {
+              if(System.env.BUILD_LITE_INTERPRETER == '0') {
                println 'Build pytorch_jni'
                exclude 'org/pytorch/LiteModuleLoader.java'
                exclude 'org/pytorch/LiteNativePeer.java'
+              } else {
+                println 'Build pytorch_jni_lite'
              }
            }
            jniLibs.srcDirs = ['src/main/jniLibs']
--- a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
@ -10,7 +10,7 @@ public final class PyTorchAndroid {
    if (!NativeLoader.isInitialized()) {
      NativeLoader.init(new SystemDelegate());
    }
-    NativeLoader.loadLibrary("pytorch_jni");
+    NativeLoader.loadLibrary("pytorch_jni_lite");
    PyTorchCodegenLoader.loadNativeLibs();
  }

--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@ -14,11 +14,11 @@ namespace at {
 //    OptionalDeviceGuard guard(device_of(tensor));

 /// Return the Device of a Tensor, if the Tensor is defined.
-inline optional<Device> device_of(const Tensor& t) {
+inline c10::optional<Device> device_of(const Tensor& t) {
  if (t.defined()) {
-    return make_optional(t.device());
+    return c10::make_optional(t.device());
  } else {
-    return nullopt;
+    return c10::nullopt;
  }
 }

@ -29,11 +29,11 @@ inline optional<Device> device_of(const optional<Tensor>& t) {
 /// Return the Device of a TensorList, if the list is non-empty and
 /// the first Tensor is defined.  (This function implicitly assumes
 /// that all tensors in the list have the same device.)
-inline optional<Device> device_of(TensorList t) {
+inline c10::optional<Device> device_of(TensorList t) {
  if (!t.empty()) {
    return device_of(t.front());
  } else {
-    return nullopt;
+    return c10::nullopt;
  }
 }

--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -372,6 +372,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
  KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32)
  KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
  KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, const Scalar&, int64_t, const Scalar&), fp32)
+  KERNEL(ADD_NS(grid_sampler), "grid_sampler", Tensor (const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
  // fp32_set_opt_dtype
  KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional<ScalarType>), fp32_set_opt_dtype)
  KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -133,7 +133,6 @@ TORCH_API DimnameList get_names(const TensorImpl* impl);
 // tensor is constructed with names=None.
 TORCH_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);

-
 } // namespace impl

 } // namespace at
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@ -495,6 +495,7 @@ _(aten, miopen_depthwise_convolution_backward_input) \
 _(aten, miopen_depthwise_convolution_backward_weight) \
 _(aten, miopen_rnn) \
 _(aten, miopen_rnn_backward) \
+_(aten, mish) \
 _(aten, mkldnn_convolution) \
 _(aten, mkldnn_convolution_backward) \
 _(aten, mkldnn_convolution_backward_input) \
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@ -479,23 +479,12 @@ public:
        vsqrtq_f32(values.val[1]));
  }
  Vec256<float> reciprocal() const {
-    float32x4_t r0 = vrecpeq_f32(values.val[0]);
-    float32x4_t r1 = vrecpeq_f32(values.val[1]);
-    // Run two more Netwon's method iterations to get more accurate results
-    r0 = vmulq_f32(vrecpsq_f32(values.val[0], r0), r0);
-    r0 = vmulq_f32(vrecpsq_f32(values.val[0], r0), r0);
-    r1 = vmulq_f32(vrecpsq_f32(values.val[1], r1), r1);
-    r1 = vmulq_f32(vrecpsq_f32(values.val[1], r1), r1);
+    auto r0 = vdivq_f32(vdupq_n_f32(1.0f), values.val[0]);
+    auto r1 = vdivq_f32(vdupq_n_f32(1.0f), values.val[1]);
    return Vec256<float>(r0, r1);
  }
  Vec256<float> rsqrt() const {
-    float32x4_t r0 =  vrsqrteq_f32(values.val[0]);
-    float32x4_t r1 =  vrsqrteq_f32(values.val[1]);
-    r0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(values.val[0], r0), r0), r0);
-    r0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(values.val[0], r0), r0), r0);
-    r1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(values.val[1], r1), r1), r1);
-    r1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(values.val[1], r1), r1), r1);
-    return Vec256<float>(r0, r1);
+    return this->sqrt().reciprocal();
  }
  Vec256<float> pow(const Vec256<float> &exp) const {
    __at_align32__ float tmp[size()];
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -43,6 +43,8 @@ namespace at {
 namespace cuda {
 namespace detail {

+std::function<void(void)> THCMagma_init;
+
 // NB: deleter is dynamic, because we need it to live in a separate
 // compilation unit (alt is to have another method in hooks, but
 // let's not if we don't need to!)
@ -51,9 +53,8 @@ std::unique_ptr<THCState, void (*)(THCState*)> CUDAHooks::initCUDA() const {
  THCState* thc_state = THCState_alloc();

  THCudaInit(thc_state);
-#ifdef USE_MAGMA
-  THCMagma_init(thc_state);
-#endif
+  if (THCMagma_init)
+    THCMagma_init();
  return std::unique_ptr<THCState, void (*)(THCState*)>(
      thc_state, [](THCState* p) {
        if (p)
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -8,6 +8,9 @@

 namespace at { namespace cuda { namespace detail {

+// Callback to initialize THC Magma, which is implemented in torch_cuda_cu
+TORCH_CUDA_CPP_API extern std::function<void()> THCMagma_init;
+
 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
  CUDAHooks(at::CUDAHooksArgs) {}
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -54,6 +54,10 @@ TORCH_META_FUNC(silu) (const Tensor& self) {
  build_unary_op(maybe_get_output(), self);
 }

+TORCH_META_FUNC(mish) (const Tensor& self) {
+  build_unary_op(maybe_get_output(), self);
+}
+
 TORCH_META_FUNC(softplus) (
  const Tensor& self, const Scalar& beta, const Scalar& threshold
 ) {
@ -127,6 +131,10 @@ DEFINE_DISPATCH(leaky_relu_backward_stub);
 DEFINE_DISPATCH(silu_stub);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(silu_backward_stub);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(mish_stub);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(mish_backward_stub);

 TORCH_IMPL_FUNC(elu_out) (
  const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result
@ -140,6 +148,12 @@ TORCH_IMPL_FUNC(silu_out) (
  silu_stub(device_type(), *this);
 }

+TORCH_IMPL_FUNC(mish_out) (
+  const Tensor& self, const Tensor& result
+) {
+  mish_stub(device_type(), *this);
+}
+
 TORCH_IMPL_FUNC(softplus_out) (
  const Tensor& self, const Scalar& beta, const Scalar& threshold, const Tensor& result
 ) {
@ -306,6 +320,23 @@ Tensor math_silu_backward(
  return grad_output * (input_sigmoid * (1 + input * (1 - input_sigmoid)));
 }

+Tensor mish_backward(
+    const Tensor& grad_output,
+    const Tensor& input) {
+  Tensor grad_input = at::empty({0}, input.options());
+  auto iter = TensorIterator::binary_op(grad_input, grad_output, input);
+  mish_backward_stub(iter.device_type(), iter);
+  return grad_input;
+}
+
+Tensor math_mish_backward(
+    const Tensor& grad_output,
+    const Tensor& input) {
+  auto input_tanh_softplus = at::tanh(at::softplus(input));
+  auto input_sigmoid = at::sigmoid(input);
+  return grad_output * (input_tanh_softplus + (input * input_sigmoid * (1 - input_tanh_softplus * input_tanh_softplus)));
+}
+
 template <typename scalar_t>
 inline void _rrelu_with_noise_train(
    Tensor& output,
--- a/aten/src/ATen/native/Activation.h
+++ b/aten/src/ATen/native/Activation.h
@ -54,6 +54,8 @@ DECLARE_DISPATCH(activation_fn, glu_stub);
 DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
 DECLARE_DISPATCH(structured_activation_fn, silu_stub);
 DECLARE_DISPATCH(activation_backward_fn, silu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, mish_stub);
+DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);

 } // namespace native

--- a/aten/src/ATen/native/DilatedMaxPool2d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@ -66,14 +66,15 @@ bool ceil_mode) {
    outputHeight, outputWidth, memory_format);

  /* resize output and indices */
+  DimnameList maybe_names = input.has_names() ? input.names() : DimnameList{};
  if (input.ndimension() == 3) {
-    set_output(0, {nInputPlane, outputHeight, outputWidth}, {}, input.options().memory_format(memory_format), input.names());
+    set_output(0, {nInputPlane, outputHeight, outputWidth}, {}, input.options().memory_format(memory_format), maybe_names);
    /* indices will contain the locations for each output point */
-    set_output(1, {nInputPlane, outputHeight, outputWidth}, {}, input.options().dtype(kLong), input.names());
+    set_output(1, {nInputPlane, outputHeight, outputWidth}, {}, input.options().dtype(kLong), maybe_names);
  } else {
-    set_output(0, {nbatch, nInputPlane, outputHeight, outputWidth}, {}, input.options().memory_format(memory_format), input.names());
+    set_output(0, {nbatch, nInputPlane, outputHeight, outputWidth}, {}, input.options().memory_format(memory_format), maybe_names);
    /* indices will contain the locations for each output point */
-    set_output(1, {nbatch, nInputPlane, outputHeight, outputWidth}, {}, input.options().dtype(kLong), input.names());
+    set_output(1, {nbatch, nInputPlane, outputHeight, outputWidth}, {}, input.options().dtype(kLong), maybe_names);
  }
 }

@ -152,7 +153,8 @@ const Tensor& indices) {
    outputHeight_for_shape_check, outputWidth_for_shape_check,
    memory_format);

-  set_output(0, input.sizes(), {}, input.options().memory_format(memory_format), input.names());
+  set_output(0, input.sizes(), {}, input.options().memory_format(memory_format),
+             input.has_names() ? input.names() : DimnameList{});
 }
 } // namespace meta

--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -30,6 +30,9 @@ namespace meta {
 TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");

  auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self);
  set_output(0, IntArrayRef({mat1.sizes()[0], mat2.sizes()[1]}), {}, self.options(), names);
@ -932,12 +935,6 @@ static void addmm_impl_cpu_(
  auto m2_strides = m2.strides();
  auto m2_sizes = m2.sizes();

-  // keeping TORCH_CHECKs here because othe mm methods also utilize this impl.
-  // TODO move this to meta once all methods have migrated to structured kernel.
-  TORCH_CHECK(
-      m1_sizes[1] == m2_sizes[0], "mat1 and mat2 shapes cannot be multiplied (",
-      m1_sizes[0], "x", m1_sizes[1], " and ", m2_sizes[0], "x", m2_sizes[1], ")");
-
  TORCH_CHECK(
      self_sizes[0] == m1_sizes[0] && self_sizes[1] == m2_sizes[1],
      "input shape is incompatible with matrix multiplication (",
@ -1111,6 +1108,9 @@ TORCH_IMPL_FUNC(addmm_out_cpu)(const Tensor& self, const Tensor& mat1, const Ten
 Tensor& mm_cpu_out(const Tensor & self, const Tensor & mat2, Tensor & result) {
  TORCH_CHECK(self.dim() == 2, "self must be a matrix");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
  native::resize_(result, {self.sizes()[0], mat2.sizes()[1]});
  addmm_impl_cpu_(result, result, self, mat2, 0, 1);
  auto names = at::namedinference::propagate_names_for_addmm(self, mat2, result);
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -47,12 +47,6 @@ bool cudnn_is_acceptable(const Tensor& self) {
  return true;
 }

-Tensor detach(const Tensor& self) {
-  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
-  //AT_ERROR("detach is not implemented for Tensor");
-  return self;
-}
-
 Tensor & detach_(Tensor & self) {
  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
  //AT_ERROR("detach_ is not implemented for Tensor");
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -2170,6 +2170,12 @@ Tensor alias(const Tensor& self) {
    return alias_with_sizes_and_strides(self, self.sizes(), self.strides());
 }

+Tensor detach(const Tensor& self) {
+  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
+  //AT_ERROR("detach is not implemented for Tensor");
+  return native::alias(self);
+}
+
 Tensor unfold(const Tensor& self, int64_t dimension, int64_t size, int64_t step) {
  // some special handling to deal with allow dimension == 0 when self.dim() == 0
  dimension = at::maybe_wrap_dim(dimension, self.dim(), /*wrap_scalar=*/true);
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@ -632,6 +632,40 @@ void silu_backward_kernel(TensorIterator& iter) {
      });
 }

+void mish_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_cpu", [&]() {
+        using Vec = Vec256<scalar_t>;
+        cpu_kernel_vec(
+            iter,
+            [](scalar_t x) -> scalar_t{
+              return static_cast<scalar_t>(x * std::tanh(std::log1p(std::exp(x))));
+            },
+            [](Vec x_vec) -> Vec {
+              return x_vec * x_vec.exp().log1p().tanh();
+            });
+      });
+}
+
+void mish_backward_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_backward_cpu", [&]() {
+        using Vec = Vec256<scalar_t>;
+        const Vec kOneVec(scalar_t(1));
+        cpu_kernel_vec(
+            iter,
+            [](scalar_t dy, scalar_t x) -> scalar_t {
+              const scalar_t sigmoid =
+                  scalar_t(1) / (scalar_t(1) + std::exp(-x));
+              const scalar_t tanh_softplus = std::tanh(std::log1p(std::exp(x)));
+              return dy * (tanh_softplus + x * sigmoid * (scalar_t(1) - tanh_softplus * tanh_softplus));
+            },
+            [kOneVec](Vec dy_vec, Vec x_vec) -> Vec {
+              const Vec sigmoid = kOneVec / (kOneVec + x_vec.neg().exp());
+              const Vec tanh_softplus = x_vec.exp().log1p().tanh();
+              return dy_vec * (tanh_softplus + x_vec * sigmoid * (kOneVec - tanh_softplus * tanh_softplus));
+            });
+      });
+}
+
 } // namespace

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
@ -680,6 +714,10 @@ REGISTER_DISPATCH(glu_backward_stub, &glu_backward_kernel);
 REGISTER_DISPATCH(silu_stub, &silu_kernel);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_DISPATCH(silu_backward_stub, &silu_backward_kernel);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_DISPATCH(mish_stub, &mish_kernel);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_DISPATCH(mish_backward_stub, &mish_backward_kernel);

 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@ -496,6 +496,45 @@ void silu_backward_kernel(TensorIterator& iter) {
      });
 }

+void mish_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_cuda",
+      [&]() {
+        gpu_kernel(
+            iter,
+            [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+          using T_ACC = acc_type<scalar_t, true>;
+          const T_ACC x_acc = static_cast<T_ACC>(x);
+          return x_acc * c10::cuda::compat::tanh(c10::cuda::compat::log1p(c10::cuda::compat::exp(x_acc)));
+      });
+      });
+}
+
+void mish_backward_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_backward_cuda",
+      [&]() {
+        gpu_kernel(
+            iter,
+            [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+          using T_ACC = acc_type<scalar_t, true>;
+          const T_ACC dy_acc = static_cast<T_ACC>(dy);
+          const T_ACC x_acc = static_cast<T_ACC>(x);
+          const T_ACC s_acc =
+              T_ACC(1) / (T_ACC(1) + c10::cuda::compat::exp(-x_acc));
+          const T_ACC t_acc =
+              c10::cuda::compat::tanh(c10::cuda::compat::log1p(c10::cuda::compat::exp(x_acc)));
+          return dy_acc * (t_acc + x_acc * s_acc * (T_ACC(1) - t_acc * t_acc));
+      });
+      });
+}
+
 } // namespace

 Tensor gelu_cuda(const Tensor& self) {
@ -540,6 +579,8 @@ REGISTER_DISPATCH(softplus_stub, &softplus_kernel);
 REGISTER_DISPATCH(softplus_backward_stub, &softplus_backward_kernel);
 REGISTER_DISPATCH(silu_stub, &silu_kernel);
 REGISTER_DISPATCH(silu_backward_stub, &silu_backward_kernel);
+REGISTER_DISPATCH(mish_stub, &mish_kernel);
+REGISTER_DISPATCH(mish_backward_stub, &mish_backward_kernel);
 REGISTER_DISPATCH(threshold_stub, &threshold_kernel_cuda);

 } // namespace native
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@ -75,7 +75,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  // Make sure to keep addmm_cuda below in sync with this code; it
  // preflights a check to try to avoid actually needing to call
  // expand().
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");

  TensorArg args[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
  checkAllSameGPU("addmm", args);
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -24,20 +24,27 @@ struct TensorDims {
  index_t sizes[MAX_DIMS];
 };

-template<typename index_t>
-__global__ void write_indices(int64_t * inp, TensorDims<index_t> dims, int ndim, index_t n){
-    CUDA_KERNEL_LOOP(index, n) { // this assumed int (not int64_t) index
-      index_t div = 1;
-      int64_t idx_flat = inp[index];
-      for (int dim = ndim-1; dim >= 0; dim--){
-        auto dim_size = dims.sizes[dim];
-        inp[index + dim*n] = (idx_flat/div) % dim_size;
-        div *= dim_size;
-      }
+template <typename index_t>
+__global__ void write_indices(
+    int64_t* inp,
+    TensorDims<index_t> dims,
+    int ndim,
+    index_t n) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    index_t div = 1;
+    int64_t idx_flat = inp[index];
+#pragma unroll
+    for (int dim = MAX_DIMS; dim >= 0; dim--) {
+      if (dim > ndim - 1)
+        continue;
+      auto dim_size = dims.sizes[dim];
+      inp[index + dim * n] = (idx_flat / div) % dim_size;
+      div *= dim_size;
    }
+  }
 }

-
 } //anonymous namespace

 template<typename scalar_t>
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@ -47,9 +47,9 @@ template <int N> struct alignas(N) OpaqueType { char data[N]; };

 Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
-  TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
    "randperm of tensors larger than INT_MAX is not supported yet in pytorch");
+
  check_supported_max_int_with_precision(n, result);

  result.resize_({n});
@ -73,13 +73,15 @@ Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor&
  const double log_threshold_12 = std::log(0.9) * 12;
  double nd = static_cast<double>(n);

-  constexpr bool is_reduced_bits = true;
  int bits = std::min(64,
    static_cast<int>(std::ceil(std::log2(nd - (6 * nd * nd + 1) / log_threshold_12))));

  if (n == 0) {
    return result;
  } else if (bits <= 32) {
+    // For asserting device type match of the generator and result,
+    // we deligate that to the 'random_' function below.
+
    auto keys = at::empty(result.sizes(), opt.dtype(kInt)).random_(
      std::numeric_limits<int>::min(), std::numeric_limits<int>::max(), generator);
    auto keys_tmp = at::empty_like(keys);
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -0,0 +1,527 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/Config.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Utils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/SpectralOpsUtils.h>
+#include <ATen/native/cuda/CuFFTUtils.h>
+#include <ATen/native/cuda/CuFFTPlanCache.h>
+
+#include <cufft.h>
+#include <cufftXt.h>
+
+#include <cmath>
+#include <vector>
+
+
+namespace at { namespace native {
+
+using namespace at::native::detail;
+
+// Execute a pre-planned transform
+static void exec_cufft_plan(
+    const CuFFTConfig &config, void* in_data, void* out_data, bool forward) {
+  auto& plan = config.plan();
+#ifdef __HIP_PLATFORM_HCC__
+  auto value_type = config.data_type();
+  if (value_type == kFloat) {
+    switch (config.transform_type()) {
+      case CuFFTTransformType::C2C: {
+        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(in_data),
+                                  static_cast<hipfftComplex*>(out_data),
+                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case CuFFTTransformType::R2C: {
+        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(in_data),
+                                  static_cast<hipfftComplex*>(out_data)));
+        return;
+      }
+      case CuFFTTransformType::C2R: {
+        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(in_data),
+                                  static_cast<hipfftReal*>(out_data)));
+        return;
+      }
+    }
+  } else if (value_type == kDouble) {
+    switch (config.transform_type()) {
+      case CuFFTTransformType::C2C: {
+        CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast<hipfftDoubleComplex*>(in_data),
+                                  static_cast<hipfftDoubleComplex*>(out_data),
+                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case CuFFTTransformType::R2C: {
+        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(in_data),
+                                  static_cast<hipfftDoubleComplex*>(out_data)));
+        return;
+      }
+      case CuFFTTransformType::C2R: {
+        CUFFT_CHECK(hipfftExecZ2D(plan, static_cast<hipfftDoubleComplex*>(in_data),
+                                  static_cast<hipfftDoubleReal*>(out_data)));
+        return;
+      }
+    }
+  }
+  TORCH_CHECK(false, "hipFFT doesn't support transforms on type: ", value_type);
+#else
+  CUFFT_CHECK(cufftXtExec(plan, in_data, out_data,
+                          forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+#endif
+}
+
+
+// NOTE [ cuFFT Embedded Strides ]
+//
+// cuFFT supports a subset of arbitrary strides via their "advanced data layout"
+// option (http://docs.nvidia.com/cuda/cufft/index.html#advanced-data-layout).
+// Specifically, these are tensors that can be viewed as subtensors resulted
+// from slicing a larger contiguous tensors. For such input tensors, let the
+// sizes of the enclosing tensor be `inembed`, and we can have in 3d case:
+//
+//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z)]
+//
+// Above is the simplified formula ignoring the batch dimension. In fact, the
+// last dimension of the enclosing tensor doesn't have to be contiguous, i.e.,
+// it can be greater than 1. Then one can set the base stride for the enclosing
+// tensor with `istride`. Then we have
+//
+//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z) * istride]
+//
+// For example, consider
+//
+//     enclosing = torch.zeros(6, 8, 10)  # contiguous
+//     input = enclosing[:4, 2:6, 6:]
+//     input.size()                       # [ 4,  4,  4]
+//     input.stride()                     # [80, 10,  1]
+//     # inembed = [6, 8, 10]
+//     input[2, 1, 3] = input[((2 * 8) + 1) * 10 + 3]   # using above formula
+//                    = input[173]
+//                    = input[2 * 80 + 1 * 10 + 1 * 3]  # using strides directly
+//
+// Generally, the embedded strides can be computed as
+//
+//     embed[i] = stride[i - 1] / stride[i].
+//
+// Note that the value of embed[0] isn't used to compute indices and doesn't
+// matter.
+//
+// Contrary to advanced data layout, simple layout means that *embeds have
+// unit-strides. In particular, unit-stride refers to that the input and output
+// tensors being contiguous, and that the strides at the innermost signal
+// dimension being unit (1) w.r.t. the corresponding data type.
+
+#pragma push
+#pragma diag_suppress 177   // Function was declared but never referenced
+static inline Tensor _run_cufft(
+    const CuFFTConfig &config, Tensor& input, int64_t signal_ndim,
+    bool complex_input, bool complex_output, bool inverse,
+    IntArrayRef checked_signal_sizes, fft_norm_mode norm, bool onesided,
+    IntArrayRef output_sizes, bool input_was_cloned
+) {
+  if (config.should_clone_input() && !input_was_cloned) {
+    input = input.clone(at::MemoryFormat::Contiguous);
+  }
+
+  auto& plan = config.plan();
+  auto& ctx = at::globalContext();
+
+  // set output
+  auto output = at::empty(output_sizes, input.options());
+
+  // set to current stream
+  CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
+
+  auto ws = at::empty({ config.workspace_size() }, at::device(at::kCUDA).dtype(at::kByte));
+  CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr()));
+
+  // run
+  exec_cufft_plan(config, input.data_ptr(), output.data_ptr(), !inverse);
+
+  // rescale if requested
+  auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1];
+  if (norm != fft_norm_mode::none) {
+    auto signal_numel = c10::multiply_integers(checked_signal_sizes);
+    double scale_denom;
+    if (norm == fft_norm_mode::by_root_n) {
+      scale_denom = std::sqrt(static_cast<double>(signal_numel));
+    } else {
+      scale_denom = static_cast<double>(signal_numel);
+    }
+    if (!complex_input && complex_output && !onesided) {
+      auto end_data_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim);
+      output.narrow(signal_ndim, 0, end_data_slice).div_(scale_denom);
+    } else {
+      output.div_(scale_denom);
+    }
+  }
+
+  // if needed, fill out the other half using conjugate symmetry
+  if (!complex_input && complex_output && !onesided) {
+    DimVector signal_dims(signal_ndim);
+    std::iota(signal_dims.begin(), signal_dims.end(), 1);
+    auto out_as_complex = at::view_as_complex(output);
+    at::native::_fft_fill_with_conjugate_symmetry_(out_as_complex, signal_dims);
+  }
+  return output;
+}
+#pragma pop
+
+// The cuFFT plan cache
+// unique_ptr for nullability and to avoid reference invalidation on vector resize
+static std::vector<std::unique_ptr<CuFFTParamsLRUCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline
+CuFFTParamsLRUCache &cufft_get_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  AT_ASSERT(device_index >= 0);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<CuFFTParamsLRUCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+
+
+namespace detail {
+
+int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) {
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_get_plan_cache_max_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).max_size();
+}
+
+void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) {
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).resize(max_size);
+}
+
+int64_t cufft_get_plan_cache_size_impl(int64_t device_index) {
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_get_plan_cache_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).size();
+}
+
+void cufft_clear_plan_cache_impl(int64_t device_index) {
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_clear_plan_cache: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).clear();
+}
+
+} // namespace at::native::detail
+
+namespace {
+constexpr int64_t cufft_max_ndim = 3;
+
+// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+                         IntArrayRef dim, bool forward) {
+  const auto ndim = self.dim();
+  const int64_t signal_ndim = dim.size();
+  const auto batch_dims = ndim - signal_ndim;
+
+  // Permute dimensions so batch dimensions come first, and in stride order
+  // This maximizes data locality when collapsing to a single batch dimension
+  DimVector dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), int64_t{0});
+
+  c10::SmallVector<bool, kDimVectorStaticSize> is_transformed_dim(ndim);
+  for (const auto& d : dim) {
+    is_transformed_dim[d] = true;
+  }
+  auto batch_end = std::partition(dim_permute.begin(), dim_permute.end(),
+                                  [&](int64_t d) {return !is_transformed_dim[d]; });
+  auto self_strides = self.strides();
+  std::sort(dim_permute.begin(), batch_end,
+            [&](int64_t a, int64_t b) { return self_strides[a] > self_strides[b]; });
+  std::copy(dim.cbegin(), dim.cend(), batch_end);
+  auto input = self.permute(dim_permute);
+
+  // Collapse batch dimensions into a single dimension
+  DimVector batched_sizes(signal_ndim + 1);
+  batched_sizes[0] = -1;
+  std::copy(input.sizes().cbegin() + batch_dims, input.sizes().cend(), batched_sizes.begin() + 1);
+  input = input.reshape(batched_sizes);
+
+  const auto batch_size = input.sizes()[0];
+  DimVector signal_size(signal_ndim + 1);
+  signal_size[0] = batch_size;
+  for (int64_t i = 0; i < signal_ndim; ++i) {
+    auto in_size = input.sizes()[i + 1];
+    auto out_size = out_sizes[dim[i]];
+    signal_size[i + 1] = std::max(in_size, out_size);
+    TORCH_INTERNAL_ASSERT(in_size == signal_size[i + 1] ||
+                          in_size == (signal_size[i + 1] / 2) + 1);
+    TORCH_INTERNAL_ASSERT(out_size == signal_size[i + 1] ||
+                          out_size == (signal_size[i + 1] / 2) + 1);
+  }
+
+  batched_sizes[0] = batch_size;
+  DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end());
+  for (size_t i = 0; i < dim.size(); ++i) {
+    batched_out_sizes[i + 1] = out_sizes[dim[i]];
+  }
+  out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
+
+  // Create the transform plan (either from cache or locally)
+  const auto value_type = c10::toValueType(input.scalar_type());
+  auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex());
+  CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type);
+  CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index());
+  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+  c10::optional<CuFFTConfig> uncached_plan;
+  const CuFFTConfig * config = nullptr;
+
+  if (plan_cache.max_size() > 0) {
+    guard.lock();
+    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
+      config = &plan_cache.lookup(Params);
+    }
+  }
+
+  if (config == nullptr) {
+    uncached_plan.emplace(Params);
+    config = &uncached_plan.value();
+  }
+
+  auto & plan = config->plan();
+
+  if (config->should_clone_input()) {
+    input = input.clone(MemoryFormat::Contiguous);
+  }
+
+  // prepare cufft for execution
+  CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
+  auto workspace = at::empty({ config->workspace_size() }, at::device(at::kCUDA).dtype(at::kByte));
+  CUFFT_CHECK(cufftSetWorkArea(plan, workspace.data_ptr()));
+
+  // execute transform plan
+  exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward);
+
+  // Inplace reshaping to original batch shape and inverting the dimension permutation
+  DimVector out_strides(ndim);
+  int64_t batch_numel = 1;
+  for (int64_t i = batch_dims - 1; i >= 0; --i) {
+    out_strides[dim_permute[i]] = batch_numel * out.strides()[0];
+    batch_numel *= out_sizes[dim_permute[i]];
+  }
+  for (int64_t i = batch_dims; i < ndim; ++i) {
+    out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)];
+  }
+  return out.as_strided_(out_sizes, out_strides, out.storage_offset());
+}
+
+// Calculates the normalization constant and applies it in-place to self
+// sizes is the sizes of a twosided tensor and dims are all transformed dims
+double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto norm = static_cast<fft_norm_mode>(normalization);
+  if (norm == fft_norm_mode::none) {
+    return 1.0;
+  }
+
+  int64_t signal_numel = 1;
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (norm == fft_norm_mode::by_root_n) ?
+    std::sqrt(signal_numel) : static_cast<double>(signal_numel);
+  return 1.0 / scale_denom;
+}
+
+const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return (scale == 1.0) ? self : self.mul_(scale);
+}
+
+Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return at::mul_out(out, self, c10::scalar_to_tensor(scale));
+}
+
+}  // namespace (anonymous)
+
+// n-dimensional real to complex FFT
+Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
+  TORCH_CHECK(self.is_floating_point());
+  auto input_sizes = self.sizes();
+  DimVector onesided_sizes(input_sizes.begin(), input_sizes.end());
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
+  onesided_sizes[last_dim] = last_dim_halfsize;
+  IntArrayRef out_sizes = onesided ? onesided_sizes : input_sizes;
+
+  const auto out_options = self.options().dtype(c10::toComplexType(self.scalar_type()));
+  auto output = at::empty(out_sizes, out_options);
+
+  // CuFFT requires real input to be over-aligned, as if it were complex
+  const auto complex_size = 2 * self.element_size();
+  const bool complex_aligned = (
+      reinterpret_cast<std::uintptr_t>(self.data_ptr()) % complex_size == 0);
+  auto working_tensor = self;
+  if (!complex_aligned) {
+    working_tensor = self.movedim(last_dim, -1)
+                         .clone(MemoryFormat::Contiguous)
+                         .movedim(-1, last_dim);
+  }
+
+  // First do the R2C transform on the last dimension
+  {
+    auto target_sizes = dim.size() == 1 ? out_sizes : onesided_sizes;
+    _exec_fft(output, working_tensor, target_sizes, last_dim, /*forward=*/true);
+    if (dim.size() > 1) {
+      working_tensor = at::empty(out_sizes, out_options);
+    }
+  }
+
+  // Then any remaining C2C transforms
+  DimVector sorted_dims(dim.begin(), dim.end() - 1);
+  while (!sorted_dims.empty()) {
+    std::swap(output, working_tensor);
+
+    // Resort dimensions every time as _exec_fft re-strides the output
+    auto strides = working_tensor.strides();
+    std::sort(sorted_dims.begin(), sorted_dims.end(),
+              [&](int64_t a, int64_t b) { return strides[a] > strides[b]; });
+
+    const auto max_dims = std::min(static_cast<size_t>(cufft_max_ndim), sorted_dims.size());
+    auto last_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims);
+
+    // Intermediate results are always onesided
+    _exec_fft(output, working_tensor, onesided_sizes, last_dims, /*forward=*/true);
+    sorted_dims.resize(sorted_dims.size() - max_dims);
+  }
+
+  // Only need to normalize the onesided slice since data in the other half is overwritten
+  auto out_slice = output.slice(last_dim, 0, last_dim_halfsize);
+  _fft_apply_normalization(out_slice, normalization, input_sizes, dim);
+
+  if (!onesided) {
+    if (output.sizes()[last_dim] != out_sizes[last_dim]) {
+      working_tensor.resize_(out_sizes, MemoryFormat::Contiguous);
+      working_tensor.slice(last_dim, 0, last_dim_halfsize).copy_(output);
+      output = std::move(working_tensor);
+    }
+    at::native::_fft_fill_with_conjugate_symmetry_(output, dim);
+  }
+  return output;
+}
+
+Tensor& _fft_r2c_cufft_out(const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool onesided, Tensor& out) {
+  auto result = _fft_r2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), /*onesided=*/true);
+  if (onesided) {
+    return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim);
+  }
+
+  resize_output(out, self.sizes());
+
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = result.sizes()[last_dim];
+  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
+  _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim);
+  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
+  return out;
+}
+
+// n-dimensional complex to real IFFT
+Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) {
+  TORCH_CHECK(self.is_complex());
+  auto in_sizes = self.sizes();
+  DimVector out_sizes(in_sizes.begin(), in_sizes.end());
+  out_sizes[dim.back()] = lastdim;
+
+  // First complete any C2C transforms
+  Tensor temp;
+  if (dim.size() > 1) {
+    temp = _fft_c2c_cufft(
+        self, dim.slice(0, dim.size() - 1),
+        static_cast<int64_t>(fft_norm_mode::none), /*forward=*/false);
+  } else {
+    // Complex to real FFTs may overwrite the input buffer, so must always clone (gh-34551)
+    temp = self.clone(MemoryFormat::Contiguous);
+  }
+
+  // Finally, do a 1D C2R transform
+  // TODO: could transform up to 2 other dims in the same cuFFT operation
+  auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
+  _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false);
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2r_cufft_out(const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, int64_t lastdim, Tensor& out) {
+  auto result = _fft_c2r_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), lastdim);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
+}
+
+// n-dimensional complex to complex FFT/IFFT
+Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
+  TORCH_CHECK(self.is_complex());
+  if (dim.empty()) {
+    return self.clone();
+  }
+
+  auto out_sizes = self.sizes();
+  auto output = at::empty(out_sizes, self.options());
+
+  // Perform any number of C2C transforms
+  DimVector sorted_dims(dim.begin(), dim.end());
+  auto self_strides = self.strides();
+  auto working_tensor = self;
+  while (true) {
+    // Sort dimensions every time as _exec_fft re-strides the output
+    auto strides = working_tensor.strides();
+    std::sort(sorted_dims.begin(), sorted_dims.end(),
+              [&](int64_t a, int64_t b) { return strides[a] > strides[b]; });
+
+    const auto max_dims = std::min(static_cast<size_t>(cufft_max_ndim), sorted_dims.size());
+    auto first_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims);
+
+    _exec_fft(output, working_tensor, out_sizes, first_dims, forward);
+    sorted_dims.resize(sorted_dims.size() - max_dims);
+
+    if (sorted_dims.empty()) {
+      break;
+    }
+
+    if (working_tensor.is_same(self)) {
+      working_tensor = std::move(output);
+      output = at::empty(out_sizes, self.options());
+    } else {
+      std::swap(output, working_tensor);
+    }
+  }
+
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2c_cufft_out(const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool forward, Tensor& out) {
+  auto result = _fft_c2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), forward);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
+}
+
+
+}} // at::native
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@ -16,10 +16,6 @@
 #include <THC/THCTensorSort.cuh>
 #include <THC/THCThrustAllocator.cuh>

-#include <thrust/execution_policy.h>
-#include <thrust/unique.h>
-#include <cufft.h>
-#include <cufftXt.h>

 #include <cmath>
 #include <vector>
@ -83,6 +79,7 @@ struct HermitianSymmetryOffsetCalculator {
  }
 };

+
 // out[:] = conj(in[:]) where in and out ordering is generalized by offset calculators
 template <typename scalar_t, typename inp_calc_t, typename out_calc_t>
 C10_LAUNCH_BOUNDS_1(cuda::detail::CUDA_NUM_THREADS)
@ -135,504 +132,4 @@ void _fft_fill_with_conjugate_symmetry_cuda_(

 REGISTER_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cuda_);

-// Execute a pre-planned tranform
-static void exec_cufft_plan(
-    const CuFFTConfig &config, void* in_data, void* out_data, bool forward) {
-  auto& plan = config.plan();
-#ifdef __HIP_PLATFORM_HCC__
-  auto value_type = config.data_type();
-  if (value_type == kFloat) {
-    switch (config.transform_type()) {
-      case CuFFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(in_data),
-                                  static_cast<hipfftComplex*>(out_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case CuFFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(in_data),
-                                  static_cast<hipfftComplex*>(out_data)));
-        return;
-      }
-      case CuFFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(in_data),
-                                  static_cast<hipfftReal*>(out_data)));
-        return;
-      }
-    }
-  } else if (value_type == kDouble) {
-    switch (config.transform_type()) {
-      case CuFFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleComplex*>(out_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case CuFFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(in_data),
-                                  static_cast<hipfftDoubleComplex*>(out_data)));
-        return;
-      }
-      case CuFFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecZ2D(plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleReal*>(out_data)));
-        return;
-      }
-    }
-  }
-  TORCH_CHECK(false, "hipFFT doesn't support transforms on type: ", value_type);
-#else
-  CUFFT_CHECK(cufftXtExec(plan, in_data, out_data,
-                          forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-#endif
-}
-
-
-// NOTE [ cuFFT Embedded Strides ]
-//
-// cuFFT supports a subset of arbitrary strides via their "advanced data layout"
-// option (http://docs.nvidia.com/cuda/cufft/index.html#advanced-data-layout).
-// Specifically, these are tensors that can be viewed as subtensors resulted
-// from slicing a larger contiguous tensors. For such input tensors, let the
-// sizes of the enclosing tensor be `inembed`, and we can have in 3d case:
-//
-//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z)]
-//
-// Above is the simplified formula ignoring the batch dimension. In fact, the
-// last dimension of the enclosing tensor doesn't have to be contiguous, i.e.,
-// it can be greater than 1. Then one can set the base stride for the enclosing
-// tensor with `istride`. Then we have
-//
-//     input[x, y, z] = input[((x * inembed[1] + y) * inembed[2] + z) * istride]
-//
-// For example, consider
-//
-//     enclosing = torch.zeros(6, 8, 10)  # contiguous
-//     input = enclosing[:4, 2:6, 6:]
-//     input.size()                       # [ 4,  4,  4]
-//     input.stride()                     # [80, 10,  1]
-//     # inembed = [6, 8, 10]
-//     input[2, 1, 3] = input[((2 * 8) + 1) * 10 + 3]   # using above formula
-//                    = input[173]
-//                    = input[2 * 80 + 1 * 10 + 1 * 3]  # using strides directly
-//
-// Generally, the embedded strides can be computed as
-//
-//     embed[i] = stride[i - 1] / stride[i].
-//
-// Note that the value of embed[0] isn't used to compute indices and doesn't
-// matter.
-//
-// Contrary to advanced data layout, simple layout means that *embeds have
-// unit-strides. In particular, unit-stride refers to that the input and output
-// tensors being contiguous, and that the strides at the innermost signal
-// dimension being unit (1) w.r.t. the corresponding data type.
-
-#pragma push
-#pragma diag_suppress 177   // Function was declared but never referenced
-static inline Tensor _run_cufft(
-    const CuFFTConfig &config, Tensor& input, int64_t signal_ndim,
-    bool complex_input, bool complex_output, bool inverse,
-    IntArrayRef checked_signal_sizes, fft_norm_mode norm, bool onesided,
-    IntArrayRef output_sizes, bool input_was_cloned
-) {
-  if (config.should_clone_input() && !input_was_cloned) {
-    input = input.clone(at::MemoryFormat::Contiguous);
-  }
-
-  auto& plan = config.plan();
-  auto& ctx = at::globalContext();
-
-  // set output
-  auto output = at::empty(output_sizes, input.options());
-
-  // set to current stream
-  CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
-
-  auto ws = at::empty({ config.workspace_size() }, at::device(at::kCUDA).dtype(at::kByte));
-  CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr()));
-
-  // run
-  exec_cufft_plan(config, input.data_ptr(), output.data_ptr(), !inverse);
-
-  // rescale if requested
-  auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1];
-  if (norm != fft_norm_mode::none) {
-    auto signal_numel = c10::multiply_integers(checked_signal_sizes);
-    double scale_denom;
-    if (norm == fft_norm_mode::by_root_n) {
-      scale_denom = std::sqrt(static_cast<double>(signal_numel));
-    } else {
-      scale_denom = static_cast<double>(signal_numel);
-    }
-    if (!complex_input && complex_output && !onesided) {
-      auto end_data_slice = infer_ft_real_to_complex_onesided_size(size_last_signal_dim);
-      output.narrow(signal_ndim, 0, end_data_slice).div_(scale_denom);
-    } else {
-      output.div_(scale_denom);
-    }
-  }
-
-  // if needed, fill out the other half using conjugate symmetry
-  if (!complex_input && complex_output && !onesided) {
-    DimVector signal_dims(signal_ndim);
-    std::iota(signal_dims.begin(), signal_dims.end(), 1);
-    auto out_as_complex = at::view_as_complex(output);
-    at::native::_fft_fill_with_conjugate_symmetry_(out_as_complex, signal_dims);
-  }
-  return output;
-}
-#pragma pop
-
-// The cuFFT plan cache
-// unique_ptr for nullability and to avoid reference invalidation on vector resize
-static std::vector<std::unique_ptr<CuFFTParamsLRUCache>> plan_caches;
-static std::mutex plan_caches_mutex;
-
-static inline
-CuFFTParamsLRUCache &cufft_get_plan_cache(int64_t device_index) {
-  std::lock_guard<std::mutex> guard(plan_caches_mutex);
-
-  AT_ASSERT(device_index >= 0);
-
-  if (device_index >= plan_caches.size()) {
-    plan_caches.resize(device_index + 1);
-  }
-
-  if (!plan_caches[device_index]) {
-    plan_caches[device_index] = std::make_unique<CuFFTParamsLRUCache>();
-  }
-
-  return *plan_caches[device_index];
-}
-
-
-namespace detail {
-
-int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
-    "cufft_get_plan_cache_max_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
-    device_index);
-  return cufft_get_plan_cache(device_index).max_size();
-}
-
-void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
-    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
-    device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
-}
-
-int64_t cufft_get_plan_cache_size_impl(int64_t device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
-    "cufft_get_plan_cache_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
-    device_index);
-  return cufft_get_plan_cache(device_index).size();
-}
-
-void cufft_clear_plan_cache_impl(int64_t device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
-    "cufft_clear_plan_cache: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
-    device_index);
-  return cufft_get_plan_cache(device_index).clear();
-}
-
-} // namespace at::native::detail
-
-namespace {
-constexpr int64_t cufft_max_ndim = 3;
-
-// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
-static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
-                         IntArrayRef dim, bool forward) {
-  const auto ndim = self.dim();
-  const int64_t signal_ndim = dim.size();
-  const auto batch_dims = ndim - signal_ndim;
-
-  // Permute dimensions so batch dimensions come first, and in stride order
-  // This maximizes data locality when collapsing to a single batch dimension
-  DimVector dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), int64_t{0});
-
-  c10::SmallVector<bool, kDimVectorStaticSize> is_transformed_dim(ndim);
-  for (const auto& d : dim) {
-    is_transformed_dim[d] = true;
-  }
-  auto batch_end = std::partition(dim_permute.begin(), dim_permute.end(),
-                                  [&](int64_t d) {return !is_transformed_dim[d]; });
-  auto self_strides = self.strides();
-  std::sort(dim_permute.begin(), batch_end,
-            [&](int64_t a, int64_t b) { return self_strides[a] > self_strides[b]; });
-  std::copy(dim.cbegin(), dim.cend(), batch_end);
-  auto input = self.permute(dim_permute);
-
-  // Collapse batch dimensions into a single dimension
-  DimVector batched_sizes(signal_ndim + 1);
-  batched_sizes[0] = -1;
-  std::copy(input.sizes().cbegin() + batch_dims, input.sizes().cend(), batched_sizes.begin() + 1);
-  input = input.reshape(batched_sizes);
-
-  const auto batch_size = input.sizes()[0];
-  DimVector signal_size(signal_ndim + 1);
-  signal_size[0] = batch_size;
-  for (int64_t i = 0; i < signal_ndim; ++i) {
-    auto in_size = input.sizes()[i + 1];
-    auto out_size = out_sizes[dim[i]];
-    signal_size[i + 1] = std::max(in_size, out_size);
-    TORCH_INTERNAL_ASSERT(in_size == signal_size[i + 1] ||
-                          in_size == (signal_size[i + 1] / 2) + 1);
-    TORCH_INTERNAL_ASSERT(out_size == signal_size[i + 1] ||
-                          out_size == (signal_size[i + 1] / 2) + 1);
-  }
-
-  batched_sizes[0] = batch_size;
-  DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end());
-  for (size_t i = 0; i < dim.size(); ++i) {
-    batched_out_sizes[i + 1] = out_sizes[dim[i]];
-  }
-  out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
-
-  // Create the transform plan (either from cache or locally)
-  const auto value_type = c10::toValueType(input.scalar_type());
-  auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex());
-  CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type);
-  CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index());
-  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-  c10::optional<CuFFTConfig> uncached_plan;
-  const CuFFTConfig * config = nullptr;
-
-  if (plan_cache.max_size() > 0) {
-    guard.lock();
-    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
-      config = &plan_cache.lookup(Params);
-    }
-  }
-
-  if (config == nullptr) {
-    uncached_plan.emplace(Params);
-    config = &uncached_plan.value();
-  }
-
-  auto & plan = config->plan();
-
-  if (config->should_clone_input()) {
-    input = input.clone(MemoryFormat::Contiguous);
-  }
-
-  // prepare cufft for execution
-  CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
-  auto workspace = at::empty({ config->workspace_size() }, at::device(at::kCUDA).dtype(at::kByte));
-  CUFFT_CHECK(cufftSetWorkArea(plan, workspace.data_ptr()));
-
-  // execute transform plan
-  exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward);
-
-  // Inplace reshaping to original batch shape and inverting the dimension permutation
-  DimVector out_strides(ndim);
-  int64_t batch_numel = 1;
-  for (int64_t i = batch_dims - 1; i >= 0; --i) {
-    out_strides[dim_permute[i]] = batch_numel * out.strides()[0];
-    batch_numel *= out_sizes[dim_permute[i]];
-  }
-  for (int64_t i = batch_dims; i < ndim; ++i) {
-    out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)];
-  }
-  return out.as_strided_(out_sizes, out_strides, out.storage_offset());
-}
-
-// Calculates the normalization constant and applies it in-place to self
-// sizes is the sizes of a twosided tensor and dims are all transformed dims
-double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
-  auto norm = static_cast<fft_norm_mode>(normalization);
-  if (norm == fft_norm_mode::none) {
-    return 1.0;
-  }
-
-  int64_t signal_numel = 1;
-  for (auto dim : dims) {
-    signal_numel *= sizes[dim];
-  }
-  const double scale_denom = (norm == fft_norm_mode::by_root_n) ?
-    std::sqrt(signal_numel) : static_cast<double>(signal_numel);
-  return 1.0 / scale_denom;
-}
-
-const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
-  auto scale = _fft_normalization_scale(normalization, sizes, dims);
-  return (scale == 1.0) ? self : self.mul_(scale);
-}
-
-Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
-  auto scale = _fft_normalization_scale(normalization, sizes, dims);
-  return at::mul_out(out, self, c10::scalar_to_tensor(scale));
-}
-
-}  // namespace (anonymous)
-
-// n-dimensional real to complex FFT
-Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
-  TORCH_CHECK(self.is_floating_point());
-  auto input_sizes = self.sizes();
-  DimVector onesided_sizes(input_sizes.begin(), input_sizes.end());
-  auto last_dim = dim.back();
-  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
-  onesided_sizes[last_dim] = last_dim_halfsize;
-  IntArrayRef out_sizes = onesided ? onesided_sizes : input_sizes;
-
-  const auto out_options = self.options().dtype(c10::toComplexType(self.scalar_type()));
-  auto output = at::empty(out_sizes, out_options);
-
-  // CuFFT requires real input to be over-aligned, as if it were complex
-  const auto complex_size = 2 * self.element_size();
-  const bool complex_aligned = (
-      reinterpret_cast<std::uintptr_t>(self.data_ptr()) % complex_size == 0);
-  auto working_tensor = self;
-  if (!complex_aligned) {
-    working_tensor = self.movedim(last_dim, -1)
-                         .clone(MemoryFormat::Contiguous)
-                         .movedim(-1, last_dim);
-  }
-
-  // First do the R2C transform on the last dimension
-  {
-    auto target_sizes = dim.size() == 1 ? out_sizes : onesided_sizes;
-    _exec_fft(output, working_tensor, target_sizes, last_dim, /*forward=*/true);
-    if (dim.size() > 1) {
-      working_tensor = at::empty(out_sizes, out_options);
-    }
-  }
-
-  // Then any remaining C2C transforms
-  DimVector sorted_dims(dim.begin(), dim.end() - 1);
-  while (!sorted_dims.empty()) {
-    std::swap(output, working_tensor);
-
-    // Resort dimensions every time as _exec_fft re-strides the output
-    auto strides = working_tensor.strides();
-    std::sort(sorted_dims.begin(), sorted_dims.end(),
-              [&](int64_t a, int64_t b) { return strides[a] > strides[b]; });
-
-    const auto max_dims = std::min(static_cast<size_t>(cufft_max_ndim), sorted_dims.size());
-    auto last_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims);
-
-    // Intermediate results are always onesided
-    _exec_fft(output, working_tensor, onesided_sizes, last_dims, /*forward=*/true);
-    sorted_dims.resize(sorted_dims.size() - max_dims);
-  }
-
-  // Only need to normalize the onesided slice since data in the other half is overwritten
-  auto out_slice = output.slice(last_dim, 0, last_dim_halfsize);
-  _fft_apply_normalization(out_slice, normalization, input_sizes, dim);
-
-  if (!onesided) {
-    if (output.sizes()[last_dim] != out_sizes[last_dim]) {
-      working_tensor.resize_(out_sizes, MemoryFormat::Contiguous);
-      working_tensor.slice(last_dim, 0, last_dim_halfsize).copy_(output);
-      output = std::move(working_tensor);
-    }
-    at::native::_fft_fill_with_conjugate_symmetry_(output, dim);
-  }
-  return output;
-}
-
-Tensor& _fft_r2c_cufft_out(const Tensor& self, IntArrayRef dim,
-                           int64_t normalization, bool onesided, Tensor& out) {
-  auto result = _fft_r2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), /*onesided=*/true);
-  if (onesided) {
-    return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim);
-  }
-
-  resize_output(out, self.sizes());
-
-  auto last_dim = dim.back();
-  auto last_dim_halfsize = result.sizes()[last_dim];
-  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
-  _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim);
-  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
-  return out;
-}
-
-// n-dimensional complex to real IFFT
-Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) {
-  TORCH_CHECK(self.is_complex());
-  auto in_sizes = self.sizes();
-  DimVector out_sizes(in_sizes.begin(), in_sizes.end());
-  out_sizes[dim.back()] = lastdim;
-
-  // First complete any C2C transforms
-  Tensor temp;
-  if (dim.size() > 1) {
-    temp = _fft_c2c_cufft(
-        self, dim.slice(0, dim.size() - 1),
-        static_cast<int64_t>(fft_norm_mode::none), /*forward=*/false);
-  } else {
-    // Complex to real FFTs may overwrite the input buffer, so must always clone (gh-34551)
-    temp = self.clone(MemoryFormat::Contiguous);
-  }
-
-  // Finally, do a 1D C2R transform
-  // TODO: could transform up to 2 other dims in the same cuFFT operation
-  auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
-  _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false);
-  return _fft_apply_normalization(output, normalization, out_sizes, dim);
-}
-
-Tensor& _fft_c2r_cufft_out(const Tensor& self, IntArrayRef dim,
-                           int64_t normalization, int64_t lastdim, Tensor& out) {
-  auto result = _fft_c2r_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), lastdim);
-  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
-}
-
-// n-dimensional complex to complex FFT/IFFT
-Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
-  TORCH_CHECK(self.is_complex());
-  if (dim.empty()) {
-    return self.clone();
-  }
-
-  auto out_sizes = self.sizes();
-  auto output = at::empty(out_sizes, self.options());
-
-  // Perform any number of C2C transforms
-  DimVector sorted_dims(dim.begin(), dim.end());
-  auto self_strides = self.strides();
-  auto working_tensor = self;
-  while (true) {
-    // Sort dimensions every time as _exec_fft re-strides the output
-    auto strides = working_tensor.strides();
-    std::sort(sorted_dims.begin(), sorted_dims.end(),
-              [&](int64_t a, int64_t b) { return strides[a] > strides[b]; });
-
-    const auto max_dims = std::min(static_cast<size_t>(cufft_max_ndim), sorted_dims.size());
-    auto first_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims);
-
-    _exec_fft(output, working_tensor, out_sizes, first_dims, forward);
-    sorted_dims.resize(sorted_dims.size() - max_dims);
-
-    if (sorted_dims.empty()) {
-      break;
-    }
-
-    if (working_tensor.is_same(self)) {
-      working_tensor = std::move(output);
-      output = at::empty(out_sizes, self.options());
-    } else {
-      std::swap(output, working_tensor);
-    }
-  }
-
-  return _fft_apply_normalization(output, normalization, out_sizes, dim);
-}
-
-Tensor& _fft_c2c_cufft_out(const Tensor& self, IntArrayRef dim,
-                           int64_t normalization, bool forward, Tensor& out) {
-  auto result = _fft_c2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), forward);
-  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
-}
-
-
 }} // at::native
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@ -141,8 +141,9 @@ std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
      num_inp,
      " is too big to be handled by cub");
  Tensor sorted;
+  Tensor self_c = self.contiguous();
  if (consecutive) {
-    sorted = self;
+    sorted = self_c;
  } else {
    sorted = at::empty({num_inp}, self.options());
  }
@ -151,14 +152,14 @@ std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
  Tensor sorted_indices;
  if (!return_inverse) {
    if (!consecutive) {
-      cuda::cub::sort_keys(self.data_ptr<scalar_t>(), sorted_data, num_inp);
+      cuda::cub::sort_keys(self_c.data_ptr<scalar_t>(), sorted_data, num_inp);
    }
  } else {
    if (!consecutive) {
      Tensor range = at::arange(0, num_inp, options);
      sorted_indices = at::empty({num_inp}, options);
      cuda::cub::sort_pairs(
-          self.data_ptr<scalar_t>(),
+          self_c.data_ptr<scalar_t>(),
          sorted_data,
          range.data_ptr<int64_t>(),
          sorted_indices.data_ptr<int64_t>(),
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2148,6 +2148,7 @@
 - func: _cufft_clear_plan_cache(int device_index) -> ()

 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
    CPU, CUDA: index
@ -2172,6 +2173,7 @@
  variants: function, method

 - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+  device_check: NoCheck   # delegate to _index_put_impl_, which leverages TensorIterator
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: index_put_
@ -2182,6 +2184,7 @@
  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)

 - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  device_check: NoCheck   # delegate to _index_put_impl_ after clone, which leverages TensorIterator
  variants: function, method

 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
@ -3515,6 +3518,31 @@
    CPU, CUDA: silu_backward
    CompositeImplicitAutograd: math_silu_backward

+- func: mish(Tensor self) -> Tensor
+  structured_delegate: mish.out
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: mish
+
+- func: mish_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: mish.out
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: mish_
+
+- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_out
+
+- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_backward
+    CompositeImplicitAutograd: math_mish_backward
+
 - func: sigmoid(Tensor self) -> Tensor
  device_check: NoCheck   # TensorIterator
  structured_delegate: sigmoid.out
@ -4780,9 +4808,9 @@
 # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
 # the default would never make sense.

- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: _sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor

- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: _sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor

 - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor

--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@ -36,7 +36,7 @@ SparseCsrTensor new_csr_tensor(const TensorOptions& options) {
 // TODO: This constructor should probably use an ATen abstract method in order
 // to make autograd dispatch available for the CSR constructor. See the relevant
 // note in native_functions.yaml.
-Tensor sparse_csr_tensor(
+Tensor _sparse_csr_tensor(
    const Tensor& crow_indices,
    const Tensor& col_indices,
    const Tensor& values,
@ -86,7 +86,7 @@ Tensor sparse_csr_tensor(
  return self;
 }

-Tensor sparse_csr_tensor(
+Tensor _sparse_csr_tensor(
    const Tensor& crow_indices,
    const Tensor& col_indices,
    const Tensor& values,
@ -125,7 +125,7 @@ Tensor sparse_csr_tensor(
    size[1] = 0;
  }

-  return at::sparse_csr_tensor(
+  return at::_sparse_csr_tensor(
      crow_indices, col_indices, values, size, options);
 }

--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@ -110,6 +110,15 @@ TEST(MathKernelTest, SiluBackward) {
  ASSERT_ALLCLOSE_TOLERANCES(out, math_out, 1e-4, 1e-6);
 }

+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(MathKernelTest, MishBackward) {
+  const auto input = rand({20, 10});
+  const auto grad_output = rand({20, 10});
+  auto out = at::native::mish_backward(grad_output, input);
+  auto math_out = at::native::math_mish_backward(grad_output, input);
+  ASSERT_ALLCLOSE_TOLERANCES(out, math_out, 1e-4, 1e-6);
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(MathKernelTest, NarrowCopy)  {
  auto x = rand({5, 8, 7});
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@ -47,8 +47,6 @@ TORCH_CUDA_CPP_API int THCState_getPeerToPeerAccess(THCState* state, int dev, in

 TORCH_CUDA_CPP_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);

-TORCH_CUDA_CPP_API void THCMagma_init(THCState *state);
-
 /* For the current device and stream, returns the allocated scratch space */
 TORCH_CUDA_CPP_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);

--- a/aten/src/THC/THCTensorMathMagma.cu
+++ b/aten/src/THC/THCTensorMathMagma.cu
@ -6,6 +6,7 @@
 #include <THC/THCStorage.hpp>
 #include <algorithm>
 #include <ATen/native/cuda/MiscUtils.h>
+#include <ATen/cuda/detail/CUDAHooks.h>

 #ifdef USE_MAGMA
 #include <magma_v2.h>
@ -17,12 +18,19 @@

 #define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)"

-void THCMagma_init(THCState *state)
-{
+namespace {
+void _THCMagma_init() {
 #ifdef USE_MAGMA
  magma_init();
 #endif
 }

+struct Initializer {
+  Initializer() {
+    ::at::cuda::detail::THCMagma_init = _THCMagma_init;
+  };
+} initializer;
+} // anonymous namespace
+
 #include <THC/generic/THCTensorMathMagma.cu>
 #include <THC/THCGenerateAllTypes.h>
--- a/benchmarks/sparse/utils.py
+++ b/benchmarks/sparse/utils.py
@ -27,7 +27,7 @@ def gen_sparse_csr(shape, nnz):
        dense[f] = fill_value
    dense = torch.from_numpy(dense.reshape(shape))

-    return dense.to_sparse_csr()
+    return dense._to_sparse_csr()

 def gen_sparse_coo(shape, nnz):
    dense = np.random.randn(*shape)
@ -51,4 +51,4 @@ def gen_sparse_coo_and_csr(shape, nnz):
        dense[f] = 0

    dense = torch.from_numpy(dense.reshape(shape))
-    return dense.to_sparse(), dense.to_sparse_csr()
+    return dense.to_sparse(), dense._to_sparse_csr()
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@ -33,17 +33,21 @@ class C10_CUDA_API CUDAError : public c10::Error {
    }                                                            \
  } while (0)
 #else
-#define C10_CUDA_CHECK(EXPR)                                              \
-  do {                                                                    \
-    cudaError_t __err = EXPR;                                             \
-    if (__err != cudaSuccess) {                                           \
-      auto error_unused C10_UNUSED = cudaGetLastError();                  \
-      auto _cuda_check_prefix = c10::cuda::get_cuda_check_prefix();       \
-      throw c10::CUDAError(                                               \
-          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)},          \
-          TORCH_CHECK_MSG(                                                \
-              false, "", _cuda_check_prefix, cudaGetErrorString(__err))); \
-    }                                                                     \
+#define C10_CUDA_CHECK(EXPR)                                        \
+  do {                                                              \
+    cudaError_t __err = EXPR;                                       \
+    if (__err != cudaSuccess) {                                     \
+      auto error_unused C10_UNUSED = cudaGetLastError();            \
+      auto _cuda_check_suffix = c10::cuda::get_cuda_check_suffix(); \
+      throw c10::CUDAError(                                         \
+          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)},    \
+          TORCH_CHECK_MSG(                                          \
+              false,                                                \
+              "",                                                   \
+              "CUDA error: ",                                       \
+              cudaGetErrorString(__err),                            \
+              _cuda_check_suffix));                                 \
+    }                                                               \
  } while (0)
 #endif

--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -141,17 +141,16 @@ void device_synchronize() {
  C10_CUDA_CHECK(cudaDeviceSynchronize());
 }

-const char* get_cuda_check_prefix() noexcept {
+const char* get_cuda_check_suffix() noexcept {
  static char* device_blocking_flag = getenv("CUDA_LAUNCH_BLOCKING");
  static bool blocking_enabled =
      (device_blocking_flag && atoi(device_blocking_flag));
  if (blocking_enabled) {
-    return "CUDA error: ";
+    return "";
  } else {
-    return "CUDA kernel errors might be "
-           "asynchronously reported at some other API call,so the "
-           "stacktrace below might be incorrect. For debugging "
-           "consider passing CUDA_LAUNCH_BLOCKING=1. CUDA error: ";
+    return "\nCUDA kernel errors might be asynchronously reported at some"
+           " other API call,so the stacktrace below might be incorrect."
+           "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.";
  }
 }

--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@ -30,7 +30,7 @@ C10_CUDA_API void set_device(DeviceIndex device);

 C10_CUDA_API void device_synchronize();

-C10_CUDA_API const char* get_cuda_check_prefix() noexcept;
+C10_CUDA_API const char* get_cuda_check_suffix() noexcept;

 } // namespace cuda
 } // namespace c10
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -176,7 +176,7 @@ endif()
 if(BUILD_SPLIT_CUDA)
  # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp
  foreach(tmp ${Caffe2_GPU_SRCS})
-    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|CUDAHooks|detail/|TensorShapeCUDA).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
+    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
      # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files
      list(APPEND Caffe2_GPU_SRCS_CU ${tmp})
    else()
@ -738,6 +738,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/step_lr.cpp
      ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
      ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
+      ${TORCH_SRC_DIR}/csrc/utils/crash_handler.cpp
    )
  endif()

@ -1020,9 +1021,10 @@ endif()
 if(LINUX)
  find_library(BREAKPAD_LIB breakpad_client)
  find_path(BREAKPAD_INCLUDE_DIR breakpad)
-  if(BREAKPAD_LIB_FOUND AND BREAKPAD_INCLUDE_DIR_FOUND)
-    target_link_libraries(torch_cpu PRIVATE ${BREAKPAD_LIB})
-    add_compile_definitions(ADD_BREAKPAD_SIGNAL_HANDLER)
+  if(BREAKPAD_LIB AND BREAKPAD_INCLUDE_DIR)
+    message(STATUS "found breakpad library")
+    target_link_libraries(torch_cpu PUBLIC ${BREAKPAD_LIB})
+    target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
    target_include_directories(torch_cpu PUBLIC ${BREAKPAD_INCLUDE_DIR}/breakpad)
  else()
    message(STATUS "breakpad library not found")
@ -1417,8 +1419,9 @@ target_link_libraries(torch PUBLIC torch_cpu_library)
 if(USE_CUDA)
  target_link_libraries(torch PUBLIC torch_cuda_library)
  if(BUILD_SPLIT_CUDA)
-    target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
+    # NS: Library order is important here to prevent cudnn double linking
    target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
+    target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
  endif()
 elseif(USE_ROCM)
  target_link_libraries(torch PUBLIC torch_hip_library)
@ -1465,6 +1468,10 @@ if(BUILD_SPLIT_CUDA)
  target_link_libraries(
      torch_cuda_cpp PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
  target_link_libraries(torch_cuda_cu PRIVATE torch_cuda_cpp)
+  if(USE_CUDNN)
+    target_link_libraries(
+        torch_cuda_cpp PRIVATE  caffe2::cudnn-private)
+  endif()

  # These public dependencies must go after the previous dependencies, as the
  # order of the libraries in the linker call matters here when statically
@ -1481,6 +1488,10 @@ elseif(USE_CUDA)
      torch_cuda PRIVATE ${Caffe2_GPU_INCLUDE})
  target_link_libraries(
      torch_cuda PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
+  if(USE_CUDNN)
+    target_link_libraries(
+        torch_cuda PRIVATE  caffe2::cudnn-private)
+  endif()

  # These public dependencies must go after the previous dependencies, as the
  # order of the libraries in the linker call matters here when statically
--- a/caffe2/serialize/versions.h
+++ b/caffe2/serialize/versions.h
@ -65,9 +65,13 @@ constexpr uint64_t kProducedFileFormatVersion = 0x3L;
 //  0x1L: Initial version
 //  0x2L: (Comment missing)
 //  0x3L: (Comment missing)
-//  0x4L: (Comment missing)
 //  0x4L: (update) Added schema to function tuple. Forward-compatible change.
-constexpr uint64_t kProducedBytecodeVersion = 0x4L;
+//  0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize
+//  extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting
+//  to the unify format, the root key of tensor storage is updated from {index} to
+//  {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage`
+//  Forward-compatibility change.
+constexpr uint64_t kProducedBytecodeVersion = 0x5L;

 static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
    "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion.");
--- a/caffe2/utils/threadpool/pthreadpool-cpp.cc
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc
@ -87,7 +87,6 @@ PThreadPool* pthreadpool() {
      auto num_threads = leaked->get_thread_count();
      // NOLINTNEXTLINE(modernize-make-unique)
      threadpool.reset(new PThreadPool(num_threads));
-      TORCH_WARN("Leaking Caffe2 thread-pool after fork.");
    }
  }
  return threadpool.get();
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1165,7 +1165,7 @@ if(USE_CUDA)
      caffe2_update_option(USE_NVRTC OFF)
    endif()
    if(CAFFE2_USE_CUDNN)
-      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn-public)
    else()
      caffe2_update_option(USE_CUDNN OFF)
    endif()
@ -1216,8 +1216,8 @@ if(USE_ROCM)
    list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
    list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
    list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-    list(APPEND HIP_CXX_FLAGS -DROCM_VERSION=${ROCM_VERSION_DEV_INT})
    list(APPEND HIP_CXX_FLAGS -std=c++14)
+    add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})

    if(CMAKE_BUILD_TYPE MATCHES Debug)
       list(APPEND HIP_CXX_FLAGS -g2)
--- a/cmake/Modules/FindBLAS.cmake
+++ b/cmake/Modules/FindBLAS.cmake
@ -67,7 +67,7 @@ MACRO(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list)
      else ( APPLE )
        find_library(${_prefix}_${_library}_LIBRARY
          NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /opt/OpenBLAS/lib /usr/lib/aarch64-linux-gnu
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /opt/OpenBLAS/lib /usr/lib/aarch64-linux-gnu ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}
          ENV LD_LIBRARY_PATH )
      endif( APPLE )
      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
@ -178,6 +178,19 @@ if((NOT BLAS_LIBRARIES)
  endif(BLAS_LIBRARIES)
 endif()

+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "openblas;pthread;m;gomp")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
 if((NOT BLAS_LIBRARIES) AND (WIN32)
    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
  check_fortran_libraries(
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@ -128,13 +128,7 @@ if(BLAS_FOUND)
      if(NOT LAPACK_CGESDD_WORKS)
        find_library(GFORTRAN_LIBRARY
          NAMES libgfortran.a gfortran
-          PATHS /usr/lib/gcc/aarch64-linux-gnu/9/
-                /usr/lib/gcc/x86_64-redhat-linux/9/
-                /usr/lib/gcc/aarch64-linux-gnu/8/
-                /usr/lib/gcc/x86_64-redhat-linux/8/
-                /usr/lib/gcc/aarch64-linux-gnu/7/
-                /usr/lib/gcc/x86_64-redhat-linux/7/
-                )
+          PATHS ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES})
       list(APPEND CMAKE_REQUIRED_LIBRARIES "${GFORTRAN_LIBRARY}")
       unset(LAPACK_CGESDD_WORKS CACHE)
       check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -90,8 +90,12 @@ function(caffe2_print_configuration_summary)
    get_target_property(__tmp caffe2::curand IMPORTED_LOCATION)
    message(STATUS "    curand library      : ${__tmp}")
    if(${USE_CUDNN})
-      get_target_property(__tmp caffe2::cudnn IMPORTED_LOCATION)
+      get_target_property(__tmp caffe2::cudnn-public INTERFACE_LINK_LIBRARIES)
      message(STATUS "    cuDNN library       : ${__tmp}")
+      if(${CUDNN_STATIC})
+        get_target_property(__tmp caffe2::cudnn-private INTERFACE_LINK_LIBRARIES)
+        message(STATUS "    cuDNN static library: ${__tmp}")
+      endif()
    endif()
    get_target_property(__tmp caffe2::nvrtc IMPORTED_LOCATION)
    message(STATUS "    nvrtc               : ${__tmp}")
@ -130,6 +134,7 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
  message(STATUS "  USE_MKLDNN            : ${USE_MKLDNN}")
  if(${CAFFE2_USE_MKLDNN})
+    message(STATUS "  USE_MKLDNN_ACL        : ${USE_MKLDNN_ACL}")
    message(STATUS "  USE_MKLDNN_CBLAS      : ${USE_MKLDNN_CBLAS}")
  endif()
  message(STATUS "  USE_NCCL              : ${USE_NCCL}")
--- a/cmake/public/ComputeLibrary.cmake
+++ b/cmake/public/ComputeLibrary.cmake
@ -0,0 +1,34 @@
+# Build with Compute Library backend for the Arm architecture
+# Note: Compute Library is available from: https://github.com/ARM-software/ComputeLibrary
+#   and must be built separately. The location of the Compute Library build
+#   must be set with the env var ACL_ROOT_DIR. This path will be checked later
+#   as part of FindACL.cmake in oneDNN.
+
+if(NOT USE_MKLDNN_ACL)
+    RETURN()
+endif()
+
+set(DNNL_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
+
+# Check the Compute Library version number.
+# Note: oneDNN / MKL-DNN v2.2 onwards will check the Compute Library version
+#   the version check here can be removed once PyTorch transitions to v2.2.
+set(ACL_MINIMUM_VERSION "21.02")
+
+file(GLOB_RECURSE ACL_VERSION_FILE $ENV{ACL_ROOT_DIR}/*/arm_compute_version.embed)
+
+if("${ACL_VERSION_FILE}" STREQUAL "")
+  message(WARNING "Build may fail: Could not determine ACL version (minimum required is ${ACL_MINIMUM_VERSION})")
+else()
+  file(READ ${ACL_VERSION_FILE} ACL_VERSION_STRING)
+  string(REGEX MATCH "v([0-9]+\\.[0-9]+)" ACL_VERSION ${ACL_VERSION_STRING})
+  set(ACL_VERSION "${CMAKE_MATCH_1}")
+
+  if(${ACL_VERSION} VERSION_EQUAL "0.0")
+    # Unreleased ACL versions come with version string "v0.0-unreleased", and may not be compatible with oneDNN.
+    # It is recommended to use the latest release of ACL.
+    message(WARNING "Build may fail: Using unreleased ACL version (minimum required is ${ACL_MINIMUM_VERSION})")
+  elseif(${ACL_VERSION} VERSION_LESS ${ACL_MINIMUM_VERSION})
+    message(FATAL_ERROR "Detected ACL version ${ACL_VERSION}, but minimum required is ${ACL_MINIMUM_VERSION}")
+  endif()
+endif()
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -272,20 +272,66 @@ else()
      ${LIBNVTOOLSEXT})
 endif()

-# cudnn
-# static linking is handled by USE_STATIC_CUDNN environment variable
-if(CAFFE2_USE_CUDNN)
-  add_library(caffe2::cudnn UNKNOWN IMPORTED)
-  set_property(
-      TARGET caffe2::cudnn PROPERTY IMPORTED_LOCATION
-      ${CUDNN_LIBRARY_PATH})
-  set_property(
-      TARGET caffe2::cudnn PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-      ${CUDNN_INCLUDE_PATH})
-  if(CUDNN_STATIC AND NOT WIN32)
+# cublas. CUDA_CUBLAS_LIBRARIES is actually a list, so we will make an
+# interface library similar to cudart.
+add_library(caffe2::cublas INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
    set_property(
-        TARGET caffe2::cudnn PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a")
+    if(CUDA_VERSION VERSION_GREATER_EQUAL 10.1)
+      set_property(
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
+      # Add explicit dependency to cudart_static to fix
+      # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
+      # error adding symbols: DSO missing from command line
+      set_property(
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_cudart_static_LIBRARY}" rt dl)
+    endif()
+else()
+    set_property(
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        ${CUDA_CUBLAS_LIBRARIES})
+endif()
+set_property(
+    TARGET caffe2::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDA_INCLUDE_DIRS})
+
+# cudnn public and private interfaces
+# static linking is handled by USE_STATIC_CUDNN environment variable
+# If library is linked dynamically, than private interface is no-op
+# If library is linked statically:
+#  - public interface would only reference headers
+#  - private interface will contain the actual link instructions
+if(CAFFE2_USE_CUDNN)
+  add_library(caffe2::cudnn-public INTERFACE IMPORTED)
+  set_property(
+    TARGET caffe2::cudnn-public PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDNN_INCLUDE_PATH})
+  add_library(caffe2::cudnn-private INTERFACE IMPORTED)
+  set_property(
+    TARGET caffe2::cudnn-private PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${CUDNN_INCLUDE_PATH})
+  if(CUDNN_STATIC AND NOT WIN32)
+    if(USE_WHOLE_CUDNN)
+      set_property(
+        TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
+        "-Wl,--whole-archive,\"${CUDNN_LIBRARY_PATH}\" -Wl,--no-whole-archive")
+    else()
+      set_property(
+        TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
+        ${CUDNN_LIBRARY_PATH})
+    endif()
+    set_property(
+      TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+      "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+    # Add explicit dependency on cublas to cudnn
+    get_target_property(__tmp caffe2::cublas INTERFACE_LINK_LIBRARIES)
+    set_property(
+      TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+      "${__tmp}")
    # Lines below use target_link_libraries because we support cmake 3.5+.
    # For cmake 3.13+, target_link_options to set INTERFACE_LINK_OPTIONS would be better.
    # https://cmake.org/cmake/help/v3.5/command/target_link_libraries.html warns
@ -295,8 +341,12 @@ if(CAFFE2_USE_CUDNN)
    #  link items that will not propagate to dependents."
    # Propagating to a dependent (torch_cuda) is exactly what we want here, so we are
    # flouting the warning, but I can't think of a better (3.5+ compatible) way.
-    target_link_libraries(caffe2::cudnn INTERFACE
+    target_link_libraries(caffe2::cudnn-private INTERFACE
        "-Wl,--exclude-libs,libcudnn_static.a")
+  else()
+  set_property(
+    TARGET caffe2::cudnn-public PROPERTY INTERFACE_LINK_LIBRARIES
+    ${CUDNN_LIBRARY_PATH})
  endif()
 endif()

@ -346,33 +396,6 @@ if(CAFFE2_USE_TENSORRT)
      ${TENSORRT_INCLUDE_DIR})
 endif()

-# cublas. CUDA_CUBLAS_LIBRARIES is actually a list, so we will make an
-# interface library similar to cudart.
-add_library(caffe2::cublas INTERFACE IMPORTED)
-if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
-    set_property(
-        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a")
-    if(CUDA_VERSION VERSION_GREATER_EQUAL 10.1)
-      set_property(
-        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
-      # Add explicit dependency to cudart_static to fix
-      # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
-      # error adding symbols: DSO missing from command line
-      set_property(
-        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_cudart_static_LIBRARY}" rt dl)
-    endif()
-else()
-    set_property(
-        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDA_CUBLAS_LIBRARIES})
-endif()
-set_property(
-    TARGET caffe2::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
-
 # nvrtc
 add_library(caffe2::nvrtc UNKNOWN IMPORTED)
 set_property(
--- a/cmake/public/mkldnn.cmake
+++ b/cmake/public/mkldnn.cmake
@ -1,5 +1,9 @@
 set(MKLDNN_USE_NATIVE_ARCH ${USE_NATIVE_ARCH})

+if(CPU_AARCH64)
+  include(${CMAKE_CURRENT_LIST_DIR}/ComputeLibrary.cmake)
+endif()
+
 find_package(MKLDNN QUIET)

 if(NOT TARGET caffe2::mkldnn)
--- a/docs/cpp/source/notes/inference_mode.rst
+++ b/docs/cpp/source/notes/inference_mode.rst
@ -30,8 +30,6 @@ Inside an ``InferenceMode`` block, we make the following performance guarantees:
 - Inplace operations on inference tensors are guaranteed not to do a version bump.

 For more implementation details of ``InferenceMode`` please see the `RFC-0011-InferenceMode <https://github.com/pytorch/rfcs/pull/17>`_.
-Currently this guard is only available in C++ frontend, adding python frontend support
-is tracked in #56608.

 Migration guide from ``AutoNonVariableTypeMode``
 ------------------------------------------------
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@ -142,6 +142,7 @@ Ops that can autocast to ``float32``
 ``exp``,
 ``expm1``,
 ``gelu``,
+``grid_sample``,
 ``group_norm``,
 ``hinge_embedding_loss``,
 ``kl_div``,
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -50,6 +50,10 @@ you can use it as ``functional.jacobian(lambda x: f(x, constant, flag=flag), inp
 Locally disabling gradient computation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

+See :ref:`locally-disable-grad-doc` for more information on the differences
+between no-grad and inference mode as well as other related mechanisms that
+may be confused with the two.
+
 .. autosummary::
    :toctree: generated
    :nosignatures:
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@ -180,6 +180,8 @@ joined.

 .. autofunction:: is_nccl_available

+.. autofunction:: is_torchelastic_launched
+
 --------------------------------------------------------------------------------

 Currently three initialization methods are supported:
--- a/docs/source/elastic/errors.rst
+++ b/docs/source/elastic/errors.rst
@ -1,3 +1,5 @@
+.. _elastic_errors-api:
+
 Error Propagation
 ==================

--- a/docs/source/elastic/run.rst
+++ b/docs/source/elastic/run.rst
@ -1,9 +1,6 @@
 .. _launcher-api:

-Elastic Launch
-============================
-
-torch.distributed.run
----------------------
+torch.distributed.run (Elastic Launch)
+======================================

 .. automodule:: torch.distributed.run
--- a/docs/source/elastic/train_script.rst
+++ b/docs/source/elastic/train_script.rst
@ -1,3 +1,5 @@
+.. _elastic_train_script:
+
 Train script
 -------------

@ -7,18 +9,20 @@ working with ``torch.distributed.run`` with these differences:
 1. No need to manually pass ``RANK``, ``WORLD_SIZE``,
   ``MASTER_ADDR``, and ``MASTER_PORT``.

-2. ``rdzv_backend`` and ``rdzv_endpoint`` must be provided. For most users
-   this will be set to ``c10d`` (see `rendezvous <rendezvous.html>`_).
+2. ``rdzv_backend`` and ``rdzv_endpoint`` can be provided. For most users
+   this will be set to ``c10d`` (see `rendezvous <rendezvous.html>`_). The default
+   ``rdzv_backend`` creates a non-elastic rendezvous where ``rdzv_endpoint`` holds
+   the master address.

 3. Make sure you have a ``load_checkpoint(path)`` and
-   ``save_checkpoint(path)`` logic in your script. When workers fail
-   we restart all the workers with the same program arguments so you will
-   lose progress up to the most recent checkpoint
-   (see `elastic launch <distributed.html>`_).
+   ``save_checkpoint(path)`` logic in your script. When any number of
+   workers fail we restart all the workers with the same program
+   arguments so you will lose progress up to the most recent checkpoint
+   (see `elastic launch <run.html>`_).

 4. ``use_env`` flag has been removed. If you were parsing local rank by parsing
   the ``--local_rank`` option, you need to get the local rank from the
-   environment variable ``LOCAL_RANK`` (e.g. ``os.environ["LOCAL_RANK"]``).
+   environment variable ``LOCAL_RANK`` (e.g. ``int(os.environ["LOCAL_RANK"])``).

 Below is an expository example of a training script that checkpoints on each
 epoch, hence the worst-case progress lost on failure is one full epoch worth
@ -31,7 +35,7 @@ of training.
       state = load_checkpoint(args.checkpoint_path)
       initialize(state)

-       # torch.distributed.run ensure that this will work
+       # torch.distributed.run ensures that this will work
       # by exporting all the env vars needed to initialize the process group
       torch.distributed.init_process_group(backend=args.backend)

--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@ -89,6 +89,7 @@ Non-linear activation functions
    sigmoid
    hardsigmoid
    silu
+    mish
    batch_norm
    group_norm
    instance_norm
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -22,6 +22,7 @@ These are the basic building blocks for graphs:

    ~parameter.Parameter
    ~parameter.UninitializedParameter
+    ~parameter.UninitializedBuffer

 Containers
 ----------------------------------
@ -145,6 +146,7 @@ Non-linear Activations (weighted sum, nonlinearity)
    nn.GELU
    nn.Sigmoid
    nn.SiLU
+    nn.Mish
    nn.Softplus
    nn.Softshrink
    nn.Softsign
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@ -8,56 +8,6 @@ operations. It's not strictly necessary to understand all this, but we recommend
 getting familiar with it, as it will help you write more efficient, cleaner
 programs, and can aid you in debugging.

-.. _excluding-subgraphs:
-
-Excluding subgraphs from backward
---------------------------------
-
-Every Tensor has a flag: :attr:`requires_grad` that allows for fine grained
-exclusion of subgraphs from gradient computation and can increase efficiency.
-
-.. _excluding-requires_grad:
-
-``requires_grad``
-^^^^^^^^^^^^^^^^^
-
-If there's a single input to an operation that requires gradient, its output
-will also require gradient. Conversely, only if all inputs don't require
-gradient, the output also won't require it. Backward computation is never
-performed in the subgraphs, where all Tensors didn't require gradients.
-
-.. code::
-
-    >>> x = torch.randn(5, 5)  # requires_grad=False by default
-    >>> y = torch.randn(5, 5)  # requires_grad=False by default
-    >>> z = torch.randn((5, 5), requires_grad=True)
-    >>> a = x + y
-    >>> a.requires_grad
-    False
-    >>> b = a + z
-    >>> b.requires_grad
-    True
-
-This is especially useful when you want to freeze part of your model, or you
-know in advance that you're not going to use gradients w.r.t. some parameters.
-For example if you want to finetune a pretrained CNN, it's enough to switch the
-:attr:`requires_grad` flags in the frozen base, and no intermediate buffers will
-be saved, until the computation gets to the last layer, where the affine
-transform will use weights that require gradient, and the output of the network
-will also require them.
-
-.. code::
-
-    model = torchvision.models.resnet18(pretrained=True)
-    for param in model.parameters():
-        param.requires_grad = False
-    # Replace the last fully-connected layer
-    # Parameters of newly constructed modules have requires_grad=True by default
-    model.fc = nn.Linear(512, 100)
-
-    # Optimize only the classifier
-    optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)
-
 .. _how-autograd-encodes-history:

 How autograd encodes the history
@ -86,6 +36,157 @@ flow statements, that can change the overall shape and size of the graph at
 every iteration. You don't have to encode all possible paths before you
 launch the training - what you run is what you differentiate.

+.. _locally-disable-grad-doc:
+
+Locally disabling gradient computation
+--------------------------------------
+
+There are several mechanisms available from Python to locally disable gradient
+computation:
+
+To disable gradients across entire blocks of code, there are context managers
+like no-grad mode and inference mode.
+For more fine-grained exclusion of subgraphs from gradient computation,
+there is setting the ``requires_grad`` field of a tensor.
+
+Below, in addition to discussing the mechanisms above, we also describe
+evaluation mode (:meth:`nn.Module.eval()`), a method that is not actually used
+to disable gradient computation but, because of its name, is often mixed up with the three.
+
+Setting ``requires_grad``
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:attr:`requires_grad` is a flag that allows for fine-grained exclusion of
+subgraphs from gradient computation. It takes effect in both the forward
+and backward passes:
+
+During the forward pass, an operation is only recorded in the backward graph if
+at least one of its input tensors require grad.
+During the backward pass (``.backward()``), only leaf tensors with
+``requires_grad=True`` will have gradients accumulated into their ``.grad``
+fields.
+
+It is important to note that even though every tensor has this flag,
+*setting* it only makes sense for leaf tensors (tensors that do not have a
+``grad_fn``, e.g., a ``nn.Module``'s parameters).
+Non-leaf tensors (tensors that do have ``grad_fn``) are tensors that have a
+backward graph associated with them. Thus their gradients will be needed
+as an intermediary result to compute the gradient for a leaf tensor that
+requires grad. From this definition, it is clear that all non-leaf tensors
+will automatically have ``require_grad=True``.
+
+Setting ``requires_grad`` should be the main way you control which parts
+of the model are part of the gradient computation, for example, if you need to
+freeze parts of your pretrained model during model fine-tuning.
+
+To freeze parts of your model, simply apply ``.requires_grad_(False)`` to
+the parameters that you don't want updated. And as described above,
+since computations that use these parameters as inputs would not be recorded in
+the forward pass, they won't have their ``.grad`` fields updated in the backward
+pass because they won't be part of the backward graph in the first place, as
+desired.
+
+Because this is such a common pattern, ``requires_grad`` can also be set at
+the module level with :meth:`nn.Module.requires_grad_()`.
+When applied to a module, ``.requires_grad_()`` takes effect on all
+of the module's parameters (which have ``requires_grad=True`` by default).
+
+Grad Modes
+^^^^^^^^^^
+
+Apart from setting ``requires_grad`` there are also three possible modes
+enableable from Python that can affect how computations in PyTorch are
+processed by autograd internally: default mode (grad mode), no-grad mode,
+and inference mode, all of which can be togglable via context managers and
+decorators.
+
+Default Mode (Grad Mode)
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The "default mode" is actually the mode we are implicitly in when no other modes like
+no-grad and inference mode are enabled. To be contrasted with
+"no-grad mode" the default mode is also sometimes called "grad mode".
+
+The most important thing to know about the default mode is that it is the only
+mode in which ``requires_grad`` takes effect. ``requires_grad`` is always overridden
+to be ``False`` in both the two other modes.
+
+No-grad Mode
+^^^^^^^^^^^^
+
+Computations in no-grad mode behave as if none of the inputs require grad.
+In other words, computations in no-grad mode are never recorded in the backward graph
+even if there are inputs that have ``require_grad=True``.
+
+Enable no-grad mode when you need to perform operations that should not be
+recorded by autograd, but you’d still like to use the outputs of these
+computations in grad mode later. This context manager makes it convenient to
+disable gradients for a block of code or function without
+having to temporarily set tensors to have ``requires_grad=False``, and then
+back to ``True``.
+
+For example, no-grad mode might be useful when writing an optimizer: when
+performing the training update you’d like to update parameters
+in-place without the update being recorded by autograd.
+You also intend to use the updated parameters for computations in
+grad mode in the next forward pass.
+
+The implementations in :ref:`nn-init-doc` also
+rely on no-grad mode when initializing the parameters as to avoid
+autograd tracking when updating the intialized parameters in-place.
+
+Inference Mode
+^^^^^^^^^^^^^^
+
+Inference mode is the extreme version of no-grad mode. Just like in no-grad
+mode, computations in inference mode are not recorded in the backward graph, but
+enabling inference mode will allow PyTorch to speed up your model even more.
+This better runtime comes with a drawback: tensors created in inference mode
+will not be able to be used in computations to be recorded by autograd after
+exiting inference mode.
+
+Enable inference mode when you are performing computations that don’t need
+to be recorded in the backward graph, AND you don’t plan on using the tensors
+created in inference mode in any computation that is to be recorded by autograd later.
+
+It is recommended that you try out inference mode in the parts of your code
+that do not require autograd tracking (e.g., data processing and model evaluation).
+If it works out of the box
+for your use case it’s a free performance win. If you run into errors after
+enabling inference mode, check that you are not using tensors created in
+inference mode in computations that are recorded by autograd after exiting inference
+mode. If you cannot avoid such use in your case, you can always switch back
+to no-grad mode.
+
+For details on inference mode please see
+`Inference Mode <https://pytorch.org/cppdocs/notes/inference_mode.html>`_.
+
+For implementation details of inference mode see
+`RFC-0011-InferenceMode <https://github.com/pytorch/rfcs/pull/17>`_.
+
+Evaluation Mode (``nn.Module.eval()``)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Evaluation mode is not actually a mechanism to locally disable gradient computation.
+It is included here anyway because it is sometimes confused to be such a mechanism.
+
+Functionally, ``module.eval()`` (or equivalently ``module.train()``) are completely
+orthogonal to no-grad mode and inference mode. How ``model.eval()`` affects
+your model depends entirely on the specific modules used in your model and
+whether they define any training-mode specific behavior.
+
+You are responsible for calling ``model.eval()`` and ``model.train()`` if your
+model relies on modules such as :class:`torch.nn.Dropout` and
+:class:`torch.nn.BatchNorm2d` that may behave
+differently depending on training mode, for example, to avoid updating your
+BatchNorm running statistics on validation data.
+
+It is recommended that you always use ``model.train()`` when
+training and ``model.eval()`` when evaluating your model (validation/testing) even
+if you aren’t sure your model has training-mode specific behavior, because a
+module you are using might be updated to behave differently in training and
+eval modes.
+
 In-place operations with autograd
 ---------------------------------

--- a/docs/source/package.rst
+++ b/docs/source/package.rst
@ -2,10 +2,787 @@

 torch.package
 =============
+``torch.package`` adds support for creating hermetic packages containing arbitrary
+PyTorch code. These packages can be saved, shared, used to load and execute models
+at a later date or on a different machine, and can even be deployed to production using
+``torch::deploy``.
+
+This document contains tutorials, how-to guides, explanations, and an API reference that
+will help you learn more about ``torch.package`` and how to use it.
+

 .. warning::

-    This module is experimental and has not yet been publicly released.
+    This module depends on the ``pickle`` module which is is not secure. Only unpackage data you trust.
+
+    It is possible to construct malicious pickle data which will **execute arbitrary code during unpickling**.
+    Never unpackage data that could have come from an untrusted source, or that could have been tampered with.
+
+    For more information, review the `documentation <https://docs.python.org/3/library/pickle.html>`_ for the ``pickle`` module.
+
+
+.. contents:: :local:
+    :depth: 2
+
+
+Tutorials
+---------
+Packaging your first model
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+A tutorial that guides you through packaging and unpackaging a simple model is available
+`on Colab <https://colab.research.google.com/drive/1lFZkLyViGfXxB-m3jqlyTQuYToo3XLo->`_.
+After completing this exercise, you will be familiar with the basic API for creating and using
+Torch packages.
+
+How do I...
+-----------
+See what is inside a package?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Treat the package like a ZIP archive
+""""""""""""""""""""""""""""""""""""
+The container format for a ``torch.package`` is ZIP, so any tools that work with standard ZIP files should
+work for exploring the contents. Some common ways to interact with ZIP files:
+
+* ``unzip my_package.pt`` will unzip the ``torch.package`` archive to disk, where you can freely inspect its contents.
+
+
+::
+
+    $ unzip my_package.pt && tree my_package
+    my_package
+    ├── .data
+    │   ├── 94304870911616.storage
+    │   ├── 94304900784016.storage
+    │   ├── extern_modules
+    │   └── version
+    ├── models
+    │   └── model_1.pkl
+    └── torchvision
+        └── models
+            ├── resnet.py
+            └── utils.py
+    ~ cd my_package && cat torchvision/models/resnet.py
+    ...
+
+
+* The Python ``zipfile`` module provides a standard way to read and write ZIP archive contents.
+
+
+::
+
+    from zipfile import ZipFile
+    with ZipFile("my_package.pt") as myzip:
+        file_bytes = myzip.read("torchvision/models/resnet.py")
+        # edit file_bytes in some way
+        myzip.writestr("torchvision/models/resnet.py", new_file_bytes)
+
+
+* vim has the ability to natively read ZIP archives. You can even edit files and :``write`` them back into the archive!
+
+
+::
+
+    # add this to your .vimrc to treat `*.pt` files as zip files
+    au BufReadCmd *.pt call zip#Browse(expand("<amatch>"))
+
+    ~ vi my_package.pt
+
+
+Use the ``file_structure()`` API
+""""""""""""""""""""""""""""""""
+:class:`PackageImporter` and :class:`PackageExporter` provide a ``file_structure()`` method, which will return a printable
+and queryable ``Folder`` object. The ``Folder`` object is a simple directory structure that you can use to explore the
+current contents of a ``torch.package``.
+
+The ``Folder`` object itself is directly printable and will print out a file tree representation. To filter what is returned,
+use the glob-style ``include`` and ``exclude`` filtering arguments.
+
+
+::
+
+    with PackageExporter('my_package.pt', verbose=False) as pe:
+        pe.save_pickle('models', 'model_1.pkl', mod)
+        # can limit printed items with include/exclude args
+        print(pe.file_structure(include=["**/utils.py", "**/*.pkl"], exclude="**/*.storages"))
+
+    importer = PackageImporter('my_package.pt')
+    print(importer.file_structure()) # will print out all files
+
+
+Output:
+
+
+::
+
+    # filtered with glob pattern:
+    #    include=["**/utils.py", "**/*.pkl"], exclude="**/*.storages"
+    ─── my_package.pt
+        ├── models
+        │   └── model_1.pkl
+        └── torchvision
+            └── models
+                └── utils.py
+
+    # all files
+    ─── my_package.pt
+        ├── .data
+        │   ├── 94304870911616.storage
+        │   ├── 94304900784016.storage
+        │   ├── extern_modules
+        │   └── version
+        ├── models
+        │   └── model_1.pkl
+        └── torchvision
+            └── models
+                ├── resnet.py
+                └── utils.py
+
+
+You can also query ``Folder`` objects with the ``has_file()`` method.
+
+
+::
+
+    exporter_file_structure = exporter.file_structure()
+    found: bool = exporter_file_structure.has_file("package_a/subpackage.py")
+
+
+Include arbitrary resources with my package and access them later?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`PackageExporter` exposes three methods, ``save_pickle``, ``save_text`` and ``save_binary`` that allow you to save
+Python objects, text, and binary data to a package.
+
+
+::
+
+    with torch.PackageExporter("package.pt") as exporter:
+        # Pickles the object and saves to `my_resources/tens.pkl` in the archive.
+        exporter.save_pickle("my_resources", "tensor.pkl", torch.randn(4))
+        exporter.save_text("config_stuff", "words.txt", "a sample string")
+        exporter.save_binary("raw_data", "binary", my_bytes)
+
+
+:class:`PackageImporter` exposes complementary methods named ``load_pickle``, ``load_text`` and ``load_binary`` that allow you to load
+Python objects, text and binary data from a package.
+
+
+::
+
+    importer = torch.PackageImporter("package.pt")
+    my_tensor = importer.load_pickle("my_resources", "tensor.pkl")
+    text = importer.load_text("config_stuff", "words.txt")
+    binary = importer.load_binary("raw_data", "binary")
+
+
+Customize how a class is packaged?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``torch.package`` allows for the customization of how classes are packaged. This behavior is accessed through defining the method
+``__reduce_package__`` on a class and by defining a corresponding de-packaging function. This is similar to defining ``__reduce__`` for
+Python’s normal pickling process.
+
+Steps:
+
+1. Define the method ``__reduce_package__(self, exporter: PackageExporter)`` on the target class. This method should do the work to save the class instance inside of the package, and should return a tuple of the corresponding de-packaging function with the arguments needed to invoke the de-packaging function. This method is called by the ``PackageExporter`` when it encounters an instance of the target class.
+2. Define a de-packaging function for the class. This de-packaging function should do the work to reconstruct and return an instance of the class. The function signature’s first parameter should be a ``PackageImporter`` instance, and the rest of the parameters are user defined.
+
+
+::
+
+    # foo.py [Example of customizing how class Foo is packaged]
+    from torch.package import PackageExporter, PackageImporter
+    import time
+
+
+    class Foo:
+        def __init__(self, my_string: str):
+            super().__init__()
+            self.my_string = my_string
+            self.time_imported = 0
+            self.time_exported = 0
+
+        def __reduce_package__(self, exporter: PackageExporter):
+            """
+            Called by ``torch.package.PackageExporter``'s Pickler's ``persistent_id`` when
+            saving an instance of this object. This method should do the work to save this
+            object inside of the ``torch.package`` archive.
+
+            Returns function w/ arguments to load the object from a
+            ``torch.package.PackageImporter``'s Pickler's ``persistent_load`` function.
+            """
+
+            # use this pattern to ensure no naming conflicts with normal dependencies,
+            # anything saved under this module name shouldn't conflict with other
+            # items in the package
+            generated_module_name = f"foo-generated._{exporter.get_unique_id()}"
+            exporter.save_text(
+                generated_module_name,
+                "foo.txt",
+                self.my_string + ", with exporter modification!",
+            )
+            time_exported = time.clock_gettime(1)
+
+            # returns de-packaging function w/ arguments to invoke with
+            return (unpackage_foo, (generated_module_name, time_exported,))
+
+
+    def unpackage_foo(
+        importer: PackageImporter, generated_module_name: str, time_exported: float
+    ) -> Foo:
+        """
+        Called by ``torch.package.PackageImporter``'s Pickler's ``persistent_load`` function
+        when depickling a Foo object.
+        Performs work of loading and returning a Foo instance from a ``torch.package`` archive.
+        """
+        time_imported = time.clock_gettime(1)
+        foo = Foo(importer.load_text(generated_module_name, "foo.txt"))
+        foo.time_imported = time_imported
+        foo.time_exported = time_exported
+        return foo
+
+
+::
+
+    # example of saving instances of class Foo
+
+    import torch
+    from torch.package import PackageImporter, PackageExporter
+    import foo
+
+    foo_1 = foo.Foo("foo_1 initial string")
+    foo_2 = foo.Foo("foo_2 initial string")
+    with PackageExporter('foo_package.pt', verbose=False) as pe:
+        # save as normal, no extra work necessary
+        pe.save_pickle('foo_collection', 'foo1.pkl', foo_1)
+        pe.save_pickle('foo_collection', 'foo2.pkl', foo_2)
+        print(pe.file_structure())
+
+    pi = PackageImporter('foo_package.pt')
+    imported_foo = pi.load_pickle('foo_collection', 'foo1.pkl')
+    print(f"foo_1 string: '{imported_foo.my_string}'")
+    print(f"foo_1 export time: {imported_foo.time_exported}")
+    print(f"foo_1 import time: {imported_foo.time_imported}")
+
+
+::
+
+    # output of running above script
+    ─── foo_package
+        ├── foo-generated
+        │   ├── _0
+        │   │   └── foo.txt
+        │   └── _1
+        │       └── foo.txt
+        ├── foo_collection
+        │   ├── foo1.pkl
+        │   └── foo2.pkl
+        └── foo.py
+
+    foo_1 string: 'foo_1 initial string, with reduction modification!'
+    foo_1 export time: 9857706.650140837
+    foo_1 import time: 9857706.652698385
+
+
+Test in my source code whether or not it is executing inside a package?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A :class:`PackageImporter` will add the attribute ``__torch_package__`` to every module that it initializes. Your code can check for the
+presence of this attribute to determine whether it is executing in a packaged context or not.
+
+
+::
+
+    # In foo/bar.py:
+
+    if "__torch_package__" in dir():  # true if the code is being loaded from a package
+        def is_in_package():
+            return True
+
+        UserException = Exception
+    else:
+        def is_in_package():
+            return False
+
+        UserException = UnpackageableException
+
+
+Now, the code will behave differently depending on whether it’s imported normally through your Python environment or imported from a
+``torch.package``.
+
+
+::
+
+    from foo.bar import is_in_package
+
+    print(is_in_package())  # False
+
+    loaded_module = PackageImporter(my_pacakge).import_module("foo.bar")
+    loaded_module.is_in_package()  # True
+
+
+**Warning**: in general, it’s bad practice to have code that behaves differently depending on whether it’s packaged or not. This can lead to
+hard-to-debug issues that are sensitive to how you imported your code. If your package is intended to be heavily used, consider restructuring
+your code so that it behaves the same way no matter how it was loaded.
+
+
+Patch code into a package?
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`PackageExporter` offers a ``save_source_string()`` method that allows one to save arbitrary Python source code to a module of your choosing.
+
+
+::
+
+    with PackageExporter(f) as exporter:
+        # Save the my_module.foo available in your current Python environment.
+        exporter.save_module("my_module.foo")
+
+        # This saves the provided string to my_module/foo.py in the package archive.
+        # It will override the my_module.foo that was previously saved.
+        exporter.save_source_string("my_module.foo", textwrap.dedent(
+            """\
+            def my_function():
+                print('hello world')
+            """
+        ))
+
+        # If you want to treat my_module.bar as a package
+        # (e.g. save to `my_module/bar/__init__.py` instead of `my_module/bar.py)
+        # pass is_package=True,
+        exporter.save_source_string("my_module.bar",
+                                    "def foo(): print('hello')\n",
+                                    is_package=True)
+
+    importer = PackageImporter(f)
+    importer.import_module("my_module.foo").my_function()  # prints 'hello world'
+
+
+Access package contents from packaged code?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`PackageImporter` implements the
+`importlib.resources <https://docs.python.org/3/library/importlib.html#module-importlib.resources>`_
+API for accessing resources from inside a package.
+
+
+::
+
+    with PackageExporter(f) as exporter:
+        # saves text to one/a.txt in the archive
+        exporter.save_text("my_resource", "a.txt", "hello world!")
+        # saves the tensor to my_pickle/obj.pkl
+        exporter.save_pickle("my_pickle", "obj.pkl", torch.ones(2, 2))
+
+        # see below for module contents
+        exporter.save_module("foo")
+        exporter.save_module("bar")
+
+
+The ``importlib.resources`` API allows access to resources from within packaged code.
+
+
+::
+
+    # foo.py:
+    import importlib.resources
+    import my_resource
+
+    # returns "hello world!"
+    def get_my_resource():
+        return importlib.resources.read_text(my_resource, "a.txt")
+
+
+Using ``importlib.resources`` is the recommended way to access package contents from within packaged code, since it complies
+with the Python standard. However, it is also possible to access the parent :class:`PackageImporter` instance itself from within
+packaged code.
+
+
+::
+
+    # bar.py:
+    import torch_package_importer # this is the PackageImporter that imported this module.
+
+    # Prints "hello world!", equivalient to importlib.resources.read_text
+    def get_my_resource():
+        return torch_package_importer.load_text("my_resource", "a.txt")
+
+    # You also do things that the importlib.resources API does not support, like loading
+    # a pickled object from the package.
+    def get_my_pickle():
+        return torch_package_importer.load_pickle("my_pickle", "obj.pkl")
+
+
+Distinguish between packaged code and non-packaged code?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+To tell if an object’s code is from a ``torch.package``, use the ``torch.package.is_from_package()`` function.
+Note: if an object is from a package but its definition is from a module marked ``extern`` or from ``stdlib``,
+this check will return ``False``.
+
+
+::
+
+    importer = PackageImporter(f)
+    mod = importer.import_module('foo')
+    obj = importer.load_pickle('model', 'model.pkl')
+    txt = importer.load_text('text', 'my_test.txt')
+
+    assert is_from_package(mod)
+    assert is_from_package(obj)
+    assert not is_from_package(txt) # str is from stdlib, so this will return False
+
+
+Re-export an imported object?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+To re-export an object that was previously imported by a :class:`PackageImporter`, you must make the new :class:`PackageExporter`
+aware of the original :class:`PackageImporter` so that it can find source code for your object’s dependencies.
+
+
+::
+
+    importer = PackageImporter(f)
+    obj = importer.load_pickle("model", "model.pkl")
+
+    # re-export obj in a new package
+    with PackageExporter(f2, importer=(importer, sys_importer)) as exporter:
+        exporter.save_pickle("model", "model.pkl", obj)
+
+
+Package a TorchScript module?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+To package a TorchScript model, use the same ``save_pickle`` and ``load_pickle`` APIs as you would with any other object.
+Saving TorchScript objects that are attributes or submodules is supported as well with no extra work.
+
+
+::
+
+    # save TorchScript just like any other object
+    with PackageExporter(file_name, verbose=True) as e:
+        e.save_pickle("res", "script_model.pkl", scripted_model)
+        e.save_pickle("res", "mixed_model.pkl", python_model_with_scripted_submodule)
+    # load as normal
+    importer = PackageImporter(file_name)
+    loaded_script = importer.load_pickle("res", "script_model.pkl")
+    loaded_mixed = importer.load_pickle("res", "mixed_model.pkl"
+
+
+Explanation
+-----------
+``torch.package`` Format Overview
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A ``torch.package`` file is a ZIP archive which conventionally uses the ``.pt`` extension. Inside the ZIP archive, there are two kinds of files:
+
+* Framework files, which are placed in the ``.data/``.
+* User files, which is everything else.
+
+As an example, this is what a fully packaged ResNet model from ``torchvision`` looks like:
+
+
+::
+
+    resnet
+    ├── .data  # All framework-specific data is stored here.
+    │   │      # It's named to avoid conflicts with user-serialized code.
+    │   ├── 94286146172688.storage  # tensor data
+    │   ├── 94286146172784.storage
+    │   ├── extern_modules  # text file with names of extern modules (e.g. 'torch')
+    │   ├── version         # version metadata
+    │   ├── ...
+    ├── model  # the pickled model
+    │   └── model.pkl
+    └── torchvision  # all code dependencies are captured as source files
+        └── models
+            ├── resnet.py
+            └── utils.py
+
+
+Framework files
+"""""""""""""""
+The ``.data/`` directory is owned by torch.package, and its contents are considered to be a private implementation detail.
+The ``torch.package`` format makes no guarantees about the contents of ``.data/``, but any changes made will be backward compatible
+(that is, newer version of PyTorch will always be able to load older ``torch.packages``).
+
+Currently, the ``.data/`` directory contains the following items:
+
+* ``version``: a version number for the serialized format, so that the ``torch.package`` import infrastructures knows how to load this package.
+* ``extern_modules``: a list of modules that are considered ``extern:class:`PackageImporter`. ``extern`` modules will be imported using the loading environment’s system importer.
+* ``*.storage``: serialized tensor data.
+
+
+::
+
+    .data
+    ├── 94286146172688.storage
+    ├── 94286146172784.storage
+    ├── extern_modules
+    ├── version
+    ├── ...
+
+
+User files
+""""""""""
+All other files in the archive were put there by a user. The layout is identical to a Python
+`regular package <https://docs.python.org/3/reference/import.html#regular-packages>`_. For a deeper dive in how Python packaging works,
+please consult `this essay <https://www.python.org/doc/essays/packages/>`_ (it’s slightly out of date, so double-check implementation details
+with the `Python reference documentation <https://docs.python.org/3/library/importlib.html>`_).
+
+
+::
+
+    <package root>
+    ├── model  # the pickled model
+    │   └── model.pkl
+    ├── another_package
+    │   ├── __init__.py
+    │   ├── foo.txt         # a resource file , see importlib.resources
+    │   └── ...
+    └── torchvision
+        └── models
+            ├── resnet.py   # torchvision.models.resnet
+            └── utils.py    # torchvision.models.utils
+
+
+How ``torch.package`` finds your code's dependencies
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Analyzing an object's dependencies
+""""""""""""""""""""""""""""""""""
+When you issue a ``save_pickle(obj, ...)`` call, :class:`PackageExporter` will pickle the object normally. Then, it uses the
+``pickletools`` standard library module to parse the pickle bytecode.
+
+In a pickle, an object is saved along with a ``GLOBAL`` opcode that describes where to find the implementation of the object’s type, like:
+
+
+::
+
+    GLOBAL 'torchvision.models.resnet Resnet`
+
+
+The dependency resolver will gather up all ``GLOBAL`` ops and mark them as dependencies of your pickled object.
+For more information about pickling and the pickle format, please consult `the Python docs <https://docs.python.org/3/library/pickle.html>`_.
+
+Analyzing a module's dependencies
+"""""""""""""""""""""""""""""""""
+When a Python module is identified as a dependency, ``torch.package`` walks the module’s python AST representation and looks for import statements with
+full support for the standard forms: ``from x import y``, ``import z``, ``from w import v as u``, etc. When one of these import statements are
+encountered, ``torch.package`` registers the imported modules as dependencies that are then themselves parsed in the same AST walking way.
+
+**Note**: AST parsing has limited support for the ``__import__(...)`` syntax and does not support ``importlib.import_module`` calls. In general, you should
+not expect dynamic imports to be detected by ``torch.package``.
+
+
+Dependency Management
+^^^^^^^^^^^^^^^^^^^^^
+``torch.package`` automatically finds the Python modules that your code and objects depend on. This process is called dependency resolution.
+For each module that the dependency resolver finds, you must specify an *action* to take.
+
+The allowed actions are:
+
+* ``intern``: put this module into the package.
+* ``extern``: declare this module as an external dependency of the package.
+* ``mock``: stub out this module.
+* ``deny``: depending on this module will raise an error during package export.
+
+Finally, there is one more important action that is not technically part of ``torch.package``:
+
+* Refactoring: remove or change the dependencies in your code.
+
+Note that actions are only defined on entire Python modules. There is no way to package “just” a function or class from module and leave the rest out.
+This is by design. Python does not offer clean boundaries between objects defined in a module. The only defined unit of dependency organization is a
+module, so that’s what ``torch.package`` uses.
+
+Actions are applied to modules using patterns. Patterns can either be module names (``"foo.bar"``) or globs (like ``"foo.**"``). You associate a pattern
+with an action using methods on :class:`PackageImporter`, e.g.
+
+
+::
+
+    my_exporter.intern("torchvision.**")
+    my_exporter.extern("numpy")
+
+
+If a module matches a pattern, the corresponding action is applied to it. For a given module, patterns will be checked in the order that they were defined,
+and the first action will be taken.
+
+
+``intern``
+""""""""""
+If a module is ``intern``-ed, it will be placed into the package.
+
+This action is your model code, or any related code you want to package. For example, if you are trying to package a ResNet from ``torchvision``,
+you will need to ``intern`` the module torchvision.models.resnet.
+
+On package import, when your packaged code tries to import an ``intern``-ed module, PackageImporter will look inside your package for that module.
+If it can’t find that module, an error will be raised. This ensures that each :class:`PackageImporter` is isolated from the loading environment—even
+if you have ``my_interned_module`` available in both your package and the loading environment, :class:`PackageImporter` will only use the version in your
+package.
+
+**Note**: Only Python source modules can be ``intern``-ed. Other kinds of modules, like C extension modules and bytecode modules, will raise an error if
+you attempt to ``intern`` them. These kinds of modules need to be ``mock``-ed or ``extern``-ed.
+
+
+``extern``
+""""""""""
+If a module is ``extern``-ed, it will not be packaged. Instead, it will be added to a list of external dependencies for this package. You can find this
+list on ``package_exporter.extern_modules``.
+
+On package import, when time packaged code tries to import an ``extern``-ed module, :class:`PackageImporter` will use the default Python importer to find
+that module, as if you did ``importlib.import_module("my_externed_module")``. If it can’t find that module, an error will be raised.
+
+In this way, you can depend on third-party libraries like ``numpy`` and ``scipy`` from within your package without having to package them too.
+
+**Warning**: If any external library changes in a backwards-incompatible way, your package may fail to load. If you need long-term reproducibility
+for your package, try to limit your use of ``extern``.
+
+
+``mock``
+""""""""
+If a module is ``mock``-ed, it will not be packaged. Instead a stub module will be packaged in its place. The stub module will allow you to retrieve
+objects from it (so that ``from my_mocked_module import foo`` will not error), but any use of that object will raise a ``NotImplementedError``.
+
+``mock`` should be used for code that you “know” will not be needed in the loaded package, but you still want to available for use in non-packaged contents.
+For example, initialization/configuration code, or code only used for debugging/training.
+
+**Warning**: In general, ``mock`` should be used as a last resort. It introduces behavioral differences between packaged code and non-packaged code,
+which may lead to later confusion. Prefer instead to refactor your code to remove unwanted dependencies.
+
+
+Refactoring
+"""""""""""
+The best way to manage dependencies is to not have dependencies at all! Often, code can be refactored to remove unnecessary dependencies. Here are some
+guidelines for writing code with clean dependencies (which are also generally good practices!):
+
+**Include only what you use**. Do not leave unused imports in our code. The dependency resolver is not smart enough to tell that they are indeed unused,
+and will try to process them.
+
+**Qualify your imports**. For example, instead of writing import foo and later using ``foo.bar.baz``, prefer to write ``from foo.bar import baz``. This more
+precisely specifies your real dependency (``foo.bar``) and lets the dependency resolver know you don’t need all of ``foo``.
+
+**Split up large files with unrelated functionality into smaller ones**. If your ``utils`` module contains a hodge-podge of unrelated functionality, any module
+that depends on ``utils`` will need to pull in lots of unrelated dependencies, even if you only needed a small part of it. Prefer instead to define
+single-purpose modules that can be packaged independently of one another.
+
+
+Patterns
+""""""""
+Patterns allow you to specify groups of modules with a convenient syntax. The syntax and behavior of patterns follows the Bazel/Buck
+`glob() <https://docs.bazel.build/versions/master/be/functions.html#glob>`_.
+
+A module that we are trying to match against a pattern is called a candidate. A candidate is composed of a list of segments separated by a
+separator string, e.g. ``foo.bar.baz``.
+
+A pattern contains one or more segments. Segments can be:
+
+* A literal string (e.g. ``foo``), which matches exactly.
+* A string containing a wildcard (e.g. ``torch``, or ``foo*baz*``). The wildcard matches any string, including the empty string.
+* A double wildcard (``**``). This matches against zero or more complete segments.
+
+Examples:
+
+* ``torch.**``: matches ``torch`` and all its submodules, e.g. ``torch.nn`` and ``torch.nn.functional``.
+* ``torch.*``: matches ``torch.nn`` or ``torch.functional``, but not ``torch.nn.functional`` or ``torch``
+* ``torch*.**``: matches ``torch``, ``torchvision``, and all of their submodules
+
+When specifying actions, you can pass multiple patterns, e.g.
+
+
+::
+
+    exporter.intern(["torchvision.models.**", "torchvision.utils.**"])
+
+
+A module will match against this action if it matches any of the patterns.
+
+You can also specify patterns to exlcude, e.g.
+
+
+::
+
+    exporter.mock("**", exclude=["torchvision.**"])
+
+
+A module will not match against this action if it matches any of the exclude patterns. In this example, we are mocking all modules except
+``torchvision`` and its submodules.
+
+When a module could potentially match against multiple actions, the first action defined will be taken.
+
+
+``torch.package`` sharp edges
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Avoid global state in your modules
+""""""""""""""""""""""""""""""""""
+Python makes it really easy to bind objects and run code at module-level scope. This is generally fine—after all, functions and classes are bound to
+names this way. However, things become more complicated when you define an object at module scope with the intention of mutating it, introducing mutable
+global state.
+
+Mutable global state is quite useful—it can reduce boilerplate, allow for open registration into tables, etc. But unless employed very carefully, it can
+cause complications when used with ``torch.package``.
+
+Every :class:`PackageImporter` creates an independent environment for its contents. This is nice because it means we load multiple packages and ensure
+they are isolated from each other, but when modules are written in a way that assumes shared mutable global state, this behavior can create hard-to-debug
+errors.
+
+Types are not shared between packages and the loading environment
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+Any class that you import from a :class:`PackageImporter` will be a version of the class specific to that importer. For example:
+
+
+::
+
+    from foo import MyClass
+
+    my_class_instance = MyClass()
+
+    with PackageExporter(f) as exporter:
+        exporter.save_module("foo")
+
+    importer = PackageImporter(f)
+    imported_MyClass = importer.import_module("foo").MyClass
+
+    assert isinstance(my_class_instance, MyClass)  # works
+    assert isinstance(my_class_instance, imported_MyClass)  # ERROR!
+
+
+In this example, ``MyClass`` and ``import_MyClass`` are *not the same type*. In this specific example, ``MyClass`` and ``import_MyClass`` have exactly the
+same implementation, so you might thing it’s okay to consider them the same class. But consider the situation where ``import_MyClass`` is coming from an
+older package with an entirely different implementation of ``MyClass`` — in that case, it’s unsafe to consider them the same class.
+
+Under the hood, each importer has a prefix that allows it to uniquely identify classes:
+
+
+::
+
+    print(MyClass.__name__)  # prints "foo.MyClass"
+    print(imported_MyClass.__name__)  # prints <torch_package_0>.foo.MyClass
+
+
+That means you should not expect ``isinstance`` checks to work when one of the arguments if from a package and the other is not. If you need this
+functionality, consider the following options:
+
+* Doing duck typing (just using the class instead of explicitly checking that it is of a given type).
+* Make the typing relationship an explicit part of the class contract. For example, you can add an attribute tag ``self.handler = "handle_me_this_way"`` and have client code check for the value of ``handler`` instead of checking the type directly.
+
+
+How ``torch.package`` keeps packages isolated from each other
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Each :class:`PackageImporter` instance creates an independent, isolated environment for its modules and objects. Modules in a package can only import
+other packaged modules, or modules marked ``extern``. If you use multiple :class:`PackageImporter` instances to load a single package, you will get
+multiple independent environments that do not interact.
+
+This is achieved by extending Python’s import infrastructure with a custom importer. :class:`PackageImporter` provides the same core API as the
+``importlib`` importer; namely, it implements the ``import_module`` and ``__import__`` methods.
+
+When you invoke :meth:`PackageImporter.import_module`, :class:`PackageImporter` will construct and return a new module, much as the system importer does.
+However, :class:`PackageImporter` patches the returned module to use ``self`` (i.e. that :class:`PackageImporter` instance) to fulfill future import
+requests by looking in the package rather than searching the user’s Python environment.
+
+Mangling
+""""""""
+To avoid confusion (“is this ``foo.bar`` object the one from my package, or the one from my Python environment?”), :class:`PackageImporter` mangles the
+``__name__`` and ``__file__`` of all imported modules, by adding a *mangle prefix* to them.
+
+For ``__name__``, a name like ``torchvision.models.resnet18`` becomes ``<torch_package_0>.torchvision.models.resnet18``.
+
+For ``__file__``, a name like ``torchvision/models/resnet18.py`` becomes ``<torch_package_0>.torchvision/modules/resnet18.py``.
+
+Name mangling helps avoid inadvertent punning of module names between different packages, and helps you debug by making stack traces and print
+statements more clearly show whether they are referring to packaged code or not. For developer-facing details about mangling, consult
+``mangling.md`` in ``torch/package/``.
+

 API Reference
 -------------
--- a/docs/source/scripts/build_activation_images.py
+++ b/docs/source/scripts/build_activation_images.py
@ -37,6 +37,7 @@ functions = [
    'RReLU',
    'SELU',
    'SiLU',
+    'Mish',
    'CELU',
    'GELU',
    'Sigmoid',
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -397,7 +397,7 @@ and ``values``:
 Construction of CSR tensors
 ---------------------------

-Sparse CSR matrices can be directly constructed by using the :func:`torch.sparse_csr_tensor`
+Sparse CSR matrices can be directly constructed by using the :func:`torch._sparse_csr_tensor`
 method. The user must supply the row and column indices and values tensors separately.
 The ``size`` argument is optional and will be deduced from the the ``crow_indices``
 and ``col_indices`` if it is not present.
@ -405,7 +405,7 @@ and ``col_indices`` if it is not present.
    >>> crow_indices = torch.tensor([0, 2, 4])
    >>> col_indices = torch.tensor([0, 1, 0, 1])
    >>> values = torch.tensor([1, 2, 3, 4])
-    >>> csr = torch.sparse_csr_tensor(crow_indices, col_indices, values, dtype=torch.double)
+    >>> csr = torch._sparse_csr_tensor(crow_indices, col_indices, values, dtype=torch.double)
    >>> csr
    tensor(crow_indices=tensor([0, 2, 4]),
          col_indices=tensor([0, 1, 0, 1]),
@ -419,11 +419,11 @@ CSR Tensor Operations
 ---------------------

 The simplest way of constructing a sparse CSR tensor from a strided or sparse COO
-tensor is to use :meth:`tensor.to_sparse_csr`. Any zeros in the (strided) tensor will
+tensor is to use :meth:`tensor._to_sparse_csr`. Any zeros in the (strided) tensor will
 be interpreted as missing values in the sparse tensor:

    >>> a = torch.tensor([[0, 0, 1, 0], [1, 2, 0, 0], [0, 0, 0, 0]], dtype = torch.float64)
-    >>> sp = a.to_sparse_csr()
+    >>> sp = a._to_sparse_csr()
    >>> sp
    tensor(crow_indices=tensor([0, 1, 3, 3]),
          col_indices=tensor([2, 0, 1]),
@ -496,7 +496,7 @@ The following Tensor methods are related to sparse tensors:
    Tensor.sparse_dim
    Tensor.sparse_mask
    Tensor.to_sparse
-    Tensor.to_sparse_csr
+    Tensor._to_sparse_csr
    Tensor.indices
    Tensor.values

@ -581,7 +581,7 @@ Torch functions specific to sparse Tensors
    :nosignatures:

    sparse_coo_tensor
-    sparse_csr_tensor
+    _sparse_csr_tensor
    sparse.sum
    sparse.addmm
    sparse.mm
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -253,7 +253,9 @@ Examples::
    no_grad
    enable_grad
    set_grad_enabled
+    is_grad_enabled
    inference_mode
+    is_inference_mode_enabled

 Math operations
 ---------------
@ -575,5 +577,4 @@ Utilities
    are_deterministic_algorithms_enabled
    set_warn_always
    is_warn_always_enabled
-    vmap
    _assert
--- a/ios/TestApp/TestApp/Benchmark.mm
+++ b/ios/TestApp/TestApp/Benchmark.mm
@ -1,14 +1,19 @@
 #import "Benchmark.h"
 #include <string>
 #include <vector>
+#include "torch/script.h"
+
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/interpreter.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/observer.h>
 #include "ATen/ATen.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/utils/string_utils.h"
 #include "torch/csrc/autograd/grad_mode.h"
-#include "torch/csrc/jit/serialization/import.h"
-#include "torch/script.h"

-static std::string model = "model.pt";
+static std::string model = "model.ptl";
 static std::string input_dims = "1,3,224,224";
 static std::string input_type = "float";
 static BOOL print_output = false;
@ -18,9 +23,9 @@ static int iter = 10;
@implementation Benchmark

 + (BOOL)setup:(NSDictionary*)config {
-  NSString* modelPath = [[NSBundle mainBundle] pathForResource:@"model" ofType:@"pt"];
+  NSString* modelPath = [[NSBundle mainBundle] pathForResource:@"model" ofType:@"ptl"];
  if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
-    NSLog(@"model.pt doesn't exist!");
+    NSLog(@"model.ptl doesn't exist!");
    return NO;
  }
  model = std::string(modelPath.UTF8String);
@ -66,10 +71,9 @@ static int iter = 10;
  }

  c10::InferenceMode mode;
-  torch::jit::GraphOptimizerEnabledGuard opguard(false);
-  auto module = torch::jit::load(model);
+  auto module = torch::jit::_load_for_mobile(model);

-  module.eval();
+//  module.eval();
  if (print_output) {
    std::cout << module.forward(inputs) << std::endl;
  }
--- a/ios/TestApp/TestAppTests/TestAppTests.mm
+++ b/ios/TestApp/TestAppTests/TestAppTests.mm
@ -1,13 +1,22 @@
 #import <XCTest/XCTest.h>

 #include <torch/script.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/interpreter.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/observer.h>
+#include "ATen/ATen.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/string_utils.h"
+#include "torch/csrc/autograd/grad_mode.h"

@interface TestAppTests : XCTestCase

@end

@implementation TestAppTests {
-  torch::jit::Module _module;
+  torch::jit::mobile::Module _module;
 }

 + (void)setUp {
@ -17,14 +26,14 @@
 - (void)setUp {
  [super setUp];
  NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model"
-                                                                         ofType:@"pt"];
+                                                                         ofType:@"ptl"];
  XCTAssertTrue([NSFileManager.defaultManager fileExistsAtPath:modelPath],
-                @"model.pt doesn't exist!");
-  _module = torch::jit::load(modelPath.UTF8String);
+                @"model.ptl doesn't exist!");
+  _module = torch::jit::_load_for_mobile(modelPath.UTF8String);
 }

 - (void)testForward {
-  _module.eval();
+//  _module.eval();
  c10::InferenceMode mode;
  std::vector<c10::IValue> inputs;
  inputs.push_back(torch::ones({1, 3, 224, 224}, at::ScalarType::Float));
--- a/ios/TestApp/benchmark/setup.rb
+++ b/ios/TestApp/benchmark/setup.rb
@ -38,7 +38,7 @@ targets.each do |target|
    end
 end
 puts "Installing the testing model..."
-model_path = File.expand_path("./model.pt")
+model_path = File.expand_path("./model.ptl")
 if not File.exist?(model_path)
   raise "model.pt can't be found!"
 end
--- a/ios/TestApp/benchmark/trace_model.py
+++ b/ios/TestApp/benchmark/trace_model.py
@ -1,8 +1,10 @@
 import torch
 import torchvision
+from torch.utils.mobile_optimizer import optimize_for_mobile

 model = torchvision.models.mobilenet_v2(pretrained=True)
 model.eval()
 example = torch.rand(1, 3, 224, 224)
-traced_script_module = torch.jit.trace(model, example)
-traced_script_module.save("model.pt")
+traced_script_module = torch.jit.script(model, example)
+optimized_scripted_module = optimize_for_mobile(traced_script_module)
+exported_optimized_scripted_module = optimized_scripted_module._save_for_lite_interpreter("model.ptl")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Python dependencies required for development
 astunparse
 future
-numpy
+numpy<1.21
 psutil
 pyyaml
 requests
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@ -104,12 +104,13 @@ fi
 CMAKE_ARGS+=("-DBUILD_TEST=OFF")
 CMAKE_ARGS+=("-DBUILD_BINARY=OFF")

-# If there exists env variable and it equals to 1, build lite interpreter.
-# cmd:  BUILD_LITE_INTERPRETER=1 ./scripts/build_android.sh
-if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
-else
+# If there exists env variable and it equals to 0, build full jit interpreter.
+# Default behavior is to build lite interpreter
+# cmd:  BUILD_LITE_INTERPRETER=0 ./scripts/build_android.sh
+if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
 fi
 CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK")
 CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST")
--- a/scripts/build_ios.sh
+++ b/scripts/build_ios.sh
@ -78,10 +78,10 @@ if [ -n "${IOS_ARCH:-}" ]; then
  CMAKE_ARGS+=("-DIOS_ARCH=${IOS_ARCH}")
 fi

-if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
-else
+if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
 fi

 # Don't build binaries or tests (only the library)
--- a/setup.py
+++ b/setup.py
@ -43,6 +43,10 @@
 #   USE_MKLDNN=0
 #     disables use of MKLDNN
 #
+#   USE_MKLDNN_ACL
+#     enables use of Compute Library backend for MKLDNN on Arm;
+#     USE_MKLDNN must be explicitly enabled.
+#
 #   MKLDNN_CPU_RUNTIME
 #     MKL-DNN threading mode: TBB or OMP (default)
 #
@ -156,6 +160,9 @@
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
+#   ACL_ROOT_DIR
+#     specify where Compute Library is installed
+#
 #   LIBRARY_PATH
 #   LD_LIBRARY_PATH
 #     we will search for libraries in these paths
@ -460,6 +467,10 @@ class build_ext(setuptools.command.build_ext.build_ext):
            report('-- Not using CUDA')
        if cmake_cache_vars['USE_MKLDNN']:
            report('-- Using MKLDNN')
+            if cmake_cache_vars['USE_MKLDNN_ACL']:
+                report('-- Using Compute Library for the Arm architecture with MKLDNN')
+            else:
+                report('-- Not using Compute Library for the Arm architecture with MKLDNN')
            if cmake_cache_vars['USE_MKLDNN_CBLAS']:
                report('-- Using CBLAS in MKLDNN')
            else:
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@ -89,6 +89,7 @@ allow_list = [
    ("aten::_amp_update_scale", datetime.date(2021, 6, 1)),
    ("aten::randperm", datetime.date(9999, 1, 1)),
    ("aten::linalg_vector_norm", datetime.date(2021, 5, 15)),
+    ("aten::sparse_csr_tensor", datetime.date(9999, 1, 1)),
 ]

 def allow_listed(schema, allow_list):
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@ -1749,6 +1749,15 @@ TEST_F(FunctionalTest, Softsign) {
  ASSERT_TRUE(torch::allclose(y, y_exp));
 }

+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST_F(FunctionalTest, Mish) {
+  auto x = torch::randn(100) * 10;
+  auto y_exp = x * x.exp().log1p().tanh();
+  auto y = F::mish(x);
+
+  ASSERT_TRUE(torch::allclose(y, y_exp));
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST_F(FunctionalTest, Tanhshrink) {
  auto x = torch::randn(100) * 10;
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -2958,6 +2958,16 @@ TEST_F(ModulesTest, GELU) {
  ASSERT_TRUE(torch::allclose(y, y_exp));
 }

+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST_F(ModulesTest, Mish) {
+  Mish model;
+  auto x = torch::randn(100) * 10;
+  auto y_exp = x * x.exp().log1p().tanh();
+  auto y = model(x);
+
+  ASSERT_TRUE(torch::allclose(y, y_exp));
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST_F(ModulesTest, Sigmoid) {
  Sigmoid model;
--- a/test/cpp_api_parity/parity-tracker.md
+++ b/test/cpp_api_parity/parity-tracker.md
@ -49,6 +49,7 @@ torch::nn::Hardshrink|Yes|No
 torch::nn::Hardtanh|Yes|No
 torch::nn::LeakyReLU|Yes|No
 torch::nn::LogSigmoid|Yes|No
+torch::nn::Mish|Yes|No
 torch::nn::MultiheadAttention|No|No
 torch::nn::PReLU|Yes|No
 torch::nn::ReLU|Yes|No
@ -187,6 +188,7 @@ F::rrelu|Yes|No
 F::glu|Yes|No
 F::gelu|Yes|No
 F::silu|Yes|No
+F::mish|Yes|No
 F::logsigmoid|Yes|No
 F::hardshrink|Yes|No
 F::tanhshrink|Yes|No
--- a/test/distributed/launcher/bin/test_script_init_method.py
+++ b/test/distributed/launcher/bin/test_script_init_method.py
@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="test script")
+
+    parser.add_argument(
+        "--init_method",
+        type=str,
+        required=True,
+        help="init_method to pass to `dist.init_process_group()` (e.g. env://)",
+    )
+    parser.add_argument(
+        "--world_size",
+        type=int,
+        default=os.getenv("WORLD_SIZE", -1),
+        help="world_size to pass to `dist.init_process_group()`",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=os.getenv("RANK", -1),
+        help="rank to pass to `dist.init_process_group()`",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    dist.init_process_group(
+        backend="gloo",
+        init_method=args.init_method,
+        world_size=args.world_size,
+        rank=args.rank,
+    )
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # one hot (by rank) tensor of size world_size
+    # example:
+    # rank 0, world_size 4 => [1, 0, 0, 0]
+    # rank 1, world_size 4 => [0, 1, 0, 0]
+    # ...
+    t = F.one_hot(torch.tensor(rank), num_classes=world_size)
+
+    # after all_reduce t = tensor.ones(size=world_size)
+    dist.all_reduce(t)
+
+    # adding all elements in t should equal world_size
+    derived_world_size = torch.sum(t).item()
+    if derived_world_size != world_size:
+        raise RuntimeError(
+            f"Wrong world size derived. Expected: {world_size}, Got: {derived_world_size}"
+        )
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
--- a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
+++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This is a test script that launches as part of the test cases in
+run_test.py, to validate the correctness of
+the method ``torch.distributed.is_torchelastic_launched()``. To do so,
+we run this script with and without torchelastic and validate that the
+boolean value written to the out_file is indeed what we expect (e.g.
+should be False when not launched with torchelastic, True when launched with)
+The script itself is not a test case hence no assertions are made in this script.
+
+see: - test/distributed/launcher/run_test.py#test_is_torchelastic_launched()
+     - test/distributed/launcher/run_test.py#test_is_not_torchelastic_launched()
+"""
+import argparse
+
+import torch.distributed as dist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="test script")
+    parser.add_argument(
+        "--out_file",
+        help="file to write indicating whether this script was launched with torchelastic",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    with open(args.out_file, "w") as out:
+        out.write(f"{dist.is_torchelastic_launched()}")
+
+
+if __name__ == "__main__":
+    main()
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@ -7,8 +7,10 @@
 # LICENSE file in the root directory of this source tree.
 import multiprocessing as mp
 import os
+import runpy
 import shutil
 import subprocess
+import sys
 import tempfile
 import unittest
 import uuid
@ -21,6 +23,7 @@ from torch.distributed.elastic.agent.server.api import RunResult, WorkerState
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
+from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import (
    TEST_WITH_ASAN,
    TEST_WITH_TSAN,
@ -169,6 +172,35 @@ class ElasticLaunchTest(unittest.TestCase):
            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
        )

+    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    def test_launch_user_script_default_nproc(self):
+        run_id = str(uuid.uuid4().int)
+        nnodes = 1
+        world_size = 1
+        args = [
+            f"--nnodes={nnodes}",
+            "--rdzv_backend=etcd",
+            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv_id={run_id}",
+            "--monitor_interval=1",
+            "--start_method=fork",
+            "--no_python",
+        ]
+
+        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
+
+        with self.assertRaises(ValueError):
+            # --no_python cannot be used with --module
+            launch.main(args + ["--module"] + script_args)
+
+        launch.main(args + script_args)
+
+        # make sure all the workers ran
+        # each worker touches a file with its global rank as the name
+        self.assertSetEqual(
+            {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
+        )
+
    @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
    def test_launch_with_env_vars(self):
        run_id = str(uuid.uuid4().int)
@ -447,3 +479,117 @@ class ElasticLaunchTest(unittest.TestCase):
            param_mock.return_value = rdzv_handler_mock
            launch.main(args)
            rdzv_handler_mock.shutdown.assert_called_once()
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_is_torchelastic_launched(self):
+        # launch test script with torchelastic and validate that
+        # torch.distributed.is_torchelastic_launched() returns True
+
+        out_file = f"{os.path.join(self.test_dir, 'out')}"
+
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=1",
+                "--monitor_interval=1",
+                path("bin/test_script_is_torchelastic_launched.py"),
+                f"--out_file={out_file}",
+            ]
+        )
+
+        with open(out_file, "r") as fp:
+            is_torchelastic_launched = fp.readline()
+            self.assertEqual("True", is_torchelastic_launched)
+
+    def test_is_not_torchelastic_launched(self):
+        # launch test script without torchelastic and validate that
+        # torch.distributed.is_torchelastic_launched() returns False
+
+        out_file = f"{os.path.join(self.test_dir, 'out')}"
+
+        # need to run the script with runpy in the same interpreter
+        # as the test because otherwise (depending on the environment)
+        # it will not find torch as a dependency
+        with patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_is_torchelastic_launched.py"),
+                f"--out_file={out_file}",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            with open(out_file, "r") as fp:
+                is_torchelastic_launched = fp.readline()
+                self.assertEqual("False", is_torchelastic_launched)
+
+    def test_init_method_tcp(self):
+        port = get_free_port()
+        with patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_init_method.py"),
+                f"--init_method=tcp://localhost:{port}",
+                "--rank=0",
+                "--world_size=1",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            # nothing to validate, just make sure it runs
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_init_method_tcp_with_torchelastic(self):
+        port = get_free_port()
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=4",
+                "--master_addr=localhost",
+                f"--master_port={port}",
+                "--monitor_interval=1",
+                path("bin/test_script_init_method.py"),
+                f"--init_method=tcp://localhost:{port}",
+            ]
+        )
+        # nothing to validate, just make sure it runs
+
+    def test_init_method_env(self):
+        port = get_free_port()
+        with patch.dict(
+            os.environ,
+            {
+                "RANK": "0",
+                "WORLD_SIZE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": str(port),
+            },
+        ), patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_init_method.py"),
+                "--init_method=env://",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            # nothing to validate, just make sure it runs
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_init_method_env_with_torchelastic(self):
+        port = get_free_port()
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=4",
+                "--master_addr=localhost",
+                f"--master_port={port}",
+                "--monitor_interval=1",
+                path("bin/test_script_init_method.py"),
+                "--init_method=env://",
+            ]
+        )
+        # nothing to validate, just make sure it runs
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@ -5373,6 +5373,18 @@ class TestONNXRuntime(unittest.TestCase):
        x = torch.randn(2, 3, 4)
        self.run_test(SiLUModel(), (x))

+    def test_mish(self):
+        class MishModel(torch.nn.Module):
+            def __init__(self):
+                super(MishModel, self).__init__()
+                self.mish = torch.nn.Mish()
+
+            def forward(self, x):
+                return self.mish(x)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(MishModel(), (x))
+
    def test_remainder(self):
        class RemainderModel(torch.nn.Module):
            def forward(self, input, other):
--- a/test/package/common.py
+++ b/test/package/common.py
@ -3,7 +3,6 @@ import sys
 from tempfile import NamedTemporaryFile

 from torch.testing._internal.common_utils import IS_WINDOWS, TestCase
-import torch.package.package_exporter


 class PackageTestCase(TestCase):
@ -29,8 +28,6 @@ class PackageTestCase(TestCase):
        self.package_test_dir = os.path.dirname(os.path.realpath(__file__))
        self.orig_sys_path = sys.path.copy()
        sys.path.append(self.package_test_dir)
-        torch.package.package_exporter._gate_torchscript_serialization = False
-

    def tearDown(self):
        super().tearDown()
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@ -6,7 +6,7 @@ from unittest import skipIf

 from torch.package import EmptyMatchError, Importer, PackageExporter, PackageImporter
 from torch.package.package_exporter import PackagingError
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests

 try:
    from .common import PackageTestCase
@ -224,19 +224,27 @@ class TestDependencyAPI(PackageTestCase):

        buffer = BytesIO()

-        try:
+        with self.assertRaises(PackagingError) as e:
            with PackageExporter(buffer, verbose=False) as he:
                he.save_pickle("obj", "obj.pkl", obj2)
-        except PackagingError as e:
-            self.assertEqual(e.unhandled, set(["package_a", "package_a.subpackage"]))
-        else:
-            self.fail("PackagingError should have been raised")
+
+        self.assertEqual(
+            str(e.exception),
+            dedent(
+                """
+                * Module did not match against any action pattern. Extern, mock, or intern it.
+                    package_a
+                    package_a.subpackage
+                """
+            ),
+        )

        # Interning all dependencies should work
        with PackageExporter(buffer, verbose=False) as he:
            he.intern(["package_a", "package_a.subpackage"])
            he.save_pickle("obj", "obj.pkl", obj2)

+    @skipIf(IS_WINDOWS, "extension modules have a different file extension on windows")
    def test_broken_dependency(self):
        """A unpackageable dependency should raise a PackagingError."""

@ -262,16 +270,42 @@ class TestDependencyAPI(PackageTestCase):

        buffer = BytesIO()

-        try:
+        with self.assertRaises(PackagingError) as e:
            with PackageExporter(
                buffer, verbose=False, importer=BrokenImporter()
            ) as exporter:
                exporter.intern(["foo", "bar"])
                exporter.save_source_string("my_module", "import foo; import bar")
-        except PackagingError as e:
-            self.assertEqual(set(e.broken.keys()), set(["foo", "bar"]))
-        else:
-            self.fail("PackagingError should have been raised")
+
+        self.assertEqual(
+            str(e.exception),
+            dedent(
+                """
+                * Module is a C extension module. torch.package supports Python modules only.
+                    foo
+                    bar
+                """
+            ),
+        )
+
+    def test_invalid_import(self):
+        """An incorrectly-formed import should raise a PackagingError."""
+        buffer = BytesIO()
+        with self.assertRaises(PackagingError) as e:
+            with PackageExporter(buffer, verbose=False) as exporter:
+                # This import will fail to load.
+                exporter.save_source_string("foo", "from ........ import lol")
+
+        self.assertEqual(
+            str(e.exception),
+            dedent(
+                """
+                * Dependency resolution failed.
+                    foo
+                      Context: attempted relative import beyond top-level package
+                """
+            ),
+        )


 if __name__ == "__main__":
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@ -3272,6 +3272,24 @@ class TestQuantizeFxOps(QuantizationTestCase):
        self._test_default_node_quant_handler_ops(
            module, functional, qconfig, is_reference, node_list)

+    def test_mish_reference(self):
+        module = torch.nn.Mish
+        functional = torch.nn.functional.mish
+        qconfig = float16_static_qconfig
+        is_reference = True
+        node_list = [
+            ns.call_method("to"),
+            ns.call_method("dequantize"),
+            ns.call_module(module),
+            ns.call_method("to"),
+            ns.call_method('dequantize'),
+            ns.call_function(functional),
+            ns.call_method("to"),
+            ns.call_method('dequantize')
+        ]
+        self._test_default_node_quant_handler_ops(
+            module, functional, qconfig, is_reference, node_list)
+
    def test_bmm_int_reference(self):
        class M(torch.nn.Module):
            def __init__(self):
--- a/test/quantization/test_quantized_module.py
+++ b/test/quantization/test_quantized_module.py
@ -197,7 +197,7 @@ class TestStaticQuantizedModule(QuantizationTestCase):
        self.checkScriptable(qlinear, [[X_q]], check_save_load=True)

        # Make sure `from_float` works for all linear variants
-        modules_under_test = [torch.nn.Linear, torch.nn.modules.linear._LinearWithBias]
+        modules_under_test = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear]

        for mut in modules_under_test:
            # Test from_float.
@ -978,7 +978,7 @@ class TestDynamicQuantizedModule(QuantizationTestCase):
        # Test JIT
        self.checkScriptable(qlinear, [[X]], check_save_load=True)

-        modules_under_test = [torch.nn.Linear, torch.nn.modules.linear._LinearWithBias]
+        modules_under_test = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear]
        for mut in modules_under_test:
            # Test from_float
            float_linear = mut(in_features, out_features).float()
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@ -886,7 +886,7 @@ class TestCppExtensionJIT(common.TestCase):
            #include <torch/torch.h>

            int fail() {{
-                torch::crash_handler::_enable_minidump_collection("{destination}");
+                torch::crash_handler::enable_minidumps("{destination}");

                volatile int* bad = nullptr;
                return *bad;
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@ -14,13 +14,15 @@ from unittest import skipIf
 import torch
 import torch.nn as nn
 from torch.testing._internal.common_utils import (TestCase, run_tests)
-from torch.utils.data import \
-    (IterDataPipe, RandomSampler, DataLoader,
-     argument_validation, runtime_validation_disabled, runtime_validation)
+from torch.utils.data import (
+    IterDataPipe, RandomSampler, DataLoader,
+    argument_validation, runtime_validation_disabled, runtime_validation
+)

-from typing import \
-    (Any, Dict, Generic, Iterator, List, NamedTuple, Optional, Tuple, Type,
-     TypeVar, Set, Union)
+from typing import (
+    Any, Awaitable, Dict, Generic, Iterator, List, NamedTuple, Optional, Tuple,
+    Type, TypeVar, Set, Union
+)

 import torch.utils.data.datapipes as dp
 from torch.utils.data.datapipes.utils.decoder import (
@ -707,7 +709,7 @@ class TestTyping(TestCase):
        dp2 = DP1(5)
        self.assertEqual(dp1.type, dp2.type)

-        with self.assertRaisesRegex(TypeError, r"Can not subclass a DataPipe"):
+        with self.assertRaisesRegex(TypeError, r"is not a generic class"):
            class InvalidDP5(DP1[tuple]):  # type: ignore[type-arg]
                def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
                    yield (0, )
@ -754,7 +756,8 @@ class TestTyping(TestCase):

        self.assertTrue(issubclass(DP5, IterDataPipe))
        dp = DP5()  # type: ignore[assignment]
-        self.assertTrue(dp.type.param == Any)
+        from torch.utils.data._typing import issubtype
+        self.assertTrue(issubtype(dp.type.param, Any) and issubtype(Any, dp.type.param))

        class DP6(IterDataPipe[int]):
            r""" DataPipe with plain Iterator"""
@ -765,6 +768,19 @@ class TestTyping(TestCase):
        dp = DP6()  # type: ignore[assignment]
        self.assertTrue(dp.type.param == int)

+        class DP7(IterDataPipe[Awaitable[T_co]]):
+            r""" DataPipe with abstract base class"""
+
+        self.assertTrue(issubclass(DP6, IterDataPipe))
+        self.assertTrue(DP7.type.param == Awaitable[T_co])
+
+        class DP8(DP7[str]):
+            r""" DataPipe subclass from a DataPipe with abc type"""
+
+        self.assertTrue(issubclass(DP8, IterDataPipe))
+        self.assertTrue(DP8.type.param == Awaitable[str])
+
+
    def test_construct_time(self):
        class DP0(IterDataPipe[Tuple]):
            @argument_validation
--- a/test/test_fx.py
+++ b/test/test_fx.py
@ -2728,6 +2728,7 @@ class TestFunctionalTracing(JitTestCase):
        "rrelu": CONTROL_FLOW,
        "selu": CONTROL_FLOW,
        "silu": CONTROL_FLOW,
+        "mish": CONTROL_FLOW,
        "smooth_l1_loss": CONTROL_FLOW,
        "soft_margin_loss": CONTROL_FLOW,
        "threshold": CONTROL_FLOW,
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -1699,10 +1699,10 @@ graph(%Ra, %Rb):
        self.checkScript(test_sparse_addmm_alpha_beta, (torch.randn(2, 4), get_sparse(), torch.randn(3, 4)))

    @suppress_warnings
-    def test_sparse_csr_tensors(self):
+    def test__sparse_csr_tensors(self):
        @torch.jit.ignore
        def get_sparse_csr():
-            return torch.randn(3, 3).to_sparse_csr()
+            return torch.randn(3, 3)._to_sparse_csr()

        @torch.jit.script
        def test_is_sparse_csr(input):
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -5947,6 +5947,11 @@ scipy_lobpcg  | {:10.2e}  | {:10.2e}  | {:6} | N/A
                    m2 = torch.randn(k, m, device=device).to(dtype)
                    self._test_addmm_addmv(torch.addmm, M, m1, m2)

+                    m1 = torch.randn(n, k + 1, device=device).to(dtype)
+                    m2 = torch.randn(k, m, device=device).to(dtype)
+                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2))
+                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2))
+
    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
    @onlyCUDA
    def test_matmul_45724(self, device):
--- a/Show More
+++ b/Show More