feat(dockerfile): shrink layers & build cleaner (#95375 )

this change will reduce the layer size as it will not save the layers also it will build cleaner on other machines as it won't ask for a user interaction when running the build Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/95375 Approved by: https://github.com/ezyang (cherry picked from commit 1c526664d5b9556fa7c24d492cec05e622c58c2e)
Update PyTorch docker base image to Ubuntu-20.04 (take 2) (#101310 )
2025-10-21 21:49:24 +08:00 · 2023-05-12 16:21:40 -07:00 · 2023-05-12 14:51:45 -07:00 · 2023-05-12 14:11:09 -07:00 · 2023-05-12 14:01:38 -07:00 · 2023-04-24 09:27:27 -04:00
186 changed files with 2859 additions and 1129 deletions
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -119,7 +119,7 @@ function install_torchvision() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.0 --quiet https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -135,16 +135,10 @@ function install_filelock() {

 function install_triton() {
  local commit
-  commit=$(get_pinned_commit triton)
-  local short_hash
-  short_hash=$(echo "${commit}"|cut -c -10)
-  local index_url
-  index_url=https://download.pytorch.org/whl/nightly/cpu
  if [[ "${TEST_CONFIG}" == *rocm* ]]; then
    echo "skipping triton due to rocm"
-  elif pip install "pytorch-triton==2.0.0+${short_hash}" --index-url "${index_url}"; then
-     echo "Using prebuilt version ${short_hash}"
  else
+    commit=$(get_pinned_commit triton)
    if [[ "${BUILD_ENVIRONMENT}" == *gcc7* ]]; then
      # Trition needs gcc-9 to build
      sudo apt-get install -y g++-9
@ -181,7 +175,7 @@ function checkout_install_torchdeploy() {
  pushd multipy
  git checkout "${commit}"
  python multipy/runtime/example/generate_examples.py
-  pip install -e . --install-option="--cudatests"
+  pip install -e .
  popd
  popd
 }
@ -190,7 +184,6 @@ function test_torch_deploy(){
 pushd ..
 pushd multipy
 ./multipy/runtime/build/test_deploy
- ./multipy/runtime/build/test_deploy_gpu
 popd
 popd
 }
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,7 +62,7 @@ git --no-pager log --max-count 1
 popd

 # Clone the Builder master repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b release/2.0 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.github/ci_commit_pins/triton.txt
+++ b/.github/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8bfe3f548b164f745ada620a560f87f41ab8465
+b8b470bc597c1c5bd03682c09fe3e6b7c53787fd
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-d29eb67c27af0f18d4f487d76b86f43b0a69aade
+r2.0
--- a/.github/requirements/triton-requirements-rocm.txt
+++ b/.github/requirements/triton-requirements-rocm.txt
@ -1 +1 @@
-pytorch-triton-rocm>=2.0.0.dev
+pytorch-triton-rocm>=2.0.0,<2.1
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -38,7 +38,7 @@ def build_triton(commit_hash: str, build_conda: bool = False, py_version : Optio
        check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
-                print(f"package:\n  name: torchtriton\n  version: 2.0.0+{commit_hash[:10]}\n", file=meta)
+                print("package:\n  name: torchtriton\n  version: 2.0.0\n", file=meta)
                print("source:\n  path: .\n", file=meta)
                print("build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
                      "python setup.py install --single-version-externally-managed --record=record.txt\n", file=meta)
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -196,10 +196,11 @@ def generate_wheels_matrix(os: str,
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
-            gpu_arch_version = "" if arch_version == "cpu" or arch_version == "cpu-cxx11-abi" else arch_version
-            # Skip rocm 3.11 binaries for now as the docker image are not correct
-            if python_version == "3.11" and gpu_arch_type == "rocm":
-                continue
+            gpu_arch_version = (
+                ""
+                if arch_version == "cpu" or arch_version == "cpu-cxx11-abi"
+                else arch_version
+            )

            # special 11.7 wheels package without dependencies
            # dependency downloaded via pip install
@ -226,7 +227,8 @@ def generate_wheels_matrix(os: str,
                        "nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                        "nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                        "nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'",
+                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
                        "build_name":
                        f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn"
                        .replace(
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -8,7 +8,7 @@
 # NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
 #       the binary builds will check out
 {%- set builder_repo = "pytorch/builder" -%}
-{%- set builder_branch = "main" -%}
+{%- set builder_branch = "release/2.0" -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -87,8 +87,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -74,8 +74,8 @@ jobs:
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@v2.8.2
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -62,8 +62,8 @@ jobs:
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -98,8 +98,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/_android-build-test.yml
+++ b/.github/workflows/_android-build-test.yml
@ -35,7 +35,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@ -35,7 +35,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -35,7 +35,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -131,7 +131,7 @@ jobs:
        with:
          github-secret: ${{ secrets.github-token }}
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Chown workspace
@ -145,7 +145,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -158,7 +157,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -128,7 +128,7 @@ jobs:
          github-secret: ${{ secrets.github-token }}
        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Chown workspace
@ -142,7 +142,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
      - name: Clean PyTorch checkout
@ -154,7 +153,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -96,7 +96,7 @@ jobs:
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true

--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Set up JDK 8
        uses: actions/setup-java@v3
--- a/.github/workflows/_calculate-docker-image.yml
+++ b/.github/workflows/_calculate-docker-image.yml
@ -22,7 +22,7 @@ jobs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -76,7 +76,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@ -38,7 +38,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Populate CI build options
        run: |
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -76,7 +76,7 @@ jobs:
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -48,7 +48,7 @@ jobs:
      keep-going: ${{ steps.filter.outputs.keep-going }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -79,7 +79,7 @@ jobs:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -79,7 +79,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Set xcode version
        env:
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -47,7 +47,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -96,7 +96,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Download build artifacts
        uses: ./.github/actions/download-build-artifacts
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -54,7 +54,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -78,7 +78,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true

--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@ -13,7 +13,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup miniconda
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -68,7 +68,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -36,7 +36,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -119,7 +119,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true

--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -38,7 +38,7 @@ jobs:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false

@ -65,9 +65,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.7)
-            PYTHON_EXECUTABLE=/opt/python/cp37-cp37m/bin/python
-            ;;
          3.8)
            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
            ;;
@ -86,7 +83,8 @@ jobs:
            ;;
          esac

-          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
+          docker exec -t "${container_name}" yum install -y zlib-devel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0
          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" /pytorch/.github/scripts/build_triton_wheel.py
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts

@ -139,7 +137,7 @@ jobs:
        run: |
            set -ex
            pip install -q awscli
-            s3_dir="${UPLOAD_BUCKET}/whl/nightly/"
+            s3_dir="${UPLOAD_BUCKET}/whl/test/"
            for pkg in "${PKG_DIR}/"*.whl; do
              aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
             done
@ -162,7 +160,7 @@ jobs:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false

@ -195,7 +193,7 @@ jobs:
        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
        run: |
          container_name=$(docker container ps --format '{{.ID}}')
-          docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-nightly --label main --no-progress --force"
+          docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-test --label main --no-progress --force"

      - name: Chown artifacts
        run: |
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: linux.20_04.4x
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -62,7 +62,7 @@ jobs:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required for git merge-base
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -829,7 +829,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -841,7 +840,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -933,7 +932,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -945,7 +943,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1037,7 +1035,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1049,7 +1046,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1141,7 +1138,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1153,7 +1149,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -829,7 +829,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -841,7 +840,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -933,7 +932,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -945,7 +943,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1037,7 +1035,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1049,7 +1046,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1141,7 +1138,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1153,7 +1149,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-linux-binary-manywheel-master.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-master.yml
@ -47,7 +47,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -169,7 +169,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

@ -381,7 +381,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -393,7 +392,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -482,7 +481,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -494,7 +492,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -669,7 +667,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

@ -881,7 +879,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -893,7 +890,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -982,7 +979,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -994,7 +990,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1169,7 +1165,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

@ -1381,7 +1377,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1393,7 +1388,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1482,7 +1477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1494,7 +1488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1669,7 +1663,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

@ -1834,3 +1828,203 @@ jobs:
      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+  manywheel-py3_11-rocm5_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm5_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_11-rocm5_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-rocm5_3-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_11-rocm5_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: release/2.0
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm5.3
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_11-rocm5_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-rocm5_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm5_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  manywheel-py3_11-rocm5_4_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm5_4_2
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_11-rocm5_4_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-rocm5_4_2-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_11-rocm5_4_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: release/2.0
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_11-rocm5_4_2-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-rocm5_4_2-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm5_4_2
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -75,7 +75,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -87,7 +86,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -187,7 +186,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -199,7 +197,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -299,7 +297,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -311,7 +308,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -411,7 +408,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -423,7 +419,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -75,7 +75,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -87,7 +86,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -187,7 +186,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -199,7 +197,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -299,7 +297,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -311,7 +308,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -411,7 +408,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -423,7 +419,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@ -73,7 +73,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -85,7 +84,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -185,7 +184,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -197,7 +195,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -297,7 +295,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -309,7 +306,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -409,7 +406,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -421,7 +417,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@ -77,7 +77,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -89,7 +88,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -194,7 +193,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -206,7 +204,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -311,7 +309,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -323,7 +320,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@ -77,7 +77,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -89,7 +88,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -194,7 +193,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -206,7 +204,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -311,7 +309,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -323,7 +320,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@ -73,7 +73,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -85,7 +84,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -185,7 +184,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -197,7 +195,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -297,7 +295,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -309,7 +306,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -409,7 +406,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -421,7 +417,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -87,7 +87,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -99,7 +98,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -198,7 +197,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -210,7 +208,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -316,7 +314,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -328,7 +325,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -547,7 +543,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -559,7 +554,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -659,7 +654,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -671,7 +665,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -777,7 +771,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -789,7 +782,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -888,7 +881,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -900,7 +892,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1006,7 +998,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1018,7 +1009,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1118,7 +1109,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1130,7 +1120,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1237,7 +1227,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1249,7 +1238,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1349,7 +1338,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1361,7 +1349,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1467,7 +1455,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1479,7 +1466,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1578,7 +1565,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1590,7 +1576,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1696,7 +1682,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1708,7 +1693,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1808,7 +1793,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1820,7 +1804,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1927,7 +1911,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1939,7 +1922,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2039,7 +2022,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2051,7 +2033,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2157,7 +2139,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2169,7 +2150,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2268,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2280,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2386,7 +2366,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2398,7 +2377,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2498,7 +2477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2510,7 +2488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2617,7 +2595,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2629,7 +2606,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2729,7 +2706,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2741,7 +2717,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
@ -86,7 +86,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -98,7 +97,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -201,7 +200,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -213,7 +211,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -91,7 +91,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -103,7 +102,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -206,7 +205,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -218,7 +216,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -331,7 +329,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -343,7 +340,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -446,7 +443,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -458,7 +454,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -571,7 +567,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -583,7 +578,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -686,7 +681,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -698,7 +692,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -811,7 +805,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -823,7 +816,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -926,7 +919,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -938,7 +930,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1052,7 +1044,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1064,7 +1055,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1168,7 +1159,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1180,7 +1170,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1295,7 +1285,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1307,7 +1296,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1411,7 +1400,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1423,7 +1411,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1538,7 +1526,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1550,7 +1537,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1654,7 +1641,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1666,7 +1652,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1781,7 +1767,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1793,7 +1778,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1897,7 +1882,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1909,7 +1893,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2024,7 +2008,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2036,7 +2019,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2140,7 +2123,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2152,7 +2134,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2267,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2279,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2383,7 +2364,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2395,7 +2375,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2510,7 +2490,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2522,7 +2501,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,7 +2605,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2638,7 +2616,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2753,7 +2731,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2765,7 +2742,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2869,7 +2846,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2881,7 +2857,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-release-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
@ -86,7 +86,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -98,7 +97,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -201,7 +200,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -213,7 +211,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -91,7 +91,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -103,7 +102,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -206,7 +205,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -218,7 +216,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -331,7 +329,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -343,7 +340,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -446,7 +443,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -458,7 +454,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -571,7 +567,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -583,7 +578,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -686,7 +681,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -698,7 +692,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -811,7 +805,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -823,7 +816,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -926,7 +919,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -938,7 +930,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1052,7 +1044,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1064,7 +1055,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1168,7 +1159,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1180,7 +1170,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1295,7 +1285,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1307,7 +1296,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1411,7 +1400,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1423,7 +1411,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1538,7 +1526,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1550,7 +1537,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1654,7 +1641,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1666,7 +1652,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1781,7 +1767,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1793,7 +1778,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1897,7 +1882,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1909,7 +1893,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2024,7 +2008,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2036,7 +2019,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2140,7 +2123,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2152,7 +2134,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2267,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2279,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2383,7 +2364,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2395,7 +2375,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2510,7 +2490,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2522,7 +2501,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,7 +2605,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2638,7 +2616,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2753,7 +2731,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2765,7 +2742,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2869,7 +2846,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2881,7 +2857,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -87,7 +87,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -99,7 +98,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -198,7 +197,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -210,7 +208,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -316,7 +314,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -328,7 +325,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -547,7 +543,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -559,7 +554,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -659,7 +654,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -671,7 +665,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -777,7 +771,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -789,7 +782,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -888,7 +881,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -900,7 +892,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1006,7 +998,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1018,7 +1009,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1118,7 +1109,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1130,7 +1120,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1237,7 +1227,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1249,7 +1238,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1349,7 +1338,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1361,7 +1349,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1467,7 +1455,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1479,7 +1466,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1578,7 +1565,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1590,7 +1576,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1696,7 +1682,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1708,7 +1693,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1808,7 +1793,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1820,7 +1804,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1927,7 +1911,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1939,7 +1922,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2039,7 +2022,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2051,7 +2033,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2157,7 +2139,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2169,7 +2150,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2268,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2280,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2386,7 +2366,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2398,7 +2377,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2498,7 +2477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2510,7 +2488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2617,7 +2595,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2629,7 +2606,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2729,7 +2706,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2741,7 +2717,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -106,7 +106,7 @@ jobs:
    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: -1
@ -216,7 +216,7 @@ jobs:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required, to allow us to use git log
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@ -14,7 +14,7 @@ jobs:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -37,7 +37,7 @@ jobs:
        run: echo "${TRIGGERING_WORKFLOW}"

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0

      - run: |
          pip3 install requests==2.26
--- a/13
+++ b/13
@ -7,11 +7,11 @@
 #
 #       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
-ARG BASE_IMAGE=ubuntu:18.04
+ARG BASE_IMAGE=ubuntu:20.04
 ARG PYTHON_VERSION=3.8

 FROM ${BASE_IMAGE} as dev-base
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
@ -82,15 +82,16 @@ ARG TRITON_VERSION
 ARG TARGETPLATFORM
 ARG CUDA_VERSION
 LABEL com.nvidia.volumes.needed="nvidia_driver"
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        ca-certificates \
        libjpeg-dev \
-        libpng-dev
+        libpng-dev \
+        && rm -rf /var/lib/apt/lists/*
 COPY --from=conda-installs /opt/conda /opt/conda
 RUN if test -n "${TRITON_VERSION}" -a "${TARGETPLATFORM}" != "linux/arm64"; then \
-        apt install -y --no-install-recommends gcc; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends gcc; \
+        rm -rf /var/lib/apt/lists/*; \
    fi
-RUN rm -rf /var/lib/apt/lists/*
 ENV PATH /opt/conda/bin:$PATH
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -25,6 +25,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
  OP_DECOMPOSE(feature_dropout_);
 }

+void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+    TORCH_CHECK(false, "mutating directly with `.data` under vmap transform is not allowed.");
+}
+
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(__and__, Scalar);
  OP_DECOMPOSE2(__and__, Tensor);
@ -327,6 +331,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_tensor);
  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_float);

+  m.impl("_has_compatible_shallow_copy_type", torch::CppFunction::makeFromBoxedFunction<&unsupportedData>());
 }

 }}
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -836,6 +836,7 @@ Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t di
  if (index.dim() == 0) {
    return index.expand(self_size);
  }
+  dim = maybe_wrap_dim(dim, self_size.size());

  // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
  // to reshape index_
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@ -103,6 +103,24 @@ const char* BatchedTensorImpl::tensorimpl_type_name() const {
  return "BatchedTensorImpl";
 }

+c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const {
+  TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
+  return nullptr;
+}
+
+c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
+    c10::VariableVersion&& version_counter,
+    bool allow_tensor_metadata_change) const {
+  TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
+  return nullptr;
+}
+
+void BatchedTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
+  TORCH_CHECK(false, "mutating directly with `.data` under vmap transform is not allowed.");
+}
+
 Tensor makeBatched(const Tensor& tensor, int64_t bdim, int64_t level) {
  DispatchKeySet key_set = getKeysToPropagateToWrapper(tensor);
  auto* batched = maybeGetBatchedImpl(tensor);
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -71,6 +71,13 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
  void set_size(int64_t dim, int64_t new_size) override;
  void set_stride(int64_t dim, int64_t new_stride) override;
  void set_storage_offset(int64_t storage_offset) override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
 #ifdef DEBUG
  bool has_storage() const override;
 #endif
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -94,13 +94,12 @@ MPSDevice::MPSDevice(): _mtl_device(nil), _mtl_indexing_library(nil)  {

 bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
  id mpsCD = NSClassFromString(@"MPSGraph");
+  static auto compileOptions = [[[MTLCompileOptions alloc] init] autorelease];
  static bool _macos_13_0_plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
  static bool _macos_13_1_plus = [mpsCD instancesRespondToSelector:@selector(
    sampleGridWithSourceTensor:coordinateTensor:layout:normalizeCoordinates:relativeCoordinates:alignCorners:paddingMode:samplingMode:constantValue:name:)] == YES;
  static bool _macos_13_2_plus = [mpsCD instancesRespondToSelector:@selector(convolution3DWithSourceTensor:weightsTensor:descriptor:name:)] == YES;
-  static bool _macos_13_3_plus = NO;
-  if (@available(macOS 13.3, *))
-    _macos_13_3_plus = YES;
+  static bool _macos_13_3_plus = [compileOptions respondsToSelector:@selector(maxTotalThreadsPerThreadgroup)] == YES;

  switch (version) {
    case MacOSVersion::MACOS_VER_13_0_PLUS:  return _macos_13_0_plus;
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@ -54,8 +54,6 @@ TORCH_LIBRARY_IMPL(aten, MPS, m) {
  m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("im2col", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); // Used in  preprocessing by nn.Unfold
  m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -9,7 +9,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_mps_max_pool2d.h>
 #include <ATen/ops/adaptive_avg_pool1d_native.h>
 #include <ATen/ops/adaptive_avg_pool2d.h>
 #include <ATen/ops/adaptive_max_pool1d_native.h>
@ -141,12 +140,6 @@ Tensor max_pool2d(
    return at::mkldnn_max_pool2d(
        self, kernel_size, stride, padding, dilation, ceil_mode);
  }
-#ifdef USE_MPS
-  if (self.is_mps()) {
-    return at::_mps_max_pool2d(
-        self, kernel_size, stride, padding, dilation, ceil_mode);
-  }
-#endif
 #if defined(C10_MOBILE)
  if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
                             dilation, ceil_mode)) {
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1428,7 +1428,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
  }
 #ifdef USE_MPS
  if (_input.is_mps() && !bidirectional) {
-    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
    std::tuple<Tensor, Tensor, Tensor> return_values = std::make_tuple(std::get<0>(output), std::get<1>(output), std::get<2>(output));
    return return_values;
--- a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
+++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
@ -29,7 +29,7 @@ void spmm_reduce_kernel_impl(
    const Tensor& values,
    const Tensor& other_) {

-  int64_t nnz = other_.numel();
+  int64_t nnz = values.numel();
  if (nnz == 0) {
    return;
  }
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -40,6 +40,10 @@ C10_DIAGNOSTIC_POP()
 #include <ATen/ops/empty.h>
 #endif

+#ifdef __linux__
+#include <dlfcn.h>
+#endif
+
 namespace at { namespace native {

 namespace {
@ -62,6 +66,22 @@ uint8_t getAlignment(const Tensor &t) {
 }

 cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const at::MemoryFormat memory_format, const bool _virtual) {
+#if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && CUDNN_MINOR > 5
+  // Workaround for cudnn error handling deficiency, that results in a crash on Ubuntu-22+
+  // if `libnvrtc.so` is not found on the system, which strictly speaking is not necessary
+  // for usecases below
+  // See https://github.com/pytorch/pytorch/issues/97041
+  static C10_UNUSED auto cudnn_cnn_infer_handler = [] {
+    void *handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
+    char *err = dlerror();
+    if (!handle) {
+      TORCH_WARN("Attempt to open cnn_infer failed: handle=", handle, " error: ", err);
+    } else if (err) {
+      TORCH_WARN("Applied workaround for CuDNN issue, install nvrtc.so");
+    }
+    return handle;
+  }();
+#endif
  auto sizes = t.sizes();
  auto strides = t.strides();
  bool channels_last = memory_format == at::MemoryFormat::ChannelsLast ||
@ -153,8 +173,9 @@ cudnn_frontend::ExecutionPlan* find(const KeyType& key) {
  return &(it->second);
 }

-void emplace(const KeyType& key, T& results) {
+void update(const KeyType& key, T& results) {
  std::lock_guard<std::mutex> guard(mutex);
+  engine_cache.erase(key);
  engine_cache.emplace(key, std::move(results));
 }

@ -548,7 +569,7 @@ void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKey& key, con
  for (auto & plan : plans) {
    try {
      run_conv_plan(handle, x, y, w, plan);
-      benchmark_cache.emplace(key, plan);
+      benchmark_cache.update(key, plan);
      return;
    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -562,7 +583,7 @@ void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFuse
  for (auto & plan : plans) {
    try {
      run_conv_plan_fused(handle, x, y, w, z, b, plan);
-      benchmark_cache_fused.emplace(key, plan);
+      benchmark_cache_fused.update(key, plan);
      return;
    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -583,7 +604,7 @@ bool try_configs(cudnn_frontend::EngineConfigList& configs, const std::string& o
        continue;
      }
      run_conv_plan(handle, x, y, w, plan);
-      benchmark_cache.emplace(key, plan);
+      benchmark_cache.update(key, plan);
      return true;
    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -604,7 +625,7 @@ bool try_configs_fused(cudnn_frontend::EngineConfigList& configs, const std::str
        continue;
      }
      run_conv_plan_fused(handle, x, y, w, z, b, plan);
-      benchmark_cache_fused.emplace(key, plan);
+      benchmark_cache_fused.update(key, plan);
      return true;
    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@ -138,4 +138,7 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
                                     nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
                                           constantValue:(double) constantValue
                                                    name:(NSString * _Nullable) name;
+- (MPSGraphTensor * _Nonnull) truncateWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                            name:(NSString * _Nullable) name;
+
@end
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -265,7 +265,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSS
  id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
  bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
  // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
     Tensor emptyShell = Tensor();
    // use "_tensor" from Placeholder to retain view's output during its usage in other ops
    _tensor = gatherViewTensor(src, emptyShell);
@ -289,7 +289,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSS
  } else {
    if (!mpsShape) {
      mpsShape = getMPSShape(_tensor);
-  }
+    }

    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
                                                      shape:mpsShape
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@ -311,11 +311,25 @@ TORCH_IMPL_FUNC(log_softmax_mps_out) (

          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);

-          MPSGraphTensor* softmaxTensor = [mpsGraph softMaxWithTensor:inputTensor
-                                                                 axis:dim
-                                                                 name:nil];
-          MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:softmaxTensor
-                                                                  name:nil];
+          MPSGraphTensor* maximumsTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
+                                                                            axis:dim
+                                                                            name:nil];
+          MPSGraphTensor* inputTensorSubMax = [mpsGraph subtractionWithPrimaryTensor:inputTensor
+                                                                     secondaryTensor:maximumsTensor
+                                                                                name:nil];
+          MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:inputTensorSubMax
+                                                                   name:nil];
+
+          MPSGraphTensor* exponentTensorReduced = [mpsGraph reductionSumWithTensor:exponentTensor
+                                                                              axis:dim
+                                                                              name:nil];
+
+          MPSGraphTensor* logSumExpTensor = [mpsGraph logarithmWithTensor:exponentTensorReduced
+                                                                    name:nil];
+
+          MPSGraphTensor* outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensorSubMax
+                                                                       secondaryTensor:logSumExpTensor
+                                                                                  name:nil];

          newCachedGraph->inputTensor_ = inputTensor;
          newCachedGraph->outputTensor_ = outputTensor;
@ -1208,8 +1222,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
  {
    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
    MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
    MPSGraphTensor *gradInputTensor_ = nil;
  };

@ -1218,7 +1231,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                 to_string(alpha.to<double>()) + ":" +
                                                 to_string(scale.to<double>()) + ":" +
                                                 to_string(input_scale.to<double>()) + ":" +
@ -1235,18 +1248,14 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
          newCachedGraph = new CachedGraph(mpsGraph);

          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-          MPSGraphTensor* inputTensor = nil;
-          MPSGraphTensor* resultTensor = nil;
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
          MPSGraphTensor* lessThanZeroGradTensor = nil;

          if(is_result) {
-            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
            MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                        secondaryTensor:alphaTensor
                                                                                   name:nil];
            auto constMul = scale.to<double>() * input_scale.to<double>();
@ -1258,11 +1267,10 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
                                                                          name:nil];
          }
          else {
-            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
            MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                    shape:@[@1]
                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                          secondaryTensor:inputScaleTensor
                                                                                     name:nil];
            MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@ -1282,7 +1290,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                              shape:@[@1]
                                                           dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                   secondaryTensor:zeroTensor
                                                                              name:nil];
          MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@ -1294,8 +1302,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
                                                                                 name:nil];

          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
          newCachedGraph->gradInputTensor_ = gradInputTensor;
        }
        return newCachedGraph;
@ -1304,28 +1311,14 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
    }

    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder resultPlaceholder = Placeholder();
-    if(is_result)
-      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);

    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if(is_result)
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
-      };
-    else
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
+    };
    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
    };
@ -1840,7 +1833,7 @@ std::tuple<Tensor, Tensor> prelu_backward_mps(const Tensor& grad_output, const T
    using namespace mps;

    Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
    if (grad_output.numel() == 0) {
      return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
    }
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@ -177,10 +177,6 @@ void div_mode_template(const Tensor& self, const Tensor& other,
                       c10::optional<c10::string_view> rounding_mode,
                       const Tensor& output, const string op_name)
 {
-  if(rounding_mode.has_value() && *rounding_mode == "floor"){
-    TORCH_CHECK(self.scalar_type() != ScalarType::Long,
-                "MPS: does not support floor_divide op with int64 input");
-  }
  BinaryOpBlock div_mode_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
    MPSGraph* mpsGraph = cachedGraph->graph();
    bool isFloatInput = ([primaryCastTensor dataType] & MPSDataTypeFloatBit) != 0;
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@ -12,7 +12,7 @@ Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) {
  }
  Tensor output = self;
  bool needsCopyToOutput = false;
-  if (!self.is_contiguous()) {
+  if (!self.is_contiguous() || self.storage_offset()) {
    output = empty_mps(self.sizes(), self.scalar_type(), c10::nullopt, kMPS);
    needsCopyToOutput = true;
  }
@ -89,7 +89,7 @@ bool fill_mps_tensor_(Tensor& self, uint8_t value) {
  if (self.is_contiguous()) {
    MPSStream* stream = getCurrentMPSStream();
    auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
    return true;
  }
  return false;
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -56,15 +56,17 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
  descriptor_.groups = groups;
 }

-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
    const Tensor& input_t,
    const Tensor& weight_t,
    const c10::optional<Tensor>& bias_opt,
    IntArrayRef padding,
    IntArrayRef stride,
    IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::optional<IntArrayRef> input_shape) {
  TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
+  TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");

  namespace native_mps = at::native::mps;
  CheckedFrom c = "mps_convolution";
@ -83,6 +85,8 @@ Tensor _mps_convolution(
  auto memory_format = input_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
  auto output_t = at::empty(
+                    input_shape.has_value() ?
+                    input_shape.value() :
                    conv_output_size(input->sizes(), weight->sizes(),
                                     padding, stride, dilation),
                    input->scalar_type(),
@ -237,21 +241,30 @@ Tensor _mps_convolution(
  return *output;
 }

+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
+}
+
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  namespace native_mps = at::native::mps;
  using namespace mps;
+  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
  CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
  checkAllSameType(c, {grad_output, weight});
  checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  Tensor grad_output_t = grad_output_.contiguous(memory_format);
-  Tensor weight_t = weight_.contiguous(memory_format);
-  MPSShape* weightShape = getMPSShape(weight_);
  auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);

  // Avoid "grad_input" when this is being used as transposed convolution
@ -327,10 +340,10 @@ Tensor mps_convolution_backward_input(
          }

          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);

          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
            gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
          }
          MPSGraphTensor* gradInputTensor;
@ -359,7 +372,7 @@ Tensor mps_convolution_backward_input(
    }

    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);

    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@ -377,17 +390,15 @@ Tensor mps_convolution_backward_input(
 }

 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  namespace native_mps = at::native::mps;
  using namespace mps;
+  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
  CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);

-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
-
  MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);

  // For uniformity with everything else, although it seems grad_weight
@ -475,7 +486,7 @@ Tensor mps_convolution_backward_weights(
          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);

          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
            gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
          }

@ -525,12 +536,9 @@ Tensor mps_convolution_backward_weights(
 }

 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
    std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
  Tensor grad_input, grad_weight, grad_bias;
  if (input.numel() == 0) {
    if (output_mask[0]) {
@ -576,10 +584,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
    const Tensor& grad_output_t, const Tensor& weight_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+  return _mps_convolution_impl(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }

 Tensor mps_convolution_transpose_backward_weight(
@ -595,15 +603,12 @@ Tensor mps_convolution_transpose_backward_weight(


 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
    std::array<bool,2> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
  Tensor grad_input, grad_weight;
  if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
  }
  if (output_mask[1]) {
    grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -251,8 +251,11 @@ static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, boo
  bool returnGatherOutput = dst_.is_contiguous();
  Tensor src;
  auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+  const bool sameDataType = src_.dtype() == dst_.dtype();

-  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
+      // the copy_cast path requires storage_offset to be applied before casting
+      (src_.storage_offset() && !sameDataType)) {
    Tensor emptyShell = Tensor();
    src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);

@ -282,7 +285,7 @@ static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, boo
  src._set_neg(src_.is_neg());

  const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
+  if (sameDataType) {
    MPSStream* stream = getCurrentMPSStream();
    // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
    stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
@ -297,22 +300,27 @@ at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
  TORCH_CHECK(dst.defined(), "dst is undefined");
  TORCH_CHECK(src.defined(), "src is undefined");

+  bool needs_broadcasting = false;
+
  if (src.numel() == 0 || dst.is_same(src)) {
    return dst;
  }
  if (dst.numel() == 0) {
    dst.resize_as_(src);
  }
+  if (dst.dim() > src.dim()) {
+    needs_broadcasting = true;
+  }

  if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
-    return copy_from_mps_(dst, src, non_blocking);
+    return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }
  if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
-    return copy_to_mps_(dst, src, non_blocking);
+    return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }

  if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
-    return copy_kernel_mps(dst, src, non_blocking);
+    return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }
  TORCH_INTERNAL_ASSERT(
      src.device().type() == DeviceType::MPS,
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps(

            MPSGraphTensor* reshapedIndicesTensor = indicesTensor;

+            MPSGraphTensor* castGradTensor = incomingGradTensor;
+            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
+            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
+            if (dataType == MPSDataTypeFloat16) {
+              castGradTensor = [mpsGraph castTensor: incomingGradTensor
+                                             toType: MPSDataTypeFloat32
+                                               name: @"castGradTensor"];
+            }
            if (num_indices_dims != 0) {
              reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                               axes: @[@-1]
                                                               name: nil];
            }

-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                             indicesTensor: reshapedIndicesTensor
                                                                     shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                           batchDimensions: 0
                                                                      mode: MPSGraphScatterModeAdd
                                                                      name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
+              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
+                                                 toType: MPSDataTypeFloat16
+                                                   name: @"castGradTensor"];
+            }
            newCachedGraph->incomingGradTensor_ = incomingGradTensor;
            newCachedGraph->indicesTensor_ = indicesTensor;
            newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -609,11 +609,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_mps

    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];

-    string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":"
-                      + std::to_string(train) + ":"
-                      + std::to_string(has_running_mean) + ":"
-                      + std::to_string(has_weight) + ":"
-                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(input.scalar_type());
+    string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
+        std::to_string(train) + ":" + std::to_string(has_running_mean) + ":" + std::to_string(has_weight) + ":" +
+        [ns_shape_key UTF8String] + ":" + c10::Join(",", grad_input_mask) + ":" + native_mps::getMPSTypeString(input.scalar_type());
    auto input_mps_dtype = native_mps::getMPSDataType(input.scalar_type());
    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));

--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
  pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                     nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);

+  auto output_memory_format = output.suggest_memory_format();
  // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
  // by simply restriding them (instead of calling the costly Contiguous()).
  if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
      outputSizes.insert(outputSizes.begin(), nbatch);
    }
    output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
    output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+    output_memory_format = MemoryFormat::Contiguous;
  }

  if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
    }

    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+
+    if (output_memory_format != suggested_memory_format) {
+      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
+    }
  }
 }

@ -302,7 +308,7 @@ static void avg_pool2d_template(const Tensor& input, const Tensor& output,

 } // namespace mps

-Tensor _mps_max_pool2d(
+Tensor mps_max_pool2d(
    const Tensor& input,
    IntArrayRef kernel_size,
    IntArrayRef stride,
@ -356,6 +362,8 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
    const Tensor& output,
    const Tensor& indices) {

+  auto indices_memory_format = indices.suggest_memory_format();
+
  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
    MPSGraph* mpsGraph = cachedGraph.graph();
    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@ -366,6 +374,10 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
  };
  mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  }
 }

 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -139,6 +139,10 @@ void reduction_out_mps(
  MPSReductionType reduction_type,
  const std::string& func_name) {

+  // issue 103641234, reduction ops does not have int64 support
+  if (input_t.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 reduction ops, casting it to int32");
+  }
  IntArrayRef input_shape = input_t.sizes();

  if (opt_dim.has_value()) {
@ -163,6 +167,9 @@ void reduction_out_mps(
    if (reduction_type == MPSReductionType::PROD) {
      output_t.fill_(1);
    }
+    else if (reduction_type == MPSReductionType::SUM) {
+      output_t.zero_();
+    }
    return;
  }

@ -197,7 +204,10 @@ void reduction_out_mps(
             (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
            inputCastDtype = getMPSDataType(dtype.value());
          } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
            inputCastDtype = MPSDataTypeFloat32;
          }

@ -241,7 +251,7 @@ void reduction_out_mps(
                                                               axes:wrappedAxes
                                                               name:nil];
          } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                     numLower:0
                                                                     numUpper:0
                                                                         name:nil];
@ -1257,7 +1267,9 @@ Tensor min_max_mps
  (const Tensor& input_t,
   MPSReductionType reduction_type,
   const std::string& func_name) {
-  TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 min/max ops, casting it to int32");
+  if (input_t.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
+  }

  using CachedGraph = MPSUnaryCachedGraph;

--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -233,7 +233,7 @@ Tensor repeat_interleave_mps(const Tensor& repeat_, c10::optional<int64_t> outpu
  if (repeat.scalar_type() == kLong) {
    // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output,
    // which currently doesn't support int64_t as input. Casting internally the indices to int32_t.
-    TORCH_WARN_ONCE(false, "MPS: no support for int64 repeats mask, casting it to int32");
+    TORCH_WARN_ONCE("MPS: no support for int64 repeats mask, casting it to int32");
    repeat = repeat.to(kInt);
  }
  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
@ -243,4 +243,4 @@ Tensor repeat_interleave_mps(const Tensor& repeat_, c10::optional<int64_t> outpu
  return output;
 }

-}  // namespace at::native
+}  // namespace at::native
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@ -23,17 +23,31 @@ std::vector<long long> getTensorShape(MPSGraphTensor* mpsTensor) {
    return output_dimensions;
 }

-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
    using namespace mps;
+
+    //Projections are not currently supported, raise an error if needed
+    bool has_projections = (hx[0].size(2) != hx[1].size(2));
+    if(has_projections) {
+        AT_ERROR("LSTM with projections is not currently supported with MPS.");
+    }
+
+    TORCH_CHECK(!(!is_macos_13_or_newer() && num_layers > 1), "Multi-layer LSTM support in MPS available only on MacOS 13 onwards");
+
    std::vector<Tensor> kernel_weights;
    std::vector<Tensor> recurrent_kernel_weights;
    std::vector<Tensor> biases;
    std::vector<Tensor> recurrent_biases;
    for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if (has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
    }

    struct CachedGraph : public MPSCachedGraph {
@ -44,8 +58,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
-      std::vector<MPSGraphTensor*> outputCellStateFwdVector_;
-      std::vector<MPSGraphTensor*> outputZStateVector_;
    };

    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@ -67,12 +79,15 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
            NSMutableArray<MPSGraphTensor*> *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
            NSMutableArray<MPSGraphTensor*> *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+            NSMutableArray<MPSGraphTensor*> *layersOutputsList = [[NSMutableArray alloc] initWithCapacity:num_layers];

            for (size_t i = 0; i < num_layers; i += 1) {
                [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                if(has_biases) {
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                }
            }

            MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@ -93,25 +108,28 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            }

            MPSGraphTensor* inputTensor_ = inputTensor;
-            MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
-                                                        dimension:0
-                                                        start:0
-                                                        length:1
-                                                        name:nil];
-            MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
-                                                                dimension:0
-                                                                start:0
-                                                                length:1
-                                                                name:nil];
            NSArray<MPSGraphTensor*>* outputs = nil;
            NSMutableArray<MPSGraphTensor*>* outputStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                    secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                MPSGraphTensor* biasTensor = nil;
+                if(has_biases) {
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                     secondaryTensor:recurrentBiasList[i]
+                                                                name:nil];
+                }
+                MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
+                                                           dimension:0
+                                                               start:i
+                                                              length:1
+                                                                name:nil];
+                MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
+                                                               dimension:0
+                                                                   start:i
+                                                                  length:1
+                                                                    name:nil];
                outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                        recurrentWeight:recurrentKernelWeightsList[i]
                                            inputWeight:kernelWeightsList[i]
@ -121,18 +139,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
                                             descriptor:opDesc
                                                   name:nil];

-
-                stateTensor_ = [mpsGraph sliceTensor:stateTensor
-                                                            dimension:0
-                                                            start:i
-                                                            length:1
-                                                            name:nil];
-                cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
-                                                                    dimension:0
-                                                                    start:i
-                                                                    length:1
-                                                                    name:nil];
                inputTensor_ = [outputs objectAtIndex:0];
+                // no need to keep a final layer output copy as it is
+                // returned anyway and not used in backprop
+                if(i != num_layers - 1) {
+                    [layersOutputsList addObject:[mpsGraph expandDimsOfTensor:inputTensor_
+                                                                         axis:0
+                                                                         name:nil]];
+                }
                if(dropout_p>0.0 && train && (i!=num_layers-1)) {
                    inputTensor_ = [mpsGraph dropoutTensor:inputTensor_
                                                      rate:dropout_p
@ -150,7 +164,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
                                                            name:nil]];
            }

-            MPSGraphTensor* outputTensor = [outputs objectAtIndex:0];
+            MPSGraphTensor* outputTensor = inputTensor_;
            if (batch_first) {
                outputTensor = [mpsGraph transposeTensor:outputTensor
                                               dimension:0
@ -169,8 +183,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            MPSGraphTensor* outputCellStatesFwd = [mpsGraph concatTensors:outputCellStateFwdArray
                                                            dimension:0
                                                            name:nil];
+            MPSGraphTensor* layersOutputs = (num_layers > 1)
+                ? [mpsGraph concatTensors:layersOutputsList dimension:0 name:nil]
+                : nil;

-            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd};
+            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd, layersOutputs};
            newCachedGraph->inputTensors_ = inputTensors;
            newCachedGraph->outputTensors_ = outputTensors;
            newCachedGraph->kernelWeightsList_ = kernelWeightsList;
@ -188,20 +205,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      NSMutableArray<MPSGraphTensor*> *biasList = cachedGraph->biasList_;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList = cachedGraph->recurrentBiasList_;

-      Placeholder kernelWeight;
-      Placeholder recurrentKernelWeight;
-      Placeholder bias;
-      Placeholder recurrentBias;
+      Placeholder kernelWeight, recurrentKernelWeight, bias, recurrentBias;
+
      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
      for (size_t i = 0; i < num_layers; i+=1) {
          kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
          recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
          [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
          [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          if(has_biases) {
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          }

      }
      Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@ -218,6 +235,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      Tensor cy = at::empty_like(hx[1], input.options());
      Tensor zState = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[3])), input.options());
      Tensor cellStateFwd = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[4])), input.options());
+      Tensor layerOutputs = (num_layers > 1)
+          ? at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[5])), input.options())
+          : at::empty({ 1 }, input.options()); // not used if num_layers == 1

      Placeholder outputPlaceholder0 = Placeholder(cachedGraph->outputTensors_[0], output);
      Placeholder outputPlaceholder1 = Placeholder(cachedGraph->outputTensors_[1], hy);
@ -225,20 +245,25 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      Placeholder outputPlaceholder3 = Placeholder(cachedGraph->outputTensors_[3], zState);
      Placeholder outputPlaceholder4 = Placeholder(cachedGraph->outputTensors_[4], cellStateFwd);

-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [@{
        outputPlaceholder0.getMPSGraphTensor() : outputPlaceholder0.getMPSGraphTensorData(),
        outputPlaceholder1.getMPSGraphTensor() : outputPlaceholder1.getMPSGraphTensorData(),
        outputPlaceholder2.getMPSGraphTensor() : outputPlaceholder2.getMPSGraphTensorData(),
        outputPlaceholder3.getMPSGraphTensor() : outputPlaceholder3.getMPSGraphTensorData(),
-        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData()
-      };
+        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData(),
+      } mutableCopy];
+
+      if (num_layers > 1) {
+          Placeholder outputPlaceholder5 = Placeholder(cachedGraph->outputTensors_[5], layerOutputs);
+          [results setObject:outputPlaceholder5.getMPSGraphTensorData() forKey: outputPlaceholder5.getMPSGraphTensor()];
+      }

      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-      return std::make_tuple(output, hy, cy, zState, cellStateFwd);
+      return std::make_tuple(output, hy, cy, zState, cellStateFwd, layerOutputs);
    }
 }

-std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, const Tensor& layersOutputs, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
    using namespace mps;
    const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] {return Tensor();});
    const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
@ -250,10 +275,15 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
    std::vector<Tensor> biases;
    std::vector<Tensor> recurrent_biases;
    for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if(has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
    }

    struct CachedGraph : public MPSCachedGraph {
@ -264,12 +294,12 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradOutput_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradRecWeights_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradWeights_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradBias_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradState_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradCellState_ = nil;
+      MPSGraphTensor* gradOutput_ = nil;
+      MPSGraphTensor* gradState_ = nil;
+      MPSGraphTensor* gradCellState_ = nil;
    };

    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@ -296,8 +326,10 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                    for (size_t i = 0; i < num_layers; i += 1) {
                        [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                        [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        if(has_biases) {
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        }
                    }

                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@ -308,8 +340,22 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                    MPSGraphTensor* gradientCyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_cy.scalar_type()), getMPSShape(grad_cy));
                    MPSGraphTensor* gradientHyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_hy.scalar_type()), getMPSShape(grad_hy));
                    MPSGraphTensor* cellStateFwdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(cell_state_fwd.scalar_type()), getMPSShape(cell_state_fwd));
+                    MPSGraphTensor* layersOutputsTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(layersOutputs.scalar_type()), getMPSShape(layersOutputs));
+
+                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor, layersOutputsTensor};
+
+                    if (batch_first) {
+                        inputTensor = [mpsGraph transposeTensor: inputTensor
+                                                      dimension: 0
+                                                  withDimension: 1
+                                                           name: nil];
+
+                        gradientTensor = [mpsGraph transposeTensor: gradientTensor
+                                                         dimension: 0
+                                                     withDimension: 1
+                                                              name: nil];
+                    }

-                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor};
                    newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList;
                    newCachedGraph->kernelWeightsList_ = kernelWeightsList;
                    newCachedGraph->biasList_ = kernelBiasList;
@ -325,7 +371,6 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c

                    NSArray<MPSGraphTensor*>* outputs = nil;

-                    NSMutableArray<MPSGraphTensor*>* gradOutputArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradRecWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
@ -349,9 +394,15 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                        cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                    axis:0
                                                    name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                            secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                        MPSGraphTensor* biasTensor = nil;
+                        if(has_biases) {
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                            secondaryTensor:recurrentBiasList[i]
+                                                            name:nil];
+                        } else {
+                            biasTensor = [mpsGraph constantWithScalar:0.0
+                                                            dataType:inputTensor.dataType];
+                        }

                        MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                    dimension:0
@ -375,7 +426,23 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                                                                            length:1
                                                                            name:nil];

-                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: inputTensor
+                        MPSGraphTensor* iterationInputTensor_ = nil;
+                        if (i == 0) {
+                            iterationInputTensor_ = inputTensor;
+                        } else {
+                            iterationInputTensor_ = [mpsGraph sliceTensor:layersOutputsTensor
+                                                                dimension: 0
+                                                                    // last element in layersOutputsTensor contains
+                                                                    // **inputs** for the last layer
+                                                                    start: i - num_layers
+                                                                   length: 1
+                                                                     name: nil];
+                            iterationInputTensor_ = [mpsGraph squeezeTensor:iterationInputTensor_
+                                                                       axis:0
+                                                                       name: nil];
+                        }
+
+                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: iterationInputTensor_
                                             recurrentWeight: recurrentKernelWeightsList[i]
                                              sourceGradient: gradientTensor_
                                                      zState: zState
@ -391,24 +458,31 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                                                  descriptor: opDesc
                                                        name: nil];

-
                        gradientTensor_ = [outputs objectAtIndex:0];
-                        [gradOutputArray addObject:[outputs objectAtIndex:0]];
-                        [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
-                        [gradWeightsArray addObject:[outputs objectAtIndex:2]];
-                        [gradBiasArray addObject:[outputs objectAtIndex:3]];
-                        [gradStateArray addObject:[outputs objectAtIndex:4]];
-                        [gradCellStateArray addObject:[outputs objectAtIndex:5]];
+                        [gradRecWeightsArray insertObject:[outputs objectAtIndex:1] atIndex:0];
+                        [gradWeightsArray insertObject:[outputs objectAtIndex:2] atIndex:0];
+                        [gradBiasArray insertObject: [outputs objectAtIndex:3] atIndex:0];
+                        [gradStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:4] axis:0 name:nil]  atIndex:0];
+                        [gradCellStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:5] axis:0 name:nil] atIndex:0];
                    }
                    std::vector<MPSGraphTensor*> outputTensors = {[outputs objectAtIndex:0],[outputs objectAtIndex:1],[outputs objectAtIndex:2],[outputs objectAtIndex:3], [outputs objectAtIndex:4], [outputs objectAtIndex:5]};
+
+                    if (batch_first) {
+                        MPSGraphTensor* gradientTensorTransposed = [mpsGraph transposeTensor:gradientTensor_
+                                                                                   dimension: 0
+                                                                               withDimension: 1
+                                                                                        name:nil];
+                        newCachedGraph->gradOutput_ = gradientTensorTransposed;
+                    } else {
+                        newCachedGraph->gradOutput_ = gradientTensor_;
+                    }
+
                    newCachedGraph->outputTensors_ = outputTensors;
-                    newCachedGraph->gradOutput_ = gradOutputArray;
                    newCachedGraph->gradRecWeights_ = gradRecWeightsArray;
                    newCachedGraph->gradWeights_ = gradWeightsArray;
                    newCachedGraph->gradBias_ = gradBiasArray;
-                    newCachedGraph->gradState_ = gradStateArray;
-                    newCachedGraph->gradCellState_ = gradCellStateArray;
-
+                    newCachedGraph->gradState_ = [mpsGraph concatTensors:gradStateArray dimension: 0 name: nil];
+                    newCachedGraph->gradCellState_ = [mpsGraph concatTensors:gradCellStateArray dimension: 0 name: nil];
                }
                return newCachedGraph;
            });
@ -423,6 +497,7 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        Placeholder cellStateFwdPlaceholder   = Placeholder(cachedGraph->inputTensors_[5], cell_state_fwd);
        Placeholder gradientHyPlaceholder   = Placeholder(cachedGraph->inputTensors_[6], grad_hy);
        Placeholder gradientCyPlaceholder   = Placeholder(cachedGraph->inputTensors_[7], grad_cy);
+        Placeholder layersOutputsPlaceholder   = Placeholder(cachedGraph->inputTensors_[8], layersOutputs);

        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
        [feeds setObject:gradientPlaceholder.getMPSGraphTensorData() forKey:gradientPlaceholder.getMPSGraphTensor()];
@ -433,6 +508,7 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        [feeds setObject:cellStatePlaceholder.getMPSGraphTensorData() forKey:cellStatePlaceholder.getMPSGraphTensor()];
        [feeds setObject:zStatePlaceholder.getMPSGraphTensorData() forKey:zStatePlaceholder.getMPSGraphTensor()];
        [feeds setObject:cellStateFwdPlaceholder.getMPSGraphTensorData() forKey:cellStateFwdPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:layersOutputsPlaceholder.getMPSGraphTensorData() forKey:layersOutputsPlaceholder.getMPSGraphTensor()];

        NSMutableArray<MPSGraphTensor*> *kernelWeightsList = cachedGraph->kernelWeightsList_;
        NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_;
@ -445,68 +521,65 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        for (size_t i = 0; i < num_layers; i+=1) {
            kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
            recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
            [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
            [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            if(has_biases) {
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            }
        }

-        Tensor output = at::empty_like(input);
-        Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
-        Tensor grad_weights = at::empty_like(kernel_weights[0]);
-        Tensor grad_bias = at::empty_like(biases[0]);
-        Tensor grad_state = at::empty_like(hx[0]);
-        Tensor grad_cell_state = at::empty_like(hx[1]);
-        Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
-        Placeholder gradRecWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[1], grad_rec_weights);
-        Placeholder gradWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[2], grad_weights);
-        Placeholder gradBiasPlaceholder   = Placeholder(cachedGraph->outputTensors_[3], grad_bias);
-        Placeholder gradStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[4], grad_state);
-        Placeholder gradCellStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[5], grad_cell_state);
+        Tensor output_out = at::empty_like(input);
+        Tensor grad_state_out = at::empty_like(hx[0]);
+        Tensor grad_cell_state_out = at::empty_like(hx[1]);

-        std::vector<Tensor> grad_hx = {grad_state, grad_cell_state};
+
+        std::vector<Tensor> grad_hx = {grad_state_out, grad_cell_state_out};

        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *results = [[[NSMutableDictionary alloc] init] autorelease];
-        NSMutableArray<MPSGraphTensor*> *gradOutputArray = cachedGraph->gradOutput_;
        NSMutableArray<MPSGraphTensor*> *gradRecWeightsArray = cachedGraph->gradRecWeights_;
        NSMutableArray<MPSGraphTensor*> *gradWeightsArray = cachedGraph->gradWeights_;
        NSMutableArray<MPSGraphTensor*> *gradBiasArray = cachedGraph->gradBias_;
-        NSMutableArray<MPSGraphTensor*> *gradStateArray = cachedGraph->gradState_;
-        NSMutableArray<MPSGraphTensor*> *gradCellStateArray = cachedGraph->gradCellState_;
-        Placeholder gradOutPlaceholder;
+        MPSGraphTensor* gradOutput = cachedGraph->gradOutput_;
+        MPSGraphTensor* gradState = cachedGraph->gradState_;
+        MPSGraphTensor* gradCellState = cachedGraph->gradCellState_;
+
+        Placeholder gradStatePlaceholder = Placeholder(gradState, grad_state_out);
+        Placeholder gradCellStatePlaceholder = Placeholder(gradCellState, grad_cell_state_out);
+        Placeholder outputPlaceholder = Placeholder(gradOutput, output_out);
+        [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
+        [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
+        [results setObject:outputPlaceholder.getMPSGraphTensorData() forKey:outputPlaceholder.getMPSGraphTensor()];
+
+        Placeholder gradRecWeightsPlaceholder, gradWeightsPlaceholder, gradBiasPlaceholder;

        std::vector<Tensor> weights;
        for (int i = 0; i < num_layers; i++) {
-            Tensor output = at::empty_like(input);
            Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
            Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
-            Tensor grad_state = at::empty_like(hx[0]);
-            Tensor grad_cell_state = at::empty_like(hx[1]);
+            Tensor grad_bias = at::empty((kernel_weights[i].size(0)), kernel_weights[i].options());
            weights.push_back(grad_weights);
            weights.push_back(grad_rec_weights);
-            weights.push_back(grad_bias);
-            weights.push_back(grad_bias);
-            gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
-            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
-            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
-            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex:i], grad_bias);
-            gradStatePlaceholder = Placeholder([gradStateArray objectAtIndex:i], grad_state);
-            gradCellStatePlaceholder = Placeholder([gradCellStateArray objectAtIndex:i], grad_cell_state);

-            [results setObject:gradOutPlaceholder.getMPSGraphTensorData() forKey:gradOutPlaceholder.getMPSGraphTensor()];
-            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
+            if(has_biases) {
+                weights.push_back(grad_bias);
+                weights.push_back(grad_bias);
+            }
+
+            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex: i], grad_rec_weights);
+            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex: i], grad_weights);
+            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex: i], grad_bias);
+
            [results setObject:gradBiasPlaceholder.getMPSGraphTensorData() forKey:gradBiasPlaceholder.getMPSGraphTensor()];
-            [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
-            [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
+            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
            [results setObject:gradWeightsPlaceholder.getMPSGraphTensorData() forKey:gradWeightsPlaceholder.getMPSGraphTensor()];
        }

        runMPSGraph(stream, cachedGraph->graph(), feeds, results);

-        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output, grad_hx, weights);
+        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output_out, grad_hx, weights);

    }
 }
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -35,7 +35,9 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
    indices.copy_(cpu_indices);
    return;
  }
-  TORCH_WARN_ONCE(self.scalar_type() != ScalarType::Long, "MPS: no support for int64 min/max ops, casting it to int32");
+  if (self.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
+  }

  MPSStream* stream = getCurrentMPSStream();
  struct CachedGraph : public MPSCachedGraph {
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -75,15 +75,20 @@ MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)
    return inputTensor;
  }

-  MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
-                                                   dataType:inputTensor.dataType];
-  MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
-                                                        secondaryTensor:zeroTensor
-                                                                    name:nil];
-  return [mpsGraph selectWithPredicateTensor:predicateTensor
-                         truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil]
-                        falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil]
-                                        name:nil];
+  if(!is_macos_13_or_newer()) {
+    MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                    dataType:inputTensor.dataType];
+    MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                          secondaryTensor:zeroTensor
+                                                                      name:nil];
+    return [mpsGraph selectWithPredicateTensor:predicateTensor
+                          truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil]
+                          falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil]
+                                          name:nil];
+  } else {
+    return [mpsGraph truncateWithTensor:inputTensor
+                                   name:nil];
+  }
 };

 } // namespace mps
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
  } else {
    native::upsample_2d_common_check(input.sizes(), output_size);
  }
+  Tensor out;
+  if (!output.is_contiguous()) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
  bool centerResults = false;
  MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
  MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
    MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];

    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);

    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
    };
    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
  }
 }

--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -424,22 +424,54 @@ MPSGraphTensor* asStridedLayer_pattern(MPSGraph *graph, MPSGraphTensor *inputTen
 }

 static
-std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape) {
+std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape, const bool squeeze) {
  bool hasMPSShape = (mpsShape != nil);
  std::vector<int64_t> src_view_shape;
  if (hasMPSShape) {
    int src_ndim_view = [mpsShape count];
-    src_view_shape.resize(src_ndim_view);
-    for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_shape[i] = [mpsShape[i] intValue];
+    if (squeeze) {
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if ([mpsShape[i] intValue] == 1)
+          continue;
+        src_view_shape.emplace_back([mpsShape[i] intValue]);
+      }
+    } else {
+      src_view_shape.resize(src_ndim_view);
+      for (const auto i : c10::irange(src_ndim_view)) {
+        src_view_shape[i] = [mpsShape[i] intValue];
+      }
    }
+
  } else {
-    src_view_shape = src.sizes().vec();
+    if (squeeze) {
+      IntArrayRef src_shape = src.sizes();
+      size_t src_ndim_view = src_shape.size();
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if (src_shape[i] == 1)
+          continue;
+        src_view_shape.emplace_back(src_shape[i]);
+      }
+    } else {
+      src_view_shape = src.sizes().vec();
+    }
  }

  return src_view_shape;
 }

+
+std::vector<int64_t> getSqueezedBaseShape(const Tensor& src, IntArrayRef shape) {
+  std::vector<int64_t> src_base_shape;
+  for (const auto i : c10::irange(shape.size())) {
+    if (shape[i] == 1)
+      continue;
+    src_base_shape.emplace_back(shape[i]);
+  }
+
+  return src_base_shape;
+}
+
+
 bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
  if (!src.is_contiguous()) {
    return false;
@ -447,57 +479,79 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {

  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
  size_t src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
  size_t src_ndim_view = src_view_shape.size();
+
  if (src_ndim_base != src_ndim_view) {
    return false;
  }

  for (const auto i: c10::irange(src_ndim_base)) {
-    if (src_view_shape[i] > src_base_shape[i]) {
-      return false;
-    }
-  }
-
+     if (src_view_shape[i] > src_base_shape[i]) {
+       return false;
+     }
+   }
  return true;
 }

 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
-  int src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
-  int src_ndim_view = src_view_shape.size();
-
-  TORCH_CHECK(src_ndim_base == src_ndim_view);
+  size_t src_ndim_base = src_base_shape.size();
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
+  size_t src_ndim_view = src_view_shape.size();

  MPSNDArray *srcTensorNDArrayView = nil;
  MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
  MPSNDArray *srcTensorNDArray = nil;
  id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
+  int64_t base_idx = 0;
+
+  std::vector<int64_t> src_base_shape_vec;
+
+  if (src_ndim_view != src_ndim_base) {
+    src_base_shape_vec.reserve(src_ndim_view);
+    for (const auto i : c10::irange(src_ndim_view)) {
+      if (src_view_shape[i] == 1 && src_base_shape[base_idx] != 1) {
+        src_base_shape_vec.emplace_back(1);
+      } else {
+        src_base_shape_vec.emplace_back(src_base_shape[base_idx]);
+        if (base_idx < src_ndim_base - 1)
+          base_idx += 1;
+      }
+    }
+    src_base_shape = IntArrayRef(src_base_shape_vec);
+    src_ndim_base = src_base_shape.size();
+  }

  srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
  srcTensorNDArrayDesc = srcTensorNDArray.descriptor;

-  int firstDimToSlice = 0;
+  size_t firstDimToSlice = 0;
  while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
    firstDimToSlice++;
  }

-  int view_numel = 1;
+  int64_t view_numel = 1;
  for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
    view_numel *= src_base_shape[i];
  }

-  int sliceOffset = src.storage_offset() / view_numel;
-  // There are cases where both dimensions of a view can shrink
-  // E.g: x = torch.randn((3,6))[1, 1:3]
-  int nextSliceOffset = src.storage_offset() % view_numel;
+  int64_t sliceOffset = src.storage_offset() / view_numel;
+  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice
+                          withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];

-  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
-  if (nextSliceOffset) {
-    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
+  // Slice any remaining dimensions
+  for (const auto crtSliceOffset: c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
+    if (src_view_shape[crtSliceOffset] != src_base_shape[crtSliceOffset]) {
+      if (crtSliceOffset == src_base_shape.size() - 1) {
+        sliceOffset = src.storage_offset() % src_base_shape[src_base_shape.size() - 1];
+      } else {
+        sliceOffset = (src.storage_offset() % view_numel) / (view_numel / src_base_shape[crtSliceOffset]);
+      }
+      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - crtSliceOffset
+                              withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[crtSliceOffset])}];
+    }
  }
-
  srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
                                                           descriptor:srcTensorNDArrayDesc
                                                             aliasing:MPSAliasingStrategyShallAlias];
@ -696,7 +750,7 @@ const std::string& getGatherScatterScalarType(const Tensor& t) {
    {c10::ScalarType::Int,   "int"},
    {c10::ScalarType::Short, "short"},
    {c10::ScalarType::Char,  "char"},
-    {c10::ScalarType::Byte,  "char"},
+    {c10::ScalarType::Byte,  "uchar"},
    {c10::ScalarType::Bool,  "bool"},
  };

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3567,19 +3567,14 @@
 - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor

 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-
-# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
-# native_functions.yaml
-# https://github.com/pytorch/pytorch/issues/77394
- func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
-    MPS: _mps_max_pool2d
-  autogen: _mps_max_pool2d.out
+    CompositeImplicitAutograd: max_pool2d
+    MPS: mps_max_pool2d

- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
    MPS: mps_max_pool2d_backward
-  autogen: mps_max_pool2d_backward.out
+  autogen: max_pool2d_backward.out

 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
@ -7188,12 +7183,12 @@

 # MPS LSTM implementation

- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    MPS: _lstm_mps
  autogen: _lstm_mps.out

- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
  dispatch:
    MPS: lstm_mps_backward
  autogen: lstm_mps_backward.out
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@ -379,16 +379,33 @@ inline bool check_gpu_sm50_or_greater(sdp_params params, bool debug) {
  return true;
 }

-inline bool check_gpu_sm86_head_dim_128(sdp_params params, bool debug) {
+inline bool check_head_dim_gt64_and_sm_ge86(sdp_params params, bool debug) {
  // Memory Efficient Attention is throwing a cuda illegal memory error
-  // on sm86 when head_dim is 128.
+  // on sm86 or newer when head_dim is greater than 64.
  auto dprops = at::cuda::getCurrentDeviceProperties();
-  bool is_sm86 = (dprops->major == 8) && (dprops->minor == 6);
-  if (is_sm86 && (params.query.size(-1) == 128)) {
+  bool is_sm86_or_newer = (dprops->major == 8) && (dprops->minor >= 6);
+  // Categorically disable sm90 as well. Will want to fix this once we have H100s available for testing.
+  is_sm86_or_newer = is_sm86_or_newer || (dprops->major > 8);
+  if (is_sm86_or_newer && (params.query.sym_size(-1) > 64)) {
    if (debug) {
      TORCH_WARN(
-        "Memory Efficient Attention does not currently support head_dim == 128 on sm86",
-        "because it is throwing a cuda illegal memory error on sm86 when head_dim is 128.");
+          "Memory Efficient Attention does not currently support head_dim greater than 64 on sm86 or newer");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad_and_head_dim_gt64_and_sm_ge86(
+    sdp_params params,
+    bool debug) {
+  // Flash Attention will raise an error in the backward pass if the head_dim
+  // size is greater than 64 And the device is sm86 or newer.
+  if (!check_requires_grad(params, false) &&
+      !check_head_dim_gt64_and_sm_ge86(params, false)) {
+    if (debug) {
+      TORCH_WARN(
+          "Flash attention currently doesn't support training with head_dim greater than 64 on sm86 or newer.");
    }
    return false;
  }
@ -422,13 +439,14 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
  return false;
 #endif
  //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints {{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
      check_runtime_disabled_flash,
      check_tensor_shapes,
      check_equal_batch_size_and_num_heads,
      check_for_attn_mask,
      check_head_dim_size,
      check_gpu_sm75_or_greater,
+      check_requires_grad_and_head_dim_gt64_and_sm_ge86,
      check_for_nested_inputs,
      check_for_seq_len_1_nested_tensor}};
  for (auto& constraint : constraints) {
@ -465,7 +483,7 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
      check_equal_batch_size_and_num_heads,
      check_for_attn_mask,
      check_head_dim_size_mem_efficient,
-      check_gpu_sm86_head_dim_128,
+      check_head_dim_gt64_and_sm_ge86,
      check_for_seq_len_1_nested_tensor,
      check_for_non_zero_dropout,
      check_use_deterministic_algorithms}};
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@ -630,6 +630,7 @@ macro(cuda_unset_include_and_libraries)
  unset(CUDA_cublas_LIBRARY CACHE)
  unset(CUDA_cublas_device_LIBRARY CACHE)
  unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cublasLt_LIBRARY CACHE)
  unset(CUDA_cufft_LIBRARY CACHE)
  unset(CUDA_cufftemu_LIBRARY CACHE)
  unset(CUDA_cupti_LIBRARY CACHE)
@ -963,6 +964,7 @@ endif()

 find_cuda_helper_libs(cufft)
 find_cuda_helper_libs(cublas)
+find_cuda_helper_libs(cublasLt)
 # cusparse showed up in version 3.2
 find_cuda_helper_libs(cusparse)
 find_cuda_helper_libs(curand)
@ -993,7 +995,7 @@ if (CUDA_BUILD_EMULATION)
  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
 else()
  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
 endif()

 ########################
@ -1962,7 +1964,7 @@ macro(CUDA_ADD_CUBLAS_TO_TARGET target)
  if (CUDA_BUILD_EMULATION)
    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
  else()
-    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
  endif()
 endmacro()

--- a/docker.Makefile
+++ b/docker.Makefile
@ -10,8 +10,8 @@ endif

 CUDA_VERSION              = 11.7.0
 CUDNN_VERSION             = 8
-BASE_RUNTIME              = ubuntu:18.04
-BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
+BASE_RUNTIME              = ubuntu:20.04
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu20.04

 # The conda channel to use to install cudatoolkit
 CUDA_CHANNEL              = nvidia
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -351,7 +351,7 @@ master_doc = 'index'

 # General information about the project.
 project = 'PyTorch'
-copyright = '2022, PyTorch Contributors'
+copyright = '2023, PyTorch Contributors'
 author = 'PyTorch Contributors'
 torch_version = str(torch.__version__)

--- a/docs/source/dynamo/get-started.rst
+++ b/docs/source/dynamo/get-started.rst
@ -6,13 +6,12 @@ significant speedups the newer your GPU is.

 .. code:: python

-   from torch._dynamo import optimize
   import torch
   def fn(x, y):
       a = torch.cos(x).cuda()
       b = torch.sin(y).cuda()
       return a + b
-   new_fn = optimize("inductor")(fn)
+   new_fn = torch.compile(fn, backend="inductor")
   input_tensor = torch.randn(10000).to(device="cuda:0")
   a = new_fn(input_tensor, input_tensor)

@ -54,7 +53,7 @@ with the actual generated kernel being
       tmp2 = tl.sin(tmp1)
       tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)

-And you can verify that fusing the two ``sins`` did actually occur
+And you can verify that fusing the two ``sin`` did actually occur
 because the two ``sin`` operations occur within a single Triton kernel
 and the temporary variables are held in registers with very fast access.

@ -69,13 +68,12 @@ hub.
 .. code-block:: python

   import torch
-   import torch._dynamo as dynamo
   model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
-   opt_model = dynamo.optimize("inductor")(model)
+   opt_model = torch.compile(model, backend="inductor")
   model(torch.randn(1,3,64,64))

 And that is not the only available backend, you can run in a REPL
-``dynamo.list_backends()`` to see all the available backends. Try out the
+``torch._dynamo.list_backends()`` to see all the available backends. Try out the
 ``cudagraphs`` or ``nvfuser`` next as inspiration.

 Let’s do something a bit more interesting now, our community frequently
@ -92,11 +90,10 @@ HuggingFace hub and optimize it:

   import torch
   from transformers import BertTokenizer, BertModel
-   import torch._dynamo as dynamo
   # Copy pasted from here https://huggingface.co/bert-base-uncased
   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
   model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
-   model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
+   model = torch.compile(model, backend="inductor") # This is the only line of code that we changed
   text = "Replace me by any text you'd like."
   encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
   output = model(**encoded_input)
@ -116,7 +113,7 @@ Similarly let’s try out a TIMM example
   import torch._dynamo as dynamo
   import torch
   model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
-   opt_model = dynamo.optimize("inductor")(model)
+   opt_model = torch.compile(model, backend="inductor")
   opt_model(torch.randn(64,3,7,7))

 Our goal with Dynamo and inductor is to build the highest coverage ML compiler
@ -132,16 +129,16 @@ or ``torch._dynamo.list_backends()`` each of which with its optional dependencie
 Some of the most commonly used backends include:

 **Training & inference backends**:
-  * ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
-  * ``dynamo.optimize("aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+  * ``torch.compile(m, backend="inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+  * ``torch.compile(m, backend="aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``torch.compile(m, backend=""nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``torch.compile(m, backend="cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__

 **Inference-only backends**:
-  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
-  * ``dynamo.optimize("tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
-  * ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
-  * ``dynamo.optimize("tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
+  * ``torch.compile(m, backend="onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
+  * ``torch.compile(m, backend="tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
+  * ``torch.compile(m, backend="ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+  * ``torch.compile(m, backend="tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__

 Why do you need another way of optimizing PyTorch code?
 -------------------------------------------------------
--- a/docs/source/dynamo/guards-overview.rst
+++ b/docs/source/dynamo/guards-overview.rst
@ -15,7 +15,7 @@ Where a complete example looks like this:

   from typing import List
   import torch
-   import torchdynamo
+   from torch import _dynamo as torchdynamo
   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
       print("my_compiler() called with FX graph:")
       gm.graph.print_tabular()
--- a/docs/source/dynamo/index.rst
+++ b/docs/source/dynamo/index.rst
@ -14,7 +14,7 @@ worlds — usability and performance.

 TorchDynamo makes it easy to experiment with different compiler
 backends to make PyTorch code faster with a single line decorator
-``torch._dynamo.optimize()``
+``torch._dynamo.optimize()`` which is wrapped for convenience by ``torch.compile()``

 .. image:: ../_static/img/dynamo/TorchDynamo.png

--- a/docs/source/dynamo/installation.rst
+++ b/docs/source/dynamo/installation.rst
@ -27,7 +27,7 @@ TorchDynamo dependencies (for CUDA 11.7):

 .. code-block:: shell

-   pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+   pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117

 CPU requirements
 ~~~~~~~~~~~~~~~~
@ -41,16 +41,6 @@ To install, run the following command:
   pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu


-Install from Local Source
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Alternatively, you can build PyTorch from `source
-<https://github.com/pytorch/pytorch#from-source>`__, which has TorchDynamo
-included.
-
-To install GPU TorchDynamo dependencies, run ``make triton`` in the
-PyTorch repo root directory.
-
 Verify Installation
 ~~~~~~~~~~~~~~~~~~~

--- a/docs/source/notes/extending.func.rst
+++ b/docs/source/notes/extending.func.rst
@ -37,8 +37,13 @@ Only the latter is supported with function transforms:
  (by calling ``ctx.save_for_backward(*tensors)``), or save non-Tensors
  (by assigning them to the ``ctx`` object).

-Any intermediates that need to be saved must be returned as an output from
-:meth:`~Function.forward`.
+Because :meth:`~Function.setup_context` accepts only ``inputs`` and ``output``,
+the only quantities that can be saved are either objects (such as Tensors) in
+the inputs or outputs or quantities (like ``Tensor.shape``) derived from them.
+If you wish to save a non-input intermediate activation from
+:meth:`Function.forward` for backward, then you'll need to return it as an
+output from :meth:`~Function.forward` so that it gets passed to
+:meth:`~Function.setup_context`.

 Depending on the transform,

--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@ -129,6 +129,49 @@ Algorithms
    Rprop
    SGD

+Many of our algorithms have various implementations optimized for performance,
+readability and/or generality, so we attempt to default to the generally fastest
+implementation for the current device if no particular implementation has been
+specified by the user.
+
+We have 3 major categories of implementations: for-loop, foreach (multi-tensor), and
+fused. The most straightforward implementations are for-loops over the parameters with
+big chunks of computation. For-looping is usually slower than our foreach
+implementations, which combine parameters into a multi-tensor and run the big chunks
+of computation all at once, thereby saving many sequential kernel calls. A few of our
+optimizers have even faster fused implementations, which fuse the big chunks of
+computation into one kernel. We can think of foreach implementations as fusing
+horizontally and fused implementations as fusing vertically on top of that.
+
+In general, the performance ordering of the 3 implementations is fused > foreach > for-loop.
+So when applicable, we default to foreach over for-loop. Applicable means the foreach
+implementation is available, the user has not specified any implementation-specific kwargs
+(e.g., fused, foreach, differentiable), and all tensors are native and on CUDA. Note that
+while fused should be even faster than foreach, the implementations are newer and we would
+like to give them more bake-in time before flipping the switch everywhere. You are welcome
+to try them out though!
+
+Below is a table showing the available and default implementations of each algorithm:
+
+.. csv-table::
+    :header: "Algorithm", "Default", "Has foreach?", "Has fused?"
+    :widths: 25, 25, 25, 25
+    :delim: ;
+
+    :class:`Adadelta`;foreach;yes;no
+    :class:`Adagrad`;foreach;yes;no
+    :class:`Adam`;foreach;yes;yes
+    :class:`AdamW`;foreach;yes;yes
+    :class:`SparseAdam`;for-loop;no;no
+    :class:`Adamax`;foreach;yes;no
+    :class:`ASGD`;foreach;yes;no
+    :class:`LBFGS`;for-loop;no;no
+    :class:`NAdam`;foreach;yes;no
+    :class:`RAdam`;foreach;yes;no
+    :class:`RMSprop`;foreach;yes;no
+    :class:`Rprop`;foreach;yes;no
+    :class:`SGD`;foreach;yes;no
+
 How to adjust learning rate
 ---------------------------

--- a/setup.py
+++ b/setup.py
@ -1024,17 +1024,12 @@ def main():
        'typing-extensions',
        'sympy',
        'networkx',
+        'jinja2',
    ]

    extras_require = {
        'opt-einsum': ['opt-einsum>=3.3']
    }
-    if platform.system() == 'Linux':
-        triton_pin_file = os.path.join(cwd, ".github", "ci_commit_pins", "triton.txt")
-        if os.path.exists(triton_pin_file):
-            with open(triton_pin_file) as f:
-                triton_pin = f.read().strip()
-                extras_require['dynamo'] = ['pytorch-triton==2.0.0+' + triton_pin[:10], 'jinja2']

    # Parse the command line and check the arguments before we proceed with
    # building deps and setup. We need to set values so `--help` works.
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@ -504,7 +504,7 @@ class TestFSDPUseOrigParamsUnshardReshard(FSDPTest):
            fsdp_kwargs=fsdp_kwargs,
            deterministic=True,
        )
-        optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
+        optim = torch.optim.Adam(fsdp_model.parameters(), foreach=False, lr=LR)
        fsdp_kwargs["use_orig_params"] = True
        fsdp_model_orig_params = TransformerWithSharedParams.init(
            self.process_group,
@ -513,7 +513,9 @@ class TestFSDPUseOrigParamsUnshardReshard(FSDPTest):
            fsdp_kwargs=fsdp_kwargs,
            deterministic=True,
        )
-        optim_orig_params = torch.optim.Adam(fsdp_model_orig_params.parameters(), lr=LR)
+        optim_orig_params = torch.optim.Adam(
+            fsdp_model_orig_params.parameters(), foreach=False, lr=LR
+        )
        return fsdp_model, optim, fsdp_model_orig_params, optim_orig_params

    def _check_fsdp_parameter_parity(self, fsdp1: FSDP, fsdp2: FSDP) -> None:
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@ -1444,6 +1444,59 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
            PythonProcessGroupExtensionTest.create_dummy
        )

+    def test_backend_config(self):
+        dist.Backend.register_backend(
+            "dummy",
+            PythonProcessGroupExtensionTest.create_dummy
+        )
+
+        # Ensure backend config can be created with the following arguments
+        backend_config_strings_and_expected_values = [
+            (dist.Backend.GLOO, "cpu:gloo,cuda:gloo"),
+            (dist.Backend.NCCL, "cpu:nccl,cuda:nccl"),
+            (dist.Backend.MPI, "cpu:mpi,cuda:mpi"),
+            (dist.Backend.UCC, "cpu:ucc,cuda:ucc"),
+            (dist.Backend.DUMMY, "cpu:dummy,cuda:dummy"),
+            ("DUMMY", "cpu:dummy,cuda:dummy"),
+            ("dummy", "cpu:dummy,cuda:dummy"),
+            ("cpu:dummy,cuda:dummy", "cpu:dummy,cuda:dummy"),
+            ("cpu:dummy,cuda:nccl", "cpu:dummy,cuda:nccl"),
+            ("cpu:gloo,cuda:dummy", "cpu:gloo,cuda:dummy"),
+            ("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
+            ("cPu:gLoO,cuDa:NcCl", "cpu:gloo,cuda:nccl")
+        ]
+
+        for config_str, expected_value in backend_config_strings_and_expected_values:
+            with self.subTest(config_str):
+                # ensures these configs strings are valid and no ValueError is raised
+                config = dist.BackendConfig(config_str)
+                self.assertEqual(str(config), expected_value)
+
+        # Ensure backend config will raise ValueError with the following arguments
+        invalid_backend_config_strings = [
+            "cpu:gloo,cuda:nccl,",  # trailing comma
+            "cpu:gloo,cuda:nccl,cpu:dummy",  # duplicate device
+        ]
+        for config_str in invalid_backend_config_strings:
+            with self.subTest(config_str):
+                with self.assertRaises(ValueError):
+                    dist.BackendConfig(config_str)
+
+    def test_init_process_group_with_multiple_backends(self):
+        dist.Backend.register_backend("dummy", PythonProcessGroupExtensionTest.create_dummy)
+
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '6789'
+        dist.init_process_group("cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size)
+
+        # test all_gather
+        input_tensor = torch.ones(2, 2) * 7
+        output_tensor_list = [torch.zeros(2, 2) for _ in range(self.world_size)]
+        dist.all_gather(output_tensor_list, input_tensor)
+
+        dist.barrier()
+        dist.destroy_process_group()
+
    class Options:
        def __init__(self):
            pass
@ -1570,6 +1623,11 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase):
                    world_size=self.world_size,
                    store=store
                )
+                pg = c10d._get_default_group()
+                self.assertEqual(pg.rank(), self.rank)
+                self.assertEqual(pg.size(), self.world_size)
+                self.assertEqual(pg.name(), str(backend))
+
                dist.destroy_process_group()

    def _call_collective_with_varying_tensors(self, backend, collective, *args):
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -335,7 +335,7 @@ class MyPythonStore(dist.Store):
        self.store = {}

    def set(self, key, value):
-        if not isinstance(key, str):
+        if not isinstance(key, (str, bytes)):
            raise AssertionError("Expected set to be called with string key")
        if type(value) is not bytes:
            raise AssertionError("Expected set to be called with bytes value")
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@ -51,11 +51,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
        y = torch.randn(4)
        x = torch.nn.Parameter(torch.randn(4))
        aot_fn = torch._dynamo.optimize("aot_eager")(fn)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "a leaf Variable that requires grad is being used in an in-place operation.",
-        ):
-            aot_fn(x, y)
+        # This should not error: we mutated an autograd leaf under no_grad mode.
+        aot_fn(x, y)

    def test_mutation1(self):
        def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
@ -179,11 +176,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):

        # Run exported graph with AOT
        aot_fn = torch._dynamo.optimize("aot_eager")(graph)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "a leaf Variable that requires grad is being used in an in-place operation.",
-        ):
-            aot_fn(x, y)
+        # This should not error: we mutated an autograd leaf under no_grad mode.
+        aot_fn(x, y)

    def test_call_fn_with_non_const_inputs_aot_unsafe_control_flow(self):
        class ModuleSpecialFwd(torch.nn.Module):
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@ -60,6 +60,11 @@ unittest.expectedFailure(
    # Cannot call sizes() on tensor with symbolic sizes/strides
 )

+unittest.expectedFailure(
+    DynamicShapesMiscTests.test_parsing_sdpa_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
+)
+

 # DynamicShapesSubGraphTests
 unittest.expectedFailure(
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -2294,7 +2294,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertIs(x_ref(), None)

    def test_release_module_memory(self):
-
        mod = torch.nn.Linear(10, 10)
        x = torch.rand([10, 10])
        mod_weight_ref = weakref.ref(mod.weight)
@ -2640,7 +2639,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
                self.names = []

            def forward(self, idx, targets=None):
-
                b, t = idx.size()
                assert (
                    t <= self.block_size
@ -3145,6 +3143,53 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertEqual(compiled.device.index, 0)
        self.assertEqual(compiled.dtype, torch.float16)

+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_parsing_sdpa(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, query, key, value):
+                out = F.scaled_dot_product_attention(query, key, value, None, 0, True)
+                out = F.scaled_dot_product_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query, key, value, None, dropout_p=0, is_causal=True
+                )
+                return out
+
+        device = "cuda"
+        dtype = torch.float16
+        seq_len_q = 1
+        seq_len_k = 1
+        head_dim = 8
+        query = torch.ones(
+            1, 8, seq_len_q, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        key = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        value = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        module = MyModule()
+        opt_mod = torch._dynamo.optimize("inductor")(module)
+        opt_mod(query, key, value)
+
    def test_autocast_cpu(self):
        class MyModule(torch.nn.Module):
            def forward(self, x):
@ -3716,7 +3761,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertTrue(same(ref, res))

    def test_disable_flag(self):
-
        cnt = torch._dynamo.testing.CompileCounter()

        with patch.dict(os.environ, {"TORCH_COMPILE_DISABLE": "1"}):
@ -3999,6 +4043,23 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        res = opt_fn(x, y)
        self.assertTrue(same(ref, res))

+    def test_tuple_from_tuple_iter(self):
+        def inner_fn(*args):
+            acc = torch.ones(10, 10)
+            for arg in args:
+                acc.add_(arg)
+
+            return acc
+
+        @torch._dynamo.optimize("eager")
+        def fn(inputs, params):
+            y = tuple(inputs) + tuple(params)
+            return inner_fn(*y)
+
+        inputs = [torch.randn(10, 10) for _ in range(3)]
+
+        fn(inputs, iter(tuple(inputs)))
+
    def test_torch_package_working_with_trace(self):
        # from torch._dynamo.test_case import run_tests

--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@ -295,6 +295,31 @@ class ModuleList(torch.nn.Module):
        return x


+class CustomGetItemModuleList(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(10, 10),
+                torch.nn.ReLU(),
+                torch.nn.Linear(10, 10),
+                torch.nn.ReLU(),
+            ]
+        )
+
+    def __getitem__(self, idx: int):
+        return self.layers[idx]
+
+    def __len__(self) -> int:
+        return len(self.layers)
+
+    def forward(self, x):
+        for i in range(len(self)):
+            x = self[i](x)
+
+        return x
+
+
 class ModuleDict(torch.nn.Module):
    def __init__(self):
        super().__init__()
@ -310,6 +335,23 @@ class ModuleDict(torch.nn.Module):
        return x


+class CustomGetItemModuleDict(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.ModuleDict(
+            {
+                "0": torch.nn.Linear(10, 10),
+            }
+        )
+
+    def __getitem__(self, key: str) -> torch.nn.Module:
+        return self.layers[key]
+
+    def forward(self, x):
+        x = self["0"](x)
+        return x
+
+
 class TensorList(torch.nn.Module):
    def __init__(self):
        super().__init__()
@ -728,7 +770,9 @@ class NNModuleTests(torch._dynamo.test_case.TestCase):
    test_cfgmod = make_test(CfgModule())
    test_stringmember = make_test(StringMember())
    test_modulelist = make_test(ModuleList())
+    test_modulelist = make_test(CustomGetItemModuleList())
    test_moduledict = make_test(ModuleDict())
+    test_moduledict = make_test(CustomGetItemModuleDict())
    test_super1 = make_test(SuperModule())
    test_super2 = make_test(SuperModule2())
    test_super_class_method = make_test(SuperChildCallsClassMethod())
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 import collections
+import contextlib
 import copy
 import inspect
 import itertools
@ -2173,6 +2174,81 @@ class ReproTests(torch._dynamo.test_case.TestCase):
        self.assertEqual(cnt.frame_count, 2)
        self.assertEqual(cnt.op_count, 2)

+    def test_exception_in_dynamo_handling(self):
+        hit_handler = False
+
+        # See https://github.com/pytorch/pytorch/pull/96488
+        @contextlib.contextmanager
+        def ctx():
+            try:
+                yield
+            except RuntimeError:
+                nonlocal hit_handler
+                hit_handler = True
+
+        @torch._dynamo.optimize("eager")
+        def f():
+            with ctx():
+                h()
+
+        def h():
+            raise RuntimeError("boof")
+
+        # Should not error
+        f()
+        self.assertTrue(hit_handler)
+
+    def test_generator_dealloc(self):
+        # See https://github.com/pytorch/pytorch/pull/96488
+        #
+        # NB: yes, [(...)] is intentional, this is a list containing a
+        # generator
+        generator_box = [(x for x in [1, 2, 3])]
+
+        counter = torch._dynamo.testing.CompileCounter()
+
+        def g(x):
+            return x + 2
+
+        # TODO: This test is pretty delicate.  To test if it's actually doing
+        # anything, rebuild eval_frame.c with '#define TORCHDYNAMO_DEBUG 1'
+        # and then look at the logs for:
+        #
+        # TRACE[_custom_eval_frame:650] begin <genexpr> test_repros.py 2276 -1 0 0
+        # TRACE[_custom_eval_frame:664] throw <genexpr>
+        #
+        # This means we're actually hitting the relevant codepath
+
+        # NB: Make sure we don't actually Dynamo this frame; if we do Dynamo
+        # this frame, Dynamo actually DOES understand list.clear and will
+        # arrange for the generator deallocation to happen when the eval frame
+        # handler is disabled, which will prevent the bug from happening (we
+        # specifically want to trigger the generator deallocation WHILE the
+        # dynamo eval frame handler is active), as that will cause the
+        # generator to become exhausted and trigger the throw_flag == TRUE
+        # case.
+        @torch._dynamo.skip
+        def f(x):
+            generator_box.clear()
+            return g(x)
+
+        self.assertNoUnraisable(
+            lambda: torch._dynamo.optimize(counter)(f)(torch.randn(3))
+        )
+
+        # Make sure the x + 2 is captured (a previous incorrect implementation
+        # of this fix would have disabled the eval frame callback, which means
+        # g wouldn't get traced
+        self.assertEqual(counter.op_count, 1)
+
+    def test_error_return_without_exception_set(self):
+        # https://github.com/pytorch/pytorch/issues/93781
+        @torch.compile
+        def f():
+            _generator_type = type((_ for _ in ()))
+
+        self.assertNoUnraisable(f)
+
    @skip_if_pytest
    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
    def test_rewrite_assert_with_msg(self):
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@ -377,8 +377,6 @@ aten::_mps_convolution
 aten::_mps_convolution.out
 aten::_mps_convolution_transpose
 aten::_mps_convolution_transpose.out
-aten::_mps_max_pool2d
-aten::_mps_max_pool2d.out
 aten::_native_batch_norm_legit.no_stats_out
 aten::_native_batch_norm_legit.out
 aten::_native_decoder_only_multi_head_attention
@ -857,6 +855,8 @@ aten::max
 aten::max.dim
 aten::max.dim_max
 aten::max.unary_out
+aten::max_pool2d_backward
+aten::max_pool2d_backward.out
 aten::max_pool2d_with_indices
 aten::max_pool2d_with_indices.out
 aten::max_pool2d_with_indices_backward
@ -930,8 +930,6 @@ aten::mps_convolution_backward
 aten::mps_convolution_backward.out
 aten::mps_convolution_transpose_backward
 aten::mps_convolution_transpose_backward.out
-aten::mps_max_pool2d_backward
-aten::mps_max_pool2d_backward.out
 aten::multi_margin_loss
 aten::multi_margin_loss.out
 aten::multi_margin_loss_backward
--- a/Show More
+++ b/Show More