feat(dockerfile): shrink layers & build cleaner (#95375 )

this change will reduce the layer size as it will not save the layers also it will build cleaner on other machines as it won't ask for a user interaction when running the build Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/95375 Approved by: https://github.com/ezyang (cherry picked from commit 1c526664d5b9556fa7c24d492cec05e622c58c2e)
Update PyTorch docker base image to Ubuntu-20.04 (take 2) (#101310 )
2025-10-31 12:15:03 +08:00 · 2023-05-12 16:21:40 -07:00 · 2023-05-12 14:51:45 -07:00 · 2023-05-12 14:11:09 -07:00 · 2023-05-12 14:01:38 -07:00 · 2023-04-24 09:27:27 -04:00
186 changed files with 2859 additions and 1129 deletions
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -119,7 +119,7 @@ function install_torchvision() {
 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.0 --quiet https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -135,16 +135,10 @@ function install_filelock() {
 function install_triton() {
  local commit
  commit=$(get_pinned_commit triton)
  local short_hash
  short_hash=$(echo "${commit}"|cut -c -10)
  local index_url
  index_url=https://download.pytorch.org/whl/nightly/cpu
  if [[ "${TEST_CONFIG}" == *rocm* ]]; then
    echo "skipping triton due to rocm"
  elif pip install "pytorch-triton==2.0.0+${short_hash}" --index-url "${index_url}"; then
     echo "Using prebuilt version ${short_hash}"
  else
    commit=$(get_pinned_commit triton)
    if [[ "${BUILD_ENVIRONMENT}" == *gcc7* ]]; then
      # Trition needs gcc-9 to build
      sudo apt-get install -y g++-9
@ -181,7 +175,7 @@ function checkout_install_torchdeploy() {
  pushd multipy
  git checkout "${commit}"
  python multipy/runtime/example/generate_examples.py
-  pip install -e . --install-option="--cudatests"
+  pip install -e .
  popd
  popd
 }
@ -190,7 +184,6 @@ function test_torch_deploy(){
 pushd ..
 pushd multipy
 ./multipy/runtime/build/test_deploy
 ./multipy/runtime/build/test_deploy_gpu
 popd
 popd
 }
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,7 +62,7 @@ git --no-pager log --max-count 1
 popd
 # Clone the Builder master repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b release/2.0 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.github/ci_commit_pins/triton.txt
+++ b/.github/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8bfe3f548b164f745ada620a560f87f41ab8465
+b8b470bc597c1c5bd03682c09fe3e6b7c53787fd
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-d29eb67c27af0f18d4f487d76b86f43b0a69aade
+r2.0
--- a/.github/requirements/triton-requirements-rocm.txt
+++ b/.github/requirements/triton-requirements-rocm.txt
@ -1 +1 @@
-pytorch-triton-rocm>=2.0.0.dev
+pytorch-triton-rocm>=2.0.0,<2.1
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -38,7 +38,7 @@ def build_triton(commit_hash: str, build_conda: bool = False, py_version : Optio
        check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
-                print(f"package:\n  name: torchtriton\n  version: 2.0.0+{commit_hash[:10]}\n", file=meta)
+                print("package:\n  name: torchtriton\n  version: 2.0.0\n", file=meta)
                print("source:\n  path: .\n", file=meta)
                print("build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
                      "python setup.py install --single-version-externally-managed --record=record.txt\n", file=meta)
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -196,10 +196,11 @@ def generate_wheels_matrix(os: str,
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
-            gpu_arch_version = "" if arch_version == "cpu" or arch_version == "cpu-cxx11-abi" else arch_version
+            gpu_arch_version = (
-            # Skip rocm 3.11 binaries for now as the docker image are not correct
+                ""
-            if python_version == "3.11" and gpu_arch_type == "rocm":
+                if arch_version == "cpu" or arch_version == "cpu-cxx11-abi"
-                continue
+                else arch_version
            )
            # special 11.7 wheels package without dependencies
            # dependency downloaded via pip install
@ -226,7 +227,8 @@ def generate_wheels_matrix(os: str,
                        "nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                        "nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                        "nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'",
+                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                        "triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
                        "build_name":
                        f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn"
                        .replace(
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -8,7 +8,7 @@
 # NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
 #       the binary builds will check out
 {%- set builder_repo = "pytorch/builder" -%}
-{%- set builder_branch = "main" -%}
+{%- set builder_branch = "release/2.0" -%}
 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -87,8 +87,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -74,8 +74,8 @@ jobs:
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@v2.8.2
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -62,8 +62,8 @@ jobs:
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -98,8 +98,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/_android-build-test.yml
+++ b/.github/workflows/_android-build-test.yml
@ -35,7 +35,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@ -35,7 +35,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -35,7 +35,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -131,7 +131,7 @@ jobs:
        with:
          github-secret: ${{ secrets.github-token }}
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Chown workspace
@ -145,7 +145,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -158,7 +157,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -128,7 +128,7 @@ jobs:
          github-secret: ${{ secrets.github-token }}
        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Chown workspace
@ -142,7 +142,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
      - name: Clean PyTorch checkout
@ -154,7 +153,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -96,7 +96,7 @@ jobs:
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true
--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Set up JDK 8
        uses: actions/setup-java@v3
--- a/.github/workflows/_calculate-docker-image.yml
+++ b/.github/workflows/_calculate-docker-image.yml
@ -22,7 +22,7 @@ jobs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -76,7 +76,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@ -38,7 +38,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Populate CI build options
        run: |
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -76,7 +76,7 @@ jobs:
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -48,7 +48,7 @@ jobs:
      keep-going: ${{ steps.filter.outputs.keep-going }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -79,7 +79,7 @@ jobs:
              docker exec -it $(docker container ps --format '{{.ID}}') bash
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -79,7 +79,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Set xcode version
        env:
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -47,7 +47,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -96,7 +96,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Download build artifacts
        uses: ./.github/actions/download-build-artifacts
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -54,7 +54,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -78,7 +78,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true
--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@ -13,7 +13,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup miniconda
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -68,7 +68,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -36,7 +36,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
@ -119,7 +119,7 @@ jobs:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          no-sudo: true
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -38,7 +38,7 @@ jobs:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
@ -65,9 +65,6 @@ jobs:
          # Determine python executable for given version
          case $PY_VERS in
          3.7)
            PYTHON_EXECUTABLE=/opt/python/cp37-cp37m/bin/python
            ;;
          3.8)
            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
            ;;
@ -86,7 +83,8 @@ jobs:
            ;;
          esac
-          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
+          docker exec -t "${container_name}" yum install -y zlib-devel
          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0
          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" /pytorch/.github/scripts/build_triton_wheel.py
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
@ -139,7 +137,7 @@ jobs:
        run: |
            set -ex
            pip install -q awscli
-            s3_dir="${UPLOAD_BUCKET}/whl/nightly/"
+            s3_dir="${UPLOAD_BUCKET}/whl/test/"
            for pkg in "${PKG_DIR}/"*.whl; do
              aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
             done
@ -162,7 +160,7 @@ jobs:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
@ -195,7 +193,7 @@ jobs:
        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
        run: |
          container_name=$(docker container ps --format '{{.ID}}')
-          docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-nightly --label main --no-progress --force"
+          docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-test --label main --no-progress --force"
      - name: Chown artifacts
        run: |
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: linux.20_04.4x
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -62,7 +62,7 @@ jobs:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required for git merge-base
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -829,7 +829,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -841,7 +840,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -933,7 +932,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -945,7 +943,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1037,7 +1035,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1049,7 +1046,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1141,7 +1138,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1153,7 +1149,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -829,7 +829,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -841,7 +840,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -933,7 +932,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -945,7 +943,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1037,7 +1035,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1049,7 +1046,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1141,7 +1138,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1153,7 +1149,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-linux-binary-manywheel-master.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-master.yml
@ -47,7 +47,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -169,7 +169,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -393,7 +392,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -482,7 +481,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -494,7 +492,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -669,7 +667,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -881,7 +879,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -893,7 +890,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -982,7 +979,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -994,7 +990,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1169,7 +1165,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1381,7 +1377,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1393,7 +1388,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1482,7 +1477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1494,7 +1488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1669,7 +1663,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_7-with-pypi-cudnn
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1834,3 +1828,203 @@ jobs:
      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  manywheel-py3_11-rocm5_3-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.3
      GPU_ARCH_VERSION: 5.3
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_3
      build_environment: linux-binary-manywheel
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-rocm5_3-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: manywheel-py3_11-rocm5_3-build
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.3
      GPU_ARCH_VERSION: 5.3
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
      DESIRED_PYTHON: "3.11"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v3
        name: Download Build Artifacts
        with:
          name: manywheel-py3_11-rocm5_3
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          submodules: recursive
          path: pytorch
          quiet-checkout: true
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
          quiet-checkout: true
      - name: Clean pytorch/builder checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: builder
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: pytorch/manylinux-builder:rocm5.3
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  manywheel-py3_11-rocm5_3-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: manywheel-py3_11-rocm5_3-test
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.3
      GPU_ARCH_VERSION: 5.3
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  manywheel-py3_11-rocm5_4_2-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.4.2
      GPU_ARCH_VERSION: 5.4.2
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_4_2
      build_environment: linux-binary-manywheel
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-rocm5_4_2-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: manywheel-py3_11-rocm5_4_2-build
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.4.2
      GPU_ARCH_VERSION: 5.4.2
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
      DESIRED_PYTHON: "3.11"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v3
        name: Download Build Artifacts
        with:
          name: manywheel-py3_11-rocm5_4_2
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          submodules: recursive
          path: pytorch
          quiet-checkout: true
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
          quiet-checkout: true
      - name: Clean pytorch/builder checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: builder
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: pytorch/manylinux-builder:rocm5.4.2
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  manywheel-py3_11-rocm5_4_2-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: manywheel-py3_11-rocm5_4_2-test
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm5.4.2
      GPU_ARCH_VERSION: 5.4.2
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_4_2
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -75,7 +75,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -87,7 +86,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -187,7 +186,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -199,7 +197,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -299,7 +297,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -311,7 +308,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -411,7 +408,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -423,7 +419,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -75,7 +75,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -87,7 +86,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -187,7 +186,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -199,7 +197,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -299,7 +297,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -311,7 +308,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -411,7 +408,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -423,7 +419,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@ -73,7 +73,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -85,7 +84,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -185,7 +184,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -197,7 +195,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -297,7 +295,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -309,7 +306,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -409,7 +406,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -421,7 +417,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@ -77,7 +77,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -89,7 +88,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -194,7 +193,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -206,7 +204,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -311,7 +309,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -323,7 +320,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@ -77,7 +77,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -89,7 +88,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -194,7 +193,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -206,7 +204,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -311,7 +309,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -323,7 +320,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@ -73,7 +73,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -85,7 +84,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -185,7 +184,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -197,7 +195,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -297,7 +295,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -309,7 +306,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -409,7 +406,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -421,7 +417,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -87,7 +87,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -99,7 +98,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -198,7 +197,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -210,7 +208,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -316,7 +314,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -328,7 +325,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -547,7 +543,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -559,7 +554,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -659,7 +654,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -671,7 +665,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -777,7 +771,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -789,7 +782,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -888,7 +881,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -900,7 +892,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1006,7 +998,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1018,7 +1009,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1118,7 +1109,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1130,7 +1120,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1237,7 +1227,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1249,7 +1238,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1349,7 +1338,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1361,7 +1349,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1467,7 +1455,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1479,7 +1466,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1578,7 +1565,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1590,7 +1576,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1696,7 +1682,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1708,7 +1693,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1808,7 +1793,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1820,7 +1804,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1927,7 +1911,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1939,7 +1922,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2039,7 +2022,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2051,7 +2033,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2157,7 +2139,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2169,7 +2150,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2268,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2280,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2386,7 +2366,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2398,7 +2377,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2498,7 +2477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2510,7 +2488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2617,7 +2595,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2629,7 +2606,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2729,7 +2706,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2741,7 +2717,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
@ -86,7 +86,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -98,7 +97,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -201,7 +200,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -213,7 +211,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -91,7 +91,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -103,7 +102,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -206,7 +205,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -218,7 +216,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -331,7 +329,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -343,7 +340,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -446,7 +443,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -458,7 +454,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -571,7 +567,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -583,7 +578,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -686,7 +681,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -698,7 +692,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -811,7 +805,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -823,7 +816,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -926,7 +919,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -938,7 +930,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1052,7 +1044,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1064,7 +1055,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1168,7 +1159,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1180,7 +1170,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1295,7 +1285,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1307,7 +1296,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1411,7 +1400,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1423,7 +1411,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1538,7 +1526,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1550,7 +1537,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1654,7 +1641,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1666,7 +1652,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1781,7 +1767,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1793,7 +1778,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1897,7 +1882,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1909,7 +1893,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2024,7 +2008,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2036,7 +2019,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2140,7 +2123,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2152,7 +2134,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2267,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2279,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2383,7 +2364,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2395,7 +2375,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2510,7 +2490,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2522,7 +2501,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,7 +2605,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2638,7 +2616,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2753,7 +2731,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2765,7 +2742,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2869,7 +2846,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2881,7 +2857,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-release-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
@ -86,7 +86,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -98,7 +97,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -201,7 +200,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -213,7 +211,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -91,7 +91,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -103,7 +102,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -206,7 +205,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -218,7 +216,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -331,7 +329,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -343,7 +340,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -446,7 +443,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -458,7 +454,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -571,7 +567,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -583,7 +578,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -686,7 +681,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -698,7 +692,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -811,7 +805,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -823,7 +816,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -926,7 +919,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -938,7 +930,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1052,7 +1044,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1064,7 +1055,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1168,7 +1159,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1180,7 +1170,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1295,7 +1285,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1307,7 +1296,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1411,7 +1400,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1423,7 +1411,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1538,7 +1526,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1550,7 +1537,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1654,7 +1641,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1666,7 +1652,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1781,7 +1767,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1793,7 +1778,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1897,7 +1882,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1909,7 +1893,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2024,7 +2008,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2036,7 +2019,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2140,7 +2123,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2152,7 +2134,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2267,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2279,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2383,7 +2364,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2395,7 +2375,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2510,7 +2490,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2522,7 +2501,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,7 +2605,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2638,7 +2616,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2753,7 +2731,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2765,7 +2742,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2869,7 +2846,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2881,7 +2857,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -87,7 +87,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -99,7 +98,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -198,7 +197,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -210,7 +208,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -316,7 +314,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -328,7 +325,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -428,7 +425,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -440,7 +436,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -547,7 +543,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -559,7 +554,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -659,7 +654,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -671,7 +665,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -777,7 +771,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -789,7 +782,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -888,7 +881,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -900,7 +892,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1006,7 +998,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1018,7 +1009,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1118,7 +1109,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1130,7 +1120,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1237,7 +1227,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1249,7 +1238,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1349,7 +1338,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1361,7 +1349,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1467,7 +1455,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1479,7 +1466,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1578,7 +1565,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1590,7 +1576,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1696,7 +1682,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1708,7 +1693,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1808,7 +1793,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1820,7 +1804,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1927,7 +1911,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -1939,7 +1922,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2039,7 +2022,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2051,7 +2033,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2157,7 +2139,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2169,7 +2150,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2268,7 +2249,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2280,7 +2260,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2386,7 +2366,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2398,7 +2377,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2498,7 +2477,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2510,7 +2488,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2617,7 +2595,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2629,7 +2606,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2729,7 +2706,6 @@ jobs:
      - name: Checkout PyTorch
        uses: malfet/checkout@silent-checkout
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          quiet-checkout: true
@ -2741,7 +2717,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: malfet/checkout@silent-checkout
        with:
-          ref: main
+          ref: release/2.0
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -106,7 +106,7 @@ jobs:
    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: -1
@ -216,7 +216,7 @@ jobs:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required, to allow us to use git log
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@ -14,7 +14,7 @@ jobs:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
        with:
          fetch-depth: 1
          submodules: false
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -37,7 +37,7 @@ jobs:
        run: echo "${TRIGGERING_WORKFLOW}"
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
      - run: |
          pip3 install requests==2.26
--- a/13
+++ b/13
@ -7,11 +7,11 @@
 #
 #       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
-ARG BASE_IMAGE=ubuntu:18.04
+ARG BASE_IMAGE=ubuntu:20.04
 ARG PYTHON_VERSION=3.8
 FROM ${BASE_IMAGE} as dev-base
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
@ -82,15 +82,16 @@ ARG TRITON_VERSION
 ARG TARGETPLATFORM
 ARG CUDA_VERSION
 LABEL com.nvidia.volumes.needed="nvidia_driver"
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        ca-certificates \
        libjpeg-dev \
-        libpng-dev
+        libpng-dev \
        && rm -rf /var/lib/apt/lists/*
 COPY --from=conda-installs /opt/conda /opt/conda
 RUN if test -n "${TRITON_VERSION}" -a "${TARGETPLATFORM}" != "linux/arm64"; then \
-        apt install -y --no-install-recommends gcc; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends gcc; \
        rm -rf /var/lib/apt/lists/*; \
    fi
 RUN rm -rf /var/lib/apt/lists/*
 ENV PATH /opt/conda/bin:$PATH
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -25,6 +25,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
  OP_DECOMPOSE(feature_dropout_);
 }
 void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
    TORCH_CHECK(false, "mutating directly with `.data` under vmap transform is not allowed.");
 }
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(__and__, Scalar);
  OP_DECOMPOSE2(__and__, Tensor);
@ -327,6 +331,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_tensor);
  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_float);
  m.impl("_has_compatible_shallow_copy_type", torch::CppFunction::makeFromBoxedFunction<&unsupportedData>());
 }
 }}
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -836,6 +836,7 @@ Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t di
  if (index.dim() == 0) {
    return index.expand(self_size);
  }
  dim = maybe_wrap_dim(dim, self_size.size());
  // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
  // to reshape index_
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@ -103,6 +103,24 @@ const char* BatchedTensorImpl::tensorimpl_type_name() const {
  return "BatchedTensorImpl";
 }
 c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
    const c10::VariableVersion& version_counter,
    bool allow_tensor_metadata_change) const {
  TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
  return nullptr;
 }
 c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
    c10::VariableVersion&& version_counter,
    bool allow_tensor_metadata_change) const {
  TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
  return nullptr;
 }
 void BatchedTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
  TORCH_CHECK(false, "mutating directly with `.data` under vmap transform is not allowed.");
 }
 Tensor makeBatched(const Tensor& tensor, int64_t bdim, int64_t level) {
  DispatchKeySet key_set = getKeysToPropagateToWrapper(tensor);
  auto* batched = maybeGetBatchedImpl(tensor);
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -71,6 +71,13 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
  void set_size(int64_t dim, int64_t new_size) override;
  void set_stride(int64_t dim, int64_t new_stride) override;
  void set_storage_offset(int64_t storage_offset) override;
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
    const c10::VariableVersion& version_counter,
    bool allow_tensor_metadata_change) const override;
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      c10::VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const override;
  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
 #ifdef DEBUG
  bool has_storage() const override;
 #endif
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -94,13 +94,12 @@ MPSDevice::MPSDevice(): _mtl_device(nil), _mtl_indexing_library(nil)  {
 bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
  id mpsCD = NSClassFromString(@"MPSGraph");
  static auto compileOptions = [[[MTLCompileOptions alloc] init] autorelease];
  static bool _macos_13_0_plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
  static bool _macos_13_1_plus = [mpsCD instancesRespondToSelector:@selector(
    sampleGridWithSourceTensor:coordinateTensor:layout:normalizeCoordinates:relativeCoordinates:alignCorners:paddingMode:samplingMode:constantValue:name:)] == YES;
  static bool _macos_13_2_plus = [mpsCD instancesRespondToSelector:@selector(convolution3DWithSourceTensor:weightsTensor:descriptor:name:)] == YES;
-  static bool _macos_13_3_plus = NO;
+  static bool _macos_13_3_plus = [compileOptions respondsToSelector:@selector(maxTotalThreadsPerThreadgroup)] == YES;
  if (@available(macOS 13.3, *))
    _macos_13_3_plus = YES;
  switch (version) {
    case MacOSVersion::MACOS_VER_13_0_PLUS:  return _macos_13_0_plus;
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@ -54,8 +54,6 @@ TORCH_LIBRARY_IMPL(aten, MPS, m) {
  m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("im2col", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); // Used in  preprocessing by nn.Unfold
  m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
  m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -9,7 +9,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_mps_max_pool2d.h>
 #include <ATen/ops/adaptive_avg_pool1d_native.h>
 #include <ATen/ops/adaptive_avg_pool2d.h>
 #include <ATen/ops/adaptive_max_pool1d_native.h>
@ -141,12 +140,6 @@ Tensor max_pool2d(
    return at::mkldnn_max_pool2d(
        self, kernel_size, stride, padding, dilation, ceil_mode);
  }
 #ifdef USE_MPS
  if (self.is_mps()) {
    return at::_mps_max_pool2d(
        self, kernel_size, stride, padding, dilation, ceil_mode);
  }
 #endif
 #if defined(C10_MOBILE)
  if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
                             dilation, ceil_mode)) {
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1428,7 +1428,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
  }
 #ifdef USE_MPS
  if (_input.is_mps() && !bidirectional) {
-    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
    std::tuple<Tensor, Tensor, Tensor> return_values = std::make_tuple(std::get<0>(output), std::get<1>(output), std::get<2>(output));
    return return_values;
--- a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
+++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
@ -29,7 +29,7 @@ void spmm_reduce_kernel_impl(
    const Tensor& values,
    const Tensor& other_) {
-  int64_t nnz = other_.numel();
+  int64_t nnz = values.numel();
  if (nnz == 0) {
    return;
  }
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -40,6 +40,10 @@ C10_DIAGNOSTIC_POP()
 #include <ATen/ops/empty.h>
 #endif
 #ifdef __linux__
 #include <dlfcn.h>
 #endif
 namespace at { namespace native {
 namespace {
@ -62,6 +66,22 @@ uint8_t getAlignment(const Tensor &t) {
 }
 cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const at::MemoryFormat memory_format, const bool _virtual) {
 #if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && CUDNN_MINOR > 5
  // Workaround for cudnn error handling deficiency, that results in a crash on Ubuntu-22+
  // if `libnvrtc.so` is not found on the system, which strictly speaking is not necessary
  // for usecases below
  // See https://github.com/pytorch/pytorch/issues/97041
  static C10_UNUSED auto cudnn_cnn_infer_handler = [] {
    void *handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
    char *err = dlerror();
    if (!handle) {
      TORCH_WARN("Attempt to open cnn_infer failed: handle=", handle, " error: ", err);
    } else if (err) {
      TORCH_WARN("Applied workaround for CuDNN issue, install nvrtc.so");
    }
    return handle;
  }();
 #endif
  auto sizes = t.sizes();
  auto strides = t.strides();
  bool channels_last = memory_format == at::MemoryFormat::ChannelsLast ||
@ -153,8 +173,9 @@ cudnn_frontend::ExecutionPlan* find(const KeyType& key) {
  return &(it->second);
 }
-void emplace(const KeyType& key, T& results) {
+void update(const KeyType& key, T& results) {
  std::lock_guard<std::mutex> guard(mutex);
  engine_cache.erase(key);
  engine_cache.emplace(key, std::move(results));
 }
@ -548,7 +569,7 @@ void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKey& key, con
  for (auto & plan : plans) {
    try {
      run_conv_plan(handle, x, y, w, plan);
-      benchmark_cache.emplace(key, plan);
+      benchmark_cache.update(key, plan);
      return;
    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -562,7 +583,7 @@ void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFuse
  for (auto & plan : plans) {
    try {
      run_conv_plan_fused(handle, x, y, w, z, b, plan);
-      benchmark_cache_fused.emplace(key, plan);
+      benchmark_cache_fused.update(key, plan);
      return;
    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -583,7 +604,7 @@ bool try_configs(cudnn_frontend::EngineConfigList& configs, const std::string& o
        continue;
      }
      run_conv_plan(handle, x, y, w, plan);
-      benchmark_cache.emplace(key, plan);
+      benchmark_cache.update(key, plan);
      return true;
    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
@ -604,7 +625,7 @@ bool try_configs_fused(cudnn_frontend::EngineConfigList& configs, const std::str
        continue;
      }
      run_conv_plan_fused(handle, x, y, w, z, b, plan);
-      benchmark_cache_fused.emplace(key, plan);
+      benchmark_cache_fused.update(key, plan);
      return true;
    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
      catch (c10::OutOfMemoryError &e) {
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@ -138,4 +138,7 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
                                     nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
                                           constantValue:(double) constantValue
                                                    name:(NSString * _Nullable) name;
 - (MPSGraphTensor * _Nonnull) truncateWithTensor:(MPSGraphTensor * _Nonnull) tensor
                                            name:(NSString * _Nullable) name;
@end
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -265,7 +265,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSS
  id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
  bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
  // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
     Tensor emptyShell = Tensor();
    // use "_tensor" from Placeholder to retain view's output during its usage in other ops
    _tensor = gatherViewTensor(src, emptyShell);
@ -289,7 +289,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSS
  } else {
    if (!mpsShape) {
      mpsShape = getMPSShape(_tensor);
-  }
+    }
    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
                                                      shape:mpsShape
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@ -311,11 +311,25 @@ TORCH_IMPL_FUNC(log_softmax_mps_out) (
          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-          MPSGraphTensor* softmaxTensor = [mpsGraph softMaxWithTensor:inputTensor
+          MPSGraphTensor* maximumsTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
-                                                                 axis:dim
+                                                                            axis:dim
-                                                                 name:nil];
+                                                                            name:nil];
-          MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:softmaxTensor
+          MPSGraphTensor* inputTensorSubMax = [mpsGraph subtractionWithPrimaryTensor:inputTensor
-                                                                  name:nil];
+                                                                     secondaryTensor:maximumsTensor
                                                                                name:nil];
          MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:inputTensorSubMax
                                                                   name:nil];
          MPSGraphTensor* exponentTensorReduced = [mpsGraph reductionSumWithTensor:exponentTensor
                                                                              axis:dim
                                                                              name:nil];
          MPSGraphTensor* logSumExpTensor = [mpsGraph logarithmWithTensor:exponentTensorReduced
                                                                    name:nil];
          MPSGraphTensor* outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensorSubMax
                                                                       secondaryTensor:logSumExpTensor
                                                                                  name:nil];
          newCachedGraph->inputTensor_ = inputTensor;
          newCachedGraph->outputTensor_ = outputTensor;
@ -1208,8 +1222,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
  {
    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
    MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
    MPSGraphTensor *resultTensor_ = nil;
    MPSGraphTensor *gradInputTensor_ = nil;
  };
@ -1218,7 +1231,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
  MPSStream* stream = getCurrentMPSStream();
  @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                 to_string(alpha.to<double>()) + ":" +
                                                 to_string(scale.to<double>()) + ":" +
                                                 to_string(input_scale.to<double>()) + ":" +
@ -1235,18 +1248,14 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
          newCachedGraph = new CachedGraph(mpsGraph);
          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
          MPSGraphTensor* inputTensor = nil;
          MPSGraphTensor* resultTensor = nil;
          MPSGraphTensor* lessThanZeroGradTensor = nil;
          if(is_result) {
            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
            MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                        secondaryTensor:alphaTensor
                                                                                   name:nil];
            auto constMul = scale.to<double>() * input_scale.to<double>();
@ -1258,11 +1267,10 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
                                                                          name:nil];
          }
          else {
            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
            MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                    shape:@[@1]
                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                          secondaryTensor:inputScaleTensor
                                                                                     name:nil];
            MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@ -1282,7 +1290,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                              shape:@[@1]
                                                           dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                   secondaryTensor:zeroTensor
                                                                              name:nil];
          MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@ -1294,8 +1302,7 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
                                                                                 name:nil];
          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
          newCachedGraph->resultTensor_ = resultTensor;
          newCachedGraph->gradInputTensor_ = gradInputTensor;
        }
        return newCachedGraph;
@ -1304,28 +1311,14 @@ TORCH_IMPL_FUNC(elu_backward_out_mps) (
    }
    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfPlaceholder = Placeholder();
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
    Placeholder resultPlaceholder = Placeholder();
    if(is_result)
      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp);
    else
      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-    if(is_result)
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
-      feeds = @{
+    };
        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
      };
    else
      feeds = @{
        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
      };
    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
    };
@ -1840,7 +1833,7 @@ std::tuple<Tensor, Tensor> prelu_backward_mps(const Tensor& grad_output, const T
    using namespace mps;
    Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
    if (grad_output.numel() == 0) {
      return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
    }
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@ -177,10 +177,6 @@ void div_mode_template(const Tensor& self, const Tensor& other,
                       c10::optional<c10::string_view> rounding_mode,
                       const Tensor& output, const string op_name)
 {
  if(rounding_mode.has_value() && *rounding_mode == "floor"){
    TORCH_CHECK(self.scalar_type() != ScalarType::Long,
                "MPS: does not support floor_divide op with int64 input");
  }
  BinaryOpBlock div_mode_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
    MPSGraph* mpsGraph = cachedGraph->graph();
    bool isFloatInput = ([primaryCastTensor dataType] & MPSDataTypeFloatBit) != 0;
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@ -12,7 +12,7 @@ Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) {
  }
  Tensor output = self;
  bool needsCopyToOutput = false;
-  if (!self.is_contiguous()) {
+  if (!self.is_contiguous() || self.storage_offset()) {
    output = empty_mps(self.sizes(), self.scalar_type(), c10::nullopt, kMPS);
    needsCopyToOutput = true;
  }
@ -89,7 +89,7 @@ bool fill_mps_tensor_(Tensor& self, uint8_t value) {
  if (self.is_contiguous()) {
    MPSStream* stream = getCurrentMPSStream();
    auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
    return true;
  }
  return false;
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -56,15 +56,17 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
  descriptor_.groups = groups;
 }
-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
    const Tensor& input_t,
    const Tensor& weight_t,
    const c10::optional<Tensor>& bias_opt,
    IntArrayRef padding,
    IntArrayRef stride,
    IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
    c10::optional<IntArrayRef> input_shape) {
  TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
  TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
  namespace native_mps = at::native::mps;
  CheckedFrom c = "mps_convolution";
@ -83,6 +85,8 @@ Tensor _mps_convolution(
  auto memory_format = input_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
  auto output_t = at::empty(
                    input_shape.has_value() ?
                    input_shape.value() :
                    conv_output_size(input->sizes(), weight->sizes(),
                                     padding, stride, dilation),
                    input->scalar_type(),
@ -237,21 +241,30 @@ Tensor _mps_convolution(
  return *output;
 }
 Tensor _mps_convolution(
    const Tensor& input_t,
    const Tensor& weight_t,
    const c10::optional<Tensor>& bias_opt,
    IntArrayRef padding,
    IntArrayRef stride,
    IntArrayRef dilation,
    int64_t groups) {
    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
 }
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  namespace native_mps = at::native::mps;
  using namespace mps;
  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
  CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+            weight{ weight_t, "weight", 2 };
  checkAllSameType(c, {grad_output, weight});
  checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
  Tensor grad_output_t = grad_output_.contiguous(memory_format);
  Tensor weight_t = weight_.contiguous(memory_format);
  MPSShape* weightShape = getMPSShape(weight_);
  auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
  // Avoid "grad_input" when this is being used as transposed convolution
@ -327,10 +340,10 @@ Tensor mps_convolution_backward_input(
          }
          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
            gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
          }
          MPSGraphTensor* gradInputTensor;
@ -359,7 +372,7 @@ Tensor mps_convolution_backward_input(
    }
    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@ -377,17 +390,15 @@ Tensor mps_convolution_backward_input(
 }
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  namespace native_mps = at::native::mps;
  using namespace mps;
  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
  CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
  auto grad_output_t = grad_output_.to(memory_format);
  auto input_t = input_.to(memory_format);
  MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
  // For uniformity with everything else, although it seems grad_weight
@ -475,7 +486,7 @@ Tensor mps_convolution_backward_weights(
          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
            gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
          }
@ -525,12 +536,9 @@ Tensor mps_convolution_backward_weights(
 }
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
    std::array<bool,3> output_mask) {
  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
  Tensor grad_input, grad_weight, grad_bias;
  if (input.numel() == 0) {
    if (output_mask[0]) {
@ -576,10 +584,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
    const Tensor& grad_output_t, const Tensor& weight_t,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
+  return _mps_convolution_impl(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }
 Tensor mps_convolution_transpose_backward_weight(
@ -595,15 +603,12 @@ Tensor mps_convolution_transpose_backward_weight(
 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
    std::array<bool,2> output_mask) {
  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
  Tensor grad_input, grad_weight;
  if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
  }
  if (output_mask[1]) {
    grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -251,8 +251,11 @@ static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, boo
  bool returnGatherOutput = dst_.is_contiguous();
  Tensor src;
  auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
  const bool sameDataType = src_.dtype() == dst_.dtype();
-  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
      // the copy_cast path requires storage_offset to be applied before casting
      (src_.storage_offset() && !sameDataType)) {
    Tensor emptyShell = Tensor();
    src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
@ -282,7 +285,7 @@ static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, boo
  src._set_neg(src_.is_neg());
  const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
+  if (sameDataType) {
    MPSStream* stream = getCurrentMPSStream();
    // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
    stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
@ -297,22 +300,27 @@ at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
  TORCH_CHECK(dst.defined(), "dst is undefined");
  TORCH_CHECK(src.defined(), "src is undefined");
  bool needs_broadcasting = false;
  if (src.numel() == 0 || dst.is_same(src)) {
    return dst;
  }
  if (dst.numel() == 0) {
    dst.resize_as_(src);
  }
  if (dst.dim() > src.dim()) {
    needs_broadcasting = true;
  }
  if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
-    return copy_from_mps_(dst, src, non_blocking);
+    return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }
  if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
-    return copy_to_mps_(dst, src, non_blocking);
+    return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }
  if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
-    return copy_kernel_mps(dst, src, non_blocking);
+    return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
  }
  TORCH_INTERNAL_ASSERT(
      src.device().type() == DeviceType::MPS,
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps(
            MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
            MPSGraphTensor* castGradTensor = incomingGradTensor;
            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
            if (dataType == MPSDataTypeFloat16) {
              castGradTensor = [mpsGraph castTensor: incomingGradTensor
                                             toType: MPSDataTypeFloat32
                                               name: @"castGradTensor"];
            }
            if (num_indices_dims != 0) {
              reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                               axes: @[@-1]
                                                               name: nil];
            }
-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                             indicesTensor: reshapedIndicesTensor
                                                                     shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                           batchDimensions: 0
                                                                      mode: MPSGraphScatterModeAdd
                                                                      name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
                                                 toType: MPSDataTypeFloat16
                                                   name: @"castGradTensor"];
            }
            newCachedGraph->incomingGradTensor_ = incomingGradTensor;
            newCachedGraph->indicesTensor_ = indicesTensor;
            newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -609,11 +609,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_mps
    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":"
+    string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
-                      + std::to_string(train) + ":"
+        std::to_string(train) + ":" + std::to_string(has_running_mean) + ":" + std::to_string(has_weight) + ":" +
-                      + std::to_string(has_running_mean) + ":"
+        [ns_shape_key UTF8String] + ":" + c10::Join(",", grad_input_mask) + ":" + native_mps::getMPSTypeString(input.scalar_type());
                      + std::to_string(has_weight) + ":"
                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(input.scalar_type());
    auto input_mps_dtype = native_mps::getMPSDataType(input.scalar_type());
    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
  pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                     nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
  auto output_memory_format = output.suggest_memory_format();
  // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
  // by simply restriding them (instead of calling the costly Contiguous()).
  if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
      outputSizes.insert(outputSizes.begin(), nbatch);
    }
    output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
    output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
    output_memory_format = MemoryFormat::Contiguous;
  }
  if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
    }
    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
    if (output_memory_format != suggested_memory_format) {
      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
    }
  }
 }
@ -302,7 +308,7 @@ static void avg_pool2d_template(const Tensor& input, const Tensor& output,
 } // namespace mps
-Tensor _mps_max_pool2d(
+Tensor mps_max_pool2d(
    const Tensor& input,
    IntArrayRef kernel_size,
    IntArrayRef stride,
@ -356,6 +362,8 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
    const Tensor& output,
    const Tensor& indices) {
  auto indices_memory_format = indices.suggest_memory_format();
  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
    MPSGraph* mpsGraph = cachedGraph.graph();
    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@ -366,6 +374,10 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
  };
  mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
  if (indices_memory_format == MemoryFormat::ChannelsLast) {
    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
  }
 }
 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -139,6 +139,10 @@ void reduction_out_mps(
  MPSReductionType reduction_type,
  const std::string& func_name) {
  // issue 103641234, reduction ops does not have int64 support
  if (input_t.scalar_type() == ScalarType::Long) {
    TORCH_WARN_ONCE("MPS: no support for int64 reduction ops, casting it to int32");
  }
  IntArrayRef input_shape = input_t.sizes();
  if (opt_dim.has_value()) {
@ -163,6 +167,9 @@ void reduction_out_mps(
    if (reduction_type == MPSReductionType::PROD) {
      output_t.fill_(1);
    }
    else if (reduction_type == MPSReductionType::SUM) {
      output_t.zero_();
    }
    return;
  }
@ -197,7 +204,10 @@ void reduction_out_mps(
             (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
            inputCastDtype = getMPSDataType(dtype.value());
          } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
                     input_type != MPSDataTypeFloat16) {
            inputCastDtype = MPSDataTypeFloat32;
          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
            inputCastDtype = MPSDataTypeFloat32;
          }
@ -241,7 +251,7 @@ void reduction_out_mps(
                                                               axes:wrappedAxes
                                                               name:nil];
          } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                     numLower:0
                                                                     numUpper:0
                                                                         name:nil];
@ -1257,7 +1267,9 @@ Tensor min_max_mps
  (const Tensor& input_t,
   MPSReductionType reduction_type,
   const std::string& func_name) {
-  TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 min/max ops, casting it to int32");
+  if (input_t.scalar_type() == ScalarType::Long) {
    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
  }
  using CachedGraph = MPSUnaryCachedGraph;
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -233,7 +233,7 @@ Tensor repeat_interleave_mps(const Tensor& repeat_, c10::optional<int64_t> outpu
  if (repeat.scalar_type() == kLong) {
    // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output,
    // which currently doesn't support int64_t as input. Casting internally the indices to int32_t.
-    TORCH_WARN_ONCE(false, "MPS: no support for int64 repeats mask, casting it to int32");
+    TORCH_WARN_ONCE("MPS: no support for int64 repeats mask, casting it to int32");
    repeat = repeat.to(kInt);
  }
  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
@ -243,4 +243,4 @@ Tensor repeat_interleave_mps(const Tensor& repeat_, c10::optional<int64_t> outpu
  return output;
 }
-}  // namespace at::native
+}  // namespace at::native
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@ -23,17 +23,31 @@ std::vector<long long> getTensorShape(MPSGraphTensor* mpsTensor) {
    return output_dimensions;
 }
-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
    using namespace mps;
    //Projections are not currently supported, raise an error if needed
    bool has_projections = (hx[0].size(2) != hx[1].size(2));
    if(has_projections) {
        AT_ERROR("LSTM with projections is not currently supported with MPS.");
    }
    TORCH_CHECK(!(!is_macos_13_or_newer() && num_layers > 1), "Multi-layer LSTM support in MPS available only on MacOS 13 onwards");
    std::vector<Tensor> kernel_weights;
    std::vector<Tensor> recurrent_kernel_weights;
    std::vector<Tensor> biases;
    std::vector<Tensor> recurrent_biases;
    for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
+        if (has_biases) {
-        recurrent_kernel_weights.push_back(params[i*4+1]);
+            kernel_weights.push_back(params[i*4]);
-        biases.push_back(params[i*4+2]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
-        recurrent_biases.push_back(params[i*4+3]);
+            biases.push_back(params[i*4+2]);
            recurrent_biases.push_back(params[i*4+3]);
        } else {
            kernel_weights.push_back(params[i*2]);
            recurrent_kernel_weights.push_back(params[i*2+1]);
        }
    }
    struct CachedGraph : public MPSCachedGraph {
@ -44,8 +58,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
      std::vector<MPSGraphTensor*> outputCellStateFwdVector_;
      std::vector<MPSGraphTensor*> outputZStateVector_;
    };
    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@ -67,12 +79,15 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
            NSMutableArray<MPSGraphTensor*> *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
            NSMutableArray<MPSGraphTensor*> *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
            NSMutableArray<MPSGraphTensor*> *layersOutputsList = [[NSMutableArray alloc] initWithCapacity:num_layers];
            for (size_t i = 0; i < num_layers; i += 1) {
                [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                if(has_biases) {
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
                }
            }
            MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@ -93,25 +108,28 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            }
            MPSGraphTensor* inputTensor_ = inputTensor;
            MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                        dimension:0
                                                        start:0
                                                        length:1
                                                        name:nil];
            MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
                                                                dimension:0
                                                                start:0
                                                                length:1
                                                                name:nil];
            NSArray<MPSGraphTensor*>* outputs = nil;
            NSMutableArray<MPSGraphTensor*>* outputStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
            for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                MPSGraphTensor* biasTensor = nil;
-                                                                    secondaryTensor:recurrentBiasList[i]
+                if(has_biases) {
-                                                                            name:nil];
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
                                                     secondaryTensor:recurrentBiasList[i]
                                                                name:nil];
                }
                MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                           dimension:0
                                                               start:i
                                                              length:1
                                                                name:nil];
                MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
                                                               dimension:0
                                                                   start:i
                                                                  length:1
                                                                    name:nil];
                outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                        recurrentWeight:recurrentKernelWeightsList[i]
                                            inputWeight:kernelWeightsList[i]
@ -121,18 +139,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
                                             descriptor:opDesc
                                                   name:nil];
                stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                            dimension:0
                                                            start:i
                                                            length:1
                                                            name:nil];
                cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
                                                                    dimension:0
                                                                    start:i
                                                                    length:1
                                                                    name:nil];
                inputTensor_ = [outputs objectAtIndex:0];
                // no need to keep a final layer output copy as it is
                // returned anyway and not used in backprop
                if(i != num_layers - 1) {
                    [layersOutputsList addObject:[mpsGraph expandDimsOfTensor:inputTensor_
                                                                         axis:0
                                                                         name:nil]];
                }
                if(dropout_p>0.0 && train && (i!=num_layers-1)) {
                    inputTensor_ = [mpsGraph dropoutTensor:inputTensor_
                                                      rate:dropout_p
@ -150,7 +164,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
                                                            name:nil]];
            }
-            MPSGraphTensor* outputTensor = [outputs objectAtIndex:0];
+            MPSGraphTensor* outputTensor = inputTensor_;
            if (batch_first) {
                outputTensor = [mpsGraph transposeTensor:outputTensor
                                               dimension:0
@ -169,8 +183,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
            MPSGraphTensor* outputCellStatesFwd = [mpsGraph concatTensors:outputCellStateFwdArray
                                                            dimension:0
                                                            name:nil];
            MPSGraphTensor* layersOutputs = (num_layers > 1)
                ? [mpsGraph concatTensors:layersOutputsList dimension:0 name:nil]
                : nil;
-            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd};
+            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd, layersOutputs};
            newCachedGraph->inputTensors_ = inputTensors;
            newCachedGraph->outputTensors_ = outputTensors;
            newCachedGraph->kernelWeightsList_ = kernelWeightsList;
@ -188,20 +205,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      NSMutableArray<MPSGraphTensor*> *biasList = cachedGraph->biasList_;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList = cachedGraph->recurrentBiasList_;
-      Placeholder kernelWeight;
+      Placeholder kernelWeight, recurrentKernelWeight, bias, recurrentBias;
-      Placeholder recurrentKernelWeight;
+
      Placeholder bias;
      Placeholder recurrentBias;
      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
      for (size_t i = 0; i < num_layers; i+=1) {
          kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
          recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
          [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
          [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+          if(has_biases) {
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
          }
      }
      Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@ -218,6 +235,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      Tensor cy = at::empty_like(hx[1], input.options());
      Tensor zState = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[3])), input.options());
      Tensor cellStateFwd = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[4])), input.options());
      Tensor layerOutputs = (num_layers > 1)
          ? at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[5])), input.options())
          : at::empty({ 1 }, input.options()); // not used if num_layers == 1
      Placeholder outputPlaceholder0 = Placeholder(cachedGraph->outputTensors_[0], output);
      Placeholder outputPlaceholder1 = Placeholder(cachedGraph->outputTensors_[1], hy);
@ -225,20 +245,25 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input
      Placeholder outputPlaceholder3 = Placeholder(cachedGraph->outputTensors_[3], zState);
      Placeholder outputPlaceholder4 = Placeholder(cachedGraph->outputTensors_[4], cellStateFwd);
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [@{
        outputPlaceholder0.getMPSGraphTensor() : outputPlaceholder0.getMPSGraphTensorData(),
        outputPlaceholder1.getMPSGraphTensor() : outputPlaceholder1.getMPSGraphTensorData(),
        outputPlaceholder2.getMPSGraphTensor() : outputPlaceholder2.getMPSGraphTensorData(),
        outputPlaceholder3.getMPSGraphTensor() : outputPlaceholder3.getMPSGraphTensorData(),
-        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData()
+        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData(),
-      };
+      } mutableCopy];
      if (num_layers > 1) {
          Placeholder outputPlaceholder5 = Placeholder(cachedGraph->outputTensors_[5], layerOutputs);
          [results setObject:outputPlaceholder5.getMPSGraphTensorData() forKey: outputPlaceholder5.getMPSGraphTensor()];
      }
      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-      return std::make_tuple(output, hy, cy, zState, cellStateFwd);
+      return std::make_tuple(output, hy, cy, zState, cellStateFwd, layerOutputs);
    }
 }
-std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, const Tensor& layersOutputs, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
    using namespace mps;
    const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] {return Tensor();});
    const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
@ -250,10 +275,15 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
    std::vector<Tensor> biases;
    std::vector<Tensor> recurrent_biases;
    for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
+        if(has_biases) {
-        recurrent_kernel_weights.push_back(params[i*4+1]);
+            kernel_weights.push_back(params[i*4]);
-        biases.push_back(params[i*4+2]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
-        recurrent_biases.push_back(params[i*4+3]);
+            biases.push_back(params[i*4+2]);
            recurrent_biases.push_back(params[i*4+3]);
        } else {
            kernel_weights.push_back(params[i*2]);
            recurrent_kernel_weights.push_back(params[i*2+1]);
        }
    }
    struct CachedGraph : public MPSCachedGraph {
@ -264,12 +294,12 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradOutput_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradRecWeights_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradWeights_ = nil;
      NSMutableArray<MPSGraphTensor*> *gradBias_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradState_ = nil;
+      MPSGraphTensor* gradOutput_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradCellState_ = nil;
+      MPSGraphTensor* gradState_ = nil;
      MPSGraphTensor* gradCellState_ = nil;
    };
    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@ -296,8 +326,10 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                    for (size_t i = 0; i < num_layers; i += 1) {
                        [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                        [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                        if(has_biases) {
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
                        }
                    }
                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@ -308,8 +340,22 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                    MPSGraphTensor* gradientCyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_cy.scalar_type()), getMPSShape(grad_cy));
                    MPSGraphTensor* gradientHyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_hy.scalar_type()), getMPSShape(grad_hy));
                    MPSGraphTensor* cellStateFwdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(cell_state_fwd.scalar_type()), getMPSShape(cell_state_fwd));
                    MPSGraphTensor* layersOutputsTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(layersOutputs.scalar_type()), getMPSShape(layersOutputs));
                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor, layersOutputsTensor};
                    if (batch_first) {
                        inputTensor = [mpsGraph transposeTensor: inputTensor
                                                      dimension: 0
                                                  withDimension: 1
                                                           name: nil];
                        gradientTensor = [mpsGraph transposeTensor: gradientTensor
                                                         dimension: 0
                                                     withDimension: 1
                                                              name: nil];
                    }
                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor};
                    newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList;
                    newCachedGraph->kernelWeightsList_ = kernelWeightsList;
                    newCachedGraph->biasList_ = kernelBiasList;
@ -325,7 +371,6 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                    NSArray<MPSGraphTensor*>* outputs = nil;
                    NSMutableArray<MPSGraphTensor*>* gradOutputArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradRecWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                    NSMutableArray<MPSGraphTensor*>* gradBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
@ -349,9 +394,15 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                        cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                    axis:0
                                                    name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                        MPSGraphTensor* biasTensor = nil;
-                                                                            secondaryTensor:recurrentBiasList[i]
+                        if(has_biases) {
-                                                                            name:nil];
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
                                                            secondaryTensor:recurrentBiasList[i]
                                                            name:nil];
                        } else {
                            biasTensor = [mpsGraph constantWithScalar:0.0
                                                            dataType:inputTensor.dataType];
                        }
                        MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                    dimension:0
@ -375,7 +426,23 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                                                                            length:1
                                                                            name:nil];
-                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: inputTensor
+                        MPSGraphTensor* iterationInputTensor_ = nil;
                        if (i == 0) {
                            iterationInputTensor_ = inputTensor;
                        } else {
                            iterationInputTensor_ = [mpsGraph sliceTensor:layersOutputsTensor
                                                                dimension: 0
                                                                    // last element in layersOutputsTensor contains
                                                                    // **inputs** for the last layer
                                                                    start: i - num_layers
                                                                   length: 1
                                                                     name: nil];
                            iterationInputTensor_ = [mpsGraph squeezeTensor:iterationInputTensor_
                                                                       axis:0
                                                                       name: nil];
                        }
                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: iterationInputTensor_
                                             recurrentWeight: recurrentKernelWeightsList[i]
                                              sourceGradient: gradientTensor_
                                                      zState: zState
@ -391,24 +458,31 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
                                                  descriptor: opDesc
                                                        name: nil];
                        gradientTensor_ = [outputs objectAtIndex:0];
-                        [gradOutputArray addObject:[outputs objectAtIndex:0]];
+                        [gradRecWeightsArray insertObject:[outputs objectAtIndex:1] atIndex:0];
-                        [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
+                        [gradWeightsArray insertObject:[outputs objectAtIndex:2] atIndex:0];
-                        [gradWeightsArray addObject:[outputs objectAtIndex:2]];
+                        [gradBiasArray insertObject: [outputs objectAtIndex:3] atIndex:0];
-                        [gradBiasArray addObject:[outputs objectAtIndex:3]];
+                        [gradStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:4] axis:0 name:nil]  atIndex:0];
-                        [gradStateArray addObject:[outputs objectAtIndex:4]];
+                        [gradCellStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:5] axis:0 name:nil] atIndex:0];
                        [gradCellStateArray addObject:[outputs objectAtIndex:5]];
                    }
                    std::vector<MPSGraphTensor*> outputTensors = {[outputs objectAtIndex:0],[outputs objectAtIndex:1],[outputs objectAtIndex:2],[outputs objectAtIndex:3], [outputs objectAtIndex:4], [outputs objectAtIndex:5]};
                    if (batch_first) {
                        MPSGraphTensor* gradientTensorTransposed = [mpsGraph transposeTensor:gradientTensor_
                                                                                   dimension: 0
                                                                               withDimension: 1
                                                                                        name:nil];
                        newCachedGraph->gradOutput_ = gradientTensorTransposed;
                    } else {
                        newCachedGraph->gradOutput_ = gradientTensor_;
                    }
                    newCachedGraph->outputTensors_ = outputTensors;
                    newCachedGraph->gradOutput_ = gradOutputArray;
                    newCachedGraph->gradRecWeights_ = gradRecWeightsArray;
                    newCachedGraph->gradWeights_ = gradWeightsArray;
                    newCachedGraph->gradBias_ = gradBiasArray;
-                    newCachedGraph->gradState_ = gradStateArray;
+                    newCachedGraph->gradState_ = [mpsGraph concatTensors:gradStateArray dimension: 0 name: nil];
-                    newCachedGraph->gradCellState_ = gradCellStateArray;
+                    newCachedGraph->gradCellState_ = [mpsGraph concatTensors:gradCellStateArray dimension: 0 name: nil];
                }
                return newCachedGraph;
            });
@ -423,6 +497,7 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        Placeholder cellStateFwdPlaceholder   = Placeholder(cachedGraph->inputTensors_[5], cell_state_fwd);
        Placeholder gradientHyPlaceholder   = Placeholder(cachedGraph->inputTensors_[6], grad_hy);
        Placeholder gradientCyPlaceholder   = Placeholder(cachedGraph->inputTensors_[7], grad_cy);
        Placeholder layersOutputsPlaceholder   = Placeholder(cachedGraph->inputTensors_[8], layersOutputs);
        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
        [feeds setObject:gradientPlaceholder.getMPSGraphTensorData() forKey:gradientPlaceholder.getMPSGraphTensor()];
@ -433,6 +508,7 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        [feeds setObject:cellStatePlaceholder.getMPSGraphTensorData() forKey:cellStatePlaceholder.getMPSGraphTensor()];
        [feeds setObject:zStatePlaceholder.getMPSGraphTensorData() forKey:zStatePlaceholder.getMPSGraphTensor()];
        [feeds setObject:cellStateFwdPlaceholder.getMPSGraphTensorData() forKey:cellStateFwdPlaceholder.getMPSGraphTensor()];
        [feeds setObject:layersOutputsPlaceholder.getMPSGraphTensorData() forKey:layersOutputsPlaceholder.getMPSGraphTensor()];
        NSMutableArray<MPSGraphTensor*> *kernelWeightsList = cachedGraph->kernelWeightsList_;
        NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_;
@ -445,68 +521,65 @@ std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(c
        for (size_t i = 0; i < num_layers; i+=1) {
            kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
            recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
            [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
            [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            if(has_biases) {
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
            }
        }
-        Tensor output = at::empty_like(input);
+        Tensor output_out = at::empty_like(input);
-        Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
+        Tensor grad_state_out = at::empty_like(hx[0]);
-        Tensor grad_weights = at::empty_like(kernel_weights[0]);
+        Tensor grad_cell_state_out = at::empty_like(hx[1]);
        Tensor grad_bias = at::empty_like(biases[0]);
        Tensor grad_state = at::empty_like(hx[0]);
        Tensor grad_cell_state = at::empty_like(hx[1]);
        Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
        Placeholder gradRecWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[1], grad_rec_weights);
        Placeholder gradWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[2], grad_weights);
        Placeholder gradBiasPlaceholder   = Placeholder(cachedGraph->outputTensors_[3], grad_bias);
        Placeholder gradStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[4], grad_state);
        Placeholder gradCellStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[5], grad_cell_state);
-        std::vector<Tensor> grad_hx = {grad_state, grad_cell_state};
+
        std::vector<Tensor> grad_hx = {grad_state_out, grad_cell_state_out};
        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *results = [[[NSMutableDictionary alloc] init] autorelease];
        NSMutableArray<MPSGraphTensor*> *gradOutputArray = cachedGraph->gradOutput_;
        NSMutableArray<MPSGraphTensor*> *gradRecWeightsArray = cachedGraph->gradRecWeights_;
        NSMutableArray<MPSGraphTensor*> *gradWeightsArray = cachedGraph->gradWeights_;
        NSMutableArray<MPSGraphTensor*> *gradBiasArray = cachedGraph->gradBias_;
-        NSMutableArray<MPSGraphTensor*> *gradStateArray = cachedGraph->gradState_;
+        MPSGraphTensor* gradOutput = cachedGraph->gradOutput_;
-        NSMutableArray<MPSGraphTensor*> *gradCellStateArray = cachedGraph->gradCellState_;
+        MPSGraphTensor* gradState = cachedGraph->gradState_;
-        Placeholder gradOutPlaceholder;
+        MPSGraphTensor* gradCellState = cachedGraph->gradCellState_;
        Placeholder gradStatePlaceholder = Placeholder(gradState, grad_state_out);
        Placeholder gradCellStatePlaceholder = Placeholder(gradCellState, grad_cell_state_out);
        Placeholder outputPlaceholder = Placeholder(gradOutput, output_out);
        [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
        [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
        [results setObject:outputPlaceholder.getMPSGraphTensorData() forKey:outputPlaceholder.getMPSGraphTensor()];
        Placeholder gradRecWeightsPlaceholder, gradWeightsPlaceholder, gradBiasPlaceholder;
        std::vector<Tensor> weights;
        for (int i = 0; i < num_layers; i++) {
            Tensor output = at::empty_like(input);
            Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
            Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
+            Tensor grad_bias = at::empty((kernel_weights[i].size(0)), kernel_weights[i].options());
            Tensor grad_state = at::empty_like(hx[0]);
            Tensor grad_cell_state = at::empty_like(hx[1]);
            weights.push_back(grad_weights);
            weights.push_back(grad_rec_weights);
            weights.push_back(grad_bias);
            weights.push_back(grad_bias);
            gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex:i], grad_bias);
            gradStatePlaceholder = Placeholder([gradStateArray objectAtIndex:i], grad_state);
            gradCellStatePlaceholder = Placeholder([gradCellStateArray objectAtIndex:i], grad_cell_state);
-            [results setObject:gradOutPlaceholder.getMPSGraphTensorData() forKey:gradOutPlaceholder.getMPSGraphTensor()];
+            if(has_biases) {
-            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
+                weights.push_back(grad_bias);
                weights.push_back(grad_bias);
            }
            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex: i], grad_rec_weights);
            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex: i], grad_weights);
            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex: i], grad_bias);
            [results setObject:gradBiasPlaceholder.getMPSGraphTensorData() forKey:gradBiasPlaceholder.getMPSGraphTensor()];
-            [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
+            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
            [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
            [results setObject:gradWeightsPlaceholder.getMPSGraphTensorData() forKey:gradWeightsPlaceholder.getMPSGraphTensor()];
        }
        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output, grad_hx, weights);
+        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output_out, grad_hx, weights);
    }
 }
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -35,7 +35,9 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
    indices.copy_(cpu_indices);
    return;
  }
-  TORCH_WARN_ONCE(self.scalar_type() != ScalarType::Long, "MPS: no support for int64 min/max ops, casting it to int32");
+  if (self.scalar_type() == ScalarType::Long) {
    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
  }
  MPSStream* stream = getCurrentMPSStream();
  struct CachedGraph : public MPSCachedGraph {
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -75,15 +75,20 @@ MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)
    return inputTensor;
  }
-  MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+  if(!is_macos_13_or_newer()) {
-                                                   dataType:inputTensor.dataType];
+    MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
-  MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                    dataType:inputTensor.dataType];
-                                                        secondaryTensor:zeroTensor
+    MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
-                                                                    name:nil];
+                                                          secondaryTensor:zeroTensor
-  return [mpsGraph selectWithPredicateTensor:predicateTensor
+                                                                      name:nil];
-                         truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil]
+    return [mpsGraph selectWithPredicateTensor:predicateTensor
-                        falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil]
+                          truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil]
-                                        name:nil];
+                          falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil]
                                          name:nil];
  } else {
    return [mpsGraph truncateWithTensor:inputTensor
                                   name:nil];
  }
 };
 } // namespace mps
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
  } else {
    native::upsample_2d_common_check(input.sizes(), output_size);
  }
  Tensor out;
  if (!output.is_contiguous()) {
    out = at::empty_like(output, MemoryFormat::Contiguous);
  }
  bool centerResults = false;
  MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
  MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
    MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);
    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
    };
    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
    if (out.has_storage()) {
      output.copy_(out);
    }
  }
 }
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -424,22 +424,54 @@ MPSGraphTensor* asStridedLayer_pattern(MPSGraph *graph, MPSGraphTensor *inputTen
 }
 static
-std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape) {
+std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape, const bool squeeze) {
  bool hasMPSShape = (mpsShape != nil);
  std::vector<int64_t> src_view_shape;
  if (hasMPSShape) {
    int src_ndim_view = [mpsShape count];
-    src_view_shape.resize(src_ndim_view);
+    if (squeeze) {
-    for (const auto i : c10::irange(src_ndim_view)) {
+      for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_shape[i] = [mpsShape[i] intValue];
+        if ([mpsShape[i] intValue] == 1)
          continue;
        src_view_shape.emplace_back([mpsShape[i] intValue]);
      }
    } else {
      src_view_shape.resize(src_ndim_view);
      for (const auto i : c10::irange(src_ndim_view)) {
        src_view_shape[i] = [mpsShape[i] intValue];
      }
    }
  } else {
-    src_view_shape = src.sizes().vec();
+    if (squeeze) {
      IntArrayRef src_shape = src.sizes();
      size_t src_ndim_view = src_shape.size();
      for (const auto i : c10::irange(src_ndim_view)) {
        if (src_shape[i] == 1)
          continue;
        src_view_shape.emplace_back(src_shape[i]);
      }
    } else {
      src_view_shape = src.sizes().vec();
    }
  }
  return src_view_shape;
 }
 std::vector<int64_t> getSqueezedBaseShape(const Tensor& src, IntArrayRef shape) {
  std::vector<int64_t> src_base_shape;
  for (const auto i : c10::irange(shape.size())) {
    if (shape[i] == 1)
      continue;
    src_base_shape.emplace_back(shape[i]);
  }
  return src_base_shape;
 }
 bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
  if (!src.is_contiguous()) {
    return false;
@ -447,57 +479,79 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
  size_t src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
  size_t src_ndim_view = src_view_shape.size();
  if (src_ndim_base != src_ndim_view) {
    return false;
  }
  for (const auto i: c10::irange(src_ndim_base)) {
-    if (src_view_shape[i] > src_base_shape[i]) {
+     if (src_view_shape[i] > src_base_shape[i]) {
-      return false;
+       return false;
-    }
+     }
-  }
+   }
  return true;
 }
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
-  int src_ndim_base = src_base_shape.size();
+  size_t src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
-  int src_ndim_view = src_view_shape.size();
+  size_t src_ndim_view = src_view_shape.size();
  TORCH_CHECK(src_ndim_base == src_ndim_view);
  MPSNDArray *srcTensorNDArrayView = nil;
  MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
  MPSNDArray *srcTensorNDArray = nil;
  id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
  int64_t base_idx = 0;
  std::vector<int64_t> src_base_shape_vec;
  if (src_ndim_view != src_ndim_base) {
    src_base_shape_vec.reserve(src_ndim_view);
    for (const auto i : c10::irange(src_ndim_view)) {
      if (src_view_shape[i] == 1 && src_base_shape[base_idx] != 1) {
        src_base_shape_vec.emplace_back(1);
      } else {
        src_base_shape_vec.emplace_back(src_base_shape[base_idx]);
        if (base_idx < src_ndim_base - 1)
          base_idx += 1;
      }
    }
    src_base_shape = IntArrayRef(src_base_shape_vec);
    src_ndim_base = src_base_shape.size();
  }
  srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
  srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
-  int firstDimToSlice = 0;
+  size_t firstDimToSlice = 0;
  while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
    firstDimToSlice++;
  }
-  int view_numel = 1;
+  int64_t view_numel = 1;
  for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
    view_numel *= src_base_shape[i];
  }
-  int sliceOffset = src.storage_offset() / view_numel;
+  int64_t sliceOffset = src.storage_offset() / view_numel;
-  // There are cases where both dimensions of a view can shrink
+  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice
-  // E.g: x = torch.randn((3,6))[1, 1:3]
+                          withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
  int nextSliceOffset = src.storage_offset() % view_numel;
-  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
+  // Slice any remaining dimensions
-  if (nextSliceOffset) {
+  for (const auto crtSliceOffset: c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
-    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
+    if (src_view_shape[crtSliceOffset] != src_base_shape[crtSliceOffset]) {
      if (crtSliceOffset == src_base_shape.size() - 1) {
        sliceOffset = src.storage_offset() % src_base_shape[src_base_shape.size() - 1];
      } else {
        sliceOffset = (src.storage_offset() % view_numel) / (view_numel / src_base_shape[crtSliceOffset]);
      }
      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - crtSliceOffset
                              withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[crtSliceOffset])}];
    }
  }
  srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
                                                           descriptor:srcTensorNDArrayDesc
                                                             aliasing:MPSAliasingStrategyShallAlias];
@ -696,7 +750,7 @@ const std::string& getGatherScatterScalarType(const Tensor& t) {
    {c10::ScalarType::Int,   "int"},
    {c10::ScalarType::Short, "short"},
    {c10::ScalarType::Char,  "char"},
-    {c10::ScalarType::Byte,  "char"},
+    {c10::ScalarType::Byte,  "uchar"},
    {c10::ScalarType::Bool,  "bool"},
  };
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3567,19 +3567,14 @@
 - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
 # native_functions.yaml
 # https://github.com/pytorch/pytorch/issues/77394
 - func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
-    MPS: _mps_max_pool2d
+    CompositeImplicitAutograd: max_pool2d
-  autogen: _mps_max_pool2d.out
+    MPS: mps_max_pool2d
- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
    MPS: mps_max_pool2d_backward
-  autogen: mps_max_pool2d_backward.out
+  autogen: max_pool2d_backward.out
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  dispatch:
@ -7188,12 +7183,12 @@
 # MPS LSTM implementation
- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    MPS: _lstm_mps
  autogen: _lstm_mps.out
- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
  dispatch:
    MPS: lstm_mps_backward
  autogen: lstm_mps_backward.out
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@ -379,16 +379,33 @@ inline bool check_gpu_sm50_or_greater(sdp_params params, bool debug) {
  return true;
 }
-inline bool check_gpu_sm86_head_dim_128(sdp_params params, bool debug) {
+inline bool check_head_dim_gt64_and_sm_ge86(sdp_params params, bool debug) {
  // Memory Efficient Attention is throwing a cuda illegal memory error
-  // on sm86 when head_dim is 128.
+  // on sm86 or newer when head_dim is greater than 64.
  auto dprops = at::cuda::getCurrentDeviceProperties();
-  bool is_sm86 = (dprops->major == 8) && (dprops->minor == 6);
+  bool is_sm86_or_newer = (dprops->major == 8) && (dprops->minor >= 6);
-  if (is_sm86 && (params.query.size(-1) == 128)) {
+  // Categorically disable sm90 as well. Will want to fix this once we have H100s available for testing.
  is_sm86_or_newer = is_sm86_or_newer || (dprops->major > 8);
  if (is_sm86_or_newer && (params.query.sym_size(-1) > 64)) {
    if (debug) {
      TORCH_WARN(
-        "Memory Efficient Attention does not currently support head_dim == 128 on sm86",
+          "Memory Efficient Attention does not currently support head_dim greater than 64 on sm86 or newer");
-        "because it is throwing a cuda illegal memory error on sm86 when head_dim is 128.");
+    }
    return false;
  }
  return true;
 }
 inline bool check_requires_grad_and_head_dim_gt64_and_sm_ge86(
    sdp_params params,
    bool debug) {
  // Flash Attention will raise an error in the backward pass if the head_dim
  // size is greater than 64 And the device is sm86 or newer.
  if (!check_requires_grad(params, false) &&
      !check_head_dim_gt64_and_sm_ge86(params, false)) {
    if (debug) {
      TORCH_WARN(
          "Flash attention currently doesn't support training with head_dim greater than 64 on sm86 or newer.");
    }
    return false;
  }
@ -422,13 +439,14 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
  return false;
 #endif
  //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints {{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
      check_runtime_disabled_flash,
      check_tensor_shapes,
      check_equal_batch_size_and_num_heads,
      check_for_attn_mask,
      check_head_dim_size,
      check_gpu_sm75_or_greater,
      check_requires_grad_and_head_dim_gt64_and_sm_ge86,
      check_for_nested_inputs,
      check_for_seq_len_1_nested_tensor}};
  for (auto& constraint : constraints) {
@ -465,7 +483,7 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
      check_equal_batch_size_and_num_heads,
      check_for_attn_mask,
      check_head_dim_size_mem_efficient,
-      check_gpu_sm86_head_dim_128,
+      check_head_dim_gt64_and_sm_ge86,
      check_for_seq_len_1_nested_tensor,
      check_for_non_zero_dropout,
      check_use_deterministic_algorithms}};
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@ -630,6 +630,7 @@ macro(cuda_unset_include_and_libraries)
  unset(CUDA_cublas_LIBRARY CACHE)
  unset(CUDA_cublas_device_LIBRARY CACHE)
  unset(CUDA_cublasemu_LIBRARY CACHE)
  unset(CUDA_cublasLt_LIBRARY CACHE)
  unset(CUDA_cufft_LIBRARY CACHE)
  unset(CUDA_cufftemu_LIBRARY CACHE)
  unset(CUDA_cupti_LIBRARY CACHE)
@ -963,6 +964,7 @@ endif()
 find_cuda_helper_libs(cufft)
 find_cuda_helper_libs(cublas)
 find_cuda_helper_libs(cublasLt)
 # cusparse showed up in version 3.2
 find_cuda_helper_libs(cusparse)
 find_cuda_helper_libs(curand)
@ -993,7 +995,7 @@ if (CUDA_BUILD_EMULATION)
  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
 else()
  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
 endif()
 ########################
@ -1962,7 +1964,7 @@ macro(CUDA_ADD_CUBLAS_TO_TARGET target)
  if (CUDA_BUILD_EMULATION)
    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
  else()
-    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
  endif()
 endmacro()
--- a/docker.Makefile
+++ b/docker.Makefile
@ -10,8 +10,8 @@ endif
 CUDA_VERSION              = 11.7.0
 CUDNN_VERSION             = 8
-BASE_RUNTIME              = ubuntu:18.04
+BASE_RUNTIME              = ubuntu:20.04
-BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu20.04
 # The conda channel to use to install cudatoolkit
 CUDA_CHANNEL              = nvidia
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -351,7 +351,7 @@ master_doc = 'index'
 # General information about the project.
 project = 'PyTorch'
-copyright = '2022, PyTorch Contributors'
+copyright = '2023, PyTorch Contributors'
 author = 'PyTorch Contributors'
 torch_version = str(torch.__version__)
--- a/docs/source/dynamo/get-started.rst
+++ b/docs/source/dynamo/get-started.rst
@ -6,13 +6,12 @@ significant speedups the newer your GPU is.
 .. code:: python
   from torch._dynamo import optimize
   import torch
   def fn(x, y):
       a = torch.cos(x).cuda()
       b = torch.sin(y).cuda()
       return a + b
-   new_fn = optimize("inductor")(fn)
+   new_fn = torch.compile(fn, backend="inductor")
   input_tensor = torch.randn(10000).to(device="cuda:0")
   a = new_fn(input_tensor, input_tensor)
@ -54,7 +53,7 @@ with the actual generated kernel being
       tmp2 = tl.sin(tmp1)
       tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
-And you can verify that fusing the two ``sins`` did actually occur
+And you can verify that fusing the two ``sin`` did actually occur
 because the two ``sin`` operations occur within a single Triton kernel
 and the temporary variables are held in registers with very fast access.
@ -69,13 +68,12 @@ hub.
 .. code-block:: python
   import torch
   import torch._dynamo as dynamo
   model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
-   opt_model = dynamo.optimize("inductor")(model)
+   opt_model = torch.compile(model, backend="inductor")
   model(torch.randn(1,3,64,64))
 And that is not the only available backend, you can run in a REPL
-``dynamo.list_backends()`` to see all the available backends. Try out the
+``torch._dynamo.list_backends()`` to see all the available backends. Try out the
 ``cudagraphs`` or ``nvfuser`` next as inspiration.
 Let’s do something a bit more interesting now, our community frequently
@ -92,11 +90,10 @@ HuggingFace hub and optimize it:
   import torch
   from transformers import BertTokenizer, BertModel
   import torch._dynamo as dynamo
   # Copy pasted from here https://huggingface.co/bert-base-uncased
   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
   model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
-   model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
+   model = torch.compile(model, backend="inductor") # This is the only line of code that we changed
   text = "Replace me by any text you'd like."
   encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
   output = model(**encoded_input)
@ -116,7 +113,7 @@ Similarly let’s try out a TIMM example
   import torch._dynamo as dynamo
   import torch
   model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
-   opt_model = dynamo.optimize("inductor")(model)
+   opt_model = torch.compile(model, backend="inductor")
   opt_model(torch.randn(64,3,7,7))
 Our goal with Dynamo and inductor is to build the highest coverage ML compiler
@ -132,16 +129,16 @@ or ``torch._dynamo.list_backends()`` each of which with its optional dependencie
 Some of the most commonly used backends include:
 **Training & inference backends**:
-  * ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+  * ``torch.compile(m, backend="inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
-  * ``dynamo.optimize("aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``torch.compile(m, backend="aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``torch.compile(m, backend=""nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+  * ``torch.compile(m, backend="cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
 **Inference-only backends**:
-  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
+  * ``torch.compile(m, backend="onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
-  * ``dynamo.optimize("tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
+  * ``torch.compile(m, backend="tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
-  * ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+  * ``torch.compile(m, backend="ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
-  * ``dynamo.optimize("tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
+  * ``torch.compile(m, backend="tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
 Why do you need another way of optimizing PyTorch code?
 -------------------------------------------------------
--- a/docs/source/dynamo/guards-overview.rst
+++ b/docs/source/dynamo/guards-overview.rst
@ -15,7 +15,7 @@ Where a complete example looks like this:
   from typing import List
   import torch
-   import torchdynamo
+   from torch import _dynamo as torchdynamo
   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
       print("my_compiler() called with FX graph:")
       gm.graph.print_tabular()
--- a/docs/source/dynamo/index.rst
+++ b/docs/source/dynamo/index.rst
@ -14,7 +14,7 @@ worlds — usability and performance.
 TorchDynamo makes it easy to experiment with different compiler
 backends to make PyTorch code faster with a single line decorator
-``torch._dynamo.optimize()``
+``torch._dynamo.optimize()`` which is wrapped for convenience by ``torch.compile()``
 .. image:: ../_static/img/dynamo/TorchDynamo.png
--- a/docs/source/dynamo/installation.rst
+++ b/docs/source/dynamo/installation.rst
@ -27,7 +27,7 @@ TorchDynamo dependencies (for CUDA 11.7):
 .. code-block:: shell
-   pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+   pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
 CPU requirements
 ~~~~~~~~~~~~~~~~
@ -41,16 +41,6 @@ To install, run the following command:
   pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 Install from Local Source
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 Alternatively, you can build PyTorch from `source
 <https://github.com/pytorch/pytorch#from-source>`__, which has TorchDynamo
 included.
 To install GPU TorchDynamo dependencies, run ``make triton`` in the
 PyTorch repo root directory.
 Verify Installation
 ~~~~~~~~~~~~~~~~~~~
--- a/docs/source/notes/extending.func.rst
+++ b/docs/source/notes/extending.func.rst
@ -37,8 +37,13 @@ Only the latter is supported with function transforms:
  (by calling ``ctx.save_for_backward(*tensors)``), or save non-Tensors
  (by assigning them to the ``ctx`` object).
-Any intermediates that need to be saved must be returned as an output from
+Because :meth:`~Function.setup_context` accepts only ``inputs`` and ``output``,
-:meth:`~Function.forward`.
+the only quantities that can be saved are either objects (such as Tensors) in
 the inputs or outputs or quantities (like ``Tensor.shape``) derived from them.
 If you wish to save a non-input intermediate activation from
 :meth:`Function.forward` for backward, then you'll need to return it as an
 output from :meth:`~Function.forward` so that it gets passed to
 :meth:`~Function.setup_context`.
 Depending on the transform,
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@ -129,6 +129,49 @@ Algorithms
    Rprop
    SGD
 Many of our algorithms have various implementations optimized for performance,
 readability and/or generality, so we attempt to default to the generally fastest
 implementation for the current device if no particular implementation has been
 specified by the user.
 We have 3 major categories of implementations: for-loop, foreach (multi-tensor), and
 fused. The most straightforward implementations are for-loops over the parameters with
 big chunks of computation. For-looping is usually slower than our foreach
 implementations, which combine parameters into a multi-tensor and run the big chunks
 of computation all at once, thereby saving many sequential kernel calls. A few of our
 optimizers have even faster fused implementations, which fuse the big chunks of
 computation into one kernel. We can think of foreach implementations as fusing
 horizontally and fused implementations as fusing vertically on top of that.
 In general, the performance ordering of the 3 implementations is fused > foreach > for-loop.
 So when applicable, we default to foreach over for-loop. Applicable means the foreach
 implementation is available, the user has not specified any implementation-specific kwargs
 (e.g., fused, foreach, differentiable), and all tensors are native and on CUDA. Note that
 while fused should be even faster than foreach, the implementations are newer and we would
 like to give them more bake-in time before flipping the switch everywhere. You are welcome
 to try them out though!
 Below is a table showing the available and default implementations of each algorithm:
 .. csv-table::
    :header: "Algorithm", "Default", "Has foreach?", "Has fused?"
    :widths: 25, 25, 25, 25
    :delim: ;
    :class:`Adadelta`;foreach;yes;no
    :class:`Adagrad`;foreach;yes;no
    :class:`Adam`;foreach;yes;yes
    :class:`AdamW`;foreach;yes;yes
    :class:`SparseAdam`;for-loop;no;no
    :class:`Adamax`;foreach;yes;no
    :class:`ASGD`;foreach;yes;no
    :class:`LBFGS`;for-loop;no;no
    :class:`NAdam`;foreach;yes;no
    :class:`RAdam`;foreach;yes;no
    :class:`RMSprop`;foreach;yes;no
    :class:`Rprop`;foreach;yes;no
    :class:`SGD`;foreach;yes;no
 How to adjust learning rate
 ---------------------------
--- a/setup.py
+++ b/setup.py
@ -1024,17 +1024,12 @@ def main():
        'typing-extensions',
        'sympy',
        'networkx',
        'jinja2',
    ]
    extras_require = {
        'opt-einsum': ['opt-einsum>=3.3']
    }
    if platform.system() == 'Linux':
        triton_pin_file = os.path.join(cwd, ".github", "ci_commit_pins", "triton.txt")
        if os.path.exists(triton_pin_file):
            with open(triton_pin_file) as f:
                triton_pin = f.read().strip()
                extras_require['dynamo'] = ['pytorch-triton==2.0.0+' + triton_pin[:10], 'jinja2']
    # Parse the command line and check the arguments before we proceed with
    # building deps and setup. We need to set values so `--help` works.
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@ -504,7 +504,7 @@ class TestFSDPUseOrigParamsUnshardReshard(FSDPTest):
            fsdp_kwargs=fsdp_kwargs,
            deterministic=True,
        )
-        optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
+        optim = torch.optim.Adam(fsdp_model.parameters(), foreach=False, lr=LR)
        fsdp_kwargs["use_orig_params"] = True
        fsdp_model_orig_params = TransformerWithSharedParams.init(
            self.process_group,
@ -513,7 +513,9 @@ class TestFSDPUseOrigParamsUnshardReshard(FSDPTest):
            fsdp_kwargs=fsdp_kwargs,
            deterministic=True,
        )
-        optim_orig_params = torch.optim.Adam(fsdp_model_orig_params.parameters(), lr=LR)
+        optim_orig_params = torch.optim.Adam(
            fsdp_model_orig_params.parameters(), foreach=False, lr=LR
        )
        return fsdp_model, optim, fsdp_model_orig_params, optim_orig_params
    def _check_fsdp_parameter_parity(self, fsdp1: FSDP, fsdp2: FSDP) -> None:
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@ -1444,6 +1444,59 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
            PythonProcessGroupExtensionTest.create_dummy
        )
    def test_backend_config(self):
        dist.Backend.register_backend(
            "dummy",
            PythonProcessGroupExtensionTest.create_dummy
        )
        # Ensure backend config can be created with the following arguments
        backend_config_strings_and_expected_values = [
            (dist.Backend.GLOO, "cpu:gloo,cuda:gloo"),
            (dist.Backend.NCCL, "cpu:nccl,cuda:nccl"),
            (dist.Backend.MPI, "cpu:mpi,cuda:mpi"),
            (dist.Backend.UCC, "cpu:ucc,cuda:ucc"),
            (dist.Backend.DUMMY, "cpu:dummy,cuda:dummy"),
            ("DUMMY", "cpu:dummy,cuda:dummy"),
            ("dummy", "cpu:dummy,cuda:dummy"),
            ("cpu:dummy,cuda:dummy", "cpu:dummy,cuda:dummy"),
            ("cpu:dummy,cuda:nccl", "cpu:dummy,cuda:nccl"),
            ("cpu:gloo,cuda:dummy", "cpu:gloo,cuda:dummy"),
            ("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
            ("cPu:gLoO,cuDa:NcCl", "cpu:gloo,cuda:nccl")
        ]
        for config_str, expected_value in backend_config_strings_and_expected_values:
            with self.subTest(config_str):
                # ensures these configs strings are valid and no ValueError is raised
                config = dist.BackendConfig(config_str)
                self.assertEqual(str(config), expected_value)
        # Ensure backend config will raise ValueError with the following arguments
        invalid_backend_config_strings = [
            "cpu:gloo,cuda:nccl,",  # trailing comma
            "cpu:gloo,cuda:nccl,cpu:dummy",  # duplicate device
        ]
        for config_str in invalid_backend_config_strings:
            with self.subTest(config_str):
                with self.assertRaises(ValueError):
                    dist.BackendConfig(config_str)
    def test_init_process_group_with_multiple_backends(self):
        dist.Backend.register_backend("dummy", PythonProcessGroupExtensionTest.create_dummy)
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '6789'
        dist.init_process_group("cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size)
        # test all_gather
        input_tensor = torch.ones(2, 2) * 7
        output_tensor_list = [torch.zeros(2, 2) for _ in range(self.world_size)]
        dist.all_gather(output_tensor_list, input_tensor)
        dist.barrier()
        dist.destroy_process_group()
    class Options:
        def __init__(self):
            pass
@ -1570,6 +1623,11 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase):
                    world_size=self.world_size,
                    store=store
                )
                pg = c10d._get_default_group()
                self.assertEqual(pg.rank(), self.rank)
                self.assertEqual(pg.size(), self.world_size)
                self.assertEqual(pg.name(), str(backend))
                dist.destroy_process_group()
    def _call_collective_with_varying_tensors(self, backend, collective, *args):
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -335,7 +335,7 @@ class MyPythonStore(dist.Store):
        self.store = {}
    def set(self, key, value):
-        if not isinstance(key, str):
+        if not isinstance(key, (str, bytes)):
            raise AssertionError("Expected set to be called with string key")
        if type(value) is not bytes:
            raise AssertionError("Expected set to be called with bytes value")
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@ -51,11 +51,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
        y = torch.randn(4)
        x = torch.nn.Parameter(torch.randn(4))
        aot_fn = torch._dynamo.optimize("aot_eager")(fn)
-        with self.assertRaisesRegex(
+        # This should not error: we mutated an autograd leaf under no_grad mode.
-            RuntimeError,
+        aot_fn(x, y)
            "a leaf Variable that requires grad is being used in an in-place operation.",
        ):
            aot_fn(x, y)
    def test_mutation1(self):
        def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
@ -179,11 +176,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
        # Run exported graph with AOT
        aot_fn = torch._dynamo.optimize("aot_eager")(graph)
-        with self.assertRaisesRegex(
+        # This should not error: we mutated an autograd leaf under no_grad mode.
-            RuntimeError,
+        aot_fn(x, y)
            "a leaf Variable that requires grad is being used in an in-place operation.",
        ):
            aot_fn(x, y)
    def test_call_fn_with_non_const_inputs_aot_unsafe_control_flow(self):
        class ModuleSpecialFwd(torch.nn.Module):
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@ -60,6 +60,11 @@ unittest.expectedFailure(
    # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 unittest.expectedFailure(
    DynamicShapesMiscTests.test_parsing_sdpa_dynamic_shapes
    # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 # DynamicShapesSubGraphTests
 unittest.expectedFailure(
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -2294,7 +2294,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertIs(x_ref(), None)
    def test_release_module_memory(self):
        mod = torch.nn.Linear(10, 10)
        x = torch.rand([10, 10])
        mod_weight_ref = weakref.ref(mod.weight)
@ -2640,7 +2639,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
                self.names = []
            def forward(self, idx, targets=None):
                b, t = idx.size()
                assert (
                    t <= self.block_size
@ -3145,6 +3143,53 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertEqual(compiled.device.index, 0)
        self.assertEqual(compiled.dtype, torch.float16)
    @unittest.skipIf(
        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater,
        "Can't run fused SDPA on this platform",
    )
    def test_parsing_sdpa(self):
        class MyModule(torch.nn.Module):
            def forward(self, query, key, value):
                out = F.scaled_dot_product_attention(query, key, value, None, 0, True)
                out = F.scaled_dot_product_attention(
                    query=query,
                    key=key,
                    value=value,
                    attn_mask=None,
                    dropout_p=0,
                    is_causal=True,
                )
                out = F.scaled_dot_product_attention(
                    query,
                    key=key,
                    value=value,
                    attn_mask=None,
                    dropout_p=0,
                    is_causal=True,
                )
                out = F.scaled_dot_product_attention(
                    query, key, value, None, dropout_p=0, is_causal=True
                )
                return out
        device = "cuda"
        dtype = torch.float16
        seq_len_q = 1
        seq_len_k = 1
        head_dim = 8
        query = torch.ones(
            1, 8, seq_len_q, head_dim, device=device, dtype=dtype, requires_grad=True
        )
        key = torch.ones(
            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
        )
        value = torch.ones(
            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
        )
        module = MyModule()
        opt_mod = torch._dynamo.optimize("inductor")(module)
        opt_mod(query, key, value)
    def test_autocast_cpu(self):
        class MyModule(torch.nn.Module):
            def forward(self, x):
@ -3716,7 +3761,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        self.assertTrue(same(ref, res))
    def test_disable_flag(self):
        cnt = torch._dynamo.testing.CompileCounter()
        with patch.dict(os.environ, {"TORCH_COMPILE_DISABLE": "1"}):
@ -3999,6 +4043,23 @@ class MiscTests(torch._dynamo.test_case.TestCase):
        res = opt_fn(x, y)
        self.assertTrue(same(ref, res))
    def test_tuple_from_tuple_iter(self):
        def inner_fn(*args):
            acc = torch.ones(10, 10)
            for arg in args:
                acc.add_(arg)
            return acc
        @torch._dynamo.optimize("eager")
        def fn(inputs, params):
            y = tuple(inputs) + tuple(params)
            return inner_fn(*y)
        inputs = [torch.randn(10, 10) for _ in range(3)]
        fn(inputs, iter(tuple(inputs)))
    def test_torch_package_working_with_trace(self):
        # from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@ -295,6 +295,31 @@ class ModuleList(torch.nn.Module):
        return x
 class CustomGetItemModuleList(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = torch.nn.ModuleList(
            [
                torch.nn.Linear(10, 10),
                torch.nn.ReLU(),
                torch.nn.Linear(10, 10),
                torch.nn.ReLU(),
            ]
        )
    def __getitem__(self, idx: int):
        return self.layers[idx]
    def __len__(self) -> int:
        return len(self.layers)
    def forward(self, x):
        for i in range(len(self)):
            x = self[i](x)
        return x
 class ModuleDict(torch.nn.Module):
    def __init__(self):
        super().__init__()
@ -310,6 +335,23 @@ class ModuleDict(torch.nn.Module):
        return x
 class CustomGetItemModuleDict(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = torch.nn.ModuleDict(
            {
                "0": torch.nn.Linear(10, 10),
            }
        )
    def __getitem__(self, key: str) -> torch.nn.Module:
        return self.layers[key]
    def forward(self, x):
        x = self["0"](x)
        return x
 class TensorList(torch.nn.Module):
    def __init__(self):
        super().__init__()
@ -728,7 +770,9 @@ class NNModuleTests(torch._dynamo.test_case.TestCase):
    test_cfgmod = make_test(CfgModule())
    test_stringmember = make_test(StringMember())
    test_modulelist = make_test(ModuleList())
    test_modulelist = make_test(CustomGetItemModuleList())
    test_moduledict = make_test(ModuleDict())
    test_moduledict = make_test(CustomGetItemModuleDict())
    test_super1 = make_test(SuperModule())
    test_super2 = make_test(SuperModule2())
    test_super_class_method = make_test(SuperChildCallsClassMethod())
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 import collections
 import contextlib
 import copy
 import inspect
 import itertools
@ -2173,6 +2174,81 @@ class ReproTests(torch._dynamo.test_case.TestCase):
        self.assertEqual(cnt.frame_count, 2)
        self.assertEqual(cnt.op_count, 2)
    def test_exception_in_dynamo_handling(self):
        hit_handler = False
        # See https://github.com/pytorch/pytorch/pull/96488
        @contextlib.contextmanager
        def ctx():
            try:
                yield
            except RuntimeError:
                nonlocal hit_handler
                hit_handler = True
        @torch._dynamo.optimize("eager")
        def f():
            with ctx():
                h()
        def h():
            raise RuntimeError("boof")
        # Should not error
        f()
        self.assertTrue(hit_handler)
    def test_generator_dealloc(self):
        # See https://github.com/pytorch/pytorch/pull/96488
        #
        # NB: yes, [(...)] is intentional, this is a list containing a
        # generator
        generator_box = [(x for x in [1, 2, 3])]
        counter = torch._dynamo.testing.CompileCounter()
        def g(x):
            return x + 2
        # TODO: This test is pretty delicate.  To test if it's actually doing
        # anything, rebuild eval_frame.c with '#define TORCHDYNAMO_DEBUG 1'
        # and then look at the logs for:
        #
        # TRACE[_custom_eval_frame:650] begin <genexpr> test_repros.py 2276 -1 0 0
        # TRACE[_custom_eval_frame:664] throw <genexpr>
        #
        # This means we're actually hitting the relevant codepath
        # NB: Make sure we don't actually Dynamo this frame; if we do Dynamo
        # this frame, Dynamo actually DOES understand list.clear and will
        # arrange for the generator deallocation to happen when the eval frame
        # handler is disabled, which will prevent the bug from happening (we
        # specifically want to trigger the generator deallocation WHILE the
        # dynamo eval frame handler is active), as that will cause the
        # generator to become exhausted and trigger the throw_flag == TRUE
        # case.
        @torch._dynamo.skip
        def f(x):
            generator_box.clear()
            return g(x)
        self.assertNoUnraisable(
            lambda: torch._dynamo.optimize(counter)(f)(torch.randn(3))
        )
        # Make sure the x + 2 is captured (a previous incorrect implementation
        # of this fix would have disabled the eval frame callback, which means
        # g wouldn't get traced
        self.assertEqual(counter.op_count, 1)
    def test_error_return_without_exception_set(self):
        # https://github.com/pytorch/pytorch/issues/93781
        @torch.compile
        def f():
            _generator_type = type((_ for _ in ()))
        self.assertNoUnraisable(f)
    @skip_if_pytest
    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
    def test_rewrite_assert_with_msg(self):
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@ -377,8 +377,6 @@ aten::_mps_convolution
 aten::_mps_convolution.out
 aten::_mps_convolution_transpose
 aten::_mps_convolution_transpose.out
 aten::_mps_max_pool2d
 aten::_mps_max_pool2d.out
 aten::_native_batch_norm_legit.no_stats_out
 aten::_native_batch_norm_legit.out
 aten::_native_decoder_only_multi_head_attention
@ -857,6 +855,8 @@ aten::max
 aten::max.dim
 aten::max.dim_max
 aten::max.unary_out
 aten::max_pool2d_backward
 aten::max_pool2d_backward.out
 aten::max_pool2d_with_indices
 aten::max_pool2d_with_indices.out
 aten::max_pool2d_with_indices_backward
@ -930,8 +930,6 @@ aten::mps_convolution_backward
 aten::mps_convolution_backward.out
 aten::mps_convolution_transpose_backward
 aten::mps_convolution_transpose_backward.out
 aten::mps_max_pool2d_backward
 aten::mps_max_pool2d_backward.out
 aten::multi_margin_loss
 aten::multi_margin_loss.out
 aten::multi_margin_loss_backward
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`c8bfe3f548b164f745ada620a560f87f41ab8465`	`b8b470bc597c1c5bd03682c09fe3e6b7c53787fd`
`@ -1 +1 @@`
	`d29eb67c27af0f18d4f487d76b86f43b0a69aade`	`r2.0`
		`@ -1 +1 @@`
			`pytorch-triton-rocm>=2.0.0.dev`				`pytorch-triton-rocm>=2.0.0,<2.1`