remove _shard_tensor() call (#111687 )

Co-authored-by: Andrey Talman <atalman@fb.com>
c10::DriverAPI Try opening libcuda.so.1 (#113096 )
2025-11-02 06:24:59 +08:00 · 2023-11-08 07:49:29 -05:00 · 2023-11-07 11:47:24 -08:00 · 2023-11-07 11:38:50 -08:00 · 2023-11-07 11:35:20 -08:00 · 2023-11-06 16:14:14 -05:00
195 changed files with 4323 additions and 1876 deletions
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -4,6 +4,10 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

+retry () {
+    "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
+}
+
 # A bunch of custom pip dependencies for ONNX
 pip_install \
  beartype==0.10.4 \
@ -18,22 +22,17 @@ pip_install \
 # onnx-weekly. Otherwise, onnx-weekly could be
 # overwritten by onnx.
 pip_install \
-  onnxruntime==1.15.1 \
  parameterized==0.8.1 \
  pytest-cov==4.0.0 \
  pytest-subtests==0.10.0 \
  tabulate==0.9.0 \
  transformers==4.31.0

-# Using 1.15dev branch for the following not yet released features and fixes.
-# - Segfault fix for shape inference.
-# - Inliner to workaround ORT segfault.
-pip_install onnx-weekly==1.15.0.dev20230717
+pip_install coloredlogs packaging
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001

-# TODO: change this when onnx-script is on testPypi
-# pip_install onnxscript-preview==0.1.0.dev20230809 --no-deps
-# NOTE: temp change for CI to run on unpublished onnxscript PR.
-pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@f69be19ebd3f2e0d7efe64b0c7be3329cbab3822" --no-deps
+pip_install onnx==1.14.1
+pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -275,3 +275,8 @@ z3-solver==4.12.2.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
+
+tensorboard==2.13.0
+#Description: Also included in .ci/docker/requirements-docs.txt
+#Pinned versions:
+#test that import: test_tensorboard
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -544,6 +544,10 @@ test_without_numpy() {
  python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())"
  # Regression test for https://github.com/pytorch/pytorch/issues/66353
  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
+  # Regression test for https://github.com/pytorch/pytorch/issues/109387
+  if [[ "${TEST_CONFIG}" == *dynamo* ]]; then
+    python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
+  fi
  popd
 }

--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -35,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -90,7 +90,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
      if [[ "\${TORCH_CONDA_BUILD_FOLDER}" == "pytorch-nightly" ]]; then
              PYTORCH_CHANNEL="pytorch-nightly"
      fi
-      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "\${PYTORCH_CHANNEL}" "pytorch-cuda=\${cu_ver}"
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch-test "pytorch-cuda=\${cu_ver}"
    fi
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
@ -98,9 +98,9 @@ elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "$(uname -m)" == aarch64 ]]; then
    # Using "extra-index-url" until all needed aarch64 dependencies are
    # added to "https://download.pytorch.org/whl/nightly/"
-    pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
+    pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
  else
-    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
  fi
  retry pip install -q numpy protobuf typing-extensions
 fi
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -77,7 +77,9 @@ else
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
 fi

-if [[ -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
+# The build with with-pypi-cudnn suffix is only applicabe to
+# pypi small wheel Linux x86 build
+if [[ -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]] && [[ "$(uname)" == 'Linux' && "$(uname -m)" == "x86_64" ]]; then
  export PYTORCH_BUILD_VERSION="${PYTORCH_BUILD_VERSION}-with-pypi-cudnn"
 fi

--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -7,6 +7,7 @@
  - docs/source/onnx.rst
  - docs/source/onnx*
  - docs/source/scripts/onnx/**
+  - docs/source/_static/img/onnx/**
  - scripts/onnx/**
  - test/onnx/**
  - tools/onnx/**
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -25,3 +25,4 @@ sympy==1.11.1
 pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
+tensorboard==2.13.0
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3

-import argparse
 import sys

 from pathlib import Path
@ -10,9 +9,11 @@ import yaml

 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 WORKFLOWS = REPO_ROOT / ".github" / "workflows"
-EXPECTED_GROUP = (
+EXPECTED_GROUP_PREFIX = (
    "${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}"
-    "-${{ github.event_name == 'workflow_dispatch' }}"
+)
+EXPECTED_GROUP = (
+    EXPECTED_GROUP_PREFIX + "-${{ github.event_name == 'workflow_dispatch' }}"
 )


@ -26,15 +27,8 @@ def should_check(filename: Path) -> bool:


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Ensure all relevant GitHub actions jobs will be cancelled based on a concurrency key"
-    )
-    args = parser.parse_args()
-
-    files = list(WORKFLOWS.glob("*.yml"))
-
    errors_found = False
-    files = [f for f in files if should_check(f)]
+    files = [f for f in WORKFLOWS.glob("*.yml") if should_check(f)]
    names = set()
    for filename in files:
        with open(filename) as f:
@ -46,7 +40,18 @@ if __name__ == "__main__":
            errors_found = True
        names.add(name)
        actual = data.get("concurrency", {})
-        if not actual.get("group", "").startswith(EXPECTED_GROUP):
+        if filename.name == "create_release.yml":
+            if not actual.get("group", "").startswith(EXPECTED_GROUP_PREFIX):
+                print(
+                    f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
+                    file=sys.stderr,
+                )
+                print(
+                    f"concurrency group should start with {EXPECTED_GROUP_PREFIX} but found {actual.get('group', None)}",
+                    file=sys.stderr,
+                )
+                errors_found = True
+        elif not actual.get("group", "").startswith(EXPECTED_GROUP):
            print(
                f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
                file=sys.stderr,
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -62,9 +62,10 @@ SUPPORTED_PERIODICAL_MODES: Dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+# Pinning Disabled and Unstable job to Oct 4, 2023.
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=EniFrNbB6taGjwKyN94j4oqUeeN8ALfI"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=2voGK5DSv0Hzvxhc23ChGcOLEBIO2vHf"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -24,6 +24,21 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

 CPU_AARCH64_ARCH = ["cpu-aarch64"]

+PYTORCH_EXTRA_INSTALL_REQUIREMENTS = (
+    "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
+    "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'"
+)
+

 def arch_type(arch_version: str) -> str:
    if arch_version in CUDA_ARCHES:
@ -39,22 +54,19 @@ def arch_type(arch_version: str) -> str:


 WHEEL_CONTAINER_IMAGES = {
-    **{
-        gpu_arch: f"pytorch/manylinux-builder:cuda{gpu_arch}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    "cpu": "pytorch/manylinux-builder:cpu",
-    "cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi",
-    "cpu-aarch64": "pytorch/manylinuxaarch64-builder:cpu-aarch64",
+    "11.8": "pytorch/manylinux-builder:cuda11.8-2.1",
+    "12.1": "pytorch/manylinux-builder:cuda12.1-2.1",
+    "5.5": "pytorch/manylinux-builder:rocm5.5-2.1",
+    "5.6": "pytorch/manylinux-builder:rocm5.6-2.1",
+    "cpu": "pytorch/manylinux-builder:cpu-2.1",
+    "cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1",
+    "cpu-aarch64": "pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1",
 }

 CONDA_CONTAINER_IMAGES = {
-    **{gpu_arch: f"pytorch/conda-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
-    "cpu": "pytorch/conda-builder:cpu",
+    "11.8": "pytorch/conda-builder:cuda11.8-2.1",
+    "12.1": "pytorch/conda-builder:cuda12.1-2.1",
+    "cpu": "pytorch/conda-builder:cpu-2.1",
 }

 PRE_CXX11_ABI = "pre-cxx11"
@ -63,24 +75,46 @@ RELEASE = "release"
 DEBUG = "debug"

 LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
-    **{
-        (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:cuda{gpu_arch}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:rocm{gpu_arch}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    **{
-        (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    ("cpu", PRE_CXX11_ABI): "pytorch/manylinux-builder:cpu",
-    ("cpu", CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu",
+    (
+        "11.8",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cuda11.8-2.1",
+    (
+        "12.1",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cuda12.1-2.1",
+    (
+        "11.8",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cuda11.8-2.1",
+    (
+        "12.1",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cuda12.1-2.1",
+    (
+        "5.5",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:rocm5.5-2.1",
+    (
+        "5.6",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:rocm5.6-2.1",
+    (
+        "5.5",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:rocm5.5-2.1",
+    (
+        "5.6",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:rocm5.6-2.1",
+    (
+        "cpu",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cpu-2.1",
+    (
+        "cpu",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cpu-2.1",
 }

 FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
@ -238,18 +272,7 @@ def generate_wheels_matrix(
                        "devtoolset": "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
-                        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                        "triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn".replace(  # noqa: B950
                            ".", "_"
                        ),
@ -274,6 +297,9 @@ def generate_wheels_matrix(
                    "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                        ".", "_"
                    ),
+                    "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS
+                    if os != "linux"
+                    else "",
                }
            )
    return ret
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -37,6 +37,7 @@ concurrency:
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -55,12 +55,12 @@ jobs:
    uses: ./.github/workflows/_binary-build-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- endif %}
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
-      {%- if config.pytorch_extra_install_requirements is defined %}
+      {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
      {%- endif %}
    secrets:
@ -74,7 +74,7 @@ jobs:
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -61,6 +61,9 @@ jobs:
    runs-on: macos-12-xl
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, true) }}
+    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
+    {%- endif %}
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -59,6 +59,9 @@ jobs:
    runs-on: windows.4xlarge.nonephemeral
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
+    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
+    {%- endif %}
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -140,6 +140,7 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
+        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -128,6 +128,7 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
+        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@ -7,14 +7,6 @@ on:
        required: true
        type: string
        description: Top-level label for what's being built/tested.
-      ios-platform:
-        required: true
-        type: string
-        description: Which iOS platform to build for.
-      ios-arch:
-        required: true
-        type: string
-        description: Which iOS arch to build for.
      sync-tag:
        required: false
        type: string
@ -31,8 +23,6 @@ on:
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
  BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-  IOS_PLATFORM: ${{ inputs.ios-platform }}
-  IOS_ARCH: ${{ inputs.ios-arch }}

 jobs:
  filter:
@ -63,6 +53,16 @@ jobs:
      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
      fail-fast: false
    runs-on: ${{ matrix.runner }}
+    env:
+      IOS_PLATFORM: ${{ matrix.ios_platform }}
+      IOS_ARCH: ${{ matrix.ios_arch }}
+      BUILD_LITE_INTERPRETER: ${{ matrix.use_lite_interpreter }}
+      USE_PYTORCH_METAL: ${{ matrix.use_metal }}
+      USE_COREML_DELEGATE: ${{ matrix.use_coreml }}
+      CUSTOM_OP_LIST: ${{ matrix.use_custom_op_list }}
+      # TODO: Bump it to 2.2.0 after cherry pick this or figure out a better way
+      # to get this version instead of hard coding it here
+      PYTORCH_VERSION: 2.1.0
    timeout-minutes: 240
    steps:
      # [see note: pytorch repo ref]
@ -70,26 +70,13 @@ jobs:
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1

      - name: Populate CI build options
+        shell: bash
        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
+          set -ex

-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
+          if [ -n "${CUSTOM_OP_LIST:-}" ]; then
+            echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/${CUSTOM_OP_LIST}" >> "${GITHUB_ENV}"
+          fi

      - name: Install brew dependencies
        uses: nick-fields/retry@v2.8.2
@ -116,54 +103,67 @@ jobs:
          retry_wait_seconds: 90
          command: |
            set -x
-            cd ios/TestApp
-            # install fastlane
+
+            pushd ios/TestApp
+            # Install fastlane
            sudo gem install bundler && bundle install
            bundle update fastlane
+            popd

-      - name: Build PyTorch Mobile Runtime
+      - name: Build PyTorch mobile runtime
+        shell: bash
        run: |
+          set -eux
          # shellcheck disable=SC1091
          export TCLLIBPATH="/usr/local/lib"
-          python -VV
          ${CONDA_RUN} scripts/build_ios.sh

      - name: Build TestApp
-        if: inputs.ios-platform == 'SIMULATOR'
+        if: matrix.ios_platform == 'SIMULATOR'
        timeout-minutes: 15
        run: |
-          # run the ruby build script
+          # Run the ruby build script
          if ! [ -x "$(command -v xcodebuild)" ]; then
            echo 'Error: xcodebuild is not installed.'
            exit 1
          fi
          ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"

-      - name: Run Simulator Tests
-        if: inputs.ios-platform == 'SIMULATOR'
+      - name: Run simulator tests
+        if: matrix.ios_platform == 'SIMULATOR'
+        shell: bash
        run: |
+          set -eux
          # shellcheck disable=SC1091
-          # use the pytorch nightly build to generate models
-          ${CONDA_RUN} pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-          # generate models for differnet backends
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
+          # Use the pytorch nightly build to generate models
+          ${CONDA_RUN} pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+
+          # Generate models for differnet backends
+          pushd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
          mkdir -p ../models
+          # NB: Both of the following scripts only export models with lite interpreter
          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
            ${CONDA_RUN} python coreml_backend.py
          else
-            cd "${GITHUB_WORKSPACE}"
+            pushd "${GITHUB_WORKSPACE}"
            ${CONDA_RUN} python test/mobile/model_test/gen_test_model.py ios-test
+            popd
          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
+
          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
            echo "Setting up the TestApp for LiteInterpreter"
            ruby setup.rb --lite 1
          else
+            # Generate some models for JIT without lite interpreter
+            ${CONDA_RUN} python trace_model.py
+
            echo "Setting up the TestApp for Full JIT"
            ruby setup.rb
          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp"
-          # instruments -s -devices
+          popd
+
+          pushd "${GITHUB_WORKSPACE}/ios/TestApp"
+          # Instruments -s -devices
          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
              bundle exec fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
@ -173,9 +173,282 @@ jobs:
          else
            bundle exec fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
          fi
+          popd

-      - name: Dump Simulator Tests On a Failure
-        if: failure() && inputs.ios-platform == 'SIMULATOR'
+      - name: Dump simulator tests on failure
+        if: failure() && matrix.ios_platform == 'SIMULATOR'
        run: |
          echo "Simulator Tests Logs:"
          cat /Users/runner/Library/Logs/scan/*.log
+
+      - name: Prepare the build artifacts for upload
+        shell: bash
+        run: |
+          set -eux
+
+          # The structure of the folder is as follows:
+          #
+          # RUNNER_TEMP/
+          # └── IOS_ARCH/
+          #     ├── LICENSE
+          #     ├── install
+          #     │   ├── include
+          #     │   │   └── headers
+          #     │   └── lib
+          #     │       ├── libXNNPACK.a
+          #     │       ├── libc10.a
+          #     │       ├── libclog.a
+          #     │       ├── libcpuinfo.a
+          #     │       ├── libeigen_blas.a
+          #     │       ├── libpthreadpool.a
+          #     │       ├── libpytorch_qnnpack.a
+          #     │       ├── libtorch.a
+          #     │       └── libtorch_cpu.a
+          #     ├── src
+          #     │   └── LibTorch-Lite.h
+          #     └── version.txt
+          SETUP_DIR="${RUNNER_TEMP}/${IOS_ARCH}"
+          mkdir -p "${SETUP_DIR}/src"
+
+          cp -R "${GITHUB_WORKSPACE}/build_ios/install" "${SETUP_DIR}"
+          # Copy the umbrella header and license
+          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
+            cp "${GITHUB_WORKSPACE}/ios/LibTorch-Lite.h" "${SETUP_DIR}/src"
+          else
+            cp "${GITHUB_WORKSPACE}/ios/LibTorch.h" "${SETUP_DIR}/src"
+          fi
+
+          # Copy license and version
+          cp "${GITHUB_WORKSPACE}/LICENSE" "${SETUP_DIR}"
+          echo "${PYTORCH_VERSION}" > "${SETUP_DIR}"/version.txt
+
+          # Save the podspec for the upload job later
+          if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
+            DATE=$(date -u +%Y%m%d)
+            cp "${GITHUB_WORKSPACE}"/ios/LibTorch-Lite-Nightly.podspec.template "${SETUP_DIR}"/LibTorch-Lite-Nightly.podspec
+            sed -i '' -e "s/IOS_NIGHTLY_BUILD_VERSION/${PYTORCH_VERSION}.${DATE}/g" "${SETUP_DIR}"/LibTorch-Lite-Nightly.podspec
+
+            cp "${GITHUB_WORKSPACE}"/ios/LibTorch-Lite.podspec.template "${SETUP_DIR}"/LibTorch-Lite.podspec
+            sed -i '' -e "s/IOS_BUILD_VERSION/${PYTORCH_VERSION}/g" "${SETUP_DIR}"/LibTorch-Lite.podspec
+          else
+            # NB: There is no nightly build without lite interpreter atm
+            cp "${GITHUB_WORKSPACE}"/ios/LibTorch.podspec.template "${SETUP_DIR}"/LibTorch.podspec
+            sed -i '' -e "s/IOS_BUILD_VERSION/${PYTORCH_VERSION}/g" "${SETUP_DIR}"/LibTorch.podspec
+          fi
+
+          pushd "${SETUP_DIR}"
+          # NB: It's important to zip all the files before uploading because the GHA will upload
+          # all files sequentially which is both slow and has too many requests. More info is at
+          # https://github.com/actions/upload-artifact#too-many-uploads-resulting-in-429-responses
+          zip -r "${IOS_ARCH}.zip" install src version.txt LICENSE ./*.podspec
+          popd
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: pytorch-ios-build-artifacts-${{ matrix.ios_arch }}
+          if-no-files-found: error
+          path: ${{ runner.temp }}/${{ matrix.ios_arch }}/${{ matrix.ios_arch }}.zip
+
+  upload-ios-artifacts:
+    # NB: this job run on GitHub MacOS ephemeral runner so that it can use lipo
+    # to create the fat iOS binaries for both x86_64 and arm64
+    runs-on: macos-12
+    needs: build
+    # NB: Only upload release build, if we need it, we could also turn on nightly here
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'ios-upload' || '' }}
+    steps:
+      - uses: actions/checkout@v3
+
+      # For awscli S3 upload
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+          cache: pip
+
+      # For cocoapods pod upload
+      - uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: '3.2'
+          bundler-cache: true
+
+      - name: Download arm64 artifacts
+        uses: actions/download-artifact@v3
+        with:
+          name: pytorch-ios-build-artifacts-arm64
+
+      - name: Download x86_64 artifacts
+        uses: actions/download-artifact@v3
+        with:
+          name: pytorch-ios-build-artifacts-x86_64
+
+      - name: Unzip arm64 and x86_64 artifacts
+        shell: bash
+        run: |
+          set -eux
+
+          for ARCH in "arm64" "x86_64"; do
+            TMP_DIR="${RUNNER_TEMP}/${ARCH}"
+            mkdir -p "${TMP_DIR}"
+
+            cp "${ARCH}.zip" "${TMP_DIR}"
+
+            pushd "${TMP_DIR}"
+            unzip -o "${ARCH}.zip"
+            popd
+          done
+
+      - name: Prepare the artifact
+        env:
+          IS_NIGHTLY: ${{ github.event.ref == 'refs/heads/nightly' }}
+        shell: bash
+        working-directory: ${{ runner.temp }}/arm64
+        run: |
+          set -eux
+
+          DEST_DIR="${RUNNER_TEMP}"/ios
+          echo "DEST_DIR=${DEST_DIR}" >> "$GITHUB_ENV"
+
+          # Prepare all the sub directories
+          mkdir -p "${DEST_DIR}"/install/lib
+
+          # Copy header and share files, arm64 or x86_64 both work
+          cp -R install/include "${DEST_DIR}"/install
+          cp -R install/share "${DEST_DIR}"/install
+          # The last dash is important to copy only files under src
+          cp -R src "${DEST_DIR}"
+          cp LICENSE "${DEST_DIR}"
+
+          if [ "${IS_NIGHTLY}" == true ]; then
+            PYTORCH_VERSION=$(cat version.txt)
+            DATE=$(date -u +%Y%m%d)
+            echo "${PYTORCH_VERSION}.${DATE}" > "${DEST_DIR}"/version.txt
+          else
+            cp version.txt "${DEST_DIR}"
+          fi
+          PYTORCH_VERSION=$(cat "${DEST_DIR}"/version.txt)
+          echo "PYTORCH_VERSION=${PYTORCH_VERSION}" >> "$GITHUB_ENV"
+
+          pushd install/lib
+          # shellcheck disable=SC2207
+          LIBRARIES=($(ls ./*.a))
+          popd
+
+          for LIB in "${LIBRARIES[@]}"; do
+            FROM_LIBS=("${RUNNER_TEMP}"/arm64/install/lib/"${LIB}" "${RUNNER_TEMP}"/x86_64/install/lib/"${LIB}")
+            # Create a fat binary for both arm64 and x86_64
+            lipo -create "${FROM_LIBS[@]}" -o "${DEST_DIR}"/install/lib/"${LIB}"
+            # Print the info
+            lipo -i "${DEST_DIR}"/install/lib/"${LIB}"
+          done
+
+          BUILD_LITE_INTERPRETER=1
+          if [ -f "${RUNNER_TEMP}"/arm64/LibTorch.podspec ]; then
+            # If LibTorch.podspec is used instead of LibTorch-Lite.podspec, the artifact is built
+            # without lite interpreter
+            BUILD_LITE_INTERPRETER=0
+          fi
+          echo "BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" >> "$GITHUB_ENV"
+
+      - name: Prepare the podspec
+        env:
+          IS_NIGHTLY: ${{ github.event.ref == 'refs/heads/nightly' }}
+        shell: bash
+        working-directory: ${{ env.DEST_DIR }}
+        run: |
+          set -eux
+
+          ARTIFACT_NAME=libtorch
+          SPEC_NAME=LibTorch
+
+          if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
+            ARTIFACT_NAME="${ARTIFACT_NAME}_lite_ios"
+            SPEC_NAME="${SPEC_NAME}-Lite"
+          else
+            ARTIFACT_NAME="${ARTIFACT_NAME}_ios"
+          fi
+
+          if [ "${IS_NIGHTLY}" == true ]; then
+            ARTIFACT_NAME="${ARTIFACT_NAME}_nightly_${PYTORCH_VERSION}.zip"
+            SPEC_NAME="${SPEC_NAME}-Nightly"
+          else
+            ARTIFACT_NAME="${ARTIFACT_NAME}_${PYTORCH_VERSION}.zip"
+          fi
+
+          SPEC_NAME_WITH_VERSION="${SPEC_NAME}-${PYTORCH_VERSION}.podspec"
+          SPEC_NAME="${SPEC_NAME}.podspec"
+
+          # Also copy the spec file
+          cp "${RUNNER_TEMP}"/arm64/"${SPEC_NAME}" "${SPEC_NAME_WITH_VERSION}"
+
+          # NB: It's important to zip all the files before uploading because the GHA will upload
+          # all files sequentially which is both slow and has too many requests. More info is at
+          # https://github.com/actions/upload-artifact#too-many-uploads-resulting-in-429-responses
+          zip -r "${ARTIFACT_NAME}" install src version.txt LICENSE
+
+          {
+            echo "ARTIFACT_NAME=${ARTIFACT_NAME}"
+            echo "SPEC_NAME_WITH_VERSION=${SPEC_NAME_WITH_VERSION}"
+            echo "SPEC_NAME=${SPEC_NAME}"
+          } >> "$GITHUB_ENV"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: pytorch-ios-artifacts
+          if-no-files-found: error
+          path: ${{ env.DEST_DIR }}/${{ env.ARTIFACT_NAME }}
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: pytorch-ios-podspec
+          if-no-files-found: error
+          path: ${{ env.DEST_DIR }}/${{ env.SPEC_NAME_WITH_VERSION }}
+
+      - name: Set DRY_RUN
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/v'))) }}
+        shell: bash
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+
+      - name: Upload the artifact to S3
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          IS_NIGHTLY: ${{ github.event.ref == 'refs/heads/nightly' }}
+        shell: bash
+        working-directory: ${{ env.DEST_DIR }}
+        run: |
+          set -eux
+
+          pip install -q awscli==1.29.40
+
+          DRY_RUN=${DRY_RUN:-enabled}
+          AWS_S3_CP="aws s3 cp --dryrun"
+          if [ "${DRY_RUN}" == "disabled" ]; then
+            AWS_S3_CP="aws s3 cp"
+          fi
+
+          if [ "${IS_NIGHTLY}" == true ]; then
+            BUCKET_NAME="ossci-ios-build"
+          else
+            BUCKET_NAME="ossci-ios"
+          fi
+
+          ${AWS_S3_CP} "${ARTIFACT_NAME}" "s3://${BUCKET_NAME}/" --acl public-read
+          ${AWS_S3_CP} "${SPEC_NAME_WITH_VERSION}" "s3://${BUCKET_NAME}/" --acl public-read
+
+      - name: Upload the artifact to cocoapods (nightly only)
+        env:
+          # We need to set this secret to upload to cocoapods. However, we might want
+          # to NOT set this for PROD release so that we can upload the artifacts manually
+          COCOAPODS_TRUNK_TOKEN: ${{ secrets.COCOAPODS_TRUNK_TOKEN || '' }}
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' && env.COCOAPODS_TRUNK_TOKEN != '' }}
+        shell: bash
+        working-directory: ${{ runner.temp }}/arm64
+        run: |
+          set -eux
+
+          gem install cocoapods
+
+          pod trunk me
+          # Upload the spec to cocoapods
+          pod trunk push --verbose --allow-warnings --use-libraries --skip-import-validation "${SPEC_NAME}"
--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@ -41,9 +41,17 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
      fail-fast: false
+    # NB: This job can only run on GitHub Linux runner atm. This is an ok thing though
+    # because that runner is ephemeral and could access upload secrets
    runs-on: ${{ matrix.runner }}
+    env:
+      # GitHub runner installs Android SDK on this path
+      ANDROID_ROOT: /usr/local/lib/android
+      ANDROID_NDK_VERSION: '21.4.7075529'
+      BUILD_LITE_INTERPRETER: ${{ matrix.use_lite_interpreter }}
+      # 4 of them are supported atm: armeabi-v7a, arm64-v8a, x86, x86_64
+      SUPPORT_ABI: '${{ matrix.support_abi }}'
    steps:
-      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1

@ -51,7 +59,7 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.1
        with:
          python-version: 3.8
-          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}.txt

      - name: Install NDK
        uses: nick-fields/retry@v2.8.2
@ -60,12 +68,12 @@ jobs:
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
+            set -eux
+
            # Install NDK 21 after GitHub update
            # https://github.com/actions/virtual-environments/issues/5595
-            ANDROID_ROOT="/usr/local/lib/android"
            ANDROID_SDK_ROOT="${ANDROID_ROOT}/sdk"
            ANDROID_NDK="${ANDROID_SDK_ROOT}/ndk-bundle"
-            ANDROID_NDK_VERSION="21.4.7075529"

            SDKMANAGER="${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager"
            # NB: This step downloads and installs NDK, thus it could be flaky.
@ -86,8 +94,10 @@ jobs:

      - name: Build PyTorch Android
        run: |
+          set -eux
+
          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          ${CONDA_RUN} ./scripts/build_pytorch_android.sh x86
+          ${CONDA_RUN} ./scripts/build_pytorch_android.sh "${SUPPORT_ABI}"

      - name: Run tests
        uses: reactivecircus/android-emulator-runner@v2
--- a/.github/workflows/build-android-binaries.yml
+++ b/.github/workflows/build-android-binaries.yml
@ -0,0 +1,48 @@
+name: Build Android binaries
+
+on:
+  push:
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - .github/workflows/build-android-binaries.yml
+      - .github/workflows/_run_android_tests.yml
+      - android/**
+  pull_request:
+    paths:
+      - .github/workflows/build-android-binaries.yml
+      - .github/workflows/_run_android_tests.yml
+      - android/**
+  # NB: We can use this workflow dispatch to test and build the binaries manually
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  android-build-test:
+    name: android-build-test
+    uses: ./.github/workflows/_run_android_tests.yml
+    with:
+      test-matrix: |
+        { include: [
+          { config: 'default',
+            shard: 1,
+            num_shards: 1,
+            runner: 'ubuntu-20.04-16x',
+            use_lite_interpreter: 1,
+            support_abi: 'armeabi-v7a,arm64-v8a,x86,x86_64',
+          },
+          { config: 'default',
+            shard: 1,
+            num_shards: 1,
+            runner: 'ubuntu-20.04-16x',
+            use_lite_interpreter: 0,
+            support_abi: 'armeabi-v7a,arm64-v8a,x86,x86_64',
+          },
+        ]}
--- a/.github/workflows/build-ios-binaries.yml
+++ b/.github/workflows/build-ios-binaries.yml
@ -0,0 +1,70 @@
+name: Build iOS binaries
+
+on:
+  push:
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - .github/workflows/build-ios-binaries.yml
+      - .github/workflows/_ios-build-test.yml
+  pull_request:
+    paths:
+      - .github/workflows/build-ios-binaries.yml
+      - .github/workflows/_ios-build-test.yml
+  # NB: We can use this workflow dispatch to test and build iOS binaries manually
+  workflow_dispatch:
+    inputs:
+      use_lite_interpreter:
+        description: "Use PyTorch lite interpreter?"
+        type: string
+        default: 1
+      use_coreml:
+        description: "Use Apple Core ML?"
+        type: string
+        default: 1
+      use_custom_op_list:
+        description: "Specify the custom ops list to include in the binaries"
+        type: string
+        default: ""
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  # TODO: Figure out how to migrate this job to M1 runner
+  ios-build-test:
+    name: ios-build-test
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-build-test
+      sync-tag: ios-build-test
+      test-matrix: |
+        { include: [
+          { config: "default",
+            shard: 1,
+            num_shards: 1,
+            runner: "macos-12",
+            ios_platform: "SIMULATOR",
+            ios_arch: "x86_64",
+            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
+            use_metal: 0,
+            use_coreml: ${{ inputs.use_coreml || 1 }},
+            use_custom_op_list: ${{ inputs.use_custom_op_list || '' }}
+          },
+          { config: "default",
+            shard: 1,
+            num_shards: 1,
+            runner: "macos-12",
+            ios_platform: "OS",
+            ios_arch: "arm64",
+            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
+            use_metal: 1,
+            use_coreml: ${{ inputs.use_coreml || 1 }},
+            use_custom_op_list: ${{ inputs.use_custom_op_list || '' }}
+          }
+        ]}
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -2,10 +2,9 @@ name: Create Release

 on:
  push:
-    tags: ['v*']
    branches:
      - main
-      - nightly
+      - release/*
  release:
    types: [published]
  pull_request:
@ -31,6 +30,8 @@ jobs:
        run: |
          tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
          tag_or_branch="${tag_or_branch#refs/heads/}"
+          # replace directory separators with _ in branch name
+          tag_or_branch="${tag_or_branch//\//_}"
          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Create source distribution
@ -52,5 +53,5 @@ jobs:
          files: ${{env.PT_RELEASE_FILE}}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
  cancel-in-progress: true
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -2,6 +2,15 @@ name: Build Official Docker Images

 on:
  workflow_dispatch:
+    inputs:
+      channel:
+        description: "Channel to use (nightly, test, release)"
+        required: false
+        type: choice
+        default: test
+        options:
+          - release
+          - test
  pull_request:
    paths:
      - Dockerfile
@ -15,6 +24,7 @@ on:
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
      - ciflow/nightly/*

+
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -26,7 +36,7 @@ env:
  DOCKER_REGISTRY: ghcr.io
  NO_BUILD_SUFFIX: true
  USE_BUILDX: 1
-  WITH_PUSH: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+  WITH_PUSH: ${{ inputs.channel == 'release' }}

 jobs:
  build:
@ -45,6 +55,7 @@ jobs:
    env:
      BUILD_IMAGE_TYPE: ${{ matrix.image_type }}
      BUILD_PLATFORMS: ${{ matrix.platform }}
+      CHANNEL: ${{ inputs.channel }}
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
@ -60,7 +71,7 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Login to GitHub Container Registry
-        if: ${{ env.WITH_PUSH == 'true' }}
+        if: ${{ inputs.channel == 'release' }}
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
@ -80,14 +91,16 @@ jobs:
          set -eou pipefail
          # To get QEMU binaries in our PATH
          echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
-          # Generate PyTorch version to use
-          echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py)" >> "${GITHUB_ENV}"
-      - name: Setup nightly specific variables
-        if: ${{ github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/ciflow/nightly/') }}
+          # Generate PyTorch version to use without suffix
+          echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}"
+
+      - name: Setup release specific variables
        run: |
          {
-            echo "DOCKER_IMAGE=pytorch-nightly";
-            echo "INSTALL_CHANNEL=pytorch-nightly";
+            echo "INSTALL_CHANNEL=pytorch-test";
+            if [[ ${CHANNEL} == "release" ]]; then
+              echo "INSTALL_CHANNEL=pytorch";
+            fi
            echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)+$(cut -c -10 .ci/docker/ci_commit_pins/triton.txt)";
          } >> "${GITHUB_ENV}"
      - name: Run docker build / push
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -47,12 +47,13 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.8"
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-aarch64-test:  # Testing
@ -67,11 +68,11 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -86,7 +87,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu-aarch64
    secrets:
@ -107,12 +108,13 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -127,11 +129,11 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -146,7 +148,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -167,12 +169,13 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -187,11 +190,11 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -206,7 +209,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -227,12 +230,13 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -247,11 +251,11 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      runs_on: linux.t4g.2xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -266,7 +270,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@ -47,7 +47,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cpu
      build_environment: linux-binary-conda
@ -65,7 +65,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cpu
      build_environment: linux-binary-conda
@ -83,7 +83,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cpu
    secrets:
@ -105,7 +105,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda11_8
      build_environment: linux-binary-conda
@ -124,7 +124,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda11_8
      build_environment: linux-binary-conda
@ -143,7 +143,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda11_8
    secrets:
@ -165,7 +165,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda12_1
      build_environment: linux-binary-conda
@ -184,7 +184,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda12_1
      build_environment: linux-binary-conda
@ -203,7 +203,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cuda12_1
    secrets:
@ -224,7 +224,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
      build_environment: linux-binary-conda
@ -242,7 +242,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
      build_environment: linux-binary-conda
@ -260,7 +260,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
    secrets:
@ -282,7 +282,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda11_8
      build_environment: linux-binary-conda
@ -301,7 +301,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda11_8
      build_environment: linux-binary-conda
@ -320,7 +320,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda11_8
    secrets:
@ -342,7 +342,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda12_1
      build_environment: linux-binary-conda
@ -361,7 +361,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda12_1
      build_environment: linux-binary-conda
@ -380,7 +380,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda12_1
    secrets:
@ -401,7 +401,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
      build_environment: linux-binary-conda
@ -419,7 +419,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
      build_environment: linux-binary-conda
@ -437,7 +437,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
    secrets:
@ -459,7 +459,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda11_8
      build_environment: linux-binary-conda
@ -478,7 +478,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda11_8
      build_environment: linux-binary-conda
@ -497,7 +497,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda11_8
    secrets:
@ -519,7 +519,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda12_1
      build_environment: linux-binary-conda
@ -538,7 +538,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda12_1
      build_environment: linux-binary-conda
@ -557,7 +557,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda12_1
    secrets:
@ -578,7 +578,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
      build_environment: linux-binary-conda
@ -596,7 +596,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
      build_environment: linux-binary-conda
@ -614,7 +614,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
    secrets:
@ -636,7 +636,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda11_8
      build_environment: linux-binary-conda
@ -655,7 +655,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda11_8
      build_environment: linux-binary-conda
@ -674,7 +674,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda11_8
    secrets:
@ -696,7 +696,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda12_1
      build_environment: linux-binary-conda
@ -715,7 +715,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda12_1
      build_environment: linux-binary-conda
@ -734,7 +734,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda12_1
    secrets:
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@ -42,7 +42,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -61,7 +61,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -47,7 +47,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -66,7 +66,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -85,7 +85,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -107,7 +107,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-without-deps-cxx11-abi
@ -126,7 +126,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-without-deps-cxx11-abi
@ -145,7 +145,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-without-deps-cxx11-abi
@ -167,7 +167,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-with-deps-cxx11-abi
@ -186,7 +186,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-with-deps-cxx11-abi
@ -205,7 +205,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-with-deps-cxx11-abi
@ -227,7 +227,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-without-deps-cxx11-abi
@ -246,7 +246,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-without-deps-cxx11-abi
@ -265,7 +265,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-without-deps-cxx11-abi
@ -288,7 +288,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
@ -308,7 +308,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
@ -328,7 +328,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
@ -351,7 +351,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-without-deps-cxx11-abi
@ -371,7 +371,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-without-deps-cxx11-abi
@ -391,7 +391,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-without-deps-cxx11-abi
@ -414,7 +414,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-with-deps-cxx11-abi
@ -434,7 +434,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-with-deps-cxx11-abi
@ -454,7 +454,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-with-deps-cxx11-abi
@ -477,7 +477,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-without-deps-cxx11-abi
@ -497,7 +497,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-without-deps-cxx11-abi
@ -517,7 +517,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-static-without-deps-cxx11-abi
@ -540,7 +540,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
@ -560,7 +560,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
@ -580,7 +580,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
@ -603,7 +603,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-without-deps-cxx11-abi
@ -623,7 +623,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-without-deps-cxx11-abi
@ -643,7 +643,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-without-deps-cxx11-abi
@ -666,7 +666,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-with-deps-cxx11-abi
@ -686,7 +686,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-with-deps-cxx11-abi
@ -706,7 +706,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-with-deps-cxx11-abi
@ -729,7 +729,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-without-deps-cxx11-abi
@ -749,7 +749,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-without-deps-cxx11-abi
@ -769,7 +769,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-static-without-deps-cxx11-abi
@ -792,7 +792,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_5-shared-with-deps-cxx11-abi
@ -814,7 +814,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -855,7 +855,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.5
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -872,7 +872,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_5-shared-with-deps-cxx11-abi
@ -895,7 +895,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_5-static-with-deps-cxx11-abi
@ -917,7 +917,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -958,7 +958,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.5
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -975,7 +975,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_5-static-with-deps-cxx11-abi
@ -998,7 +998,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi
@ -1020,7 +1020,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -1061,7 +1061,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.6
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1078,7 +1078,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi
@ -1101,7 +1101,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_6-static-with-deps-cxx11-abi
@ -1123,7 +1123,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -1164,7 +1164,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.6
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1181,7 +1181,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm5_6-static-with-deps-cxx11-abi
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
@ -42,7 +42,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
@ -61,7 +61,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -47,7 +47,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
@ -66,7 +66,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
@ -85,7 +85,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
@ -107,7 +107,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-without-deps-pre-cxx11
@ -126,7 +126,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-without-deps-pre-cxx11
@ -145,7 +145,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-without-deps-pre-cxx11
@ -167,7 +167,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-with-deps-pre-cxx11
@ -186,7 +186,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-with-deps-pre-cxx11
@ -205,7 +205,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-with-deps-pre-cxx11
@ -227,7 +227,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-without-deps-pre-cxx11
@ -246,7 +246,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-without-deps-pre-cxx11
@ -265,7 +265,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-static-without-deps-pre-cxx11
@ -288,7 +288,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
@ -308,7 +308,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
@ -328,7 +328,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
@ -351,7 +351,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-without-deps-pre-cxx11
@ -371,7 +371,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-without-deps-pre-cxx11
@ -391,7 +391,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-without-deps-pre-cxx11
@ -414,7 +414,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-with-deps-pre-cxx11
@ -434,7 +434,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-with-deps-pre-cxx11
@ -454,7 +454,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-with-deps-pre-cxx11
@ -477,7 +477,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-without-deps-pre-cxx11
@ -497,7 +497,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-without-deps-pre-cxx11
@ -517,7 +517,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-static-without-deps-pre-cxx11
@ -540,7 +540,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
@ -560,7 +560,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
@ -580,7 +580,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
@ -603,7 +603,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-without-deps-pre-cxx11
@ -623,7 +623,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-without-deps-pre-cxx11
@ -643,7 +643,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-without-deps-pre-cxx11
@ -666,7 +666,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-with-deps-pre-cxx11
@ -686,7 +686,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-with-deps-pre-cxx11
@ -706,7 +706,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-with-deps-pre-cxx11
@ -729,7 +729,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-without-deps-pre-cxx11
@ -749,7 +749,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-without-deps-pre-cxx11
@ -769,7 +769,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-static-without-deps-pre-cxx11
@ -792,7 +792,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_5-shared-with-deps-pre-cxx11
@ -814,7 +814,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
    steps:
@ -855,7 +855,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -872,7 +872,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_5-shared-with-deps-pre-cxx11
@ -895,7 +895,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_5-static-with-deps-pre-cxx11
@ -917,7 +917,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
    steps:
@ -958,7 +958,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -975,7 +975,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_5-static-with-deps-pre-cxx11
@ -998,7 +998,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11
@ -1020,7 +1020,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
    steps:
@ -1061,7 +1061,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1078,7 +1078,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11
@ -1101,7 +1101,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_6-static-with-deps-pre-cxx11
@ -1123,7 +1123,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
    steps:
@ -1164,7 +1164,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1181,7 +1181,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-rocm5_6-static-with-deps-pre-cxx11
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -43,7 +43,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
@ -62,7 +62,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
@ -82,7 +82,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -102,7 +102,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -47,7 +47,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu
      build_environment: linux-binary-manywheel
@ -65,7 +65,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu
      build_environment: linux-binary-manywheel
@ -83,7 +83,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu
    secrets:
@ -104,7 +104,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu-cxx11-abi
@ -123,7 +123,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu-cxx11-abi
@ -142,7 +142,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cpu-cxx11-abi
@ -165,7 +165,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
@ -184,7 +184,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
@ -203,7 +203,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
    secrets:
@ -225,7 +225,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -245,7 +245,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -264,7 +264,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1-with-pypi-cudnn
    secrets:
@ -286,7 +286,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
@ -305,7 +305,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
@ -324,7 +324,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
    secrets:
@ -346,7 +346,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-rocm5_5
      build_environment: linux-binary-manywheel
@ -367,7 +367,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Setup ROCm
@ -407,7 +407,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -424,7 +424,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-rocm5_5
    secrets:
@ -446,7 +446,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-rocm5_6
      build_environment: linux-binary-manywheel
@ -467,7 +467,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Setup ROCm
@ -507,7 +507,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -524,7 +524,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-rocm5_6
    secrets:
@ -545,7 +545,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
      build_environment: linux-binary-manywheel
@ -563,7 +563,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
      build_environment: linux-binary-manywheel
@ -581,7 +581,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
    secrets:
@ -602,7 +602,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-cxx11-abi
@ -621,7 +621,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-cxx11-abi
@ -640,7 +640,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-cxx11-abi
@ -663,7 +663,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
@ -682,7 +682,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
@ -701,7 +701,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
    secrets:
@ -723,7 +723,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -743,7 +743,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -762,7 +762,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1-with-pypi-cudnn
    secrets:
@ -784,7 +784,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
@ -803,7 +803,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
@ -822,7 +822,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
    secrets:
@ -844,7 +844,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-rocm5_5
      build_environment: linux-binary-manywheel
@ -865,7 +865,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
@ -905,7 +905,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -922,7 +922,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-rocm5_5
    secrets:
@ -944,7 +944,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-rocm5_6
      build_environment: linux-binary-manywheel
@ -965,7 +965,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
@ -1005,7 +1005,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1022,7 +1022,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-rocm5_6
    secrets:
@ -1043,7 +1043,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
      build_environment: linux-binary-manywheel
@ -1061,7 +1061,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
      build_environment: linux-binary-manywheel
@ -1079,7 +1079,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
    secrets:
@ -1100,7 +1100,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-cxx11-abi
@ -1119,7 +1119,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-cxx11-abi
@ -1138,7 +1138,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-cxx11-abi
@ -1161,7 +1161,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
      build_environment: linux-binary-manywheel
@ -1180,7 +1180,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
      build_environment: linux-binary-manywheel
@ -1199,7 +1199,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
    secrets:
@ -1221,7 +1221,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -1241,7 +1241,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -1260,7 +1260,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1-with-pypi-cudnn
    secrets:
@ -1282,7 +1282,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
      build_environment: linux-binary-manywheel
@ -1301,7 +1301,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
      build_environment: linux-binary-manywheel
@ -1320,7 +1320,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
    secrets:
@ -1342,7 +1342,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-rocm5_5
      build_environment: linux-binary-manywheel
@ -1363,7 +1363,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
@ -1403,7 +1403,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1420,7 +1420,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-rocm5_5
    secrets:
@ -1442,7 +1442,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-rocm5_6
      build_environment: linux-binary-manywheel
@ -1463,7 +1463,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
@ -1503,7 +1503,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1520,7 +1520,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-rocm5_6
    secrets:
@ -1541,7 +1541,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
      build_environment: linux-binary-manywheel
@ -1559,7 +1559,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
      build_environment: linux-binary-manywheel
@ -1577,7 +1577,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
    secrets:
@ -1598,7 +1598,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-cxx11-abi
@ -1617,7 +1617,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-cxx11-abi
@ -1636,7 +1636,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu-cxx11-abi
      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-cxx11-abi
@ -1659,7 +1659,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
      build_environment: linux-binary-manywheel
@ -1678,7 +1678,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
      build_environment: linux-binary-manywheel
@ -1697,7 +1697,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
    secrets:
@ -1719,7 +1719,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -1739,7 +1739,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1-with-pypi-cudnn
      build_environment: linux-binary-manywheel
@ -1758,7 +1758,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1-with-pypi-cudnn
    secrets:
@ -1780,7 +1780,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
      build_environment: linux-binary-manywheel
@ -1799,7 +1799,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
      build_environment: linux-binary-manywheel
@ -1818,7 +1818,7 @@ jobs:
      DESIRED_CUDA: cu121
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
    secrets:
@ -1840,7 +1840,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_5
      build_environment: linux-binary-manywheel
@ -1861,7 +1861,7 @@ jobs:
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.11"
    steps:
      - name: Setup ROCm
@ -1901,7 +1901,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.5
+          docker-image: pytorch/manylinux-builder:rocm5.5-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -1918,7 +1918,7 @@ jobs:
      DESIRED_CUDA: rocm5.5
      GPU_ARCH_VERSION: 5.5
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.5-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_5
    secrets:
@ -1940,7 +1940,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_6
      build_environment: linux-binary-manywheel
@ -1961,7 +1961,7 @@ jobs:
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.11"
    steps:
      - name: Setup ROCm
@ -2001,7 +2001,7 @@ jobs:
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
-          docker-image: pytorch/manylinux-builder:rocm5.6
+          docker-image: pytorch/manylinux-builder:rocm5.6-2.1
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -2018,7 +2018,7 @@ jobs:
      DESIRED_CUDA: rocm5.6
      GPU_ARCH_VERSION: 5.6
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-2.1
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-rocm5_6
    secrets:
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -135,7 +135,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cpu
      use_s3: False
@ -246,7 +246,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
      use_s3: False
@ -357,7 +357,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
      use_s3: False
@ -468,7 +468,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
      use_s3: False
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -48,6 +48,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -135,7 +136,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: wheel-py3_8-cpu
      use_s3: False
@ -159,6 +160,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -246,7 +248,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: wheel-py3_9-cpu
      use_s3: False
@ -270,6 +272,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -357,7 +360,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: wheel-py3_10-cpu
      use_s3: False
@ -381,6 +384,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -468,7 +472,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: wheel-py3_11-cpu
      use_s3: False
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@ -133,7 +133,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: conda-py3_8-cpu
      use_s3: False
@ -244,7 +244,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
      use_s3: False
@ -355,7 +355,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
      use_s3: False
@ -466,7 +466,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
      use_s3: False
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@ -137,7 +137,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -253,7 +253,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-without-deps-cxx11-abi
@ -369,7 +369,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-with-deps-cxx11-abi
@ -485,7 +485,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-static-without-deps-cxx11-abi
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@ -46,6 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -133,7 +134,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.8"
      build_name: wheel-py3_8-cpu
      use_s3: False
@ -157,6 +158,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -244,7 +246,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.9"
      build_name: wheel-py3_9-cpu
      use_s3: False
@ -268,6 +270,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -355,7 +358,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.10"
      build_name: wheel-py3_10-cpu
      use_s3: False
@ -379,6 +382,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -466,7 +470,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-2.1
      DESIRED_PYTHON: "3.11"
      build_name: wheel-py3_11-cpu
      use_s3: False
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -63,6 +63,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -173,6 +174,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -300,6 +302,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -411,6 +414,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -539,6 +543,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -650,6 +655,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -777,6 +783,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -887,6 +894,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1014,6 +1022,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1125,6 +1134,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1253,6 +1263,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1364,6 +1375,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1491,6 +1503,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1601,6 +1614,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1728,6 +1742,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1839,6 +1854,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1967,6 +1983,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2078,6 +2095,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2205,6 +2223,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2315,6 +2334,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2442,6 +2462,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2553,6 +2574,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2681,6 +2703,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2792,6 +2815,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -60,6 +60,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -174,6 +175,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -67,6 +67,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -181,6 +182,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -315,6 +317,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -429,6 +432,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -563,6 +567,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -677,6 +682,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -811,6 +817,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -925,6 +932,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1060,6 +1068,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1175,6 +1184,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1311,6 +1321,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1426,6 +1437,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1562,6 +1574,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1677,6 +1690,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1813,6 +1827,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1928,6 +1943,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2064,6 +2080,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2179,6 +2196,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2315,6 +2333,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2430,6 +2449,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2566,6 +2586,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2681,6 +2702,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2817,6 +2839,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2932,6 +2955,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -60,6 +60,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -174,6 +175,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -67,6 +67,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -181,6 +182,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -315,6 +317,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -429,6 +432,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -563,6 +567,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -677,6 +682,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -811,6 +817,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -925,6 +932,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1060,6 +1068,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1175,6 +1184,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1311,6 +1321,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1426,6 +1437,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1562,6 +1574,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1677,6 +1690,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1813,6 +1827,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1928,6 +1943,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2064,6 +2080,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2179,6 +2196,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2315,6 +2333,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2430,6 +2449,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2566,6 +2586,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2681,6 +2702,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2817,6 +2839,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2932,6 +2955,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -46,6 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -63,6 +64,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -173,6 +175,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -283,6 +286,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -300,6 +304,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -411,6 +416,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -522,6 +528,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -539,6 +546,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -650,6 +658,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -760,6 +769,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -777,6 +787,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -887,6 +898,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -997,6 +1009,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1014,6 +1027,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1125,6 +1139,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1236,6 +1251,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1253,6 +1269,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1364,6 +1381,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1474,6 +1492,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1491,6 +1510,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1601,6 +1621,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1711,6 +1732,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1728,6 +1750,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1839,6 +1862,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -1950,6 +1974,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1967,6 +1992,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2078,6 +2104,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2188,6 +2215,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2205,6 +2233,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2315,6 +2344,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2425,6 +2455,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2442,6 +2473,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2553,6 +2585,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2664,6 +2697,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2681,6 +2715,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
@ -2792,6 +2827,7 @@ jobs:
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
+        continue-on-error: true
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -21,6 +21,7 @@ jobs:
      docker-image: pytorch-linux-focal-linter
      fetch-depth: 0
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      test-infra-ref: 'release/2.1'
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@ -68,6 +69,7 @@ jobs:
      docker-image: pytorch-linux-focal-linter
      fetch-depth: 0
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      test-infra-ref: 'release/2.1'
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@ -122,6 +124,7 @@ jobs:
      docker-image: pytorch-linux-focal-linter
      fetch-depth: 0
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      test-infra-ref: 'release/2.1'
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@ -157,6 +160,7 @@ jobs:
      docker-image: pytorch-linux-focal-linter
      fetch-depth: 0
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      test-infra-ref: 'release/2.1'
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@ -195,6 +199,7 @@ jobs:
      docker-image: pytorch-linux-focal-linter
      fetch-depth: 0
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      test-infra-ref: 'release/2.1'
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -112,30 +112,38 @@ jobs:
      cuda-version: "11.8"
      test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}

-  ios-12-5-1-x86-64-coreml:
-    name: ios-12-5-1-x86-64-coreml
+  # TODO: Figure out how to migrate this job to M1 runner
+  ios-build-test:
+    name: ios-build-test
    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6'
    uses: ./.github/workflows/_ios-build-test.yml
    with:
-      build-environment: ios-12-5-1-x86-64-coreml
-      ios-platform: SIMULATOR
-      ios-arch: x86_64
+      build-environment: ios-build-test
+      sync-tag: ios-build-test
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "macos-12" },
-        ]}
-
-  ios-12-5-1-arm64-custom-ops:
-    name: ios-12-5-1-arm64-custom-ops
-    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6'
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-custom-ops
-      ios-platform: OS
-      ios-arch: arm64
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "macos-12" },
+          { config: "default",
+            shard: 1,
+            num_shards: 1,
+            runner: "macos-12",
+            ios_platform: "SIMULATOR",
+            ios_arch: "x86_64",
+            use_lite_interpreter: 1,
+            use_metal: 0,
+            use_coreml: 1,
+            use_custom_op_list: ""
+          },
+          { config: "default",
+            shard: 1,
+            num_shards: 1,
+            runner: "macos-12",
+            ios_platform: "OS",
+            ios_arch: "arm64",
+            use_lite_interpreter: 1,
+            use_metal: 1,
+            use_coreml: 1,
+            use_custom_op_list: "mobilenetv2.yaml"
+          }
        ]}

  buck-build-test:
@ -180,7 +188,14 @@ jobs:
    with:
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "ubuntu-20.04-16x" },
+          { config: "default",
+            shard: 1,
+            num_shards: 1,
+            runner: "ubuntu-20.04-16x"
+            use_lite_interpreter: 1,
+            # Just set x86 for testing here
+            support_abi: x86,
+          },
        ]}

  linux-vulkan-focal-py3_11-clang10-build:
--- a/.gitignore
+++ b/.gitignore
@ -364,3 +364,7 @@ venv/
 # Log files
 *.log
 sweep/
+
+# Android build artifacts
+android/pytorch_android/.cxx
+android/pytorch_android_torchvision/.cxx
--- a/4
+++ b/4
@ -73,8 +73,8 @@ ARG TARGETPLATFORM

 # On arm64 we can only install wheel packages.
 RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchaudio torchtext ;; \
-         *)              /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch torchvision torchaudio torchtext "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+         "linux/arm64")  pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchaudio ;; \
+         *)              /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
    esac && \
    /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
--- a/android/gradle.properties
+++ b/android/gradle.properties
@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64

-VERSION_NAME=2.1.0-SNAPSHOT
+VERSION_NAME=2.1.0
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@ -41,6 +41,7 @@ android {
                println 'Build pytorch_jni'
                exclude 'org/pytorch/LiteModuleLoader.java'
                exclude 'org/pytorch/LiteNativePeer.java'
+                exclude 'org/pytorch/LitePyTorchAndroid.java'
              } else {
                println 'Build pytorch_jni_lite'
              }
--- a/android/pytorch_android/host/build.gradle
+++ b/android/pytorch_android/host/build.gradle
@ -17,6 +17,7 @@ sourceSets {
        java {
            srcDir '../src/main/java'
            exclude 'org/pytorch/PyTorchAndroid.java'
+            exclude 'org/pytorch/LitePyTorchAndroid.java'
            exclude 'org/pytorch/LiteModuleLoader.java'
            exclude 'org/pytorch/LiteNativePeer.java'
        }
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@ -10,6 +10,7 @@ import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.Test;
+import org.junit.Ignore;

 public abstract class PytorchTestBase {
  private static final String TEST_MODULE_ASSET_NAME = "android_api_module.ptl";
@ -413,7 +414,10 @@ public abstract class PytorchTestBase {
  }

  @Test
+  @Ignore
  public void testSpectralOps() throws IOException {
+    // NB: This model fails without lite interpreter.  The error is as follows:
+    // RuntimeError: stft requires the return_complex parameter be given for real inputs
    runModel("spectral_ops");
  }

--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@ -10,12 +10,6 @@
 #include <fbjni/fbjni.h>

 #include "pytorch_jni_common.h"
-#if defined(__ANDROID__)
-#ifndef USE_PTHREADPOOL
-#define USE_PTHREADPOOL
-#endif /* USE_PTHREADPOOL */
-#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
-#endif

 namespace pytorch_jni {

@ -666,32 +660,4 @@ at::IValue JIValue::JIValueToAtIValue(
      typeCode);
 }

-#if defined(__ANDROID__)
-class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> {
- public:
-  constexpr static auto kJavaDescriptor = "Lorg/pytorch/PyTorchAndroid;";
-
-  static void registerNatives() {
-    javaClassStatic()->registerNatives({
-        makeNativeMethod(
-            "nativeSetNumThreads", PyTorchAndroidJni::setNumThreads),
-    });
-  }
-
-  static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
-    caffe2::pthreadpool()->set_thread_count(numThreads);
-  }
-};
-#endif
-
-void common_registerNatives() {
-  static const int once = []() {
-#if defined(__ANDROID__)
-    pytorch_jni::PyTorchAndroidJni::registerNatives();
-#endif
-    return 0;
-  }();
-  ((void)once);
-}
-
 } // namespace pytorch_jni
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@ -17,6 +17,11 @@
 #include <android/asset_manager.h>
 #include <android/asset_manager_jni.h>
 #include <android/log.h>
+
+#ifndef USE_PTHREADPOOL
+#define USE_PTHREADPOOL
+#endif /* USE_PTHREADPOOL */
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif

 namespace pytorch_jni {
@ -235,6 +240,34 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
  }
 };

+#if defined(__ANDROID__)
+class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> {
+ public:
+  constexpr static auto kJavaDescriptor = "Lorg/pytorch/PyTorchAndroid;";
+
+  static void registerNatives() {
+    javaClassStatic()->registerNatives({
+        makeNativeMethod(
+            "nativeSetNumThreads", PyTorchAndroidJni::setNumThreads),
+    });
+  }
+
+  static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
+    caffe2::pthreadpool()->set_thread_count(numThreads);
+  }
+};
+#endif
+
+void common_registerNatives() {
+  static const int once = []() {
+#if defined(__ANDROID__)
+    pytorch_jni::PyTorchAndroidJni::registerNatives();
+#endif
+    return 0;
+  }();
+  ((void)once);
+}
+
 } // namespace pytorch_jni

 JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@ -18,6 +18,11 @@
 #include <android/asset_manager.h>
 #include <android/asset_manager_jni.h>
 #include <android/log.h>
+
+#ifndef USE_PTHREADPOOL
+#define USE_PTHREADPOOL
+#endif /* USE_PTHREADPOOL */
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif

 namespace pytorch_jni {
@ -199,6 +204,34 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
  }
 };

+#if defined(__ANDROID__)
+class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> {
+ public:
+  constexpr static auto kJavaDescriptor = "Lorg/pytorch/LitePyTorchAndroid;";
+
+  static void registerNatives() {
+    javaClassStatic()->registerNatives({
+        makeNativeMethod(
+            "nativeSetNumThreads", PyTorchAndroidJni::setNumThreads),
+    });
+  }
+
+  static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
+    caffe2::pthreadpool()->set_thread_count(numThreads);
+  }
+};
+#endif
+
+void common_registerNatives() {
+  static const int once = []() {
+#if defined(__ANDROID__)
+    pytorch_jni::PyTorchAndroidJni::registerNatives();
+#endif
+    return 0;
+  }();
+  ((void)once);
+}
+
 } // namespace pytorch_jni

 JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
--- a/android/pytorch_android/src/main/java/org/pytorch/LitePyTorchAndroid.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/LitePyTorchAndroid.java
@ -0,0 +1,50 @@
+package org.pytorch;
+
+import android.content.res.AssetManager;
+import com.facebook.jni.annotations.DoNotStrip;
+import com.facebook.soloader.nativeloader.NativeLoader;
+import com.facebook.soloader.nativeloader.SystemDelegate;
+
+public final class LitePyTorchAndroid {
+    static {
+        if (!NativeLoader.isInitialized()) {
+            NativeLoader.init(new SystemDelegate());
+        }
+        NativeLoader.loadLibrary("pytorch_jni_lite");
+        PyTorchCodegenLoader.loadNativeLibs();
+    }
+
+    /**
+     * Attention: This is not recommended way of loading production modules, as prepackaged assets
+     * increase apk size etc. For production usage consider using loading from file on the disk {@link
+     * org.pytorch.Module#load(String)}.
+     *
+     * <p>This method is meant to use in tests and demos.
+     */
+    public static Module loadModuleFromAsset(
+            final AssetManager assetManager, final String assetName, final Device device) {
+        return new Module(new LiteNativePeer(assetName, assetManager, device));
+    }
+
+    public static Module loadModuleFromAsset(
+            final AssetManager assetManager, final String assetName) {
+        return new Module(new LiteNativePeer(assetName, assetManager, Device.CPU));
+    }
+
+    /**
+     * Globally sets the number of threads used on native side. Attention: Has global effect, all
+     * modules use one thread pool with specified number of threads.
+     *
+     * @param numThreads number of threads, must be positive number.
+     */
+    public static void setNumThreads(int numThreads) {
+        if (numThreads < 1) {
+            throw new IllegalArgumentException("Number of threads cannot be less than 1");
+        }
+
+        nativeSetNumThreads(numThreads);
+    }
+
+    @DoNotStrip
+    private static native void nativeSetNumThreads(int numThreads);
+}
--- a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
@ -10,7 +10,7 @@ public final class PyTorchAndroid {
    if (!NativeLoader.isInitialized()) {
      NativeLoader.init(new SystemDelegate());
    }
-    NativeLoader.loadLibrary("pytorch_jni_lite");
+    NativeLoader.loadLibrary("pytorch_jni");
    PyTorchCodegenLoader.loadNativeLibs();
  }

--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@ -41,6 +41,11 @@ android {
        buildConfigField("long[]", "INPUT_TENSOR_SHAPE", "new long[]{1, 3, 224, 224}")
        buildConfigField("boolean", "NATIVE_BUILD", 'false')
        buildConfigField("boolean", "USE_VULKAN_DEVICE", 'false')
+        buildConfigField(
+                "int",
+                "BUILD_LITE_INTERPRETER",
+                System.env.BUILD_LITE_INTERPRETER != null ? System.env.BUILD_LITE_INTERPRETER : "1"
+        )
        addManifestPlaceholders([APP_NAME: "@string/app_name", MAIN_ACTIVITY: "org.pytorch.testapp.MainActivity"])
    }
    buildTypes {
@ -63,14 +68,15 @@ android {
        mnet {
            dimension "model"
            applicationIdSuffix ".mnet"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"")
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet_v2.ptl\"")
            addManifestPlaceholders([APP_NAME: "MNET"])
            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
        }
+        // NB: This is not working atm https://github.com/pytorch/pytorch/issues/102966
        mnetVulkan {
            dimension "model"
            applicationIdSuffix ".mnet_vulkan"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"")
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet_v2_vulkan.ptl\"")
            buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
            addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
@ -78,7 +84,7 @@ android {
        resnet18 {
            dimension "model"
            applicationIdSuffix ".resnet18"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.pt\"")
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.ptl\"")
            addManifestPlaceholders([APP_NAME: "RN18"])
            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-resnet18\"")
        }
@ -149,8 +155,8 @@ dependencies {
    //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar')
    //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar')

-    nightlyImplementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT'
-    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.12.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android:2.2.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:2.2.0-SNAPSHOT'

    aarImplementation(name:'pytorch_android', ext:'aar')
    aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
--- a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
+++ b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
@ -1,6 +1,7 @@
 package org.pytorch.testapp;

 import android.content.Context;
+import android.content.res.AssetManager;
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.HandlerThread;
@ -16,6 +17,8 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.nio.FloatBuffer;
 import org.pytorch.Device;
 import org.pytorch.IValue;
@ -42,7 +45,13 @@ public class MainActivity extends AppCompatActivity {
      new Runnable() {
        @Override
        public void run() {
-          final Result result = doModuleForward();
+          final Result result;
+          try {
+            result = doModuleForward();
+          } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException |
+                   InvocationTargetException e) {
+            throw new RuntimeException(e);
+          }
          runOnUiThread(
              new Runnable() {
                @Override
@ -118,7 +127,7 @@ public class MainActivity extends AppCompatActivity {

  @WorkerThread
  @Nullable
-  protected Result doModuleForward() {
+  protected Result doModuleForward() throws ClassNotFoundException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
    if (mModule == null) {
      final long[] shape = BuildConfig.INPUT_TENSOR_SHAPE;
      long numElements = 1;
@ -129,12 +138,29 @@ public class MainActivity extends AppCompatActivity {
      mInputTensor =
          Tensor.fromBlob(
              mInputTensorBuffer, BuildConfig.INPUT_TENSOR_SHAPE, MemoryFormat.CHANNELS_LAST);
-      PyTorchAndroid.setNumThreads(1);
-      mModule =
-          BuildConfig.USE_VULKAN_DEVICE
-              ? PyTorchAndroid.loadModuleFromAsset(
-                  getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
-              : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
+
+      Class ptAndroid;
+      if (BuildConfig.BUILD_LITE_INTERPRETER == 1) {
+        ptAndroid = Class.forName("org.pytorch.LitePyTorchAndroid");
+      }
+      else {
+        ptAndroid = Class.forName("org.pytorch.PyTorchAndroid");
+      }
+
+      Method setNumThreads = ptAndroid.getMethod("setNumThreads", int.class);
+      setNumThreads.invoke(null,1);
+
+      Method loadModuleFromAsset = ptAndroid.getMethod(
+              "loadModuleFromAsset",
+              AssetManager.class,
+              String.class,
+              Device.class
+              );
+      mModule = (Module) (BuildConfig.USE_VULKAN_DEVICE
+                    ? loadModuleFromAsset.invoke(
+                            null, getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
+                    : loadModuleFromAsset.invoke(
+                            null, getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.CPU));
    }

    final long startTime = SystemClock.elapsedRealtime();
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -125,6 +125,7 @@ file(GLOB native_ao_sparse_h
            "native/ao_sparse/quantized/cpu/*.h")
 file(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h" "native/quantized/cudnn/*.h")
 file(GLOB native_cpu_h "native/cpu/*.h")
+file(GLOB native_utils_h "native/utils/*.h")

 file(GLOB native_cuda_cu "native/cuda/*.cu")
 file(GLOB native_cuda_cpp "native/cuda/*.cpp")
@ -540,7 +541,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"

 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
  # Metal
  if(USE_PYTORCH_METAL_EXPORT)
    # Add files needed from exporting metal models(optimized_for_mobile)
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -371,6 +371,22 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3(     \
          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))

+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)   \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND4(                     \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(                  \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_INTEGRAL_TYPES(...)          \
  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -5,9 +5,13 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>

+#include <chrono>
+#include <thread>
+
 namespace at::cuda {

 static bool _cuda_graphs_debug = false;
+constexpr int kSynchronizeBusyWaitMillis = 10;

 MempoolId_t graph_pool_handle() {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
@ -55,6 +59,25 @@ CaptureId_t capture_sequence_id() {
 * describes memory management for captures.
 */

+std::atomic<int> CUDAGraph::pending_event_queries = 0;
+
+// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
+// can be resolved before the capture begins. Note that event queries are not allowed during a
+// graph capture in the default capture mode.
+void CUDAGraph::inc_pending_event_queries() {
+  pending_event_queries++;
+}
+
+void CUDAGraph::dec_pending_event_queries() {
+  TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
+    "Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
+  pending_event_queries--;
+}
+
+int CUDAGraph::num_pending_event_queries() {
+  return pending_event_queries;
+}
+
 CUDAGraph::CUDAGraph()
  // CUDAStreams may not be default-constructed.
  : capture_stream_(at::cuda::getCurrentCUDAStream()) {
@ -115,6 +138,15 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  // due to the capture status being updated _after_ a capture had already started.
  c10::cuda::CUDACachingAllocator::beginAllocateStreamToPool(capture_dev_, capture_stream_, mempool_id_);

+  // At this point, any NCCL watchdogs should be aware that we are in capture mode
+  // and therefore should not enqueue any additional work that could be event-queried.
+  // We still must wait on any existing work that has not been cleaned up.
+  while (num_pending_event_queries()) {
+    TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
+    std::this_thread::sleep_for(
+      std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+  }
+
  // cudaStreamCaptureModeGlobal is the most conservative option to
  // prevent potentially unsafe CUDA API calls during capture.  See
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -5,6 +5,8 @@
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>

+#include <mutex>
+
 namespace at {

 struct CUDAGeneratorImpl;
@ -19,6 +21,9 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  CUDAGraph();
  ~CUDAGraph();

+  static void inc_pending_event_queries();
+  static void dec_pending_event_queries();
+  static int num_pending_event_queries();
  void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
  void capture_end();
  void replay();
@ -33,6 +38,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  cudaGraphExec_t graph_exec_ = NULL;
 #endif

+  static std::atomic<int> pending_event_queries;
+
  // internal states so reset() can do its best cleaning up
  // Set to true in capture_end if cudaStreamEndCapture succeeded
  // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -161,8 +161,10 @@ CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
 CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
 CUDA_STUB2(cuGetErrorString, CUresult, const char **);
 CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
+CUDA_STUB1(cuCtxSetCurrent, CUcontext);
 CUDA_STUB1(cuModuleUnload, CUmodule);
 CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
+CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice);
 CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
 CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
 CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@ -51,8 +51,10 @@ namespace at { namespace cuda {
  _(cuLaunchKernel)                              \
  _(cuLaunchCooperativeKernel)                   \
  _(cuCtxGetCurrent)                             \
+  _(cuCtxSetCurrent)                             \
  _(cuModuleUnload)                              \
  _(cuDevicePrimaryCtxGetState)                  \
+  _(cuDevicePrimaryCtxRetain)                    \
  _(cuLinkCreate)                                \
  _(cuLinkAddData)                               \
  _(cuLinkComplete)                              \
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -93,10 +93,17 @@ MPSDevice::MPSDevice() : _mtl_device(nil), _mtl_indexing_library(nil) {
  NSArray* devices = [MTLCopyAllDevices() autorelease];
  for (unsigned long i = 0; i < [devices count]; i++) {
    id<MTLDevice> device = devices[i];
-    if (![device isLowPower]) { // exclude Intel GPUs
-      _mtl_device = [device retain];
-      break;
+    if ([device isLowPower]) { // exclude Intel GPUs
+      continue;
    }
+    if (![device supportsFamily:MTLGPUFamilyMac2]) {
+      // Exclude devices that does not support Metal 2.0
+      // Virtualised MPS device on MacOS 12.6 should fail this check
+      TORCH_WARN("Skipping device ", [[device name] UTF8String], " that does not support Metal 2.0");
+      continue;
+    }
+    _mtl_device = [device retain];
+    break;
  }
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_mtl_device);
 }
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -1393,7 +1393,7 @@ Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& othe

 template <typename OutImpl>
 Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
-  Tensor result = at::empty(self.sizes(), self.options().dtype(kBool).device(self.device()));
+  Tensor result = at::empty({0}, self.options().dtype(kBool));
  return out_impl(result, self, other);
 }

--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -253,7 +253,9 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
      self.storage_offset() == src.storage_offset() &&
      self.strides().equals(src.strides()) &&
      self.sizes().equals(src.sizes()) &&
-      self.scalar_type() == src.scalar_type()
+      self.scalar_type() == src.scalar_type() &&
+      self.is_conj() == src.is_conj() &&
+      self.is_neg() == src.is_neg()
    );
  if (is_same_data) {
    return self;
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -2075,7 +2075,27 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
      && self.layout() == other.layout()
      && self.is_neg() == other.is_neg()
      && self.is_conj() == other.is_conj()) {
-    return true;
+    if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+      return true;
+    }
+    std::atomic<bool> result{true};
+    auto iter = TensorIteratorConfig().add_input(self).build();
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] {
+      iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) {
+        if (!result) {
+            return;
+        }
+        char* self_data = data[0];
+        for (C10_UNUSED const auto i : c10::irange(dim_size)) {
+          if (isnan_(c10::load<scalar_t>(self_data))) {
+            result = false;
+            return;
+          }
+          self_data += strides[0];
+        }
+      });
+    });
+    return result.load();
  }

  std::atomic<bool> result{true};
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -10,6 +10,7 @@
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/util/irange.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -27,7 +28,6 @@
 #include <cufftXt.h>

 #include <cmath>
-#include <vector>


 namespace at::native {
@ -304,6 +304,17 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
  CUFFT_CHECK(cufftSetWorkArea(plan, workspace.mutable_data_ptr()));

  // execute transform plan
+#if !defined(USE_ROCM)
+  CUcontext pctx = nullptr;
+  at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx);
+  if (C10_UNLIKELY(!pctx)) {
+    // workaround for corner case where a primary context exists but is not
+    // the current context
+    TORCH_WARN_ONCE("Attempting to run cuFFT, but there was no current CUDA context! Attempting to set the primary context...");
+    at::globalContext().getNVRTC().cuDevicePrimaryCtxRetain(&pctx, 0);
+    at::globalContext().getNVRTC().cuCtxSetCurrent(pctx);
+  }
+#endif /* !defined(USE_ROCM) */
  exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward);

  // Inplace reshaping to original batch shape and inverting the dimension permutation
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -100,7 +100,7 @@ static at::Tensor& copy_from_mps_(at::Tensor& dst_, const at::Tensor& src_, bool
    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
    NSUInteger alignedLength = 0;

-    const void* host_dst = dst.storage().data();
+    const void* host_dst = static_cast<const char*>(dst.storage().data()) + dst.storage_offset() * dst.itemsize();
    void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)dst_tensor_nbytes, &alignedLength);
    NSUInteger destOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr));
    // 4 bytes alignment required on macos for blits.
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -622,8 +622,13 @@ at::Tensor preprocess_mask(
    return pad_bias<mem_eff_alignment>(attn_mask);
  }
  // Check and make the tensor contiguous if needed
-  if (attn_mask.sym_stride(0) % 16 != 0 || attn_mask.sym_stride(1) % 16 != 0 ||
-      attn_mask.sym_stride(2) % 16 != 0) {
+  auto needs_contig = [](const c10::SymInt& stride) {
+    return (stride % 16 != 0) || (stride == 0);
+  };
+  if (needs_contig(attn_mask.sym_stride(0)) ||
+      needs_contig(attn_mask.sym_stride(1)) ||
+      needs_contig(attn_mask.sym_stride(2)) ||
+      needs_contig(attn_mask.sym_stride(3))) {
    return attn_mask.contiguous();
  }

--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
@ -298,9 +298,11 @@ struct AttentionKernel {
      // 15/16th of tensor core compute In that case :
      //  - we only launch kernels for head_id % kQueriesPerBlock == 0
      //  - we iterate over heads instead of queries (strideM = strideH)
-      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
-        if (head_id % kQueriesPerBlock != 0)
+      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0 &&
+          logsumexp_ptr == nullptr) {
+        if (head_id % kQueriesPerBlock != 0) {
          return false;
+        }
        q_strideM = q_strideH;
        num_queries = num_heads;
        num_heads = 1; // unused but here for intent
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -10,7 +10,7 @@ namespace {
 DriverAPI create_driver_api() {
 #define OPEN_LIBRARIES(name, n)               \
  void* handle_##n = dlopen(name, RTLD_LAZY); \
-  TORCH_INTERNAL_ASSERT(handle_##n);
+  TORCH_INTERNAL_ASSERT(handle_##n, "Can't open ", #name, ": ", dlerror());

  C10_FORALL_DRIVER_LIBRARIES(OPEN_LIBRARIES)
 #undef OPEN_LIBRARIES
@ -18,7 +18,7 @@ DriverAPI create_driver_api() {

 #define LOOKUP_ENTRY(name, n)                              \
  r.name##_ = ((decltype(&name))dlsym(handle_##n, #name)); \
-  TORCH_INTERNAL_ASSERT(r.name##_)
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
  C10_FORALL_DRIVER_API(LOOKUP_ENTRY)
 #undef LOOKUP_ENTRY
  return r;
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -19,7 +19,7 @@
  } while (0)

 #define C10_FORALL_DRIVER_LIBRARIES(_) \
-  _("libcuda.so", 0)                   \
+  _("libcuda.so.1", 0)                 \
  _("libnvidia-ml.so.1", 1)

 #define C10_FORALL_DRIVER_API(_)         \
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@ -53,9 +53,9 @@ set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
 set(PKG_CONFIG_EXECUTABLE pkg-config CACHE FILEPATH "" FORCE)

 # Setup iOS platform unless specified manually with IOS_PLATFORM
-if(NOT DEFINED IOS_PLATFORM)
+if(NOT IOS_PLATFORM)
    set(IOS_PLATFORM "OS")
-endif(NOT DEFINED IOS_PLATFORM)
+endif(NOT IOS_PLATFORM)
 set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")

 # Check the platform selection and setup for developer root
@ -118,9 +118,9 @@ set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
 # (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
 # and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
 # hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+if(NOT CMAKE_INSTALL_NAME_TOOL)
    find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+endif(NOT CMAKE_INSTALL_NAME_TOOL)

 # Setup iOS deployment target
 set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
@ -130,17 +130,17 @@ set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS ver
 exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
 set(XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
 set(XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-if(NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+if(NOT CMAKE_IOS_DEVELOPER_ROOT)
    if(EXISTS ${XCODE_POST_43_ROOT})
        set(CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
    elseif(EXISTS ${XCODE_PRE_43_ROOT})
        set(CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
    endif(EXISTS ${XCODE_POST_43_ROOT})
-endif(NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+endif(NOT CMAKE_IOS_DEVELOPER_ROOT)
 set(CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")

 # Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
-if(NOT DEFINED CMAKE_IOS_SDK_ROOT)
+if(NOT CMAKE_IOS_SDK_ROOT)
    file(GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
    if(_CMAKE_IOS_SDKS)
        list(SORT _CMAKE_IOS_SDKS)
@ -150,7 +150,7 @@ if(NOT DEFINED CMAKE_IOS_SDK_ROOT)
        message(FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
    endif(_CMAKE_IOS_SDKS)
    message(STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
-endif(NOT DEFINED CMAKE_IOS_SDK_ROOT)
+endif(NOT CMAKE_IOS_SDK_ROOT)
 set(CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")

 # Set the sysroot default to the most recent SDK
--- a/docker.Makefile
+++ b/docker.Makefile
@ -8,7 +8,7 @@ $(warning WARNING: No docker user found using results from whoami)
 DOCKER_ORG                = $(shell whoami)
 endif

-CUDA_VERSION              = 11.8.0
+CUDA_VERSION              = 12.1.1
 CUDNN_VERSION             = 8
 BASE_RUNTIME              = ubuntu:20.04
 BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu20.04
--- a/docs/Makefile
+++ b/docs/Makefile
@ -18,8 +18,8 @@ figures:
 	@$(PYCMD) source/scripts/build_quantization_configs.py

 onnx:
-	@$(PYCMD) source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py
-	@$(PYCMD) source/scripts/onnx/build_onnx_diagnostics_rules_md.py $(SOURCEDIR)/generated/onnx_diagnostics_rules
+	@$(PYCMD) source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
+	@$(PYCMD) source/scripts/onnx/build_onnx_dynamo_diagnostics_rules_md.py $(SOURCEDIR)/generated/onnx_dynamo_diagnostics_rules

 opset:
 	@$(PYCMD) source/scripts/build_opsets.py
--- a/docs/source/_static/img/onnx/onnx_dynamo_mlp_model.png
+++ b/docs/source/_static/img/onnx/onnx_dynamo_mlp_model.png
--- a/docs/source/_static/img/onnx/onnx_dynamo_mlp_model_function_body.png
+++ b/docs/source/_static/img/onnx/onnx_dynamo_mlp_model_function_body.png
--- a/docs/source/_static/img/onnx/onnx_dynamo_mlp_model_function_highlight.png
+++ b/docs/source/_static/img/onnx/onnx_dynamo_mlp_model_function_highlight.png
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -179,6 +179,7 @@ Tensor autograd functions
   torch.Tensor.detach
   torch.Tensor.detach_
   torch.Tensor.register_hook
+   torch.Tensor.register_post_accumulate_grad_hook
   torch.Tensor.retain_grad

 :hidden:`Function`
--- a/docs/source/export.rst
+++ b/docs/source/export.rst
@ -1,10 +1,551 @@
+.. _torch.export:
+
 torch.export
 =====================

-.. TODO: Add torch.export() tutorial here.
-
 .. warning::
-    This feature is a prototype and may have compatibility breaking changes in the future.
+    This feature is a prototype under active development and there WILL BE
+    BREAKING CHANGES in the future.
+
+
+Overview
+--------
+
+:func:`torch.export.export` takes an arbitrary Python callable (a
+:class:`torch.nn.Module`, a function or a method) and produces a traced graph
+representing only the Tensor computation of the function in an Ahead-of-Time
+(AOT) fashion, which can subsequently be executed with different outputs or
+serialized.
+
+::
+
+    import torch
+    from torch.export import export
+
+    def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        a = torch.sin(x)
+        b = torch.cos(y)
+        return a + b
+
+    example_args = (torch.randn(10, 10), torch.randn(10, 10))
+
+    exported_program: torch.export.ExportedProgram = export(
+        f, args=example_args
+    )
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[10, 10], arg1_1: f32[10, 10]):
+                # code: a = torch.sin(x)
+                sin: f32[10, 10] = torch.ops.aten.sin.default(arg0_1);
+
+                # code: b = torch.cos(y)
+                cos: f32[10, 10] = torch.ops.aten.cos.default(arg1_1);
+
+                # code: return a + b
+                add: f32[10, 10] = torch.ops.aten.add.Tensor(sin, cos);
+                return (add,)
+
+        Graph signature: ExportGraphSignature(
+            parameters=[],
+            buffers=[],
+            user_inputs=['arg0_1', 'arg1_1'],
+            user_outputs=['add'],
+            inputs_to_parameters={},
+            inputs_to_buffers={},
+            buffers_to_mutate={},
+            backward_signature=None,
+            assertion_dep_token=None,
+        )
+        Range constraints: {}
+        Equality constraints: []
+
+``torch.export`` produces a clean intermediate representation (IR) with the
+following invariants. More specifications about the IR can be found here (coming
+soon!).
+
+* **Soundness**: It is guaranteed to be a sound representation of the original
+  program, and maintains the same calling conventions of the original program.
+
+* **Normalized**: There are no Python semantics within the graph. Submodules
+  from the original programs are inlined to form one fully flattened
+  computational graph.
+
+* **Defined Operator Set**: The graph produced contains only a small defined
+  :ref:`Core ATen IR <torch.compiler_ir>` opset and registered custom
+  operators.
+
+* **Graph properties**: The graph is purely functional, meaning it does not
+  contain operations with side effects such as mutations or aliasing. It does
+  not mutate any intermediate values, parameters, or buffers.
+
+* **Metadata**: The graph contains metadata captured during tracing, such as a
+  stacktrace from user's code.
+
+Under the hood, ``torch.export`` leverages the following latest technologies:
+
+* **TorchDynamo (torch._dynamo)** is an internal API that uses a CPython feature
+  called the Frame Evaluation API to safely trace PyTorch graphs. This
+  provides a massively improved graph capturing experience, with much fewer
+  rewrites needed in order to fully trace the PyTorch code.
+
+* **AOT Autograd** provides a functionalized PyTorch graph and ensures the graph
+  is decomposed/lowered to the small defined Core ATen operator set.
+
+* **Torch FX (torch.fx)** is the underlying representation of the graph,
+  allowing flexible Python-based transformations.
+
+
+Existing frameworks
+^^^^^^^^^^^^^^^^^^^
+
+:func:`torch.compile` also utilizes the same PT2 stack as ``torch.export``, but
+is slightly different:
+
+* **JIT vs. AOT**: :func:`torch.compile` is a JIT compiler whereas
+  which is not intended to be used to produce compiled artifacts outside of
+  deployment.
+
+* **Partial vs. Full Graph Capture**: When :func:`torch.compile` runs into an
+  untraceable part of a model, it will "graph break" and fall back to running
+  the program in the eager Python runtime. In comparison, ``torch.export`` aims
+  to get a full graph representation of a PyTorch model, so it will error out
+  when something untraceable is reached. Since ``torch.export`` produces a full
+  graph disjoint from any Python features or runtime, this graph can then be
+  saved, loaded, and run in different environments and languages.
+
+* **Usability tradeoff**: Since :func:`torch.compile` is able to fallback to the
+  Python runtime whenever it reaches something untraceable, it is a lot more
+  flexible. ``torch.export`` will instead require users to provide more
+  information or rewrite their code to make it traceable.
+
+Compared to :func:`torch.fx.symbolic_trace`, ``torch.export`` traces using
+TorchDynamo which operates at the Python bytecode level, giving it the ability
+to trace arbitrary Python constructs not limited by what Python operator
+overloading supports. Additionally, ``torch.export`` keeps fine-grained track of
+tensor metadata, so that conditionals on things like tensor shapes do not
+fail tracing. In general, ``torch.export`` is expected to work on more user
+programs, and produce lower-level graphs (at the ``torch.ops.aten`` operator
+level). Note that users can still use :func:`torch.fx.symbolic_trace` as a
+preprocessing step before ``torch.export``.
+
+Compared to :func:`torch.jit.script`, ``torch.export`` does not capture Python
+control flow or data structures, but it supports more Python language features
+than TorchScript (as it is easier to have comprehensive coverage over Python
+bytecodes). The resulting graphs are simpler and only have straight line control
+flow (except for explicit control flow operators).
+
+Compared to :func:`torch.jit.trace`, ``torch.export`` is sound: it is able to
+trace code that performs integer computation on sizes and records all of the
+side-conditions necessary to show that a particular trace is valid for other
+inputs.
+
+
+Exporting a PyTorch Model
+-------------------------
+
+An Example
+^^^^^^^^^^
+
+The main entrypoint is through :func:`torch.export.export`, which takes a
+callable (:class:`torch.nn.Module`, function, or method) and sample inputs, and
+captures the computation graph into an :class:`torch.export.ExportedProgram`. An
+example:
+
+::
+
+    import torch
+    from torch.export import export
+
+    # Simple module for demonstration
+    class M(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.conv = torch.nn.Conv2d(
+                in_channels=3, out_channels=16, kernel_size=3, padding=1
+            )
+            self.relu = torch.nn.ReLU()
+            self.maxpool = torch.nn.MaxPool2d(kernel_size=3)
+
+        def forward(self, x: torch.Tensor, *, constant=None) -> torch.Tensor:
+            a = self.conv(x)
+            a.add_(constant)
+            return self.maxpool(self.relu(a))
+
+    example_args = (torch.randn(1, 3, 256, 256),)
+    example_kwargs = {"constant": torch.ones(1, 16, 256, 256)}
+
+    exported_program: torch.export.ExportedProgram = export(
+        M(), args=example_args, kwargs=example_kwargs
+    )
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[16, 3, 3, 3], arg1_1: f32[16], arg2_1: f32[1, 3, 256, 256], arg3_1: f32[1, 16, 256, 256]):
+
+                # code: a = self.conv(x)
+                convolution: f32[1, 16, 256, 256] = torch.ops.aten.convolution.default(
+                    arg2_1, arg0_1, arg1_1, [1, 1], [1, 1], [1, 1], False, [0, 0], 1
+                );
+
+                # code: a.add_(constant)
+                add: f32[1, 16, 256, 256] = torch.ops.aten.add.Tensor(convolution, arg3_1);
+
+                # code: return self.maxpool(self.relu(a))
+                relu: f32[1, 16, 256, 256] = torch.ops.aten.relu.default(add);
+                max_pool2d_with_indices = torch.ops.aten.max_pool2d_with_indices.default(
+                    relu, [3, 3], [3, 3]
+                );
+                getitem: f32[1, 16, 85, 85] = max_pool2d_with_indices[0];
+                return (getitem,)
+
+        Graph signature: ExportGraphSignature(
+            parameters=['L__self___conv.weight', 'L__self___conv.bias'],
+            buffers=[],
+            user_inputs=['arg2_1', 'arg3_1'],
+            user_outputs=['getitem'],
+            inputs_to_parameters={
+                'arg0_1': 'L__self___conv.weight',
+                'arg1_1': 'L__self___conv.bias',
+            },
+            inputs_to_buffers={},
+            buffers_to_mutate={},
+            backward_signature=None,
+            assertion_dep_token=None,
+        )
+        Range constraints: {}
+        Equality constraints: []
+
+Inspecting the ``ExportedProgram``, we can note the following:
+
+* The :class:`torch.fx.Graph` contains the computation graph of the original
+  program, along with records of the original code for easy debugging.
+
+* The graph contains only ``torch.ops.aten`` operators found in the
+  :ref:`Core ATen IR <torch.compiler_ir>` opset and custom operators, and is
+  fully functional, without any inplace operators such as ``torch.add_``.
+
+* The parameters (weight and bias to conv) are lifted as inputs to the graph,
+  resulting in no ``get_attr`` nodes in the graph, which previously existed in
+  the result of :func:`torch.fx.symbolic_trace`.
+
+* The :class:`torch.export.ExportGraphSignature` models the input and output
+  signature, along with specifying which inputs are parameters.
+
+* The resulting shape and dtype of tensors produced by each node in the graph is
+  noted. For example, the ``convolution`` node will result in a tensor of dtype
+  ``torch.float32`` and shape (1, 16, 256, 256).
+
+
+Expressing Dynamism
+^^^^^^^^^^^^^^^^^^^
+
+By default ``torch.export`` will trace the program assuming all input shapes are
+**static**, and specializing the exported program to those dimensions. However,
+some dimensions, such as a batch dimension, can be dynamic and vary from run to
+run. Such dimensions must be marked dynamic using the
+:func:`torch.export.dynamic_dim` API, and passed into
+:func:`torch.export.export` through the ``constraints`` argument. An example:
+
+::
+
+    import torch
+    from torch.export import export, dynamic_dim
+
+    class M(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+            self.branch1 = torch.nn.Sequential(
+                torch.nn.Linear(64, 32), torch.nn.ReLU()
+            )
+            self.branch2 = torch.nn.Sequential(
+                torch.nn.Linear(128, 64), torch.nn.ReLU()
+            )
+            self.buffer = torch.ones(32)
+
+        def forward(self, x1, x2):
+            out1 = self.branch1(x1)
+            out2 = self.branch2(x2)
+            return (out1 + self.buffer, out2)
+
+    example_args = (torch.randn(32, 64), torch.randn(32, 128))
+    constraints = [
+        # First dimension of each input is a dynamic batch size
+        dynamic_dim(example_args[0], 0),
+        dynamic_dim(example_args[1], 0),
+        # The dynamic batch size between the inputs are equal
+        dynamic_dim(example_args[0], 0) == dynamic_dim(example_args[1], 0),
+    ]
+
+    exported_program: torch.export.ExportedProgram = export(
+      M(), args=example_args, constraints=constraints
+    )
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[32, 64], arg1_1: f32[32], arg2_1: f32[64, 128], arg3_1: f32[64], arg4_1: f32[32], arg5_1: f32[s0, 64], arg6_1: f32[s0, 128]):
+
+                # code: out1 = self.branch1(x1)
+                permute: f32[64, 32] = torch.ops.aten.permute.default(arg0_1, [1, 0]);
+                addmm: f32[s0, 32] = torch.ops.aten.addmm.default(arg1_1, arg5_1, permute);
+                relu: f32[s0, 32] = torch.ops.aten.relu.default(addmm);
+
+                # code: out2 = self.branch2(x2)
+                permute_1: f32[128, 64] = torch.ops.aten.permute.default(arg2_1, [1, 0]);
+                addmm_1: f32[s0, 64] = torch.ops.aten.addmm.default(arg3_1, arg6_1, permute_1);
+                relu_1: f32[s0, 64] = torch.ops.aten.relu.default(addmm_1);  addmm_1 = None
+
+                # code: return (out1 + self.buffer, out2)
+                add: f32[s0, 32] = torch.ops.aten.add.Tensor(relu, arg4_1);
+                return (add, relu_1)
+
+        Graph signature: ExportGraphSignature(
+            parameters=[
+                'branch1.0.weight',
+                'branch1.0.bias',
+                'branch2.0.weight',
+                'branch2.0.bias',
+            ],
+            buffers=['L__self___buffer'],
+            user_inputs=['arg5_1', 'arg6_1'],
+            user_outputs=['add', 'relu_1'],
+            inputs_to_parameters={
+                'arg0_1': 'branch1.0.weight',
+                'arg1_1': 'branch1.0.bias',
+                'arg2_1': 'branch2.0.weight',
+                'arg3_1': 'branch2.0.bias',
+            },
+            inputs_to_buffers={'arg4_1': 'L__self___buffer'},
+            buffers_to_mutate={},
+            backward_signature=None,
+            assertion_dep_token=None,
+        )
+        Range constraints: {s0: RangeConstraint(min_val=2, max_val=9223372036854775806)}
+        Equality constraints: [(InputDim(input_name='arg5_1', dim=0), InputDim(input_name='arg6_1', dim=0))]
+
+Some additional things to note:
+
+* Through the :func:`torch.export.dynamic_dim` API, we specified the first
+  dimension of each input to be dynamic. Looking at the inputs ``arg5_1`` and
+  ``arg6_1``, they have a symbolic shape of (s0, 64) and (s0, 128), instead of
+  the (32, 64) and (32, 128) shaped tensors that we passed in as example inputs.
+  ``s0`` is a symbol representing that this dimension can be a range
+  of values.
+
+* ``exported_program.range_constraints`` describes the ranges of each symbol
+  appearing in the graph. In this case, we see that ``s0`` has the range
+  [2, inf]. For technical reasons that are difficult to explain here, they are
+  assumed to be not 0 or 1. This is not a bug, and does not necessarily mean
+  that the exported program will not work for dimensions 0 or 1. See
+  `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`_
+  for an in-depth discussion of this topic.
+
+* ``exported_program.equality_constraints`` describes which dimensions are
+  required to be equal. Since we specified in the constraints that the first
+  dimension of each argument is equivalent,
+  (``dynamic_dim(example_args[0], 0) == dynamic_dim(example_args[1], 0)``),
+  we see in the equality constraints the tuple specifying that ``arg5_1``
+  dimension 0 and ``arg6_1`` dimension 0 are equal.
+
+
+Serialization
+^^^^^^^^^^^^^
+
+To save the ``ExportedProgram``, users can use the :func:`torch.export.save` and
+:func:`torch.export.load` APIs. A convention is to save the ``ExportedProgram``
+using a ``.pt2`` file extension.
+
+An example:
+
+::
+
+    import torch
+    import io
+
+    class MyModule(torch.nn.Module):
+        def forward(self, x):
+            return x + 10
+
+    exported_program = torch.export.export(MyModule(), torch.randn(5))
+
+    torch.export.save(exported_program, 'exported_program.pt2')
+    saved_exported_program = torch.export.load('exported_program.pt2')
+
+
+Specialization
+^^^^^^^^^^^^^^
+
+Input shapes
+~~~~~~~~~~~~
+
+As mentioned before, by default, ``torch.export`` will trace the program
+specializing on the input tensors' shapes, unless a dimension is specified as
+dynamic via the :func:`torch.export.dynamic_dim` API. This means that if there
+exists shape-dependent control flow, ``torch.export`` will specialize on the
+branch that is being taken with the given sample inputs. For example:
+
+::
+
+    import torch
+    from torch.export import export
+
+    def fn(x):
+        if x.shape[0] > 5:
+            return x + 1
+        else:
+            return x - 1
+
+    example_inputs = (torch.rand(10, 2),)
+    exported_program = export(fn, example_inputs)
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[10, 2]):
+                add: f32[10, 2] = torch.ops.aten.add.Tensor(arg0_1, 1);
+                return (add,)
+
+The conditional of (``x.shape[0] > 5``) does not appear in the
+``ExportedProgram`` because the example inputs have the static
+shape of (10, 2). Since ``torch.export`` specializes on the inputs' static
+shapes, the else branch (``x - 1``) will never be reached. To preserve the dynamic
+branching behavior based on the shape of a tensor in the traced graph,
+:func:`torch.export.dynamic_dim` will need to be used to specify the dimension
+of the input tensor (``x.shape[0]``) to be dynamic, and the source code will
+need to be :ref:`rewritten <Data/Shape-Dependent Control Flow>`.
+
+Non-tensor inputs
+~~~~~~~~~~~~~~~~~
+
+``torch.export`` also specializes the traced graph based on the values of inputs
+that are not ``torch.Tensor``, such as ``int``, ``float``, ``bool``, and ``str``.
+However, we will likely change this in the near future to not specialize on
+inputs of primitive types.
+
+For example:
+
+::
+
+    import torch
+    from torch.export import export
+
+    def fn(x: torch.Tensor, const: int, times: int):
+        for i in range(times):
+            x = x + const
+        return x
+
+    example_inputs = (torch.rand(2, 2), 1, 3)
+    exported_program = export(fn, example_inputs)
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[2, 2], arg1_1, arg2_1):
+                add: f32[2, 2] = torch.ops.aten.add.Tensor(arg0_1, 1);
+                add_1: f32[2, 2] = torch.ops.aten.add.Tensor(add, 1);
+                add_2: f32[2, 2] = torch.ops.aten.add.Tensor(add_1, 1);
+                return (add_2,)
+
+Because integers are specialized, the ``torch.ops.aten.add.Tensor`` operations
+are all computed with the inlined constant ``1``, rather than ``arg1_1``.
+Additionally, the ``times`` iterator used in the ``for`` loop is also "inlined"
+in the graph through the 3 repeated ``torch.ops.aten.add.Tensor`` calls, and the
+input ``arg2_1`` is never used.
+
+
+Limitations of torch.export
+---------------------------
+
+Graph Breaks
+^^^^^^^^^^^^
+
+As ``torch.export`` is a one-shot process for capturing a computation graph from
+a PyTorch program, it might ultimately run into untraceable parts of programs as
+it is nearly impossible to support tracing all PyTorch and Python features. In
+the case of ``torch.compile``, an unsupported operation will cause a "graph
+break" and the unsupported operation will be run with default Python evaluation.
+In contrast, ``torch.export`` will require users to provide additional
+information or rewrite parts of their code to make it traceable. As the
+tracing is based on TorchDynamo, which evaluates at the Python
+bytecode level, there will be significantly fewer rewrites required compared to
+previous tracing frameworks.
+
+When a graph break is encountered, :ref:`ExportDB <torch.export_db>` is a great
+resource for learning about the kinds of programs that are supported and
+unsupported, along with ways to rewrite programs to make them traceable.
+
+.. _Data/Shape-Dependent Control Flow:
+
+Data/Shape-Dependent Control Flow
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Graph breaks can also be encountered on data-dependent control flow (``if
+x.shape[0] > 2``) when shapes are not being specialized, as a tracing compiler cannot
+possibly deal with without generating code for a combinatorially exploding
+number of paths. In such cases, users will need to rewrite their code using
+special control flow operators (coming soon!).
+
+Data-Dependent Accesses
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Data dependent behavior such as using the value inside of a tensor to construct
+another tensor, or using the value of a tensor to slice into another tensor, is
+also something the tracer cannot fully determine. Users will need to rewrite
+their code using the inline constraint APIs
+:func:`torch.export.constrain_as_size` and
+:func:`torch.export.constrain_as_value`.
+
+Missing Meta Kernels for Operators
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When tracing, a META implementation (or "meta kernel") is required for all
+operators. This is used to reason about the input/output shapes for this
+operator.
+
+Note that the official API for registering custom meta kernels for custom ops is
+currently undergoing development. While the final API is being refined, you can
+refer to the documentation `here <https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0>`_.
+
+In the unfortunate case where your model uses an ATen operator that is does not
+have a meta kernel implementation yet, please file an issue.
+
+
+Read More
+---------
+
+.. toctree::
+   :caption: Additional Links for Export Users
+   :maxdepth: 1
+
+   torch.compiler_transformations
+   torch.compiler_ir
+   generated/exportdb/index
+
+.. toctree::
+   :caption: Deep Dive for PyTorch Developers
+   :maxdepth: 1
+
+   torch.compiler_deepdive
+   torch.compiler_dynamic_shapes
+   torch.compiler_fake_tensor
+
+
+API Reference
+-------------

 .. automodule:: torch.export
 .. autofunction:: export
@ -24,10 +565,3 @@ torch.export
 .. autoclass:: ArgumentSpec
 .. autoclass:: ModuleCallSignature
 .. autoclass:: ModuleCallEntry
-
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-
-   generated/exportdb/index
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -94,7 +94,6 @@ Features described in this documentation are classified by release status:
   profiler
   nn.init
   onnx
-   onnx_diagnostics
   optim
   complex_numbers
   ddp_comm_hooks
--- a/docs/source/name_inference.rst
+++ b/docs/source/name_inference.rst
@ -185,6 +185,7 @@ If you don't see an operation listed here, but it would help your use case, plea
   :meth:`Tensor.reciprocal_`,None
   :meth:`Tensor.refine_names`,See documentation
   :meth:`Tensor.register_hook`,None
+   :meth:`Tensor.register_post_accumulate_grad_hook`,None
   :meth:`Tensor.rename`,See documentation
   :meth:`Tensor.rename_`,See documentation
   :attr:`Tensor.requires_grad`,None
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@ -1,745 +1,64 @@
 torch.onnx
 ==========

-.. contents:: :local:
-
-.. automodule:: torch.onnx
+Overview
+--------

 `Open Neural Network eXchange (ONNX) <https://onnx.ai/>`_ is an open standard
-format for representing machine learning models. The torch.onnx module can export
-PyTorch models to ONNX. The model can then be consumed by any of the many
-`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_.
+format for representing machine learning models. The ``torch.onnx`` module captures the computation graph from a
+native PyTorch :class:`torch.nn.Module` model and converts it into an
+`ONNX graph <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_.

-Example: AlexNet from PyTorch to ONNX
-------------------------------------
+The exported model can be consumed by any of the many
+`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_, including
+Microsoft's `ONNX Runtime <https://www.onnxruntime.ai>`_.

-Here is a simple script which exports a pretrained AlexNet to an ONNX file named ``alexnet.onnx``.
-The call to ``torch.onnx.export`` runs the model once to trace its execution and then exports the
-traced model to the specified file::
+**There are two flavors of ONNX exporter API that you can use, as listed below:**

-    import torch
-    import torchvision
+TorchDynamo-based ONNX Exporter
+-------------------------------

-    dummy_input = torch.randn(10, 3, 224, 224, device="cuda")
-    model = torchvision.models.alexnet(pretrained=True).cuda()
+*The TorchDynamo-based ONNX exporter is the newest (and Beta) exporter for PyTorch 2.0 and newer*

-    # Providing input and output names sets the display names for values
-    # within the model's graph. Setting these does not change the semantics
-    # of the graph; it is only for readability.
-    #
-    # The inputs to the network consist of the flat list of inputs (i.e.
-    # the values you would pass to the forward() method) followed by the
-    # flat list of parameters. You can partially specify names, i.e. provide
-    # a list here shorter than the number of inputs to the model, and we will
-    # only set that subset of names, starting from the beginning.
-    input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
-    output_names = [ "output1" ]
+TorchDynamo engine is leveraged to hook into Python's frame evaluation API and dynamically rewrite its
+bytecode into an FX Graph. The resulting FX Graph is then polished before it is finally translated into an
+ONNX graph.

-    torch.onnx.export(model, dummy_input, "alexnet.onnx", verbose=True, input_names=input_names, output_names=output_names)
+The main advantage of this approach is that the `FX graph <https://pytorch.org/docs/stable/fx.html>`_ is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.

-The resulting ``alexnet.onnx`` file contains a binary `protocol buffer <https://developers.google.com/protocol-buffers/>`_
-which contains both the network structure and parameters of the model you exported
-(in this case, AlexNet).  The argument ``verbose=True`` causes the
-exporter to print out a human-readable representation of the model::
+:doc:`Learn more about the TorchDynamo-based ONNX Exporter <onnx_dynamo>`

-    # These are the inputs and parameters to the network, which have taken on
-    # the names we specified earlier.
-    graph(%actual_input_1 : Float(10, 3, 224, 224)
-          %learned_0 : Float(64, 3, 11, 11)
-          %learned_1 : Float(64)
-          %learned_2 : Float(192, 64, 5, 5)
-          %learned_3 : Float(192)
-          # ---- omitted for brevity ----
-          %learned_14 : Float(1000, 4096)
-          %learned_15 : Float(1000)) {
-      # Every statement consists of some output tensors (and their types),
-      # the operator to be run (with its attributes, e.g., kernels, strides,
-      # etc.), its input tensors (%actual_input_1, %learned_0, %learned_1)
-      %17 : Float(10, 64, 55, 55) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[11, 11], pads=[2, 2, 2, 2], strides=[4, 4]](%actual_input_1, %learned_0, %learned_1), scope: AlexNet/Sequential[features]/Conv2d[0]
-      %18 : Float(10, 64, 55, 55) = onnx::Relu(%17), scope: AlexNet/Sequential[features]/ReLU[1]
-      %19 : Float(10, 64, 27, 27) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%18), scope: AlexNet/Sequential[features]/MaxPool2d[2]
-      # ---- omitted for brevity ----
-      %29 : Float(10, 256, 6, 6) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%28), scope: AlexNet/Sequential[features]/MaxPool2d[12]
-      # Dynamic means that the shape is not known. This may be because of a
-      # limitation of our implementation (which we would like to fix in a
-      # future release) or shapes which are truly dynamic.
-      %30 : Dynamic = onnx::Shape(%29), scope: AlexNet
-      %31 : Dynamic = onnx::Slice[axes=[0], ends=[1], starts=[0]](%30), scope: AlexNet
-      %32 : Long() = onnx::Squeeze[axes=[0]](%31), scope: AlexNet
-      %33 : Long() = onnx::Constant[value={9216}](), scope: AlexNet
-      # ---- omitted for brevity ----
-      %output1 : Float(10, 1000) = onnx::Gemm[alpha=1, beta=1, broadcast=1, transB=1](%45, %learned_14, %learned_15), scope: AlexNet/Sequential[classifier]/Linear[6]
-      return (%output1);
-    }
+TorchScript-based ONNX Exporter
+-------------------------------

-You can also verify the output using the `ONNX <https://github.com/onnx/onnx/>`_ library,
-which you can install using ``pip``::
+*The TorchScript-based ONNX exporter is available since PyTorch 1.2.0*

-    pip install onnx
+`TorchScript <https://pytorch.org/docs/stable/jit.html>`_ is leveraged to trace (through :func:`torch.jit.trace`)
+the model and capture a static computation graph.

-Then, you can run::
+As a consequence, the resulting graph has a couple limitations:

-    import onnx
+* It does not record any control-flow, like if-statements or loops;
+* Does not handle nuances between ``training`` and ``eval`` mode;
+* Does not truly handle dynamic inputs

-    # Load the ONNX model
-    model = onnx.load("alexnet.onnx")
+As an attempt to support the static tracing limitations, the exporter also supports TorchScript scripting
+(through :func:`torch.jit.script`), which adds support for data-dependent control-flow, for example. However, TorchScript
+itself is a subset of the Python language, so not all features in Python are supported, such as in-place operations.

-    # Check that the model is well formed
-    onnx.checker.check_model(model)
+:doc:`Learn more about the TorchScript-based ONNX Exporter <onnx_torchscript>`

-    # Print a human readable representation of the graph
-    print(onnx.helper.printable_graph(model.graph))
-
-You can also run the exported model with one of the many
-`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_.
-For example after installing `ONNX Runtime <https://www.onnxruntime.ai>`_, you can
-load and run the model::
-
-    import onnxruntime as ort
-    import numpy as np
-
-    ort_session = ort.InferenceSession("alexnet.onnx")
-
-    outputs = ort_session.run(
-        None,
-        {"actual_input_1": np.random.randn(10, 3, 224, 224).astype(np.float32)},
-    )
-    print(outputs[0])
-
-Here is a more involved `tutorial on exporting a model and running it with ONNX Runtime <https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html>`_.
-
-.. _tracing-vs-scripting:
-
-Tracing vs Scripting
--------------------
-
-Internally, :func:`torch.onnx.export()` requires a :class:`torch.jit.ScriptModule` rather than
-a :class:`torch.nn.Module`. If the passed-in model is not already a ``ScriptModule``,
-``export()`` will use *tracing* to convert it to one:
-
-.. TODO(justinchuby): Add a word on recommending tracing over scripting for most use cases.
-
-* **Tracing**: If ``torch.onnx.export()`` is called with a Module that is not already a
-  ``ScriptModule``, it first does the equivalent of :func:`torch.jit.trace`, which executes the model
-  once with the given ``args`` and records all operations that happen during that execution. This
-  means that if your model is dynamic, e.g., changes behavior depending on input data, the exported
-  model will *not* capture this dynamic behavior.
-  We recommend examining the exported model and making sure the operators look
-  reasonable. Tracing will unroll loops and if statements, exporting a static graph that is exactly
-  the same as the traced run. If you want to export your model with dynamic control flow, you will
-  need to use *scripting*.
-
-* **Scripting**: Compiling a model via scripting preserves dynamic control flow and is valid for inputs
-  of different sizes. To use scripting:
-
-  * Use :func:`torch.jit.script` to produce a ``ScriptModule``.
-  * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model. The ``args`` are still required,
-    but they will be used internally only to produce example outputs, so that the types and shapes of the
-    outputs can be captured. No tracing will be performed.
-
-See `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
-and `TorchScript <jit.html>`_ for more details, including how to compose tracing and scripting to suit the
-particular requirements of different models.
-
-
-Avoiding Pitfalls
-----------------
-
-Avoid NumPy and built-in Python types
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-PyTorch models can be written using NumPy or Python types and functions, but
-during :ref:`tracing<tracing-vs-scripting>`, any variables of NumPy or Python
-types (rather than torch.Tensor) are converted to constants, which will produce
-the wrong result if those values should change depending on the inputs.
-
-For example, rather than using numpy functions on numpy.ndarrays: ::
-
-    # Bad! Will be replaced with constants during tracing.
-    x, y = np.random.rand(1, 2), np.random.rand(1, 2)
-    np.concatenate((x, y), axis=1)
-
-Use torch operators on torch.Tensors: ::
-
-    # Good! Tensor operations will be captured during tracing.
-    x, y = torch.randn(1, 2), torch.randn(1, 2)
-    torch.cat((x, y), dim=1)
-
-
-And rather than use :func:`torch.Tensor.item` (which converts a Tensor to a Python
-built-in number): ::
-
-    # Bad! y.item() will be replaced with a constant during tracing.
-    def forward(self, x, y):
-        return x.reshape(y.item(), -1)
-
-Use torch's support for implicit casting of single-element tensors: ::
-
-    # Good! y will be preserved as a variable during tracing.
-    def forward(self, x, y):
-        return x.reshape(y, -1)
-
-Avoid Tensor.data
-^^^^^^^^^^^^^^^^^
-
-Using the Tensor.data field can produce an incorrect trace and therefore an incorrect ONNX graph.
-Use :func:`torch.Tensor.detach` instead. (Work is ongoing to
-`remove Tensor.data entirely <https://github.com/pytorch/pytorch/issues/30987>`_).
-
-Avoid in-place operations when using tensor.shape in tracing mode
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In tracing mode, shapes obtained from ``tensor.shape`` are traced as tensors,
-and share the same memory. This might cause a mismatch the final output values.
-As a workaround, avoid the use of inplace operations in these scenarios.
-For example, in the model::
-
-    class Model(torch.nn.Module):
-      def forward(self, states):
-          batch_size, seq_length = states.shape[:2]
-          real_seq_length = seq_length
-          real_seq_length += 2
-          return real_seq_length + seq_length
-
-``real_seq_length`` and ``seq_length`` share the same memory in tracing mode.
-This could be avoided by rewriting the inplace operation::
-
-    real_seq_length = real_seq_length + 2
-
-Limitations
-----------
-
-Types
-^^^^^
-
-* Only :class:`torch.Tensors`, numeric types that can be trivially converted to torch.Tensors (e.g. float, int),
-  and tuples and lists of those types are supported as model inputs or outputs. Dict and str inputs and
-  outputs are accepted in :ref:`tracing<tracing-vs-scripting>` mode, but:
-
-  * Any computation that depends on the value of a dict or a str input **will be replaced with the
-    constant value** seen during the one traced execution.
-  * Any output that is a dict will be silently replaced with a **flattened sequence of its values
-    (keys will be removed)**. E.g. ``{"foo": 1, "bar": 2}`` becomes ``(1, 2)``.
-  * Any output that is a str will be silently removed.
-
-* Certain operations involving tuples and lists are not supported in
-  :ref:`scripting<tracing-vs-scripting>` mode due to limited support in ONNX for nested sequences.
-  In particular appending a tuple to a list is not supported. In tracing mode, the nested sequences
-  will be flattened automatically during the tracing.
-
-Differences in Operator Implementations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Due to differences in implementations of operators, running the exported model on different runtimes
-may produce different results from each other or from PyTorch. Normally these differences are
-numerically small, so this should only be a concern if your application is sensitive to these
-small differences.
-
-.. _tensor-indexing:
-
-Unsupported Tensor Indexing Patterns
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Tensor indexing patterns that cannot be exported are listed below.
-If you are experiencing issues exporting a model that does not include any of
-the unsupported patterns below, please double check that you are exporting with
-the latest ``opset_version``.
-
-Reads / Gets
-~~~~~~~~~~~~
-
-When indexing into a tensor for reading, the following patterns are not supported: ::
-
-  # Tensor indices that includes negative values.
-  data[torch.tensor([[1, 2], [2, -3]]), torch.tensor([-2, 3])]
-  # Workarounds: use positive index values.
-
-Writes / Sets
-~~~~~~~~~~~~~
-
-When indexing into a Tensor for writing, the following patterns are not supported: ::
-
-  # Multiple tensor indices if any has rank >= 2
-  data[torch.tensor([[1, 2], [2, 3]]), torch.tensor([2, 3])] = new_data
-  # Workarounds: use single tensor index with rank >= 2,
-  #              or multiple consecutive tensor indices with rank == 1.
-
-  # Multiple tensor indices that are not consecutive
-  data[torch.tensor([2, 3]), :, torch.tensor([1, 2])] = new_data
-  # Workarounds: transpose `data` such that tensor indices are consecutive.
-
-  # Tensor indices that includes negative values.
-  data[torch.tensor([1, -2]), torch.tensor([-2, 3])] = new_data
-  # Workarounds: use positive index values.
-
-  # Implicit broadcasting required for new_data.
-  data[torch.tensor([[0, 2], [1, 1]]), 1:3] = new_data
-  # Workarounds: expand new_data explicitly.
-  # Example:
-  #   data shape: [3, 4, 5]
-  #   new_data shape: [5]
-  #   expected new_data shape after broadcasting: [2, 2, 2, 5]
-
-Adding support for operators
----------------------------
-
-When exporting a model that includes unsupported operators, you'll see an error message like:
-
-.. code-block:: text
-
-    RuntimeError: ONNX export failed: Couldn't export operator foo
-
-When that happens, there are a few things you can do:
-
-#. Change the model to not use that operator.
-#. Create a symbolic function to convert the operator and register it as a custom symbolic function.
-#. Contribute to PyTorch to add the same symbolic function to :mod:`torch.onnx` itself.
-
-If you decided to implement a symbolic function (we hope you will contribute it back to PyTorch!), here is how you can get started:
-
-ONNX exporter internals
-^^^^^^^^^^^^^^^^^^^^^^^
-
-A "symbolic function" is a function that decomposes a PyTorch operator into a
-composition of a series of ONNX operators.
-
-During export, each node (which contains a PyTorch operator) in the TorchScript
-graph is visited by the exporter in topological order.
-Upon visiting a node, the exporter looks for a registered symbolic functions for
-that operator. Symbolic functions are implemented in Python. A symbolic function for
-an op named ``foo`` would look something like::
-
-
-    def foo(
-      g,
-      input_0: torch._C.Value,
-      input_1: torch._C.Value) -> Union[None, torch._C.Value, List[torch._C.Value]]:
-      """
-      Adds the ONNX operations representing this PyTorch function by updating the
-      graph g with `g.op()` calls.
-
-      Args:
-        g (Graph): graph to write the ONNX representation into.
-        input_0 (Value): value representing the variables which contain
-            the first input for this operator.
-        input_1 (Value): value representing the variables which contain
-            the second input for this operator.
-
-      Returns:
-        A Value or List of Values specifying the ONNX nodes that compute something
-        equivalent to the original PyTorch operator with the given inputs.
-
-        None if it cannot be converted to ONNX.
-      """
-      ...
-
-The ``torch._C`` types are Python wrappers around the types defined in C++ in
-`ir.h <https://github.com/pytorch/pytorch/blob/main/torch/csrc/jit/ir/ir.h>`_.
-
-The process for adding a symbolic function depends on the type of operator.
-
-.. _adding-support-aten:
-
-ATen operators
-^^^^^^^^^^^^^^
-
-`ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library.
-If the operator is an ATen operator (shows up in the TorchScript graph with the prefix
-``aten::``), make sure it is not supported already.
-
-List of supported operators
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Visit the auto generated :doc:`list of supported TorchScript operators <../onnx_supported_aten_ops>`
-for details on which operator are supported in each ``opset_version``.
-
-Adding support for an aten or quantized operator
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If the operator is not in the list above:
-
-* Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
-  `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/main/torch/onnx/symbolic_opset9.py>`_.
-  Make sure the function has the same name as the ATen function, which may be declared in
-  ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at
-  build time, so will not appear in your checkout until you build PyTorch).
-* By default, the first arg is the ONNX graph.
-  Other arg names must EXACTLY match the names in the ``.pyi`` file,
-  because dispatch is done with keyword arguments.
-* In the symbolic function, if the operator is in the
-  `ONNX standard operator set <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
-  we only need to create a node to represent the ONNX operator in the graph.
-  If not, we can compose several standard operators that have the
-  equivalent semantics to the ATen operator.
-
-Here is an example of handling missing symbolic function for the ``ELU`` operator.
-
-If we run the following code::
-
-    print(
-        torch.jit.trace(
-            torch.nn.ELU(), # module
-            torch.ones(1)   # example input
-        ).graph
-    )
-
-We see something like::
-
-  graph(%self : __torch__.torch.nn.modules.activation.___torch_mangle_0.ELU,
-        %input : Float(1, strides=[1], requires_grad=0, device=cpu)):
-    %4 : float = prim::Constant[value=1.]()
-    %5 : int = prim::Constant[value=1]()
-    %6 : int = prim::Constant[value=1]()
-    %7 : Float(1, strides=[1], requires_grad=0, device=cpu) = aten::elu(%input, %4, %5, %6)
-    return (%7)
-
-Since we see ``aten::elu`` in the graph, we know this is an ATen operator.
-
-We check the `ONNX operator list <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
-and confirm that ``Elu`` is standardized in ONNX.
-
-We find a signature for ``elu`` in ``torch/nn/functional.pyi``::
-
-    def elu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
-
-We add the following lines to ``symbolic_opset9.py``::
-
-    def elu(g, input: torch.Value, alpha: torch.Value, inplace: bool = False):
-        return g.op("Elu", input, alpha_f=alpha)
-
-Now PyTorch is able to export models containing the ``aten::elu`` operator!
-
-See the ``torch/onnx/symbolic_opset*.py`` files for more examples.
-
-
-torch.autograd.Functions
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If the operator is a sub-class of :class:`torch.autograd.Function`, there are three ways
-to export it.
-
-Static Symbolic Method
-~~~~~~~~~~~~~~~~~~~~~~
-
-You can add a static method named ``symbolic`` to your function class. It should return
-ONNX operators that represent the function's behavior in ONNX. For example::
-
-    class MyRelu(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
-            ctx.save_for_backward(input)
-            return input.clamp(min=0)
-
-        @staticmethod
-        def symbolic(g: torch.Graph, input: torch.Value) -> torch.Value:
-            return g.op("Clip", input, g.op("Constant", value_t=torch.tensor(0, dtype=torch.float)))
-
-.. FIXME(justinchuby): PythonOps are too complicated and the example below
-..  uses private methods we do not expose. We are looking to
-..  improve the experience. Since SymbolicContext is deprecated, we think
-..  defining a symbolic staticmethod is a better way to go for now.
-
-.. PythonOp Symbolic
-.. ~~~~~~~~~~~~~~~~~
-
-.. Alternatively, you can register a custom symbolic function.
-.. This gives the symbolic function access to more info through the
-.. ``torch.onnx.SymbolicContext`` object, which gets passed in as the first
-.. argument (before the ``Graph`` object).
-
-.. All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes.
-.. In order to differentiate between different ``Function`` subclasses, the
-.. symbolic function should use the ``name`` kwarg which gets set to the name of the class.
-
-.. Custom symbolic functions should add type and shape information by calling ``setType(...)``
-.. on Value objects before returning them (implemented in C++ by
-.. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
-.. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
-.. ``test_aten_embedding_2`` in
-.. `test_operators.py <https://github.com/pytorch/pytorch/blob/main/test/onnx/test_operators.py>`_.
-
-.. The example below shows how you can access ``requires_grad`` via the ``Node`` object:
-
-..     class MyClip(torch.autograd.Function):
-..         @staticmethod
-..         def forward(ctx, input, min):
-..             ctx.save_for_backward(input)
-..             return input.clamp(min=min)
-
-..     class MyRelu(torch.autograd.Function):
-..         @staticmethod
-..         def forward(ctx, input):
-..             ctx.save_for_backward(input)
-..             return input.clamp(min=0)
-
-..     def symbolic_python_op(g: "GraphContext", *args, **kwargs):
-..         n = ctx.cur_node
-..         print("original node: ", n)
-..         for i, out in enumerate(n.outputs()):
-..             print("original output {}: {}, requires grad: {}".format(i, out, out.requiresGrad()))
-..         import torch.onnx.symbolic_helper as sym_helper
-..         for i, arg in enumerate(args):
-..             requires_grad = arg.requiresGrad() if sym_helper._is_value(arg) else False
-..             print("arg {}: {}, requires grad: {}".format(i, arg, requires_grad))
-
-..         name = kwargs["name"]
-..         ret = None
-..         if name == "MyClip":
-..             ret = g.op("Clip", args[0], args[1])
-..         elif name == "MyRelu":
-..             ret = g.op("Relu", args[0])
-..         else:
-..             # Logs a warning and returns None
-..             return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
-..         # Copy type and shape from original node.
-..         ret.setType(n.type())
-..         return ret
-
-..     from torch.onnx import register_custom_op_symbolic
-.. .     register_custom_op_symbolic("prim::PythonOp", symbolic_python_op, 1)
-
-Inline Autograd Function
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-In cases where a static symbolic method is not provided for its subsequent :class:`torch.autograd.Function` or
-where a function to register ``prim::PythonOp`` as custom symbolic functions is not provided,
-:func:`torch.onnx.export` tries to inline the graph that corresponds to that :class:`torch.autograd.Function` such that
-this function is broken down into individual operators that were used within the function.
-The export should be successful as long as these individual operators are supported. For example::
-
-    class MyLogExp(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
-            ctx.save_for_backward(input)
-            h = input.exp()
-            return h.log().log()
-
-There is no static symbolic method present for this model, yet it is exported as follows::
-
-    graph(%input : Float(1, strides=[1], requires_grad=0, device=cpu)):
-        %1 : float = onnx::Exp[](%input)
-        %2 : float = onnx::Log[](%1)
-        %3 : float = onnx::Log[](%2)
-        return (%3)
-
-If you need to avoid inlining of :class:`torch.autograd.Function`, you should export models with
-``operator_export_type`` set to ``ONNX_FALLTHROUGH`` or ``ONNX_ATEN_FALLBACK``.
-
-Custom operators
-^^^^^^^^^^^^^^^^
-
-You can export your model with custom operators that includes a combination of many standard ONNX ops,
-or are driven by self-defined C++ backend.
-
-ONNX-script functions
-~~~~~~~~~~~~~~~~~~~~~
-
-If an operator is not a standard ONNX op, but can be composed of multiple existing ONNX ops, you can utilize
-`ONNX-script <https://github.com/microsoft/onnx-script>`_ to create an external ONNX function to support the operator.
-You can export it by following this example::
-
-    import onnxscript
-    # There are three opset version needed to be aligned
-    # This is (1) the opset version in ONNX function
-    from onnxscript.onnx_opset import opset15 as op
-    opset_version = 15
-
-    x = torch.randn(1, 2, 3, 4, requires_grad=True)
-    model = torch.nn.SELU()
-
-    custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
-
-    @onnxscript.script(custom_opset)
-    def Selu(X):
-        alpha = 1.67326  # auto wrapped as Constants
-        gamma = 1.0507
-        alphaX = op.CastLike(alpha, X)
-        gammaX = op.CastLike(gamma, X)
-        neg = gammaX * (alphaX * op.Exp(X) - alphaX)
-        pos = gammaX * X
-        zero = op.CastLike(0, X)
-        return op.Where(X <= zero, neg, pos)
-
-    # setType API provides shape/type to ONNX shape/type inference
-    def custom_selu(g: jit_utils.GraphContext, X):
-        return g.onnxscript_op(Selu, X).setType(X.type())
-
-    # Register custom symbolic function
-    # There are three opset version needed to be aligned
-    # This is (2) the opset version in registry
-    torch.onnx.register_custom_op_symbolic(
-        symbolic_name="aten::selu",
-        symbolic_fn=custom_selu,
-        opset_version=opset_version,
-    )
-
-    # There are three opset version needed to be aligned
-    # This is (2) the opset version in exporter
-    torch.onnx.export(
-        model,
-        x,
-        "model.onnx",
-        opset_version=opset_version,
-        # only needed if you want to specify an opset version > 1.
-        custom_opsets={"onnx-script": 2}
-    )
-
-The example above exports it as a custom operator in the "onnx-script" opset.
-When exporting a custom operator, you can specify the custom domain version using the
-``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
-
-NOTE: Be careful to align the opset version mentioned in the above example, and make sure they are consumed in exporter step.
-The example usage of how to write a onnx-script function is a beta version in terms of the active development on onnx-script.
-Please follow the latest `ONNX-script <https://github.com/microsoft/onnx-script>`_
-
-C++ Operators
-~~~~~~~~~~~~~
-
-If a model uses a custom operator implemented in C++ as described in
-`Extending TorchScript with Custom C++ Operators <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_,
-you can export it by following this example::
-
-    from torch.onnx import symbolic_helper
-
-
-    # Define custom symbolic function
-    @symbolic_helper.parse_args("v", "v", "f", "i")
-    def symbolic_foo_forward(g, input1, input2, attr1, attr2):
-        return g.op("custom_domain::Foo", input1, input2, attr1_f=attr1, attr2_i=attr2)
-
-
-    # Register custom symbolic function
-    torch.onnx.register_custom_op_symbolic("custom_ops::foo_forward", symbolic_foo_forward, 9)
-
-
-    class FooModel(torch.nn.Module):
-        def __init__(self, attr1, attr2):
-            super().__init__()
-            self.attr1 = attr1
-            self.attr2 = attr2
-
-        def forward(self, input1, input2):
-            # Calling custom op
-            return torch.ops.custom_ops.foo_forward(input1, input2, self.attr1, self.attr2)
-
-
-    model = FooModel(attr1, attr2)
-    torch.onnx.export(
-        model,
-        (example_input1, example_input1),
-        "model.onnx",
-        # only needed if you want to specify an opset version > 1.
-        custom_opsets={"custom_domain": 2}
-    )
-
-The example above exports it as a custom operator in the "custom_domain" opset.
-When exporting a custom operator, you can specify the custom domain version using the
-``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
-
-The runtime that consumes the model needs to support the custom op. See
-`Caffe2 custom ops <https://caffe2.ai/docs/custom-operators.html>`_,
-`ONNX Runtime custom ops <https://onnxruntime.ai/docs/reference/operators/add-custom-op.html>`_,
-or your runtime of choice's documentation.
-
-
-Discovering all unconvertible ATen ops at once
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-When export fails due to an unconvertible ATen op, there may in fact be more
-than one such op but the error message only mentions the first. To discover
-all of the unconvertible ops in one go you can::
-
-    # prepare model, args, opset_version
-    ...
-
-    torch_script_graph, unconvertible_ops = torch.onnx.utils.unconvertible_ops(
-        model, args, opset_version=opset_version
-    )
-
-    print(set(unconvertible_ops))
-
-The set is approximated because some ops may be removed during the conversion
-process and don't need to be converted. Some other ops may have partial support
-that will fail conversion with particular inputs, but this should give you a
-general idea of what ops are not supported. Please feel free to open GitHub Issues
-for op support requests.
-
-Frequently Asked Questions
--------------------------
-Q: I have exported my LSTM model, but its input size seems to be fixed?
-
-  The tracer records the shapes of the example inputs. If the model should accept
-  inputs of dynamic shapes, set ``dynamic_axes`` when calling :func:`torch.onnx.export`.
-
-Q: How to export models containing loops?
-
-  See `Tracing vs Scripting`_.
-
-Q: How to export models with primitive type inputs (e.g. int, float)?
-
-  Support for primitive numeric type inputs was added in PyTorch 1.9.
-  However, the exporter does not support models with str inputs.
-
-Q: Does ONNX support implicit scalar datatype casting?
-
-  The ONNX standard does not, but the exporter will try to handle that part.
-  Scalars are exported as constant tensors.
-  The exporter will figure out the right data type for scalars. In rare cases when it is unable
-  to do so, you will need to manually specify the datatype with e.g. `dtype=torch.float32`.
-  If you see any errors, please [create a GitHub issue](https://github.com/pytorch/pytorch/issues).
-
-Q: Are lists of Tensors exportable to ONNX?
-
-  Yes, for ``opset_version`` >= 11, since ONNX introduced the Sequence type in opset 11.
-
-
-Contributing / developing
+Contributing / Developing
 -------------------------
-`Developer docs <https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter>`_.

-Functions
---------
-.. autofunction:: export
-.. autofunction:: export_to_pretty_string
-.. autofunction:: register_custom_op_symbolic
-.. autofunction:: unregister_custom_op_symbolic
-.. autofunction:: select_model_mode_for_export
-.. autofunction:: is_in_onnx_export
-.. autofunction:: enable_log
-.. autofunction:: disable_log
-.. autofunction:: torch.onnx.verification.find_mismatch
+The ONNX exporter is a community project and we welcome contributions. We follow the
+`PyTorch guidelines for contributions <https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md>`_, but you might
+also be interested in reading our `development wiki <https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter>`_.

-Classes
-------
+.. toctree::
+    :hidden:

-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-    :template: classtemplate.rst
-
-    JitScalarType
-    torch.onnx.verification.GraphInfo
-    torch.onnx.verification.VerificationOptions
-
-Preview: torch.onnx TorchDynamo Exporter
----------------------------------------
-
-.. warning::
-  The ONNX exporter for TorchDynamo is under active development and is
-  subject to rapid change.
-
-.. autofunction:: torch.onnx.dynamo_export
-.. autofunction:: torch.onnx.enable_fake_mode
-.. autofunction:: torch.onnx.is_onnxrt_backend_supported
-
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-    :template: classtemplate.rst
-
-    torch.onnx.DiagnosticOptions
-    torch.onnx.ExportOptions
-    torch.onnx.ExportOutput
-    torch.onnx.ExportOutputSerializer
-    torch.onnx.OnnxExporterError
-    torch.onnx.OnnxRegistry
+    onnx_dynamo
+    onnx_dynamo_onnxruntime_backend
+    onnx_torchscript
--- a/docs/source/onnx_diagnostics.rst
+++ b/docs/source/onnx_diagnostics.rst
@ -1,26 +0,0 @@
-torch.onnx diagnostics
-======================
-
-.. contents:: :local:
-.. automodule:: torch.onnx._internal.diagnostics
-.. currentmodule:: torch.onnx._internal.diagnostics
-
-Overview
--------
-
-NOTE: This feature is underdevelopment and is subject to change.
-
-The goal is to improve the diagnostics to help users debug and improve their model export to ONNX.
-
- The diagnostics are emitted in machine parsable `Static Analysis Results Interchange Format (SARIF) <https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html>`__.
- A new clearer, structured way to add new and keep track of diagnostic rules.
- Serve as foundation for more future improvements consuming the diagnostics.
-
-
-Diagnostic Rules
----------------
-
-.. toctree::
-    :glob:
-
-    generated/onnx_diagnostics_rules/*
--- a/docs/source/onnx_dynamo.rst
+++ b/docs/source/onnx_dynamo.rst
@ -0,0 +1,156 @@
+TorchDynamo-based ONNX Exporter
+===============================
+
+.. automodule:: torch.onnx
+  :noindex:
+
+.. contents:: :local:
+    :depth: 3
+
+.. warning::
+  The ONNX exporter for TorchDynamo is a rapidly evolving beta technology.
+
+Overview
+--------
+
+The ONNX exporter leverages TorchDynamo engine to hook into Python's frame evaluation API
+and dynamically rewrite its bytecode into an FX Graph.
+The resulting FX Graph is then polished before it is finally translated into an ONNX graph.
+
+The main advantage of this approach is that the `FX graph <https://pytorch.org/docs/stable/fx.html>`_ is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.
+
+The exporter is designed to be modular and extensible. It is composed of the following components:
+
+  - **ONNX Exporter**: :class:`Exporter` main class that orchestrates the export process.
+  - **ONNX Export Options**: :class:`ExportOptions` has a set of options that control the export process.
+  - **ONNX Registry**: :class:`OnnxRegistry` is the registry of ONNX operators and functions.
+  - **FX Graph Extractor**: :class:`FXGraphExtractor` extracts the FX graph from the PyTorch model.
+  - **Fake Mode**: :class:`ONNXFakeContext` is a context manager that enables fake mode for large scale models.
+  - **ONNX Export Output**: :class:`ExportOutput` is the output of the exporter that contains the exported ONNX graph and diagnostics.
+  - **ONNX Export Output Serializer**: :class:`ExportOutputSerializer` serializes the exported model to a file.
+  - **ONNX Diagnostic Options**: :class:`DiagnosticOptions` has a set of options that control the diagnostics emitted by the exporter.
+
+Dependencies
+------------
+
+The ONNX exporter depends on extra Python packages:
+
+  - `ONNX <https://onnx.ai>`_
+  - `ONNX Script <https://onnxscript.ai>`_
+
+They can be installed through `pip <https://pypi.org/project/pip/>`_:
+
+.. code-block:: bash
+
+  pip install --upgrade onnx onnxscript
+
+A simple example
+----------------
+
+See below a demonstration of exporter API in action with a simple Multilayer Perceptron (MLP) as example:
+
+.. code-block:: python
+
+  import torch
+
+  class MLPModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc0 = nn.Linear(8, 8, bias=True)
+        self.fc1 = nn.Linear(8, 4, bias=True)
+        self.fc2 = nn.Linear(4, 2, bias=True)
+        self.fc3 = nn.Linear(2, 2, bias=True)
+
+    def forward(self, tensor_x: torch.Tensor):
+        tensor_x = self.fc0(tensor_x)
+        tensor_x = torch.sigmoid(tensor_x)
+        tensor_x = self.fc1(tensor_x)
+        tensor_x = torch.sigmoid(tensor_x)
+        tensor_x = self.fc2(tensor_x)
+        tensor_x = torch.sigmoid(tensor_x)
+        output = self.fc3(tensor_x)
+        return output
+
+  model = MLPModel()
+  tensor_x = torch.rand((97, 8), dtype=torch.float32)
+  export_output = torch.onnx.dynamo_export(model, tensor_x)
+
+As the code above shows, all you need is to provide :func:`torch.onnx.dynamo_export` with an instance of the model and its input.
+The exporter will then return an instance of :class:`torch.onnx.ExportOutput` that contains the exported ONNX graph along with extra information.
+
+The in-memory model available through ``export_output.model_proto`` is an ``onnx.ModelProto`` object in compliance with the `ONNX IR spec <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_.
+The ONNX model may then be serialized into a `Protobuf file <https://protobuf.dev/>`_ using the :meth:`torch.onnx.ExportOutput.save` API.
+
+.. code-block:: python
+
+  export_output.save("mlp.onnx")
+
+Inspecting the ONNX model using GUI
+-----------------------------------
+
+You can view the exported model using `Netron <https://netron.app/>`__.
+
+.. image:: _static/img/onnx/onnx_dynamo_mlp_model.png
+    :width: 40%
+    :alt: MLP model as viewed using Netron
+
+Note that each layer is represented in a rectangular box with a *f* icon in the top right corner.
+
+.. image:: _static/img/onnx/onnx_dynamo_mlp_model_function_highlight.png
+    :width: 40%
+    :alt: ONNX function highlighted on MLP model
+
+By expanding it, the function body is shown.
+
+.. image:: _static/img/onnx/onnx_dynamo_mlp_model_function_body.png
+    :width: 50%
+    :alt: ONNX function body
+
+The function body is a sequence of ONNX operators or other functions.
+
+Diagnosing issues with SARIF
+----------------------------
+
+ONNX diagnostics goes beyond regular logs through the adoption of
+`Static Analysis Results Interchange Format (aka SARIF) <https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html>`__
+to help users debug and improve their model using a GUI, such as
+Visual Studio Code's `SARIF Viewer <https://marketplace.visualstudio.com/items?itemName=MS-SarifVSCode.sarif-viewer>`_.
+
+The main advantages are:
+
+  - The diagnostics are emitted in machine parseable `Static Analysis Results Interchange Format (SARIF) <https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html>`__.
+  - A new clearer, structured way to add new and keep track of diagnostic rules.
+  - Serve as foundation for more future improvements consuming the diagnostics.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: ONNX Diagnostic SARIF Rules
+   :glob:
+
+   generated/onnx_dynamo_diagnostics_rules/*
+
+API Reference
+-------------
+
+.. autofunction:: torch.onnx.dynamo_export
+
+.. autoclass:: torch.onnx.ExportOptions
+    :members:
+
+.. autofunction:: torch.onnx.enable_fake_mode
+
+.. autoclass:: torch.onnx.ExportOutput
+    :members:
+
+.. autoclass:: torch.onnx.ExportOutputSerializer
+    :members:
+
+.. autoclass:: torch.onnx.OnnxExporterError
+    :members:
+
+.. autoclass:: torch.onnx.OnnxRegistry
+    :members:
+
+.. autoclass:: torch.onnx.DiagnosticOptions
+    :members:
--- a/docs/source/onnx_dynamo_onnxruntime_backend.rst
+++ b/docs/source/onnx_dynamo_onnxruntime_backend.rst
@ -0,0 +1,9 @@
+ONNX Backend for TorchDynamo
+============================
+
+For a quick overview of ``torch.compiler``, see :ref:`torch.compiler_overview`.
+
+.. warning::
+  The ONNX backend for torch.compile is a rapidly evolving beta technology.
+
+.. autofunction:: torch.onnx.is_onnxrt_backend_supported
--- a/docs/source/onnx_torchscript.rst
+++ b/docs/source/onnx_torchscript.rst
@ -0,0 +1,719 @@
+TorchScript-based ONNX Exporter
+===============================
+
+.. note::
+    To export an ONNX model using TorchDynamo instead of TorchScript, see :func:`torch.onnx.dynamo_export`.
+
+.. contents:: :local:
+
+Example: AlexNet from PyTorch to ONNX
+-------------------------------------
+
+Here is a simple script which exports a pretrained AlexNet to an ONNX file named ``alexnet.onnx``.
+The call to ``torch.onnx.export`` runs the model once to trace its execution and then exports the
+traced model to the specified file::
+
+    import torch
+    import torchvision
+
+    dummy_input = torch.randn(10, 3, 224, 224, device="cuda")
+    model = torchvision.models.alexnet(pretrained=True).cuda()
+
+    # Providing input and output names sets the display names for values
+    # within the model's graph. Setting these does not change the semantics
+    # of the graph; it is only for readability.
+    #
+    # The inputs to the network consist of the flat list of inputs (i.e.
+    # the values you would pass to the forward() method) followed by the
+    # flat list of parameters. You can partially specify names, i.e. provide
+    # a list here shorter than the number of inputs to the model, and we will
+    # only set that subset of names, starting from the beginning.
+    input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+    output_names = [ "output1" ]
+
+    torch.onnx.export(model, dummy_input, "alexnet.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+The resulting ``alexnet.onnx`` file contains a binary `protocol buffer <https://developers.google.com/protocol-buffers/>`_
+which contains both the network structure and parameters of the model you exported
+(in this case, AlexNet).  The argument ``verbose=True`` causes the
+exporter to print out a human-readable representation of the model::
+
+    # These are the inputs and parameters to the network, which have taken on
+    # the names we specified earlier.
+    graph(%actual_input_1 : Float(10, 3, 224, 224)
+          %learned_0 : Float(64, 3, 11, 11)
+          %learned_1 : Float(64)
+          %learned_2 : Float(192, 64, 5, 5)
+          %learned_3 : Float(192)
+          # ---- omitted for brevity ----
+          %learned_14 : Float(1000, 4096)
+          %learned_15 : Float(1000)) {
+      # Every statement consists of some output tensors (and their types),
+      # the operator to be run (with its attributes, e.g., kernels, strides,
+      # etc.), its input tensors (%actual_input_1, %learned_0, %learned_1)
+      %17 : Float(10, 64, 55, 55) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[11, 11], pads=[2, 2, 2, 2], strides=[4, 4]](%actual_input_1, %learned_0, %learned_1), scope: AlexNet/Sequential[features]/Conv2d[0]
+      %18 : Float(10, 64, 55, 55) = onnx::Relu(%17), scope: AlexNet/Sequential[features]/ReLU[1]
+      %19 : Float(10, 64, 27, 27) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%18), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+      # ---- omitted for brevity ----
+      %29 : Float(10, 256, 6, 6) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%28), scope: AlexNet/Sequential[features]/MaxPool2d[12]
+      # Dynamic means that the shape is not known. This may be because of a
+      # limitation of our implementation (which we would like to fix in a
+      # future release) or shapes which are truly dynamic.
+      %30 : Dynamic = onnx::Shape(%29), scope: AlexNet
+      %31 : Dynamic = onnx::Slice[axes=[0], ends=[1], starts=[0]](%30), scope: AlexNet
+      %32 : Long() = onnx::Squeeze[axes=[0]](%31), scope: AlexNet
+      %33 : Long() = onnx::Constant[value={9216}](), scope: AlexNet
+      # ---- omitted for brevity ----
+      %output1 : Float(10, 1000) = onnx::Gemm[alpha=1, beta=1, broadcast=1, transB=1](%45, %learned_14, %learned_15), scope: AlexNet/Sequential[classifier]/Linear[6]
+      return (%output1);
+    }
+
+You can also verify the output using the `ONNX <https://github.com/onnx/onnx/>`_ library,
+which you can install using ``pip``::
+
+    pip install onnx
+
+Then, you can run::
+
+    import onnx
+
+    # Load the ONNX model
+    model = onnx.load("alexnet.onnx")
+
+    # Check that the model is well formed
+    onnx.checker.check_model(model)
+
+    # Print a human readable representation of the graph
+    print(onnx.helper.printable_graph(model.graph))
+
+You can also run the exported model with one of the many
+`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_.
+For example after installing `ONNX Runtime <https://www.onnxruntime.ai>`_, you can
+load and run the model::
+
+    import onnxruntime as ort
+    import numpy as np
+
+    ort_session = ort.InferenceSession("alexnet.onnx")
+
+    outputs = ort_session.run(
+        None,
+        {"actual_input_1": np.random.randn(10, 3, 224, 224).astype(np.float32)},
+    )
+    print(outputs[0])
+
+Here is a more involved `tutorial on exporting a model and running it with ONNX Runtime <https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html>`_.
+
+.. _tracing-vs-scripting:
+
+Tracing vs Scripting
+--------------------
+
+Internally, :func:`torch.onnx.export()` requires a :class:`torch.jit.ScriptModule` rather than
+a :class:`torch.nn.Module`. If the passed-in model is not already a ``ScriptModule``,
+``export()`` will use *tracing* to convert it to one:
+
+.. TODO(justinchuby): Add a word on recommending tracing over scripting for most use cases.
+
+* **Tracing**: If ``torch.onnx.export()`` is called with a Module that is not already a
+  ``ScriptModule``, it first does the equivalent of :func:`torch.jit.trace`, which executes the model
+  once with the given ``args`` and records all operations that happen during that execution. This
+  means that if your model is dynamic, e.g., changes behavior depending on input data, the exported
+  model will *not* capture this dynamic behavior.
+  We recommend examining the exported model and making sure the operators look
+  reasonable. Tracing will unroll loops and if statements, exporting a static graph that is exactly
+  the same as the traced run. If you want to export your model with dynamic control flow, you will
+  need to use *scripting*.
+
+* **Scripting**: Compiling a model via scripting preserves dynamic control flow and is valid for inputs
+  of different sizes. To use scripting:
+
+  * Use :func:`torch.jit.script` to produce a ``ScriptModule``.
+  * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model. The ``args`` are still required,
+    but they will be used internally only to produce example outputs, so that the types and shapes of the
+    outputs can be captured. No tracing will be performed.
+
+See `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
+and `TorchScript <jit.html>`_ for more details, including how to compose tracing and scripting to suit the
+particular requirements of different models.
+
+
+Avoiding Pitfalls
+-----------------
+
+Avoid NumPy and built-in Python types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch models can be written using NumPy or Python types and functions, but
+during :ref:`tracing<tracing-vs-scripting>`, any variables of NumPy or Python
+types (rather than torch.Tensor) are converted to constants, which will produce
+the wrong result if those values should change depending on the inputs.
+
+For example, rather than using numpy functions on numpy.ndarrays: ::
+
+    # Bad! Will be replaced with constants during tracing.
+    x, y = np.random.rand(1, 2), np.random.rand(1, 2)
+    np.concatenate((x, y), axis=1)
+
+Use torch operators on torch.Tensors: ::
+
+    # Good! Tensor operations will be captured during tracing.
+    x, y = torch.randn(1, 2), torch.randn(1, 2)
+    torch.cat((x, y), dim=1)
+
+
+And rather than use :func:`torch.Tensor.item` (which converts a Tensor to a Python
+built-in number): ::
+
+    # Bad! y.item() will be replaced with a constant during tracing.
+    def forward(self, x, y):
+        return x.reshape(y.item(), -1)
+
+Use torch's support for implicit casting of single-element tensors: ::
+
+    # Good! y will be preserved as a variable during tracing.
+    def forward(self, x, y):
+        return x.reshape(y, -1)
+
+Avoid Tensor.data
+^^^^^^^^^^^^^^^^^
+
+Using the Tensor.data field can produce an incorrect trace and therefore an incorrect ONNX graph.
+Use :func:`torch.Tensor.detach` instead. (Work is ongoing to
+`remove Tensor.data entirely <https://github.com/pytorch/pytorch/issues/30987>`_).
+
+Avoid in-place operations when using tensor.shape in tracing mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In tracing mode, shapes obtained from ``tensor.shape`` are traced as tensors,
+and share the same memory. This might cause a mismatch the final output values.
+As a workaround, avoid the use of inplace operations in these scenarios.
+For example, in the model::
+
+    class Model(torch.nn.Module):
+      def forward(self, states):
+          batch_size, seq_length = states.shape[:2]
+          real_seq_length = seq_length
+          real_seq_length += 2
+          return real_seq_length + seq_length
+
+``real_seq_length`` and ``seq_length`` share the same memory in tracing mode.
+This could be avoided by rewriting the inplace operation::
+
+    real_seq_length = real_seq_length + 2
+
+Limitations
+-----------
+
+Types
+^^^^^
+
+* Only :class:`torch.Tensors`, numeric types that can be trivially converted to torch.Tensors (e.g. float, int),
+  and tuples and lists of those types are supported as model inputs or outputs. Dict and str inputs and
+  outputs are accepted in :ref:`tracing<tracing-vs-scripting>` mode, but:
+
+  * Any computation that depends on the value of a dict or a str input **will be replaced with the
+    constant value** seen during the one traced execution.
+  * Any output that is a dict will be silently replaced with a **flattened sequence of its values
+    (keys will be removed)**. E.g. ``{"foo": 1, "bar": 2}`` becomes ``(1, 2)``.
+  * Any output that is a str will be silently removed.
+
+* Certain operations involving tuples and lists are not supported in
+  :ref:`scripting<tracing-vs-scripting>` mode due to limited support in ONNX for nested sequences.
+  In particular appending a tuple to a list is not supported. In tracing mode, the nested sequences
+  will be flattened automatically during the tracing.
+
+Differences in Operator Implementations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Due to differences in implementations of operators, running the exported model on different runtimes
+may produce different results from each other or from PyTorch. Normally these differences are
+numerically small, so this should only be a concern if your application is sensitive to these
+small differences.
+
+.. _tensor-indexing:
+
+Unsupported Tensor Indexing Patterns
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Tensor indexing patterns that cannot be exported are listed below.
+If you are experiencing issues exporting a model that does not include any of
+the unsupported patterns below, please double check that you are exporting with
+the latest ``opset_version``.
+
+Reads / Gets
+~~~~~~~~~~~~
+
+When indexing into a tensor for reading, the following patterns are not supported: ::
+
+  # Tensor indices that includes negative values.
+  data[torch.tensor([[1, 2], [2, -3]]), torch.tensor([-2, 3])]
+  # Workarounds: use positive index values.
+
+Writes / Sets
+~~~~~~~~~~~~~
+
+When indexing into a Tensor for writing, the following patterns are not supported: ::
+
+  # Multiple tensor indices if any has rank >= 2
+  data[torch.tensor([[1, 2], [2, 3]]), torch.tensor([2, 3])] = new_data
+  # Workarounds: use single tensor index with rank >= 2,
+  #              or multiple consecutive tensor indices with rank == 1.
+
+  # Multiple tensor indices that are not consecutive
+  data[torch.tensor([2, 3]), :, torch.tensor([1, 2])] = new_data
+  # Workarounds: transpose `data` such that tensor indices are consecutive.
+
+  # Tensor indices that includes negative values.
+  data[torch.tensor([1, -2]), torch.tensor([-2, 3])] = new_data
+  # Workarounds: use positive index values.
+
+  # Implicit broadcasting required for new_data.
+  data[torch.tensor([[0, 2], [1, 1]]), 1:3] = new_data
+  # Workarounds: expand new_data explicitly.
+  # Example:
+  #   data shape: [3, 4, 5]
+  #   new_data shape: [5]
+  #   expected new_data shape after broadcasting: [2, 2, 2, 5]
+
+Adding support for operators
+----------------------------
+
+When exporting a model that includes unsupported operators, you'll see an error message like:
+
+.. code-block:: text
+
+    RuntimeError: ONNX export failed: Couldn't export operator foo
+
+When that happens, there are a few things you can do:
+
+#. Change the model to not use that operator.
+#. Create a symbolic function to convert the operator and register it as a custom symbolic function.
+#. Contribute to PyTorch to add the same symbolic function to :mod:`torch.onnx` itself.
+
+If you decided to implement a symbolic function (we hope you will contribute it back to PyTorch!), here is how you can get started:
+
+ONNX exporter internals
+^^^^^^^^^^^^^^^^^^^^^^^
+
+A "symbolic function" is a function that decomposes a PyTorch operator into a
+composition of a series of ONNX operators.
+
+During export, each node (which contains a PyTorch operator) in the TorchScript
+graph is visited by the exporter in topological order.
+Upon visiting a node, the exporter looks for a registered symbolic functions for
+that operator. Symbolic functions are implemented in Python. A symbolic function for
+an op named ``foo`` would look something like::
+
+
+    def foo(
+      g,
+      input_0: torch._C.Value,
+      input_1: torch._C.Value) -> Union[None, torch._C.Value, List[torch._C.Value]]:
+      """
+      Adds the ONNX operations representing this PyTorch function by updating the
+      graph g with `g.op()` calls.
+
+      Args:
+        g (Graph): graph to write the ONNX representation into.
+        input_0 (Value): value representing the variables which contain
+            the first input for this operator.
+        input_1 (Value): value representing the variables which contain
+            the second input for this operator.
+
+      Returns:
+        A Value or List of Values specifying the ONNX nodes that compute something
+        equivalent to the original PyTorch operator with the given inputs.
+
+        None if it cannot be converted to ONNX.
+      """
+      ...
+
+The ``torch._C`` types are Python wrappers around the types defined in C++ in
+`ir.h <https://github.com/pytorch/pytorch/blob/main/torch/csrc/jit/ir/ir.h>`_.
+
+The process for adding a symbolic function depends on the type of operator.
+
+.. _adding-support-aten:
+
+ATen operators
+^^^^^^^^^^^^^^
+
+`ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library.
+If the operator is an ATen operator (shows up in the TorchScript graph with the prefix
+``aten::``), make sure it is not supported already.
+
+List of supported operators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Visit the auto generated :doc:`list of supported TorchScript operators <../onnx_torchscript_supported_aten_ops>`
+for details on which operator are supported in each ``opset_version``.
+
+Adding support for an aten or quantized operator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If the operator is not in the list above:
+
+* Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
+  `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/main/torch/onnx/symbolic_opset9.py>`_.
+  Make sure the function has the same name as the ATen function, which may be declared in
+  ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at
+  build time, so will not appear in your checkout until you build PyTorch).
+* By default, the first arg is the ONNX graph.
+  Other arg names must EXACTLY match the names in the ``.pyi`` file,
+  because dispatch is done with keyword arguments.
+* In the symbolic function, if the operator is in the
+  `ONNX standard operator set <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
+  we only need to create a node to represent the ONNX operator in the graph.
+  If not, we can compose several standard operators that have the
+  equivalent semantics to the ATen operator.
+
+Here is an example of handling missing symbolic function for the ``ELU`` operator.
+
+If we run the following code::
+
+    print(
+        torch.jit.trace(
+            torch.nn.ELU(), # module
+            torch.ones(1)   # example input
+        ).graph
+    )
+
+We see something like::
+
+  graph(%self : __torch__.torch.nn.modules.activation.___torch_mangle_0.ELU,
+        %input : Float(1, strides=[1], requires_grad=0, device=cpu)):
+    %4 : float = prim::Constant[value=1.]()
+    %5 : int = prim::Constant[value=1]()
+    %6 : int = prim::Constant[value=1]()
+    %7 : Float(1, strides=[1], requires_grad=0, device=cpu) = aten::elu(%input, %4, %5, %6)
+    return (%7)
+
+Since we see ``aten::elu`` in the graph, we know this is an ATen operator.
+
+We check the `ONNX operator list <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
+and confirm that ``Elu`` is standardized in ONNX.
+
+We find a signature for ``elu`` in ``torch/nn/functional.pyi``::
+
+    def elu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+
+We add the following lines to ``symbolic_opset9.py``::
+
+    def elu(g, input: torch.Value, alpha: torch.Value, inplace: bool = False):
+        return g.op("Elu", input, alpha_f=alpha)
+
+Now PyTorch is able to export models containing the ``aten::elu`` operator!
+
+See the ``torch/onnx/symbolic_opset*.py`` files for more examples.
+
+
+torch.autograd.Functions
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the operator is a sub-class of :class:`torch.autograd.Function`, there are three ways
+to export it.
+
+Static Symbolic Method
+~~~~~~~~~~~~~~~~~~~~~~
+
+You can add a static method named ``symbolic`` to your function class. It should return
+ONNX operators that represent the function's behavior in ONNX. For example::
+
+    class MyRelu(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+            ctx.save_for_backward(input)
+            return input.clamp(min=0)
+
+        @staticmethod
+        def symbolic(g: torch.Graph, input: torch.Value) -> torch.Value:
+            return g.op("Clip", input, g.op("Constant", value_t=torch.tensor(0, dtype=torch.float)))
+
+.. FIXME(justinchuby): PythonOps are too complicated and the example below
+..  uses private methods we do not expose. We are looking to
+..  improve the experience. Since SymbolicContext is deprecated, we think
+..  defining a symbolic staticmethod is a better way to go for now.
+
+.. PythonOp Symbolic
+.. ~~~~~~~~~~~~~~~~~
+
+.. Alternatively, you can register a custom symbolic function.
+.. This gives the symbolic function access to more info through the
+.. ``torch.onnx.SymbolicContext`` object, which gets passed in as the first
+.. argument (before the ``Graph`` object).
+
+.. All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes.
+.. In order to differentiate between different ``Function`` subclasses, the
+.. symbolic function should use the ``name`` kwarg which gets set to the name of the class.
+
+.. Custom symbolic functions should add type and shape information by calling ``setType(...)``
+.. on Value objects before returning them (implemented in C++ by
+.. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
+.. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
+.. ``test_aten_embedding_2`` in
+.. `test_operators.py <https://github.com/pytorch/pytorch/blob/main/test/onnx/test_operators.py>`_.
+
+.. The example below shows how you can access ``requires_grad`` via the ``Node`` object:
+
+..     class MyClip(torch.autograd.Function):
+..         @staticmethod
+..         def forward(ctx, input, min):
+..             ctx.save_for_backward(input)
+..             return input.clamp(min=min)
+
+..     class MyRelu(torch.autograd.Function):
+..         @staticmethod
+..         def forward(ctx, input):
+..             ctx.save_for_backward(input)
+..             return input.clamp(min=0)
+
+..     def symbolic_python_op(g: "GraphContext", *args, **kwargs):
+..         n = ctx.cur_node
+..         print("original node: ", n)
+..         for i, out in enumerate(n.outputs()):
+..             print("original output {}: {}, requires grad: {}".format(i, out, out.requiresGrad()))
+..         import torch.onnx.symbolic_helper as sym_helper
+..         for i, arg in enumerate(args):
+..             requires_grad = arg.requiresGrad() if sym_helper._is_value(arg) else False
+..             print("arg {}: {}, requires grad: {}".format(i, arg, requires_grad))
+
+..         name = kwargs["name"]
+..         ret = None
+..         if name == "MyClip":
+..             ret = g.op("Clip", args[0], args[1])
+..         elif name == "MyRelu":
+..             ret = g.op("Relu", args[0])
+..         else:
+..             # Logs a warning and returns None
+..             return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
+..         # Copy type and shape from original node.
+..         ret.setType(n.type())
+..         return ret
+
+..     from torch.onnx import register_custom_op_symbolic
+.. .     register_custom_op_symbolic("prim::PythonOp", symbolic_python_op, 1)
+
+Inline Autograd Function
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+In cases where a static symbolic method is not provided for its subsequent :class:`torch.autograd.Function` or
+where a function to register ``prim::PythonOp`` as custom symbolic functions is not provided,
+:func:`torch.onnx.export` tries to inline the graph that corresponds to that :class:`torch.autograd.Function` such that
+this function is broken down into individual operators that were used within the function.
+The export should be successful as long as these individual operators are supported. For example::
+
+    class MyLogExp(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+            ctx.save_for_backward(input)
+            h = input.exp()
+            return h.log().log()
+
+There is no static symbolic method present for this model, yet it is exported as follows::
+
+    graph(%input : Float(1, strides=[1], requires_grad=0, device=cpu)):
+        %1 : float = onnx::Exp[](%input)
+        %2 : float = onnx::Log[](%1)
+        %3 : float = onnx::Log[](%2)
+        return (%3)
+
+If you need to avoid inlining of :class:`torch.autograd.Function`, you should export models with
+``operator_export_type`` set to ``ONNX_FALLTHROUGH`` or ``ONNX_ATEN_FALLBACK``.
+
+Custom operators
+^^^^^^^^^^^^^^^^
+
+You can export your model with custom operators that includes a combination of many standard ONNX ops,
+or are driven by self-defined C++ backend.
+
+ONNX-script functions
+~~~~~~~~~~~~~~~~~~~~~
+
+If an operator is not a standard ONNX op, but can be composed of multiple existing ONNX ops, you can utilize
+`ONNX-script <https://github.com/microsoft/onnx-script>`_ to create an external ONNX function to support the operator.
+You can export it by following this example::
+
+    import onnxscript
+    # There are three opset version needed to be aligned
+    # This is (1) the opset version in ONNX function
+    from onnxscript.onnx_opset import opset15 as op
+    opset_version = 15
+
+    x = torch.randn(1, 2, 3, 4, requires_grad=True)
+    model = torch.nn.SELU()
+
+    custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
+
+    @onnxscript.script(custom_opset)
+    def Selu(X):
+        alpha = 1.67326  # auto wrapped as Constants
+        gamma = 1.0507
+        alphaX = op.CastLike(alpha, X)
+        gammaX = op.CastLike(gamma, X)
+        neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+        pos = gammaX * X
+        zero = op.CastLike(0, X)
+        return op.Where(X <= zero, neg, pos)
+
+    # setType API provides shape/type to ONNX shape/type inference
+    def custom_selu(g: jit_utils.GraphContext, X):
+        return g.onnxscript_op(Selu, X).setType(X.type())
+
+    # Register custom symbolic function
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in registry
+    torch.onnx.register_custom_op_symbolic(
+        symbolic_name="aten::selu",
+        symbolic_fn=custom_selu,
+        opset_version=opset_version,
+    )
+
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in exporter
+    torch.onnx.export(
+        model,
+        x,
+        "model.onnx",
+        opset_version=opset_version,
+        # only needed if you want to specify an opset version > 1.
+        custom_opsets={"onnx-script": 2}
+    )
+
+The example above exports it as a custom operator in the "onnx-script" opset.
+When exporting a custom operator, you can specify the custom domain version using the
+``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
+
+NOTE: Be careful to align the opset version mentioned in the above example, and make sure they are consumed in exporter step.
+The example usage of how to write a onnx-script function is a beta version in terms of the active development on onnx-script.
+Please follow the latest `ONNX-script <https://github.com/microsoft/onnx-script>`_
+
+C++ Operators
+~~~~~~~~~~~~~
+
+If a model uses a custom operator implemented in C++ as described in
+`Extending TorchScript with Custom C++ Operators <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_,
+you can export it by following this example::
+
+    from torch.onnx import symbolic_helper
+
+
+    # Define custom symbolic function
+    @symbolic_helper.parse_args("v", "v", "f", "i")
+    def symbolic_foo_forward(g, input1, input2, attr1, attr2):
+        return g.op("custom_domain::Foo", input1, input2, attr1_f=attr1, attr2_i=attr2)
+
+
+    # Register custom symbolic function
+    torch.onnx.register_custom_op_symbolic("custom_ops::foo_forward", symbolic_foo_forward, 9)
+
+
+    class FooModel(torch.nn.Module):
+        def __init__(self, attr1, attr2):
+            super().__init__()
+            self.attr1 = attr1
+            self.attr2 = attr2
+
+        def forward(self, input1, input2):
+            # Calling custom op
+            return torch.ops.custom_ops.foo_forward(input1, input2, self.attr1, self.attr2)
+
+
+    model = FooModel(attr1, attr2)
+    torch.onnx.export(
+        model,
+        (example_input1, example_input1),
+        "model.onnx",
+        # only needed if you want to specify an opset version > 1.
+        custom_opsets={"custom_domain": 2}
+    )
+
+The example above exports it as a custom operator in the "custom_domain" opset.
+When exporting a custom operator, you can specify the custom domain version using the
+``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
+
+The runtime that consumes the model needs to support the custom op. See
+`Caffe2 custom ops <https://caffe2.ai/docs/custom-operators.html>`_,
+`ONNX Runtime custom ops <https://onnxruntime.ai/docs/reference/operators/add-custom-op.html>`_,
+or your runtime of choice's documentation.
+
+
+Discovering all unconvertible ATen ops at once
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When export fails due to an unconvertible ATen op, there may in fact be more
+than one such op but the error message only mentions the first. To discover
+all of the unconvertible ops in one go you can::
+
+    # prepare model, args, opset_version
+    ...
+
+    torch_script_graph, unconvertible_ops = torch.onnx.utils.unconvertible_ops(
+        model, args, opset_version=opset_version
+    )
+
+    print(set(unconvertible_ops))
+
+The set is approximated because some ops may be removed during the conversion
+process and don't need to be converted. Some other ops may have partial support
+that will fail conversion with particular inputs, but this should give you a
+general idea of what ops are not supported. Please feel free to open GitHub Issues
+for op support requests.
+
+Frequently Asked Questions
+--------------------------
+Q: I have exported my LSTM model, but its input size seems to be fixed?
+
+  The tracer records the shapes of the example inputs. If the model should accept
+  inputs of dynamic shapes, set ``dynamic_axes`` when calling :func:`torch.onnx.export`.
+
+Q: How to export models containing loops?
+
+  See `Tracing vs Scripting`_.
+
+Q: How to export models with primitive type inputs (e.g. int, float)?
+
+  Support for primitive numeric type inputs was added in PyTorch 1.9.
+  However, the exporter does not support models with str inputs.
+
+Q: Does ONNX support implicit scalar datatype casting?
+
+  The ONNX standard does not, but the exporter will try to handle that part.
+  Scalars are exported as constant tensors.
+  The exporter will figure out the right data type for scalars. In rare cases when it is unable
+  to do so, you will need to manually specify the datatype with e.g. `dtype=torch.float32`.
+  If you see any errors, please [create a GitHub issue](https://github.com/pytorch/pytorch/issues).
+
+Q: Are lists of Tensors exportable to ONNX?
+
+  Yes, for ``opset_version`` >= 11, since ONNX introduced the Sequence type in opset 11.
+
+Python API
+----------
+
+.. automodule:: torch.onnx
+
+Functions
+^^^^^^^^^
+
+.. autofunction:: export
+.. autofunction:: export_to_pretty_string
+.. autofunction:: register_custom_op_symbolic
+.. autofunction:: unregister_custom_op_symbolic
+.. autofunction:: select_model_mode_for_export
+.. autofunction:: is_in_onnx_export
+.. autofunction:: enable_log
+.. autofunction:: disable_log
+.. autofunction:: torch.onnx.verification.find_mismatch
+
+Classes
+^^^^^^^
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    JitScalarType
+    torch.onnx.verification.GraphInfo
+    torch.onnx.verification.VerificationOptions
--- a/docs/source/onnx_torchscript_supported_aten_ops.rst
+++ b/docs/source/onnx_torchscript_supported_aten_ops.rst
@ -5,7 +5,7 @@ ONNX supported TorchScript operators

 .. This file is automatically generated during the documentation build
 .. by cross referencing ONNX operator symbolics with TorchScript operators via
-.. ``docs/source/scripts/build_onnx_supported_aten_op_csv_table.py``.
+.. ``docs/source/scripts/build_onnx_torchscript_supported_aten_op_csv_table.py``.
 .. Do not modify directly and instead `rebuild the docs <https://github.com/pytorch/pytorch#building-the-documentation>`_.

 This page lists the TorchScript operators that are supported/unsupported by ONNX export.
--- a/docs/source/scripts/exportdb/generate_example_rst.py
+++ b/docs/source/scripts/exportdb/generate_example_rst.py
@ -119,7 +119,9 @@ def generate_index_rst(example_cases, tag_to_modules, support_level_to_modules):
        blurb = file.read()

    # Generate contents of the .rst file
-    doc_contents = f"""ExportDB
+    doc_contents = f""".. _torch.export_db:
+
+ExportDB
 ========

 {blurb}
--- a/docs/source/scripts/onnx/build_onnx_dynamo_diagnostics_rules_md.py
+++ b/docs/source/scripts/onnx/build_onnx_dynamo_diagnostics_rules_md.py
--- a/docs/source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
+++ b/docs/source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -575,6 +575,7 @@ Tensor class reference
    Tensor.reciprocal_
    Tensor.record_stream
    Tensor.register_hook
+    Tensor.register_post_accumulate_grad_hook
    Tensor.remainder
    Tensor.remainder_
    Tensor.renorm
--- a/docs/source/torch.compiler.rst
+++ b/docs/source/torch.compiler.rst
@ -37,7 +37,7 @@ TorchDynamo requires a backend that converts the captured graphs into a fast
 machine code. Different backends can result in various optimization gains.
 The default backend is called TorchInductor, also known as *inductor*,
 TorchDynamo has a list of supported backends developed by our partners,
-which can be see by running ``torch.compile.list_backends()`` each of which
+which can be see by running ``torch.compiler.list_backends()`` each of which
 with its optional dependencies.

 Some of the most commonly used backends include:
@ -54,6 +54,10 @@ Some of the most commonly used backends include:
     - Uses the TorchInductor backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
   * - ``torch.compile(m, backend="cudagraphs")``
     - CUDA graphs with AOT Autograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+   * - ``torch.compile(m, backend="ipex")``
+     - Uses IPEX on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+   * - ``torch.compile(m, backend="onnxrt")``
+     - Uses ONNX Runtime for training on CPU/GPU. :doc:`Read more <onnx_dynamo_onnxruntime_backend>`

 **Inference-only backends**

@ -63,10 +67,8 @@ Some of the most commonly used backends include:

   * - Backend
     - Description
-   * - ``torch.compile(m, backend="onnxrt")``
-     - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
   * - ``torch.compile(m, backend="tensorrt")``
-     - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
+     - Uses ONNX Runtime to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
   * - ``torch.compile(m, backend="ipex")``
     - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
   * - ``torch.compile(m, backend="tvm")``
--- a/docs/source/torch.compiler_dynamic_shapes.rst
+++ b/docs/source/torch.compiler_dynamic_shapes.rst
@ -19,25 +19,27 @@ In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g
 Abridged public API
 -------------------

-The eventual plan:
+The default dynamic behavior in PyTorch 2.1 is:

 - PT2 assumes everything is static by default
- If we recompile because a size changed, we will instead attempt to recompile that size as being dynamic (so we will never recompile because of that size again)
- If you know ahead of time something will be dynamic, you can skip the first recompile with ``torch._dynamo.mark_dynamic(tensor, dim)``
- If you say ``torch.compile(dynamic=True)`` we will attempt to make as much dynamic as possible

-Unbacked integers for eager mode:
+- If we recompile because a size changed, we will instead attempt to recompile
+  that size as being dynamic (sizes that have changed are likely to change in
+  the future).  This generalization may fail (e.g., because user code does a
+  conditional branch on the size in question or missing dynamic shapes support
+  in PT2).  If you are trying to understand why PT2 has overspecialized some
+  code, run with ``TORCH_LOGS=dynamic`` and look for "eval" entries that say
+  when guards are added and why.

-What we have currently:
+- If you know ahead of time something will be dynamic, you can skip the first
+  recompile with ``torch._dynamo.mark_dynamic(tensor, dim)``.

- You must explicitly opt into dynamic shapes with ``torch._dynamo.config.automatic_dynamic_shapes = True`` or ``torch.compile(dynamic=True)``
- ``torch.compile(dynamic=True)`` proactively attempts to make everything dynamic
- ``torch._dynamo.config.automatic_dynamic_shapes`` will assume everything is
-  static, but if we recompile because a size varied, the next time we will try
-  to compile it dynamically
- ``torch._dynamo.mark_dynamic`` works
-
-Use ``TORCH_LOGS=dynamic`` to view more information about what is going on with dynamic shapes.
+- If you say ``torch.compile(dynamic=False)``, we will turn off automatic
+  dynamic shapes on recompiles and always recompile for each distinct size.
+  Conversely, if you say ``torch.compile(dynamic=True)``, we will try to make
+  everything as dynamic as possible.  This is mostly useful for small
+  operators; if you try it on a big model it will (1) probably crash PT2 and
+  (2) run slow for no good reason.

 The Guard Model
 ---------------
@ -114,3 +116,10 @@ Naively implemented, this is too restrictive: most PyTorch programs will immedia
 - On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use ``empty_strided`` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like ``empty``, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
 - Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
 - The data in an integer tensor is generally not known to be non-negative. However, we provide an API ``constrain_range`` whereby a user can specify that a size is bounded above and below by known limits.
+
+In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
+to infer that an unbacked symbolic integer is size-like based on usage.  For
+example, if you pass the result of an ``.item()`` call to a factory function
+like ``torch.empty``, we will automatically infer that the result is a size
+(because if it was not, it would fail.)  This assumption would get validated
+at runtime, raising an error if it was not fulfilled.
--- a/docs/source/torch.compiler_faq.rst
+++ b/docs/source/torch.compiler_faq.rst
@ -317,8 +317,8 @@ them by default: ``env TORCHDYNAMO_DYNAMIC_SHAPES=0 python model.py`` 2.
 CUDA graphs with Triton are enabled by default in inductor but removing
 them may alleviate some OOM issues: ``torch._inductor.config.triton.cudagraphs = False``.

-``torch.func`` works with ``torch.compile`` (for `grad` and `vmap` transforms)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Does ``torch.func`` work with ``torch.compile`` (for `grad` and `vmap` transforms)?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Applying a ``torch.func`` transform to a function that uses ``torch.compile``
 does not work:
@ -528,6 +528,160 @@ invokes an ``nn.Module``. This is because the outputs now depend on the
 parameters of the ``nn.Module``. To get this to work, use
 ``torch.func.functional_call`` to extract the module state.

+Does NumPy work with ``torch.compile``?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Starting in 2.1, ``torch.compile`` understands native NumPy programs that
+work on NumPy arrays, and mixed PyTorch-NumPy programs that convert from PyTorch
+to NumPy and back via ``x.numpy()``, ``torch.from_numpy``, and related functions.
+
+.. _nonsupported-numpy-feats:
+
+Which NumPy features does ``torch.compile`` support?
+----------------------------------------------------
+
+NumPy within ``torch.compile`` follows NumPy 2.0 pre-release.
+
+Generally, ``torch.compile`` is able to trace through most NumPy constructions,
+and when it cannot, it falls back to eager and lets NumPy execute that piece of
+code. Even then, there are a few features where ``torch.compile`` semantics
+slightly deviate from those of NumPy:
+
+- NumPy scalars: We model them as 0-D arrays. That is, ``np.float32(3)`` returns
+  a 0-D array under ``torch.compile``. To avoid a graph break, it is best to use this 0-D
+  array. If this breaks your code, you can workaround this by casting the NumPy scalar
+  to the relevant Python scalar type ``bool/int/float``.
+
+- Negative strides: ``np.flip`` and slicing with a negative step return a copy.
+
+- Type promotion: NumPy's type promotion will change in NumPy 2.0. The new rules
+  are described in `NEP 50 <https://numpy.org/neps/nep-0050-scalar-promotion.html)>`__.
+  ``torch.compile`` implements NEP 50 rather than the current soon-to-be deprecated rules.
+
+- ``{tril,triu}_indices_from/{tril,triu}_indices`` return arrays rather than a tuple of arrays.
+
+There are other features for which we do not support tracing and we gracefully
+fallback to NumPy for their execution:
+
+- Non-numeric dtypes like datetimes, strings, chars, void, structured dtypes and recarrays.
+
+- Long dtypes ``np.float128/np.complex256`` and some unsigned dtypes ``np.uint16/np.uint32/np.uint64``.
+
+- ``ndarray`` subclasses.
+
+- Masked arrays.
+
+- Esoteric ufunc machinery like ``axes=[(n,k),(k,m)->(n,m)]`` and ufunc methods (e.g., ``np.add.reduce``).
+
+- Sorting / ordering ``complex64/complex128`` arrays.
+
+- NumPy ``np.poly1d`` and ``np.polynomial``.
+
+- Positional ``out1, out2`` args in functions with 2 or more returns (``out=tuple`` does work).
+
+- ``__array_function__``, ``__array_interface__`` and ``__array_wrap__``.
+
+- ``ndarray.ctypes`` attribute.
+
+Can I execute NumPy code on CUDA via ``torch.compile``?
+-------------------------------------------------------
+
+Yes you can! To do so, you may simply execute your code within a ``torch.device("cuda")``
+context. Consider the example
+
+.. code-block:: python
+
+   import torch
+   import numpy as np
+
+   @torch.compile
+   def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
+       return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))
+
+   X = np.random.randn(1024, 64)
+   Y = np.random.randn(1024, 64)
+   with torch.device("cuda"):
+       Z = numpy_fn(X, Y)
+
+
+In this example, ``numpy_fn`` will be executed in CUDA. For this to be
+possible, ``torch.compile`` automatically moves ``X`` and ``Y`` from CPU
+to CUDA, and then it moves the result ``Z`` from CUDA to CPU. If we are
+executing this function several times in the same program run, we may want
+to avoid all these rather expensive memory copies. To do so, we just need
+to tweak our ``numpy_fn`` so that it accepts cuda Tensors and returns tensors:
+
+.. code-block:: python
+
+   @torch.compile
+   def numpy_fn(X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
+       X, Y = X.numpy(), Y.numpy()
+       Z = np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))
+       return torch.from_numpy(Z)
+
+   X = torch.randn(1024, 64, device="cuda")
+   Y = torch.randn(1024, 64, device="cuda")
+   with torch.device("cuda"):
+       Z = numpy_fn(X, Y)
+
+By doing this, we explicitly create the tensors in CUDA memory, and we keep
+them there. In this case ``X.numpy()`` and ``from_numpy()`` are hints to the compiler
+but no real data movement happens. Note that the original program would not run
+on eager mode now. If you want to run it in eager mode, you would need to call
+``.numpy(force=True)`` doing ``Z = Z.cuda()`` before returning
+``Z``. Of course, doing this would execute the program on eager mode NumPy, and
+on CPU.
+
+
+How do I debug NumPy code under ``torch.compile``?
+--------------------------------------------------
+
+Debugging JIT compiled code is challenging, given the complexity of modern
+compilers and the daunting errors that they raise.
+`The tutorial on how to diagnose runtime errors within torch.compile <https://pytorch.org/docs/main/torch.compiler_troubleshooting.html#diagnosing-runtime-errors>`__
+contains a few tips and tricks on how to tackle this task.
+
+If the above is not enough to pinpoint the origin of the issue, there are still
+a few other NumPy-specific tools we can use. We can discern whether the bug
+is entirely in the PyTorch code by disabling tracing through NumPy functions:
+
+
+.. code-block:: python
+
+   from torch._dynamo import config
+   config.trace_numpy = False
+
+If the bug lies in the traced NumPy code, we can execute the NumPy code eagerly (without ``torch.compile``)
+using PyTorch as a backend by importing ``import torch._numpy as np``.
+This should just be used for **debugging purposes** and is in no way a
+replacement for the PyTorch API, as it is **much less performant** and, as a
+private API, **may change without notice**. At any rate, ``torch._numpy`` is a
+Python implementation of NumPy in terms of PyTorch and it is used internally by ``torch.compile`` to
+transform NumPy code into Pytorch code. It is rather easy to read and modify,
+so if you find any bug in it feel free to submit a PR fixing it or simply open
+an issue.
+
+If the program does work when importing ``torch._numpy as np``, chances are
+that the bug is in TorchDynamo. If this is the case, please feel open an issue
+with a `minimal reproducer <https://pytorch.org/docs/2.1/torch.compiler_troubleshooting.html>`__.
+
+I ``torch.compile`` some NumPy code and I did not see any speed-up.
+-------------------------------------------------------------------
+
+The best place to start is the
+`tutorial with general advice for how to debug these sort of torch.compile issues <https://pytorch.org/docs/main/torch.compiler_faq.html#why-am-i-not-seeing-speedups>`__.
+
+Some graph breaks may happen because of the use of unsupported features. See
+:ref:`nonsupported-numpy-feats`. More generally, it is useful to keep in mind
+that some widely used NumPy features do not play well with compilers. For
+example, in-place modifications make reasoning difficult within the compiler and
+often yield worse performance than their out-of-place counterparts.As such, it is best to avoid
+them. Same goes for the use of the ``out=`` parameter. Instead, prefer
+out-of-place ops and let ``torch.compile`` optimize the memory use. Same goes
+for data-dependent ops like masked indexing through boolean masks, or
+data-dependent control flow like ``if`` or ``while`` constructions.
+
+
 Which API to use for fine grain tracing?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/torch.compiler_get_started.rst
+++ b/docs/source/torch.compiler_get_started.rst
@ -92,7 +92,7 @@ hub.
   opt_model(torch.randn(1,3,64,64))

 And that is not the only available backend, you can run in a REPL
-``torch.compile.list_backends()`` to see all the available backends. Try out the
+``torch.compiler.list_backends()`` to see all the available backends. Try out the
 ``cudagraphs`` next as inspiration.

 Using a pretrained model
--- a/docs/source/torch.compiler_ir.rst
+++ b/docs/source/torch.compiler_ir.rst
@ -1,3 +1,5 @@
+.. _torch.compiler_ir:
+
 IRs
 ===============

--- a/ios/LibTorch-Lite.podspec.template
+++ b/ios/LibTorch-Lite.podspec.template
@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
    s.name             = 'LibTorch-Lite'
-    s.version          = '1.13.0'
+    s.version          = 'IOS_BUILD_VERSION'
    s.authors          = 'PyTorch Team'
    s.license          = { :type => 'BSD' }
    s.homepage         = 'https://github.com/pytorch/pytorch'
@ -33,5 +33,5 @@ Pod::Spec.new do |s|
        'VALID_ARCHS' => 'x86_64 arm64'
    }
    s.library = ['c++', 'stdc++']
-    s.frameworks = 'Accelerate'
+    s.frameworks = 'Accelerate', 'MetalPerformanceShaders', 'CoreML'
 end
--- a/Show More
+++ b/Show More