[dynamo][user-defined] Simplify and improve scope of UserDefinedObject var_getattr

ghstack-source-id: 3ae5569e3914050c7fd2d43b943622f6c5d93c5a Pull Request resolved: https://github.com/pytorch/pytorch/pull/130169
[dynamo][user-defined] Support method descriptors
2025-10-23 14:59:34 +08:00 · 2024-07-06 17:18:23 -07:00 · 2024-07-05 14:45:45 -07:00 · 2024-07-05 07:50:30 -07:00
940 changed files with 14522 additions and 11159 deletions
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -85,7 +85,7 @@ fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -230,10 +230,6 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
-  export CMAKE_BUILD_TYPE=RelWithAssert
-fi
-
 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
 if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -249,7 +249,9 @@ fi
 # This tests that the debug asserts are working correctly.
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
-    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
+    # TODO: Enable the check after we setup the build to run debug asserts without having
+    #       to do a full (and slow) debug build
+    # (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
 elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -29,11 +29,6 @@ if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
 fi

-# this is special build with all dependencies packaged
-if [[ ${BUILD_NAME} == *-full* ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
-fi
-
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69b2a0adc2ec03ab99990d7e8be3d4510438c148
+b829e936f7cc61b48149f5f957a451a38bf2a178
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -286,7 +286,6 @@
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
  approved_by:
-  - wconstab
  - mrshenli
  - pritamdamania87
  - zhaojuanmao
@ -313,25 +312,6 @@
  - Lint
  - pull

- name: DCP
-  patterns:
-  - torch/distributed/checkpoint/**
-  approved_by:
-  - LucasLLC
-  - fegin
-  - wz337
-  - saumishr
-  - daulet-askarov
-  - pradeepdfb
-  - kirtiteja
-  - mhorowitz
-  - saiteja64
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
-
 - name: IDEEP
  patterns:
  - third_party/ideep
@ -407,7 +387,7 @@
  - torch/_inductor/codegen/cpp_template.py
  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/inductor/test_cpu_repro.py
+  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -325,7 +325,7 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS

    if arches is None:
        # Define default compute archivectures
@ -357,10 +357,6 @@ def generate_wheels_matrix(
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, aarch64, windows
-            if (gpu_arch_type == "rocm" or os != "linux") and python_version == "3.13":
-                continue
-
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
            if (
                arch_version in ["12.4", "12.1", "11.8"]
@ -400,7 +396,9 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True",
-                            "devtoolset": "",
+                            "devtoolset": (
+                                "cxx11-abi" if arch_version == "cuda-aarch64" else ""
+                            ),
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": (
@ -413,26 +411,6 @@ def generate_wheels_matrix(
                            ),
                        }
                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -2,7 +2,7 @@

 set -eoux pipefail

-SYNC_BRANCH=pytorch-stable-prototype
+SYNC_BRANCH=fbcode/pytorch-stable-prototype

 git config user.email "fake@example.com"
 git config user.name  "PyTorch Stable Bot"
@ -11,9 +11,7 @@ git fetch origin main
 git fetch origin "$SYNC_BRANCH"
 git checkout "$SYNC_BRANCH"

-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
+for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
 do
    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
@ -22,12 +20,7 @@ do
        continue
    fi
    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
+    git cherry-pick -x "$SHA"
 done

 if [[ "${WITH_PUSH}" == true ]]; then
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -7,7 +7,6 @@ cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
 pip install -e .
-pip install numpy==1.26.0

 # Run indexer
 cd ../llm-target-determinator
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -1844,71 +1844,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda12_1-full-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -3671,511 +3606,3 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cpu-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cpu-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu-cxx11-abi
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cpu-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu-cxx11-abi
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cpu-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda11_8-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda11_8-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda11_8-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda11_8-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda12_1-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_1-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda12_1-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_1-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda12_4-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_4-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_13-cuda12_4-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_4-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -28,8 +28,6 @@ jobs:
    if: github.repository_owner == 'pytorch'
    runs-on: ubuntu-22.04
    environment: upload-stats
-    permissions:
-      id-token: write
    name: Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
    steps:
      - name: Print workflow information
@ -40,13 +38,6 @@ jobs:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

-      - name: Configure aws credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        continue-on-error: true
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats
-          aws-region: us-east-1
-
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
@ -58,6 +49,8 @@ jobs:
      - name: Upload test artifacts
        id: upload-s3
        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_ARTIFACTS_URL: ${{ github.event.workflow_run.artifacts_url }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
@ -73,6 +66,8 @@ jobs:
      - name: Upload test stats
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
@ -86,6 +81,8 @@ jobs:

      - name: Analyze disabled tests rerun
        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_ARTIFACTS_URL: ${{ github.event.workflow_run.artifacts_url }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
@ -99,12 +96,14 @@ jobs:
        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
        run: |
-          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection oss_ci_benchmark --rockset-workspace benchmarks --dynamodb-table torchci-oss-ci-benchmark --match-filename "^gpt_fast_benchmark"
+          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection oss_ci_benchmark --rockset-workspace benchmarks --match-filename "^gpt_fast_benchmark"

  check-api-rate:
    if: ${{ always() && github.repository_owner == 'pytorch' }}
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -26,8 +26,6 @@ jobs:
        github.event.workflow_run.conclusion == 'failure' || needs.get-conclusion.outputs.conclusion == 'failure'
    runs-on: ubuntu-22.04
    environment: upload-stats
-    permissions:
-      id-token: write
    name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
    steps:
      - name: Checkout PyTorch
@ -36,13 +34,6 @@ jobs:
          submodules: false
          fetch-depth: 1

-      - name: Configure aws credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        continue-on-error: true
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats
-          aws-region: us-east-1
-
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
@ -54,6 +45,8 @@ jobs:
      - name: Upload torch dynamo performance stats to S3
        id: upload-s3
        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_ARTIFACTS_URL: ${{ github.event.workflow_run.artifacts_url }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
@ -68,9 +61,11 @@ jobs:
        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success'
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
        run: |
-          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection torch_dynamo_perf_stats --rockset-workspace inductor --dynamodb-table torchci-dynamo-perf-stats --match-filename "^inductor_"
+          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection torch_dynamo_perf_stats --rockset-workspace inductor --match-filename "^inductor_"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -208,6 +208,7 @@ endif()
 include(CMakeDependentOption)
 option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
 option(BUILD_BINARY "Build C++ binaries" OFF)
+option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF
       "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
@ -749,6 +750,7 @@ if(NOT TORCH_BUILD_VERSION)
      CACHE STRING "Torch build version" FORCE)
 endif()
 caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
+caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
 set(TORCH_SOVERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}")

 # ---[ CMake scripts + modules
@ -1221,6 +1223,45 @@ endif()
 add_subdirectory(c10)
 add_subdirectory(caffe2)

+# --[ Documentation
+if(BUILD_DOCS)
+  # check if Doxygen is installed
+  find_package(Doxygen)
+  if(DOXYGEN_FOUND)
+    message("Generating documentation")
+
+    set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
+    set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
+    set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
+    set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
+
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+      file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif()
+
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
+    configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
+
+    add_custom_target(
+      doc_doxygen_c ALL
+      COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Generating C++ API documentation with Doxygen"
+      VERBATIM)
+
+    add_custom_target(
+      doc_doxygen_python ALL
+      COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Generating Python API documentation with Doxygen"
+      VERBATIM)
+  else()
+    message(
+      FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
+  endif()
+endif()
+
 # ---[ CMake related files Uninistall option.
 if(NOT TARGET caffe2_uninstall)
  configure_file(
--- a/RELEASE.md
+++ b/RELEASE.md
@ -51,7 +51,6 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- |
-| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
 | 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
 | 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
 | 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
@ -61,19 +60,15 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 ## Release Cadence

-Following is the release cadence for year 2023/2024. All dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
+Following is the release cadence for year 2023/2024. All dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27).

 | Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
 | --- | --- | --- | --- | --- |
 | 2.1 | Aug 2023 | Oct 2023 | Nov 2023 | Dec 2023 |
 | 2.2 | Dec 2023 | Jan 2024 | Feb 2024 | Mar 2024 |
 | 2.3 | Mar 2024 | Apr 2024 | Jun 2024 | Not planned |
-| 2.4 | Jun 2024 | Jul 2024 | (Sept 2024) | Not planned |
-| 2.5 | Aug 2024 | Oct 2024 | (Nov 2024) | (Dec 2024) |
-| 2.6 | Dec 2024 | Jan 2025 | (Feb 2025) | (Mar 2025) |
-| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) |
-| 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sept 2025) |
-| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
+| 2.4 | Jun 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
+| 2.5 | Aug 2024 | Oct 2024 | Nov 2024 | Dec 2024 |

 ## General Overview

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -156,7 +156,6 @@ file(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 file(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
 file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu")
 file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp")
-file(GLOB native_transformers_mps_mm "native/transformers/mps/*.mm")
 file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip")
 file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
 file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp")
@ -551,7 +550,7 @@ if(USE_CUDA)
 endif()

 if(USE_MPS)
-    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_transformers_mps_mm})
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
 endif()

 if(USE_ROCM)
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@ -222,7 +222,7 @@ c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
  static const size_t size = sizeof(CPUGeneratorImplState);
  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");

-  auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+  auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
  auto rng_state = state_tensor.data_ptr();

  // accumulate generator data to be copied into byte tensor
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@ -3,7 +3,7 @@
 #include <ATen/core/Generator.h>
 #include <ATen/core/MT19937RNGEngine.h>
 #include <c10/core/GeneratorImpl.h>
-#include <optional>
+#include <c10/util/Optional.h>

 namespace at {

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -278,24 +278,7 @@ void Context::setLinalgPreferredBackend(at::LinalgBackend b) {
  }
 }

-at::BlasBackend Context::blasPreferredBackend() {
-#ifdef USE_ROCM
-  if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
-    static const bool hipblaslt_unsupported = []() {
-      static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-      for (auto index: c10::irange(getNumGPUs())) {
-        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
-          TORCH_WARN_ONCE(
-            "Attempting to use hipBLASLt on an unsupported architecture! "
-            "Overriding blas backend to hipblas");
-          return true;
-        }
-      }
-      return false;
-    }();
-    if (hipblaslt_unsupported) blas_preferred_backend = at::BlasBackend::Cublas;
-  }
-#endif
+at::BlasBackend Context::blasPreferredBackend() const {
  return blas_preferred_backend;
 }

--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -59,7 +59,7 @@ class TORCH_API Context {
    }
  }
  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
-      std::optional<c10::DeviceType> opt_device_type = std::nullopt) {
+      std::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
    c10::DeviceType device_type = opt_device_type.has_value()
        ? opt_device_type.value()
        : at::getAccelerator(true).value();
@ -224,7 +224,7 @@ class TORCH_API Context {
  at::LinalgBackend linalgPreferredBackend() const;
  void setLinalgPreferredBackend(at::LinalgBackend);

-  at::BlasBackend blasPreferredBackend();
+  at::BlasBackend blasPreferredBackend() const;
  void setBlasPreferredBackend(at::BlasBackend);

  // Note [Enabling Deterministic Operations]
@ -407,7 +407,7 @@ class TORCH_API Context {
  bool release_original_weights = false;
 #endif
  bool display_vmap_fallback_warnings_ = false;
-  std::optional<at::QEngine> quantized_engine = std::nullopt;
+  std::optional<at::QEngine> quantized_engine = c10::nullopt;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -115,9 +115,6 @@ static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
      ctx.device_id =
          at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
      break;
-    case DeviceType::MAIA:
-      ctx.device_type = DLDeviceType::kDLMAIA;
-      break;
    default:
      TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
  }
@ -144,8 +141,6 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
 #endif
    case DLDeviceType::kDLOneAPI:
      return at::detail::getXPUHooks().getDeviceFromPtr(data);
-    case DLDeviceType::kDLMAIA:
-      return at::Device(DeviceType::MAIA, ctx.device_id);
    default:
      TORCH_CHECK(
          false, "Unsupported device_type: ", std::to_string(ctx.device_type));
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@ -17,14 +17,14 @@ namespace at {
 /// Return the Device of a Tensor, if the Tensor is defined.
 inline std::optional<Device> device_of(const Tensor& t) {
  if (t.defined()) {
-    return std::make_optional(t.device());
+    return c10::make_optional(t.device());
  } else {
-    return std::nullopt;
+    return c10::nullopt;
  }
 }

 inline std::optional<Device> device_of(const std::optional<Tensor>& t) {
-  return t.has_value() ? device_of(t.value()) : std::nullopt;
+  return t.has_value() ? device_of(t.value()) : c10::nullopt;
 }

 /// Return the Device of a TensorList, if the list is non-empty and
@ -34,7 +34,7 @@ inline std::optional<Device> device_of(ITensorListRef t) {
  if (!t.empty()) {
    return device_of(t.front());
  } else {
-    return std::nullopt;
+    return c10::nullopt;
  }
 }

--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@ -76,7 +76,7 @@ TORCH_API TensorBase empty_cpu(
    IntArrayRef size,
    ScalarType dtype,
    bool pin_memory = false,
-    std::optional<c10::MemoryFormat> memory_format_opt = std::nullopt);
+    std::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);

 TORCH_API TensorBase empty_cpu(
    IntArrayRef size,
@ -110,7 +110,7 @@ TORCH_API TensorBase empty_strided_cpu(
 TORCH_API TensorBase empty_meta(
    IntArrayRef size,
    ScalarType dtype,
-    std::optional<c10::MemoryFormat> memory_format_opt = std::nullopt);
+    std::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);

 TORCH_API TensorBase empty_meta(
    IntArrayRef size,
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
    return Tensor();
 }

-Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const std::optional<Tensor>& min_seqlen, const std::optional<Tensor>& max_seqlen) {
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
  auto values = at::_nested_get_values(mutated_view);
  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return values;
@ -321,8 +321,8 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
  auto max_seqlen = at::_nested_get_max_seqlen(base);
  auto nt = at::_nested_view_from_jagged(
      mutated_view, offsets, dummy, lengths, ragged_idx,
-      (min_seqlen.defined() ? std::optional<Tensor>(min_seqlen) : std::nullopt),
-      (max_seqlen.defined() ? std::optional<Tensor>(max_seqlen) : std::nullopt));
+      (min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : c10::nullopt),
+      (max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : c10::nullopt));

  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return nt;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -531,9 +531,9 @@ Tensor to_functional_tensor(const Tensor& tensor) {
 }
 std::optional<Tensor> to_functional_tensor(const std::optional<Tensor>& tensor) {
  if (tensor.has_value()) {
-    return std::make_optional<Tensor>(to_functional_tensor(*tensor));
+    return c10::make_optional<Tensor>(to_functional_tensor(*tensor));
  }
-  return std::nullopt;
+  return c10::nullopt;
 }
 c10::List<::std::optional<Tensor>> to_functional_tensor(const c10::List<::std::optional<Tensor>>& t_list) {
  c10::List<::std::optional<Tensor>> outputs;
@ -569,9 +569,9 @@ Tensor from_functional_tensor(const Tensor& tensor, bool assert_functional) {
 }
 std::optional<Tensor> from_functional_tensor(const std::optional<Tensor>& t, bool assert_functional) {
  if (t.has_value()) {
-    return std::make_optional<Tensor>(from_functional_tensor(*t, assert_functional));
+    return c10::make_optional<Tensor>(from_functional_tensor(*t, assert_functional));
  }
-  return std::nullopt;
+  return c10::nullopt;
 }
 std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
  std::vector<Tensor> outputs;
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -217,7 +217,7 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) {
    // we will end up hitting PreDispatch stack first. So, we should
    // directly redispatch to the functionalize key manually.
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed<at::Tensor(const at::Tensor &, std::optional<at::MemoryFormat>)>();
-    return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, std::nullopt);
+    return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, c10::nullopt);
  }

  at::AutoDispatchSkipFunctionalize guard;
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -4,7 +4,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/DimVector.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <sstream>
 #include <vector>

--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@ -512,7 +512,7 @@ static optional<int64_t> maximum_indexable_location(
    IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
  auto result = native::storage_size_for(sizes, strides);
  if (result == 0) {
-    return std::nullopt;
+    return nullopt;
  }
  return result + storage_offset;
 }
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -173,7 +173,7 @@ NestedTensorImpl::NestedTensorImpl(
      nested_sizes_(std::move(nested_sizes)),
      nested_strides_(std::move(nested_strides)),
      storage_offsets_(std::move(storage_offsets)),
-      opt_sizes_(std::nullopt) {
+      opt_sizes_(c10::nullopt) {
  C10_LOG_API_USAGE_ONCE("torch.NestedTensor");
  TORCH_WARN_ONCE(
      "The PyTorch API of nested tensors is in prototype stage and will change "
@ -230,7 +230,7 @@ NestedTensorImpl::NestedTensorImpl(
      nested_sizes_(std::move(nested_sizes)),
      nested_strides_(std::move(nested_strides)),
      storage_offsets_(std::move(storage_offsets)),
-      opt_sizes_(std::nullopt) {
+      opt_sizes_(c10::nullopt) {
  validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);
  refresh_dim();
  set_custom_sizes_strides(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
@ -239,11 +239,11 @@ NestedTensorImpl::NestedTensorImpl(
 std::optional<int64_t> NestedTensorImpl::opt_size(int64_t d) const {
  if (C10_UNLIKELY(!opt_sizes_.has_value())) {
    // Cache the metadata to avoid recomputing it each time.
-    opt_sizes_ = std::make_optional(construct_opt_sizes(nested_sizes_));
+    opt_sizes_ = c10::make_optional(construct_opt_sizes(nested_sizes_));
  }
  d = at::maybe_wrap_dim(d, dim(), false);
  if ((*opt_sizes_)[d] == -1) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  return (*opt_sizes_)[d];
 }
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -32,7 +32,7 @@ void SavedTensorDefaultHooks::disable(const std::string& message) {
 }

 void SavedTensorDefaultHooks::enable() {
-  tls.disabled_error_message = std::nullopt;
+  tls.disabled_error_message = c10::nullopt;
 }

 /* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -1,8 +1,8 @@
 #pragma once

 #include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
 #include <c10/util/python_stub.h>
-#include <optional>
 #include <stack>
 #include <string>

--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@ -27,7 +27,7 @@ Tensor scalar_tensor_static(const Scalar& s, std::optional<ScalarType> dtype_opt
  at::tracer::impl::NoTracerDispatchMode tracer_guard;
  at::AutoDispatchBelowAutograd mode;
  Tensor result = at::detail::empty_cpu(
-      {}, dtype_opt, std::nullopt, device_opt, std::nullopt, std::nullopt);
+      {}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);
  scalar_fill(result, s);
  return result;
 }
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -5,8 +5,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/TensorBody.h>
 #include <c10/core/SymInt.h>
+#include <c10/util/Optional.h>
 #include <c10/util/irange.h>
-#include <optional>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -29,7 +29,7 @@ constexpr int64_t INDEX_MAX = -(INDEX_MIN + 1);

 enum class TensorIndexType { None, Ellipsis, SymInt, Boolean, Slice, Tensor };

-constexpr std::nullopt_t None = std::nullopt;
+constexpr c10::nullopt_t None = c10::nullopt;

 struct TORCH_API EllipsisIndexType final {
  EllipsisIndexType() = default;
@ -39,9 +39,9 @@ TORCH_API extern const EllipsisIndexType Ellipsis;
 struct TORCH_API Slice final {
 public:
  Slice(
-      std::optional<c10::SymInt> start_index = std::nullopt,
-      std::optional<c10::SymInt> stop_index = std::nullopt,
-      std::optional<c10::SymInt> step_index = std::nullopt) {
+      std::optional<c10::SymInt> start_index = c10::nullopt,
+      std::optional<c10::SymInt> stop_index = c10::nullopt,
+      std::optional<c10::SymInt> step_index = c10::nullopt) {
    if (!step_index.has_value()) {
      step_ = c10::SymInt(1);
    } else {
@ -110,7 +110,7 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
  // Case 1: `at::indexing::None`
-  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
+  TensorIndex(c10::nullopt_t) : type_(TensorIndexType::None) {}

  // Case 2: "..." / `at::indexing::Ellipsis`
  TensorIndex(at::indexing::EllipsisIndexType)
@ -530,7 +530,7 @@ inline Tensor applySlicing(
    auto& obj = indices[i];
    // See NOTE [nested tensor size for indexing]
    std::optional<SymIntArrayRef> result_sizes = result.is_nested()
-        ? std::optional<SymIntArrayRef>(std::nullopt)
+        ? std::optional<SymIntArrayRef>(c10::nullopt)
        : std::optional<SymIntArrayRef>(result.sym_sizes());
    result = handleDimInMultiDimIndexing(
        /*prev_dim_result=*/result,
@ -606,7 +606,7 @@ inline Tensor get_item(
  // as null may need to be changed after we reach a better solution for nested
  // tensor size
  std::optional<SymIntArrayRef> self_sizes = self.is_nested()
-      ? std::optional<SymIntArrayRef>(std::nullopt)
+      ? std::optional<SymIntArrayRef>(c10::nullopt)
      : std::optional<SymIntArrayRef>(self.sym_sizes());

  // handle simple types: integers, slices, none, ellipsis, bool
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -171,7 +171,7 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha
  //   This will bypass all shape checking in the TensorIterator. Kernels which call this method
  //   are expected to check shapes before calling `add_owned_input` or `add_owned_output`.
  TORCH_CHECK(!resize_outputs_, "resize_outputs() must be called before declare_static_shape(...)")
-  static_shape_ = std::make_optional(DimVector(shape));
+  static_shape_ = c10::make_optional(DimVector(shape));
  return *this;
 }

--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -147,7 +147,7 @@ struct TORCH_API OperandInfo {
  /// promotion target_dtype value can become different from tensor's dtype
  /// also, during type promotion target_dtype and device can be set for an
  /// undefined tensor so that tensor can be properly constructed later.
-  std::optional<Device> device = std::nullopt;
+  std::optional<Device> device = c10::nullopt;
  ScalarType target_dtype = ScalarType::Undefined;
  // Caches dtype of the tensor, because scalar_type is an expensive operation
  // If dtype of the tensor is changed (e.g. as a result of type promotion or in
@ -971,9 +971,9 @@ class TORCH_API TensorIteratorConfig final {
  int num_outputs_ = 0;
  int num_inputs_ = 0;

-  std::optional<DimVector> static_shape_ = std::nullopt;
-  std::optional<ScalarType> static_dtype_ = std::nullopt;
-  std::optional<Device> static_device_ = std::nullopt;
+  std::optional<DimVector> static_shape_ = c10::nullopt;
+  std::optional<ScalarType> static_dtype_ = c10::nullopt;
+  std::optional<Device> static_device_ = c10::nullopt;
  bool check_mem_overlap_ = true;
  bool allow_cpu_scalars_ = false;
  bool is_reduction_ = false;
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -380,7 +380,7 @@ inline std::optional<ResultVec> computeStride_impl(
        view_d--;
      }
      if (view_numel != tensor_numel) {
-        return std::nullopt;
+        return c10::nullopt;
      }
      if (tensor_d > 0) {
        chunk_base_stride = oldstride[tensor_d - 1];
@ -390,7 +390,7 @@ inline std::optional<ResultVec> computeStride_impl(
    }
  }
  if (view_d != -1) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  return newstride;
 }
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -304,7 +304,7 @@ inline std::optional<Tensor> cached_cast(
  if (arg.has_value()) {
    return cached_cast(to_type, *arg, device_type);
  } else {
-    return std::nullopt;
+    return c10::nullopt;
  }
 }

--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,7 +1,7 @@
 #include <c10/core/Allocator.h>
+#include <c10/util/Optional.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
-#include <optional>

 #include <deque>
 #include <mutex>
@ -258,6 +258,7 @@ struct CachingHostAllocatorImpl {
  }

  virtual void process_events() {
+
    while (true) {
      // Avoid calling cudaEventDestroy while holding a mutex, so move
      // intermediate events out of the lock into this object.
@ -349,7 +350,7 @@ struct CachingHostAllocatorImpl {

 template <typename T>
 struct CachingHostAllocatorInterface : public at::Allocator {
-  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
+  CachingHostAllocatorInterface() :impl_(std::make_unique<T>()) {}

  at::DataPtr allocate(size_t size) override {
    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
--- a/aten/src/ATen/core/CheckMemoryFormat.h
+++ b/aten/src/ATen/core/CheckMemoryFormat.h
@ -7,7 +7,7 @@ check_tensor_options_and_extract_memory_format(
    const TensorOptions& options,
    std::optional<MemoryFormat> memory_format) {
  TORCH_CHECK(
-      options.requires_grad_opt() == std::nullopt ||
+      options.requires_grad_opt() == c10::nullopt ||
      options.requires_grad_opt().value() == false,
      "Operators taking TensorOptions cannot take a TensorOptions with "
      "options.requires_grad set as true. This isn't implemented yet.");
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -6,7 +6,7 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/jit_type_base.h>

--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -57,7 +57,7 @@ Dimname Dimname::wildcard() {
  return result;
 }

-std::optional<Dimname> Dimname::unify(Dimname other) const {
+optional<Dimname> Dimname::unify(Dimname other) const {
  if (other.type() == NameType::WILDCARD) {
    return *this;
  }
@ -67,7 +67,7 @@ std::optional<Dimname> Dimname::unify(Dimname other) const {
  if (name_ == other.symbol()) {
    return *this;
  }
-  return std::nullopt;
+  return c10::nullopt;
 }

 bool Dimname::matches(Dimname other) const {
--- a/aten/src/ATen/core/Dimname.h
+++ b/aten/src/ATen/core/Dimname.h
@ -2,7 +2,7 @@

 #include <ATen/core/symbol.h>
 #include <c10/util/ArrayRef.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <ostream>

 namespace at {
--- a/aten/src/ATen/core/DistributionsHelper.h
+++ b/aten/src/ATen/core/DistributionsHelper.h
@ -5,12 +5,12 @@
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/MathConstants.h>
+#include <c10/util/Optional.h>
 #include <c10/macros/Macros.h>

-#include <cmath>
-#include <limits>
-#include <optional>
 #include <type_traits>
+#include <limits>
+#include <cmath>

 /**
 * Distributions kernel adapted from THRandom.cpp
--- a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
+++ b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
@ -6,7 +6,7 @@ namespace at {
 static std::mutex _generator_mutex_lock;

 std::optional<GeneratorFuncType>& GetGeneratorPrivate() {
-  static std::optional<GeneratorFuncType> generator_privateuse1 = std::nullopt;
+  static std::optional<GeneratorFuncType> generator_privateuse1 = c10::nullopt;
  return generator_privateuse1;
 }

--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@ -18,8 +18,8 @@ static std::vector<at::Tensor> get_tensor_vector() {
  return tensors;
 }

-static std::vector<std::optional<at::Tensor>> get_boxed_opt_tensor_vector() {
-  std::vector<std::optional<at::Tensor>> optional_tensors;
+static std::vector<optional<at::Tensor>> get_boxed_opt_tensor_vector() {
+  std::vector<optional<at::Tensor>> optional_tensors;
  const size_t SIZE = 5;
  for (size_t i = 0; i < SIZE * 2; i++) {
    auto opt_tensor = (i % 2 == 0) ? optional<at::Tensor>(at::empty({0})) : nullopt;
@ -234,7 +234,7 @@ TEST(ITensorListRefIteratorTest, Unboxed_Iterate) {

 TEST(IOptTensorListRefTest, Boxed_Iterate) {
  auto vec = get_boxed_opt_tensor_vector();
-  const List<std::optional<at::Tensor>> boxed(vec);
+  const List<optional<at::Tensor>> boxed(vec);
  at::IOptTensorListRef list(boxed);
  size_t i = 0;
  for (const auto t : list) {
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@ -8,7 +8,7 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/ArrayRef.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <vector>

 namespace at {
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@ -1127,7 +1127,7 @@ TEST(ListTest, canAccessStringByReference) {
 }

 TEST(ListTest, canAccessOptionalStringByReference) {
-  List<std::optional<std::string>> list({"one", "two", std::nullopt});
+  List<std::optional<std::string>> list({"one", "two", c10::nullopt});
  const auto& listRef = list;
  static_assert(
      std::is_same_v<decltype(listRef[1]), std::optional<std::reference_wrapper<const std::string>>>,
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -130,7 +130,7 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
 optional<DimnameList> get_opt_names(const TensorImpl* impl) {
  const auto* meta = get_named_tensor_meta(impl);
  if (meta == nullptr) {
-    return std::nullopt;
+    return nullopt;
  } else {
    return meta->names();
  }
--- a/aten/src/ATen/core/NestedIntSymNodeImpl.h
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.h
@ -4,9 +4,9 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 #include <c10/util/intrusive_ptr.h>
 #include <cstdint>
-#include <optional>
 #include <string>

 namespace c10 {
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -33,7 +33,7 @@ struct StashTLSOnEntryGuard {
 public:
  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  StashTLSOnEntryGuard(): saved_(tls_on_entry.value()) {
-    tls_on_entry = std::nullopt;
+    tls_on_entry = c10::nullopt;
  }

  ~StashTLSOnEntryGuard() {
@ -124,7 +124,7 @@ void preDispatchFallback(const c10::OperatorHandle& op, c10::DispatchKeySet disp
 namespace at::impl {

 RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) {
-  tls_on_entry = std::nullopt;
+  tls_on_entry = c10::nullopt;
 }

 RestorePythonTLSSnapshot::~RestorePythonTLSSnapshot() {
@ -143,7 +143,7 @@ MaybeSetTLSOnEntryGuard::MaybeSetTLSOnEntryGuard() {
 MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() {
  if (value_set_) {
    TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
-    tls_on_entry = std::nullopt;
+    tls_on_entry = c10::nullopt;
  }
 }

--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -16,7 +16,7 @@
 #include <c10/util/ExclusivelyOwned.h>
 #include <c10/util/ExclusivelyOwnedTensorTraits.h>
 #include <c10/util/MaybeOwned.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <c10/util/intrusive_ptr.h>

 #include <ATen/core/NamedTensor.h>
@ -147,7 +147,7 @@ class TORCH_API TensorBase {
  const TensorBase& fill_(const c10::Scalar& scalar) const;
  const TensorBase& zero_() const;

-  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, std::optional<at::MemoryFormat> memory_format=std::nullopt) const;
+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, std::optional<at::MemoryFormat> memory_format=c10::nullopt) const;

  bool is_complex() const {
    return at::isComplexType(this->scalar_type());
@ -712,7 +712,7 @@ class TORCH_API TensorBase {
  /// // f requires grad, has no operation creating it
  /// @endcode

-  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=std::nullopt, bool create_graph=false, std::optional<TensorList> inputs=std::nullopt) const;
+  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, std::optional<TensorList> inputs=c10::nullopt) const;
  ///
  /// Computes the gradient of current tensor with respect to graph leaves.
  ///
--- a/aten/src/ATen/core/TorchDispatchUtils.h
+++ b/aten/src/ATen/core/TorchDispatchUtils.h
@ -1,17 +1,16 @@
 #pragma once

-#include <ATen/core/dispatch/Dispatcher.h>
-#include <c10/core/impl/TorchDispatchModeTLS.h>
-#include <c10/util/ArrayRef.h>
 #include <torch/library.h>
-#include <optional>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>

 namespace at::impl {

 TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
 TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
-TORCH_API bool tensorlist_has_dispatch(
-    const c10::List<std::optional<at::Tensor>>& li);
+TORCH_API bool tensorlist_has_dispatch(const c10::List<std::optional<at::Tensor>>& li);
 using c10::impl::dispatch_mode_enabled;

-} // namespace at::impl
+}
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -72,12 +72,12 @@ inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIn

 template <>
 inline typename remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(std::optional<c10::SymInt> x) {
-  return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__)) : std::nullopt;
+  return x.has_value() ? c10::make_optional(x->guard_int(__FILE__, __LINE__)) : c10::nullopt;
 }

 template <>
 inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(at::OptionalSymIntArrayRef x) {
-  return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : std::nullopt;
+  return x.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : c10::nullopt;
 }

 template<class Return, class... Args>
--- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
@ -180,7 +180,7 @@ void boxed_func_for_outofplace_multi_op(const OperatorHandle& /*opHandle*/, Stac
 // functional

 void expectBoxedCallingWithReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  vector<IValue> stack {3, 4};
  OperatorHandle dummy = makeDummyOperatorHandle();

@ -194,7 +194,7 @@ void expectBoxedCallingWithReturnWorks(const KernelFunction& func) {
 }

 void expectBoxedCallingWithoutReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  vector<IValue> stack {3, 4};
  OperatorHandle dummy = makeDummyOperatorHandle();

@ -206,7 +206,7 @@ void expectBoxedCallingWithoutReturnWorks(const KernelFunction& func) {
 }

 void expectBoxedCallingWithMultiReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  vector<IValue> stack {3, 4};
  OperatorHandle dummy = makeDummyOperatorHandle();

@ -284,7 +284,7 @@ void expectOutOfPlaceMultiBoxedCallingWorks(const KernelFunction& func) {
 // make an unboxed call to a kernel that returns a single value.
 //
 void expectUnboxedCallingWithReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  OperatorHandle dummy = makeDummyOperatorHandle();

  int64_t result = func.call<int64_t, int64_t, int64_t>(dummy, CPU_TEST_SET, 3, 4);
@ -297,7 +297,7 @@ void expectUnboxedCallingWithReturnWorks(const KernelFunction& func) {
 // make an unboxed call to a kernel that returns nothing.
 //
 void expectUnboxedCallingWithoutReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  OperatorHandle dummy = makeDummyOperatorHandle();

  func.call<void, int64_t, int64_t>(dummy, CPU_TEST_SET, 3, 4);
@ -310,7 +310,7 @@ void expectUnboxedCallingWithoutReturnWorks(const KernelFunction& func) {
 // When calling unboxed, multiple values are returned as a tuple.
 //
 void expectUnboxedCallingWithMultiReturnWorks(const KernelFunction& func) {
-  called_with_args = std::nullopt;
+  called_with_args = c10::nullopt;
  OperatorHandle dummy = makeDummyOperatorHandle();

  auto result = func.call<std::tuple<int64_t, int64_t>, int64_t, int64_t>(dummy, CPU_TEST_SET, 3, 4);
--- a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
@ -793,9 +793,9 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenFallbackKernelWitho
  EXPECT_EQ(4, outputs[0].toInt());
 }

-std::optional<Tensor> called_arg2 = std::nullopt;
-std::optional<int64_t> called_arg3 = std::nullopt;
-std::optional<std::string> called_arg4 = std::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;

 void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional<Tensor>& arg2, std::optional<int64_t> arg3, std::optional<std::string> arg4) {
  called = true;
--- a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
@ -550,9 +550,9 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenFallbackKernelWithoutTens
  EXPECT_EQ(4, outputs[0].toInt());
 }

-std::optional<Tensor> called_arg2 = std::nullopt;
-std::optional<int64_t> called_arg3 = std::nullopt;
-std::optional<std::string> called_arg4 = std::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;

 void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional<Tensor>& arg2, std::optional<int64_t> arg3, std::optional<std::string> arg4) {
  called = true;
--- a/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp
@ -732,9 +732,9 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenFallbackKernelWithout

 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
  bool called = false;
-  std::optional<Tensor> called_arg2 = std::nullopt;
-  std::optional<int64_t> called_arg3 = std::nullopt;
-  std::optional<std::string> called_arg4 = std::nullopt;
+  std::optional<Tensor> called_arg2 = c10::nullopt;
+  std::optional<int64_t> called_arg3 = c10::nullopt;
+  std::optional<std::string> called_arg4 = c10::nullopt;

  auto registrar = RegisterOperators().op(
    "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()",
@ -771,9 +771,9 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInp

 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
  bool called = false;
-  std::optional<Tensor> called_arg2 = std::nullopt;
-  std::optional<int64_t> called_arg3 = std::nullopt;
-  std::optional<std::string> called_arg4 = std::nullopt;
+  std::optional<Tensor> called_arg2 = c10::nullopt;
+  std::optional<int64_t> called_arg3 = c10::nullopt;
+  std::optional<std::string> called_arg4 = c10::nullopt;

  auto registrar = RegisterOperators().op(
    "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?",
--- a/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp
@ -466,9 +466,9 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenFallbackKernelWithoutTensor
  EXPECT_EQ(4, outputs[0].toInt());
 }

-std::optional<Tensor> called_arg2 = std::nullopt;
-std::optional<int64_t> called_arg3 = std::nullopt;
-std::optional<std::string> called_arg4 = std::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;

 TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
  auto registrar = RegisterOperators().op(
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
@ -668,9 +668,9 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenFallbackKernelWithoutTenso
  EXPECT_EQ(4, outputs[0].toInt());
 }

-std::optional<Tensor> called_arg2 = std::nullopt;
-std::optional<int64_t> called_arg3 = std::nullopt;
-std::optional<std::string> called_arg4 = std::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;

 struct KernelWithOptInputWithoutOutput final : OperatorKernel {
  void operator()(Tensor arg1, const std::optional<Tensor>& arg2, std::optional<int64_t> arg3, std::optional<std::string> arg4) {
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@ -631,7 +631,7 @@ std::optional<IValue> ClassType::findConstant(const std::string& name) const {
  }

  if (pos >= constantNames_.size()) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  return constantValues_[pos];
 }
@ -659,7 +659,7 @@ std::optional<ClassType::Property> ClassType::getProperty(const std::string& nam
    }
  }

-  return std::nullopt;
+  return c10::nullopt;
 }

 void ClassType::addProperty(const std::string& name, torch::jit::Function* getter, torch::jit::Function* setter) {
@ -676,7 +676,7 @@ std::optional<size_t> ClassType::findConstantSlot(const std::string& name) const
    }
    slot++;
  }
-  return std::nullopt;
+  return c10::nullopt;
 }

 const std::string& ClassType::getConstantName(size_t slot) const {
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -4,7 +4,7 @@

 #include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type_base.h>
-#include <optional>
+#include <c10/util/Optional.h>


 namespace torch::jit {
@ -160,7 +160,7 @@ struct TORCH_API ClassType : public NamedType {
      }
      slot++;
    }
-    return std::nullopt;
+    return c10::nullopt;
  }
  size_t getAttributeSlot(const std::string& name) const {
    if (auto r = findAttributeSlot(name)) {
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -42,7 +42,7 @@ inline DispatchKeySet computeDispatchKeySet(
  // be nice to only do one.  Can always_included be folded into the TLS?  Well,
  // it's a bit troublesome, because fastpath TLS access requires the type of
  // the TLS in question to be zero-initialized, so you don't actually win
-  // anything in that case.
+  // anyting in that case.
  return (((ks | local.included_) - local.excluded_) & key_mask);
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -80,7 +80,7 @@ std::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_na
  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::optional<OperatorHandle> {
    auto found = operatorLookupTable.find(overload_name);
    if (found == operatorLookupTable.end()) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    return found->second;
  });
@ -93,7 +93,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
  using namespace std::chrono_literals;
  std::unique_lock<std::mutex> lock(guard_->mutex);
  bool r = cond_var_.wait_for(lock, 2s, [&]{
-    return findOp(schema.operator_name()) != std::nullopt;
+    return findOp(schema.operator_name()) != c10::nullopt;
  });
  TORCH_INTERNAL_ASSERT(r,
    "Expected main interpreter to define ", schema.operator_name(),
@ -127,7 +127,7 @@ std::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overloa
    if (it->hasSchema()) {
      return it;
    } else {
-      return std::nullopt;
+      return c10::nullopt;
    }
  } else {
    return it;
@ -164,7 +164,7 @@ const std::vector<OperatorName> Dispatcher::getAllOpNames() {
 // are done
 OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
  const auto found = findOp(op_name);
-  if (found != std::nullopt) {
+  if (found != c10::nullopt) {
    return *found;
  }

@ -279,7 +279,7 @@ std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(Operato
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = pythonModulesSingleton().find(op_name);
  if (found == pythonModulesSingleton().end()) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  return found->second;
 }
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -97,7 +97,7 @@ void OperatorEntry::registerSchema(FunctionSchema&& schema, std::string&& debug,

 void OperatorEntry::deregisterSchema() {
  TORCH_INTERNAL_ASSERT(schema_.has_value());
-  schema_ = std::nullopt;
+  schema_ = c10::nullopt;
  dispatchKeyExtractor_.deregisterSchema();
 }

--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -3,6 +3,7 @@
 #include <ATen/core/function_schema.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/flat_hash_map.h>
+#include <c10/util/Optional.h>
 #include <c10/core/DispatchKey.h>
 #include <c10/core/PyHandleCache.h>
 #include <c10/core/SafePyObject.h>
@ -15,9 +16,8 @@
 #include <ATen/core/dispatch/RegistrationHandleRAII.h>
 #include <ATen/core/enum_tag.h>

-#include <optional>
-#include <array>
 #include <list>
+#include <array>

 #ifdef C10_MOBILE
 #define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@ -5,7 +5,7 @@
 #include <type_traits>

 #include <ATen/core/jit_type_base.h>
-#include <optional>
+#include <c10/util/Optional.h>

 namespace c10 {

--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -69,7 +69,7 @@ bool FunctionSchema::canAliasTypeSetsAlias(const std::optional<AliasTypeSet> &lh

 std::optional<AliasTypeSet> FunctionSchema::getAliasTypeSetContainedTypes(const std::optional<AliasTypeSet> &aliasTypeSet) const {
  if (!aliasTypeSet) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  std::unordered_set<TypePtr> containedTypes;
  std::stack<TypePtr> typeStack;
@ -114,7 +114,7 @@ std::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr&
        }
      }
      if (mutable_types.empty()) {
-        return std::nullopt;
+        return c10::nullopt;
      }
      return mutable_types;
    }
@ -135,12 +135,12 @@ std::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr&
        }
      }
      if (mutable_types.empty()) {
-        return std::nullopt;
+        return c10::nullopt;
      }
      return {AliasTypeSet{TupleType::create(std::move(mutable_types))}};
    }
    default:
-      return std::nullopt;
+      return c10::nullopt;
  }
 }

--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -29,20 +29,20 @@ struct Argument {
  Argument(
      std::string name = "",
      const TypePtr& type = nullptr,
-      std::optional<int32_t> N = std::nullopt,
-      std::optional<IValue> default_value = std::nullopt,
+      std::optional<int32_t> N = c10::nullopt,
+      std::optional<IValue> default_value = c10::nullopt,
      bool kwarg_only = false,
-      std::optional<AliasInfo> alias_info = std::nullopt)
+      std::optional<AliasInfo> alias_info = c10::nullopt)
    : Argument(std::move(name), type, type, N, std::move(default_value), kwarg_only, std::move(alias_info)) {}

  Argument(
      std::string name,
      TypePtr fake_type,
      TypePtr real_type,
-      std::optional<int32_t> N = std::nullopt,
-      std::optional<IValue> default_value = std::nullopt,
+      std::optional<int32_t> N = c10::nullopt,
+      std::optional<IValue> default_value = c10::nullopt,
      bool kwarg_only = false,
-      std::optional<AliasInfo> alias_info = std::nullopt)
+      std::optional<AliasInfo> alias_info = c10::nullopt)
      : name_(std::move(name)),
        type_(fake_type ? std::move(fake_type) : TensorType::get()),
        real_type_(real_type ? std::move(real_type) : type_),
@ -150,7 +150,7 @@ struct Argument {
        N_,
        default_value_,
        kwarg_only_,
-        alias_info_ ? std::optional<AliasInfo>(*alias_info_) : std::nullopt);
+        alias_info_ ? std::optional<AliasInfo>(*alias_info_) : c10::nullopt);
  }

  // this function checks whether this Argument is backward compatible with
@ -397,7 +397,7 @@ struct TORCH_API FunctionSchema {
  bool is_mutable(c10::string_view name) const {
    std::optional<int> index = argumentIndexWithName(name);
    TORCH_INTERNAL_ASSERT(
-        index != std::nullopt, "Schema has no argument named ", name);
+        index != c10::nullopt, "Schema has no argument named ", name);

    return is_mutable({c10::SchemaArgType::input, static_cast<size_t>(*index)});
  }
@ -436,7 +436,7 @@ struct TORCH_API FunctionSchema {
      if(name == arguments()[i].name())
        return i;
    }
-    return std::nullopt;
+    return c10::nullopt;
  }
  FunctionSchema cloneWithName(std::string name, std::string overload_name) const {
    return FunctionSchema(
@ -470,8 +470,8 @@ struct TORCH_API FunctionSchema {
  std::string formatTypeMismatchMsg(
      const Argument& expected,
      const std::string& actual_type,
-      std::optional<size_t> position = std::nullopt,
-      std::optional<std::string> value = std::nullopt) const;
+      std::optional<size_t> position = c10::nullopt,
+      std::optional<std::string> value = c10::nullopt) const;

  FunctionSchema cloneWithRemappedTypes(
      const std::function<TypePtr(TypePtr)> type_map) const;
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -120,14 +120,14 @@ struct OptionalArray {

  operator std::optional<c10::ArrayRef<T>>() {
    if (!list) {
-      return std::nullopt;
+      return nullopt;
    }
    return *list;
  }

  operator c10::OptionalArrayRef<T>() {
    if (!list) {
-      return std::nullopt;
+      return nullopt;
    }
    return *list;
  }
@ -820,7 +820,7 @@ struct TORCH_API IValue final {
  IValue(std::optional<T> v);
  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
  IValue(c10::OptionalArrayRef<T> v);
-  IValue(std::nullopt_t);
+  IValue(c10::nullopt_t);

  // ClassType
  IValue(c10::intrusive_ptr<ivalue::Object> v);
@ -1145,10 +1145,10 @@ struct TORCH_API IValue final {
  // TODO: There are several places that recurse over IValue. This is fragile.
  // This visitor should be used to recurse over ivalues.
  void visit(const std::function<bool(const IValue&)>& visitor) const;
-  IValue deepcopy(std::optional<at::Device> device = std::nullopt) const;
+  IValue deepcopy(std::optional<at::Device> device = c10::nullopt) const;
  IValue deepcopy(
      HashIdentityIValueMap& memo,
-      std::optional<at::Device> device = std::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;

 private:
  static c10::intrusive_ptr_target* null_to_undefined_tensor(
@ -1523,24 +1523,24 @@ struct TORCH_API WeakTypePtr {
 struct WeakOrStrongCompilationUnit {
  explicit WeakOrStrongCompilationUnit(
      std::shared_ptr<torch::jit::CompilationUnit> shared_cu)
-      : strong_ptr_(std::move(shared_cu)), weak_ptr_(std::nullopt) {}
+      : strong_ptr_(std::move(shared_cu)), weak_ptr_(c10::nullopt) {}

  explicit WeakOrStrongCompilationUnit(
      std::weak_ptr<torch::jit::CompilationUnit> weak_cu)
-      : strong_ptr_(std::nullopt), weak_ptr_(std::move(weak_cu)) {}
+      : strong_ptr_(c10::nullopt), weak_ptr_(std::move(weak_cu)) {}

  std::shared_ptr<torch::jit::CompilationUnit> getStrongRefOrThrow() const {
-    TORCH_INTERNAL_ASSERT(strong_ptr_ != std::nullopt);
+    TORCH_INTERNAL_ASSERT(strong_ptr_ != c10::nullopt);
    return *strong_ptr_;
  }

  std::weak_ptr<torch::jit::CompilationUnit> getWeakRefOrThrow() const {
-    TORCH_INTERNAL_ASSERT(weak_ptr_ != std::nullopt);
+    TORCH_INTERNAL_ASSERT(weak_ptr_ != c10::nullopt);
    return *weak_ptr_;
  }

  bool holdingStrongRef() const {
-    return strong_ptr_ != std::nullopt;
+    return strong_ptr_ != c10::nullopt;
  }

  bool holdingEmptyStrongRef() const {
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -2,7 +2,6 @@

 #include <condition_variable>
 #include <memory>
-#include <optional>
 #include <type_traits>
 #include <utility>

@ -910,7 +909,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  using WeakStorage = c10::weak_intrusive_ptr<c10::StorageImpl>;
  void markCompleted(
      IValue value,
-      std::optional<std::vector<WeakStorage>> storages = std::nullopt) {
+      std::optional<std::vector<WeakStorage>> storages = c10::nullopt) {
    // Start by performing all steps that can throw, before setting any field.
    // Do this before even acquiring the mutex, because extractStorages might
    // acquire the GIL, which could lead to a lock inversion with our mutex.
@ -1587,11 +1586,11 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
  c10::intrusive_ptr<Object> copy() const;

  c10::intrusive_ptr<Object> deepcopy(
-      std::optional<at::Device> device = std::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;

  c10::intrusive_ptr<Object> deepcopy(
      IValue::HashIdentityIValueMap& memo,
-      std::optional<at::Device> device = std::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;

  bool is_weak_compilation_ref() const {
    return !type_.holds_strong_ref();
@ -1614,7 +1613,7 @@ struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
 public:
  virtual PyObject* getPyObject() = 0;
  virtual c10::InferredType tryToInferType() = 0;
-  virtual IValue toIValue(const TypePtr& type, std::optional<int32_t> N = std::nullopt) = 0;
+  virtual IValue toIValue(const TypePtr& type, std::optional<int32_t> N = c10::nullopt) = 0;
  virtual std::string toStr() = 0;
  virtual std::vector<at::Tensor> extractTensors() = 0;

@ -1912,7 +1911,7 @@ std::unordered_map<K, V> generic_to(
 template <typename T>
 std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
  if (ivalue.isNone()) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  return std::move(ivalue).to<T>();
 }
@ -2281,7 +2280,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
  }
 }

-inline IValue::IValue(std::nullopt_t) : IValue() {}
+inline IValue::IValue(c10::nullopt_t) : IValue() {}

 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
    : tag(Tag::Object) {
@ -2364,7 +2363,7 @@ inline const std::string& IValue::toStringRef() const {
 inline std::optional<std::reference_wrapper<const std::string>> IValue::
    toOptionalStringRef() const {
  if (isNone()) {
-    return std::nullopt;
+    return c10::nullopt;
  }
  AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
@ -2392,7 +2391,7 @@ inline PyObject* IValue::toPyObject() const {
 template <typename T>
 inline optional<T> IValue::toOptional() {
  if (this->isNone()) {
-    return std::nullopt;
+    return nullopt;
  }
  return this->to<T>();
 }
@ -2400,7 +2399,7 @@ inline optional<T> IValue::toOptional() {
 template <typename T>
 inline optional<T> IValue::toOptional() const {
  if (this->isNone()) {
-    return std::nullopt;
+    return nullopt;
  }
  return this->to<T>();
 }
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -8,7 +8,7 @@
 #include <ATen/core/type_factory.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/util/TypeList.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymBool.h>
 #include <c10/core/Device.h>
@ -187,7 +187,7 @@ struct OptionalType;
 using OptionalTypePtr = std::shared_ptr<OptionalType>;
 // This type represents an optional type. There is one `Optional` for
 // each element type. `Optional[T]` can accept both `T` and
-// `None`(`std::nullopt` in C++)
+// `None`(`c10::nullopt` in C++)
 // Subtype hierarchy for Optional:
 //     - Optional[T] <: Optional[R] iff T <: R
 //     - T <: Optional[R] if T <: R
@ -372,10 +372,10 @@ inline ShapeSymbol merge_primitive(
 // dims, partially known and fully known shapes are all supported.
 struct TORCH_API SymbolicShape {
  // Unranked shape constructor.
-  SymbolicShape() : dims_(std::nullopt) {}
+  SymbolicShape() : dims_(c10::nullopt) {}

  // Known rank but unknown dimentions.
-  SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
+  SymbolicShape(std::optional<size_t> rank) : dims_(c10::nullopt) {
    if(!rank) {
      return;
    }
@ -432,7 +432,7 @@ struct TORCH_API SymbolicShape {
  // Returns rank or nullopt in case of unranked shape.
  std::optional<size_t> rank() const {
    if(!dims_) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    return dims_->size();
  }
@ -443,7 +443,7 @@ struct TORCH_API SymbolicShape {

  std::optional<std::vector<bool>> symbolicDims() const {
    if (!dims_) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    auto symbolic_dims = std::vector<bool>();
    for (const ShapeSymbol& s : *dims_) {
@ -505,7 +505,7 @@ struct VaryingShape {
  VaryingShape(c10::ArrayRef<T> vec)
      : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}

-  VaryingShape(std::optional<size_t> size = std::nullopt) : dims_(std::nullopt) {
+  VaryingShape(std::optional<size_t> size = c10::nullopt) : dims_(c10::nullopt) {
    if (size) {
      dims_ = ListOfOptionalElements(*size);
    }
@ -528,7 +528,7 @@ struct VaryingShape {

  std::optional<size_t> size() const {
    if (!dims_) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    const auto& dims = dims_.value();
    return dims.size();
@ -542,13 +542,13 @@ struct VaryingShape {

  std::optional<std::vector<T>> concrete_sizes() const {
    if (!dims_) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    std::vector<T> sizes;
    sizes.reserve(dims_.value().size());
    for (auto d : *dims_) {
      if (!d) {
-        return std::nullopt;
+        return c10::nullopt;
      }
      sizes.push_back(d.value());
    }
@ -780,7 +780,7 @@ struct TORCH_API TensorType : public SharedType {

  TensorTypePtr withPossiblyUndefined() {
    auto r = clone();
-    r->undefined_ = std::nullopt;
+    r->undefined_ = c10::nullopt;
    return r;
  }

@ -854,9 +854,9 @@ struct TORCH_API TensorType : public SharedType {
  // with `withUndefined`
  // This will also mean that `undefined` tensors will fail
  // `subtypeOf(TensorType::get())` check
-  // undefined_ may become `std::nullopt` if the tensor was observed to be both
+  // undefined_ may become `c10::nullopt` if the tensor was observed to be both
  // defined and undefined. However, no tensor type starts out with
-  // `undefined_` set to `std::nullopt`
+  // `undefined_` set to `c10::nullopt`
  std::optional<bool> undefined_;
  // Represents whether or not this type was inferred.
  bool is_inferred_ = false;
@ -1161,7 +1161,7 @@ struct TORCH_API TupleType : public NamedType {
      std::vector<TypePtr> types) {
    return TupleTypePtr(new TupleType(
        std::move(types),
-        std::nullopt,
+        c10::nullopt,
        nullptr)); // NOLINT(modernize-make-shared)
  }
  static TupleTypePtr create() {
@ -1739,7 +1739,7 @@ inline TypePtr TensorType::fromNumberType(const Type& typ) {
  } else if (typ.isSubtypeOf(*BoolType::get())) {
    return TensorType::createContiguous(at::kBool, at::kCPU, {});
  } else if (typ.kind() == NumberType::Kind) {
-    return TensorType::create(std::nullopt, at::kCPU, {}, std::nullopt);
+    return TensorType::create(c10::nullopt, at::kCPU, {}, c10::nullopt);
  }
  TORCH_CHECK(false, "Unknown number type: ", typ.str());
 }
@ -1755,7 +1755,7 @@ inline std::optional<c10::ScalarType> tryScalarTypeFromJitType(const Type& type)
  } else if (type == *BoolType::get()) {
    return at::ScalarType::Bool;
  }
-  return std::nullopt;
+  return c10::nullopt;
 }

 inline at::ScalarType scalarTypeFromJitType(const Type& type) {
@ -2040,7 +2040,7 @@ struct getMaybeFakeTypePtr_<c10::Dict<K, V>, fake> final {
 };

 template <class T, bool fake>
-struct getMaybeFakeTypePtr_<std::optional<T>, fake> final {
+struct getMaybeFakeTypePtr_<at::optional<T>, fake> final {
  static const auto& call() {
    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
    // The "per optional<T>" static singleton needs to live in a .cpp file,
@ -2131,7 +2131,7 @@ struct MatchTypeReturn {

 private:
  MatchTypeReturn()
-  : reason_(std::nullopt) {}
+  : reason_(c10::nullopt) {}
  std::optional<std::string> reason_; // is there is no match, this contains the reason
 };

--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@ -14,7 +14,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
-#include <optional>
+#include <c10/util/Optional.h>

 namespace c10 {

@ -73,7 +73,7 @@ struct Type;
 struct SharedType;

 // Use this to customize how a Type is printed using `annotation_str()`. If
-// std::nullopt is returned, `annotation_str()` falls through to its default
+// c10::nullopt is returned, `annotation_str()` falls through to its default
 // implementation.
 using TypePrinter = std::function<std::optional<std::string>(const Type&)>;

@ -455,7 +455,7 @@ struct TORCH_API Type {
  // this method.
  std::string annotation_str(const TypePrinter& printer) const {
    if (printer) {
-      // the printer can return std::nullopt to fall through to the default impl
+      // the printer can return nullopt to fall through to the default impl
      if (auto renamed = printer(*this)) {
        return *renamed;
      }
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@ -61,7 +61,7 @@ void Library::reset() {

 Library::Library(Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
  : kind_(kind)
-  , ns_(ns == "_" ? std::nullopt : std::make_optional(std::move(ns)))
+  , ns_(ns == "_" ? c10::nullopt : c10::make_optional(std::move(ns)))
  , dispatch_key_(k.value_or(CatchAll) == CatchAll ? std::optional<c10::DispatchKey>() : k)
  , file_(file)
  , line_(line)
--- a/aten/src/ATen/core/op_registration/infer_schema.cpp
+++ b/aten/src/ATen/core/op_registration/infer_schema.cpp
@ -88,7 +88,7 @@ std::optional<std::string> findSchemaDifferences(
  }

  // no differences found
-  return std::nullopt;
+  return c10::nullopt;
 }

 } // namespace c10
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@ -71,7 +71,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
      opName,
      " because there is no kernel specified.");

-  std::optional<FunctionSchema> inferred_schema = std::nullopt;
+  std::optional<FunctionSchema> inferred_schema = c10::nullopt;
  for (const auto& kernel : options.kernels) {
    if (nullptr != kernel.inferred_function_schema.get()) {
      if (!inferred_schema.has_value()) {
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -76,7 +76,7 @@ public:
    // internal-only for registering stack based catch-all kernels
    template<KernelFunction::BoxedKernelFunction* kernel_func>
    Options&& catchAllKernel() && {
-      return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
+      return std::move(*this).kernel(c10::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
    }

    // internal only for registering caffe2 ops
@ -215,7 +215,7 @@ public:
      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");

      return std::move(*this).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
        impl::CppSignature::make<KernelFunctor>(),
        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
@ -272,7 +272,7 @@ public:
      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");

      return std::move(*this).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
        impl::CppSignature::make<FuncType>(),
        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
@ -302,7 +302,7 @@ public:
      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");

      return std::move(*this).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
        impl::CppSignature::make<FuncType>(),
        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
@ -384,7 +384,7 @@ public:
      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");

      return std::move(*this).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(lambda)),
        impl::CppSignature::make<Lambda>(),
        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
@ -410,18 +410,18 @@ public:
    }

    Options()
-    : schemaOrName_(std::nullopt)
+    : schemaOrName_(c10::nullopt)
    , kernels()
-    , aliasAnalysisKind_(std::nullopt)
+    , aliasAnalysisKind_(c10::nullopt)
    {}

    // KernelRegistrationConfig accumulates all information from the config
    // parameters passed to a RegisterOperators::op() call into one object.
    struct KernelRegistrationConfig final {
      KernelRegistrationConfig()
-        : dispatch_key(std::nullopt)
+        : dispatch_key(c10::nullopt)
        , func()
-        , cpp_signature(std::nullopt)
+        , cpp_signature(c10::nullopt)
        , inferred_function_schema(nullptr)
      {}

@ -522,7 +522,7 @@ public:
   op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
     constexpr bool AllowLegacyTypes = true;
     return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
-       std::nullopt,
+       c10::nullopt,
       KernelFunction::makeFromUnboxedRuntimeFunction<AllowLegacyTypes>(func),
       impl::CppSignature::make<FuncType>(),
       // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
@ -553,7 +553,7 @@ public:

      constexpr bool AllowLegacyTypes = true;
      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
        impl::CppSignature::make<Lambda>(),
        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
@ -570,7 +570,7 @@ public:

      constexpr bool AllowLegacyTypes = true;
      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
-        std::nullopt,
+        c10::nullopt,
        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
        impl::CppSignature::make<Lambda>(),
        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@ -909,28 +909,28 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {

  // optional types (with has_value() == false)
  testArgTypes<std::optional<double>>::test(
-    std::optional<double>(std::nullopt), [] (const std::optional<double>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<double>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<double>(c10::nullopt), [] (const std::optional<double>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<double>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(float? a) -> float?");
  testArgTypes<std::optional<int64_t>>::test(
-    std::optional<int64_t>(std::nullopt), [] (const std::optional<int64_t>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<int64_t>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<int64_t>(c10::nullopt), [] (const std::optional<int64_t>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<int64_t>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(int? a) -> int?");
  testArgTypes<std::optional<bool>>::test(
-    std::optional<bool>(std::nullopt), [] (const std::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<bool>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<bool>(c10::nullopt), [] (const std::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(bool? a) -> bool?");
  testArgTypes<std::optional<bool>>::test(
-    std::optional<bool>(std::nullopt), [] (const std::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<bool>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<bool>(c10::nullopt), [] (const std::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(bool? a) -> bool?");
  testArgTypes<std::optional<std::string>>::test(
-    std::optional<std::string>(std::nullopt), [] (const std::optional<std::string>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<std::string>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<std::string>(c10::nullopt), [] (const std::optional<std::string>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<std::string>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(str? a) -> str?");
  testArgTypes<std::optional<Tensor>>::test(
-    std::optional<Tensor>(std::nullopt), [] (const std::optional<Tensor>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<Tensor>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<Tensor>(c10::nullopt), [] (const std::optional<Tensor>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<Tensor>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(Tensor? a) -> Tensor?");


@ -1136,8 +1136,8 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {

  // Test optional of list (with nullopt)
  testArgTypes<std::optional<c10::List<int64_t>>>::test(
-    std::optional<c10::List<int64_t>>(std::nullopt), [] (const std::optional<c10::List<int64_t>>& v) {EXPECT_FALSE(v.has_value());},
-    std::optional<c10::List<int64_t>>(std::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+    std::optional<c10::List<int64_t>>(c10::nullopt), [] (const std::optional<c10::List<int64_t>>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<c10::List<int64_t>>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
    "(int[]? a) -> int[]?");

  // Test optional of list (with empty list)
@ -1160,8 +1160,8 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {

  // Test list of optional (with values)
  testArgTypes<c10::List<::std::optional<int64_t>>>::test(
-    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, std::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<std::optional<int64_t>>({3, std::nullopt, 2}, v);},
-    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, std::nullopt, 2})), [] (const IValue& v) {expectListEquals<std::optional<int64_t>>({3, std::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<std::optional<int64_t>>({3, c10::nullopt, 2}, v);},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<std::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
    "(int?[] a) -> int?[]");

  // dict types
@ -2141,7 +2141,7 @@ TEST(OperatorRegistrationTest, callKernelsWithDispatchKeySetConvention_mixedCall

 TEST(OperatorRegistrationTest, getRegistrationsForDispatchKey) {
  // should return every registered op
-  auto all_ops = Dispatcher::singleton().getRegistrationsForDispatchKey(std::nullopt);
+  auto all_ops = Dispatcher::singleton().getRegistrationsForDispatchKey(c10::nullopt);
  // should return every registered op with a cpu kernel
  auto cpu_ops = Dispatcher::singleton().getRegistrationsForDispatchKey(c10::DispatchKey::CPU);
  ASSERT_TRUE(all_ops.size() > 0);
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@ -2,11 +2,11 @@

 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 #include <c10/util/string_view.h>
-#include <optional>
-#include <ostream>
 #include <string>
 #include <utility>
+#include <ostream>

 namespace c10 {

@ -26,9 +26,9 @@ struct OperatorName final {
  std::optional<c10::string_view> getNamespace() const {
    auto pos = name.find("::");
    if (pos == std::string::npos) {
-      return std::nullopt;
+      return c10::nullopt;
    } else {
-      return std::make_optional(c10::string_view(name.data(), pos));
+      return c10::make_optional(c10::string_view(name.data(), pos));
    }
  }

@ -39,8 +39,7 @@ struct OperatorName final {
      const auto old_name_size = name.size();
      name.resize(ns_len + 2 + old_name_size);
      // Shift current value of name to the end of the new space.
-      name.replace(
-          name.size() - old_name_size, old_name_size, name, 0, old_name_size);
+      name.replace(name.size() - old_name_size, old_name_size, name, 0, old_name_size);
      name.replace(0, ns_len, ns, ns_len);
      name[ns_len] = ':';
      name[ns_len + 1] = ':';
@ -57,10 +56,8 @@ struct OperatorName final {
 struct OperatorNameView final {
  c10::string_view name;
  c10::string_view overload_name;
-  constexpr OperatorNameView(
-      c10::string_view name,
-      c10::string_view overload_name)
-      : name(name), overload_name(overload_name) {}
+  constexpr OperatorNameView(c10::string_view name, c10::string_view overload_name)
+    : name(name), overload_name(overload_name) {}
  // Parses strings like "foo.overload" and also "foo"
  constexpr static OperatorNameView parse(c10::string_view full_name) {
    auto i = full_name.find('.');
@ -86,11 +83,10 @@ TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
 } // namespace c10

 namespace std {
-template <>
-struct hash<::c10::OperatorName> {
-  size_t operator()(const ::c10::OperatorName& x) const {
-    return std::hash<std::string>()(x.name) ^
-        (~std::hash<std::string>()(x.overload_name));
-  }
-};
-} // namespace std
+  template <>
+  struct hash<::c10::OperatorName> {
+    size_t operator()(const ::c10::OperatorName& x) const {
+      return std::hash<std::string>()(x.name) ^ (~ std::hash<std::string>()(x.overload_name));
+    }
+  };
+}
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -350,7 +350,7 @@ VaryingShape<int64_t> TensorType::sizes() const {
        // we turn symbolic shapes into unknowns
        return ss.is_static()
            ? std::optional<int64_t>(ss.static_size())
-            : std::nullopt;
+            : c10::nullopt;
      }));
 }

@ -456,7 +456,7 @@ TensorTypePtr TensorType::createContiguous(
      device,
      VaryingShape<int64_t>(sizes),
      VaryingShape<int64_t>(strides),
-      std::nullopt);
+      c10::nullopt);
 }

 const SymbolicShape& TensorType::symbolic_sizes() const {
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -403,14 +403,14 @@ static std::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t
    auto tuple1 = t1->castRaw<TupleType>();
    auto tuple2 = t2->castRaw<TupleType>();
    if (tuple1->elements().size() != tuple2->elements().size()) {
-      return std::nullopt;
+      return c10::nullopt;
    }
    std::vector<TypePtr> elements;
    for (size_t i = 0; i < tuple1->elements().size(); i++) {
      if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_union)) {
        elements.push_back(*std::move(elem));
      } else {
-        return std::nullopt;
+        return c10::nullopt;
      }
    }
    return static_cast<TypePtr>(TupleType::create(std::move(elements)));
@ -443,7 +443,7 @@ static std::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t
    return type_hint;
  }

-  return std::nullopt;
+  return c10::nullopt;
 }

 std::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, const TypePtr& type_hint) {
@ -463,7 +463,7 @@ std::optional<TypePtr> unifyTypeList(
    const TypePtr& type_hint) {
  if (elements.empty()) {
    why_not << "Cannot get unified type from empty list";
-    return std::nullopt;
+    return c10::nullopt;
  }

  TypePtr ret_type = elements.at(0);
@ -474,7 +474,7 @@ std::optional<TypePtr> unifyTypeList(
              << elements.at(i)->repr_str()
              << " did not match the types before it ("
              << ret_type->repr_str() << ")";
-      return std::nullopt;
+      return c10::nullopt;
    }
    ret_type = *maybe_unified;
  }
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@ -50,7 +50,7 @@ std::optional<TypePtr> subtractTypeSetFrom(std::vector<TypePtr>& to_subtract, Ar
              });

  if (types.empty()) {
-    return std::nullopt;
+    return c10::nullopt;
  } else if (types.size() == 1) {
    return types[0];
  } else {
@ -98,7 +98,7 @@ void filterDuplicateSubtypes(std::vector<TypePtr>* types) {
    // `Optional` could prevent us from coalescing other types
    if ((t1->isSubtypeOf(*NoneType::get()) && !t2->isSubtypeOf(*NoneType::get()))
        || (!t1->isSubtypeOf(*NoneType::get()) && t2->isSubtypeOf(*NoneType::get()))) {
-          return std::nullopt;
+          return c10::nullopt;
    } else {
      return unifyTypes(t1, t2, /*default_to_union=*/false);
    }
@ -278,7 +278,7 @@ std::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtr

 std::optional<TypePtr> UnionType::toOptional() const {
  if (!canHoldType(*NoneType::get())) {
-      return std::nullopt;
+      return c10::nullopt;
  }

  std::vector<TypePtr> copied_types = this->containedTypes().vec();
@ -286,7 +286,7 @@ std::optional<TypePtr> UnionType::toOptional() const {
  auto maybe_opt = UnionType::create(std::move(copied_types));

  if (maybe_opt->kind() == UnionType::Kind) {
-    return std::nullopt;
+    return c10::nullopt;
  } else {
    return maybe_opt;
  }
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -8,29 +8,6 @@

 namespace at::vec {

-template <typename scalar_t>
-inline Vectorized<scalar_t> div_floor_floating_vec(
-    const Vectorized<scalar_t>& a,
-    const Vectorized<scalar_t>& b) {
-  using vec_t = Vectorized<scalar_t>;
-  const auto basic_div = a / b;
-  vec_t inf(std::numeric_limits<scalar_t>::infinity());
-  auto mod = a.fmod(b);
-  // Fixup for a case that isn't properly handled by Sleef_fmod
-  auto floor = vec_t::blendv(a - mod, a, (basic_div.abs() == inf) & (a.abs() != inf));
-  auto div = floor / b;
-  const auto zero = vec_t(0);
-  auto mask = (mod != zero) & ((b < zero) ^ (mod < zero));
-  const auto one = vec_t(1);
-  div = vec_t::blendv(div, div - one, mask);
-  auto floordiv = div.floor();
-  mask = (div - floordiv) > vec_t(0.5);
-  floordiv = vec_t::blendv(floordiv, floordiv + one, mask);
-  floordiv = vec_t::blendv(floordiv, zero.copysign(basic_div), div == zero);
-  floordiv = vec_t::blendv(floordiv, basic_div, b == zero);
-  return floordiv;
-};
-
 // slow path
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -346,7 +346,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abcType = CUDA_R_16BF;
  } else {
-    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
+    AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name());
  }

  globalContext().alertCuBLASConfigNotDeterministic();
@ -456,7 +456,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {

 template <typename Dtype>
 inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented");
+  AT_ERROR("at::cuda::blas::bgemm_internal_cublas: not implemented for ", typeid(Dtype).name());
 }

 template <>
@ -789,7 +789,7 @@ inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {

 template <typename Dtype>
 inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
+  AT_ERROR("at::cuda::blas::gemm_internal_cublas: not implemented for ", typeid(Dtype).name());
 }

 template <>
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -326,7 +326,7 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  static const size_t offset_size = sizeof(int64_t);
  static const size_t total_size = seed_size + offset_size;

-  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
  auto rng_state = state_tensor.data_ptr<uint8_t>();
  auto current_seed = this->current_seed();
  auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -100,7 +100,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt

  // default generator is always registered
  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-      std::nullopt, cuda::detail::getDefaultCUDAGenerator());
+      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
  gen->register_graph(this);

  for (auto& [generator_state, wholegraph_increments] :
--- a/aten/src/ATen/cuda/EmptyTensor.cpp
+++ b/aten/src/ATen/cuda/EmptyTensor.cpp
@ -68,9 +68,7 @@ TensorBase empty_strided_cuda(
    std::optional<Device> device_opt,
    std::optional<bool> pin_memory_opt) {
  TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned");
-  // TODO: remove check for jagged, see https://github.com/pytorch/pytorch/issues/130073
-  const auto layout = layout_or_default(layout_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout == Layout::Strided || layout == Layout::Jagged);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);

  const auto dtype = dtype_or_default(dtype_opt);
  return at::detail::empty_strided_cuda(size, stride, dtype, device_opt);
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -440,20 +440,6 @@ int CUDAHooks::getNumGPUs() const {
  return at::cuda::device_count();
 }

-#ifdef USE_ROCM
-bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
-  hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
-  std::string device_arch = prop->gcnArchName;
-  for (std::string arch : archs) {
-      size_t substring = device_arch.find(arch);
-      if (substring != std::string::npos) {
-          return true;
-      }
-  }
-  return false;
-}
-#endif
-
 void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
  c10::cuda::device_synchronize();
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -3,7 +3,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>

 #include <ATen/Generator.h>
-#include <optional>
+#include <c10/util/Optional.h>

 // TODO: No need to have this whole header, we can just put it all in
 // the cpp file
@ -49,9 +49,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
  int getNumGPUs() const override;
-#ifdef USE_ROCM
-  bool isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const override;
-#endif
  void deviceSynchronize(DeviceIndex device_index) const override;
 };

--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -19,12 +19,14 @@ inline cudnnDataType_t getDataType(const at::Tensor& t) {
  } else if (scalar_type == at::kDouble) {
    return CUDNN_DATA_DOUBLE;
  }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
    else if (scalar_type == at::kBFloat16) {
    return CUDNN_DATA_BFLOAT16;
  } else if (scalar_type == at::kQInt8) {
    return CUDNN_DATA_INT8;
  }
-  TORCH_CHECK(false, "TensorDescriptor does not support ", scalar_type);
+#endif
+  throw std::runtime_error("TensorDescriptor only supports double, float and half tensors");
 }

 } // anonymous namespace
@ -55,7 +57,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr
 void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
  size_t dim = t_sizes.size();
  if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX)
-    TORCH_CHECK(false, "cuDNN supports only up to ", CUDNN_DIM_MAX, " dimensions");
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
  int size[CUDNN_DIM_MAX];
  int stride[CUDNN_DIM_MAX];
  for (const auto i : c10::irange(dim)) {
@ -77,18 +83,22 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
      return "CUDNN_DATA_DOUBLE";
    case CUDNN_DATA_HALF:
      return "CUDNN_DATA_HALF";
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
    case CUDNN_DATA_BFLOAT16:
      return "CUDNN_DATA_BFLOAT16";
+#endif
    case CUDNN_DATA_INT8:
      return "CUDNN_DATA_INT8";
    case CUDNN_DATA_INT32:
      return "CUDNN_DATA_INT32";
    case CUDNN_DATA_INT8x4:
      return "CUDNN_DATA_INT8x4";
+#if CUDNN_VERSION >= 7100
    case CUDNN_DATA_UINT8:
      return "CUDNN_DATA_UINT8";
    case CUDNN_DATA_UINT8x4:
      return "CUDNN_DATA_UINT8x4";
+#endif
    default:
      std::ostringstream oss;
      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
@ -124,7 +134,11 @@ void TensorDescriptor::print() { std::cout << *this; }
 void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
  auto dim = t.ndimension();
  if (dim > CUDNN_DIM_MAX || pad > CUDNN_DIM_MAX)
-  TORCH_CHECK(false, "cuDNN supports only up to ", CUDNN_DIM_MAX, " dimensions");
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("cuDNN supports only up to " STR(CUDNN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
  // NB: It is possible for this test to be insufficient, because the
  // Tensor passed in to set the filter descriptor may not be the actual
  // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -31,7 +31,9 @@ std::string cudnnTypeToString(cudnnDataType_t dtype);
 inline int dataSize(cudnnDataType_t dataType)
 {
  switch (dataType) {
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
    case CUDNN_DATA_BFLOAT16:
+#endif
    case CUDNN_DATA_HALF: return 2;
    case CUDNN_DATA_FLOAT: return 4;
    default: return 8;
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@ -13,7 +13,9 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
    return CUDNN_DATA_DOUBLE;
  } else if (dtype == at::kHalf) {
    return CUDNN_DATA_HALF;
-  } else if (dtype == at::kBFloat16) {
+  }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+  else if (dtype == at::kBFloat16) {
    return CUDNN_DATA_BFLOAT16;
  } else if (dtype == at::kInt) {
    return CUDNN_DATA_INT32;
@ -22,6 +24,7 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
  } else if (dtype == at::kChar) {
    return CUDNN_DATA_INT8;
  }
+#endif
  std::string msg("getCudnnDataTypeFromScalarType() not supported for ");
  msg += toString(dtype);
  throw std::runtime_error(msg);
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -186,12 +186,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return 0;
  }

-#ifdef USE_ROCM
-  virtual bool isGPUArch(DeviceIndex /*device_index*/, const std::vector<std::string>& /*archs*/) const {
-    TORCH_CHECK(false, "Cannot check GPU arch without ATen_cuda library. ", CUDA_HELP);
-  }
-#endif
-
  virtual void deviceSynchronize(DeviceIndex /*device_index*/) const {
    TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -89,8 +89,6 @@ typedef enum {
  kDLWebGPU = 15,
  /*! \brief Qualcomm Hexagon DSP */
  kDLHexagon = 16,
-  /*! \brief Microsoft AI Accelerator */
-  kDLMAIA = 17,
 } DLDeviceType;

 /*!
--- a/aten/src/ATen/functorch/BatchRulesFactory.cpp
+++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp
@ -149,7 +149,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Tensor_batch_rule(
    std::optional<at::Layout> layout,
    std::optional<at::Device> device,
    std::optional<bool> pin_memory){
-  return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, std::nullopt, dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory);
 }

 static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
@ -162,7 +162,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
    std::optional<bool> pin_memory){

  auto end_t = at::native::wrapped_scalar_tensor(end, start.device());
-  return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, std::nullopt, steps, std::nullopt, dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::nullopt, dtype, layout, device, pin_memory);
 }

 static std::tuple<Tensor,optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
@ -175,7 +175,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
    std::optional<bool> pin_memory){

  auto start_t = at::native::wrapped_scalar_tensor(start, end.device());
-  return linspace_logspace_batch_rule_helper(start_t, std::nullopt, end, end_bdim, steps, std::nullopt, dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory);
 }

 static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
@ -187,7 +187,7 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
    std::optional<at::Layout> layout,
    std::optional<at::Device> device,
    std::optional<bool> pin_memory){
-  return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, std::make_optional(base), dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
 }

 static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
@ -201,7 +201,7 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
    std::optional<bool> pin_memory){

  auto end_t = at::native::wrapped_scalar_tensor(end, start.device());
-  return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, std::nullopt, steps, std::make_optional(base), dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
 }

 static std::tuple<Tensor,optional<int64_t>> logspace_Scalar_Tensor_batch_rule(
@ -215,7 +215,7 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Scalar_Tensor_batch_rule(
    std::optional<bool> pin_memory){

  auto start_t = at::native::wrapped_scalar_tensor(start, end.device());
-  return linspace_logspace_batch_rule_helper(start_t, std::nullopt, end, end_bdim, steps, std::make_optional(base), dtype, layout, device, pin_memory);
+  return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
 }

 static bool _has_same_storage_numel_batch_rule(const Tensor& a, const Tensor& b) {
--- a/aten/src/ATen/functorch/BatchRulesHelper.cpp
+++ b/aten/src/ATen/functorch/BatchRulesHelper.cpp
@ -38,7 +38,7 @@ optional<int64_t> valIfNonempty(optional<int64_t> maybe_empty, int64_t new_val)
  if (maybe_empty.has_value()) {
    return new_val;
  }
-  return std::nullopt;
+  return nullopt;
 }

 int64_t getPhysicalDim(const Tensor& tensor, bool has_batch_dim, int64_t logical_dim) {
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@ -25,7 +25,7 @@ static optional<int64_t> compute_stat_bdim(
  if (input_bdim.has_value() && !is_empty_tensor(stat)) {
    return 0;
  }
-  return std::nullopt;
+  return nullopt;
 }

 static Tensor padRight(const Tensor& tensor, optional<int64_t> has_bdim, int64_t logical_rank) {
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -377,7 +377,7 @@ namespace {
  // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L379-L405
  VmapDimVector get_indexed_shape(Tensor self, const torch::List<std::optional<at::Tensor>> &orig)
  {
-    at::native::checkIndexTensorTypes(orig, /*allow_int*/ true);
+    at::native::checkIndexTensorTypes(orig);
    // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
    auto indices = at::native::expandTensors(self, orig);
    // next broadcast all index tensors together
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -393,7 +393,7 @@ std::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int6
      return res_idx; // for everything currently in native_functions, each input aliases at most one output (tensor list counts as one output)
    }
  }
-  return std::nullopt;
+  return nullopt;
 }

 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@ -8,7 +8,7 @@
 #include <ATen/functorch/Macros.h>
 #include <c10/core/DispatchKey.h>
 #include <ATen/core/function_schema.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <ATen/functorch/Interpreter.h>
 #include <ATen/functorch/VmapInterpreter.h>
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@ -3,7 +3,7 @@
 #include <ATen/functorch/Macros.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
-#include <optional>
+#include <c10/util/Optional.h>
 #include <bitset>
 #include <utility>
 #include <variant>
@ -149,7 +149,7 @@ struct Interpreter {
  }
  void clearSavedLocalDispatchKeySet() {
    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
-    savedLocalDispatchKeySet_ = std::nullopt;
+    savedLocalDispatchKeySet_ = c10::nullopt;
  }
  c10::impl::LocalDispatchKeySet getSavedLocalDispatchKeySet() const {
    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@ -289,7 +289,7 @@ static optional<c10::SymInt> maximum_indexable_location(
    c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) {
  auto result = native::storage_size_for(sizes, strides);
  if (result == 0) {
-    return std::nullopt;
+    return nullopt;
  }
  return result + storage_offset;
 }
@ -536,7 +536,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
  // we'll just slice the tensor to get a Tensor of shape [0] to pass to at::cat.
  std::vector<Tensor> tensors_to_cat;
  tensors_to_cat.reserve(tensors.size());
-  std::optional<int64_t> bdim_size = std::nullopt;
+  std::optional<int64_t> bdim_size = c10::nullopt;

  // find the bdim size. Might not exist if all BatchedTensors should be skipped
  // by cat's special case.
@ -573,7 +573,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
  }

  auto new_dim = bdim_size.has_value() ? dim + 1 : dim;
-  std::optional<int64_t> new_bdim = bdim_size.has_value() ? std::make_optional((int64_t)0) : nullopt;
+  std::optional<int64_t> new_bdim = bdim_size.has_value() ? c10::make_optional((int64_t)0) : nullopt;
  auto result = at::cat(tensors_to_cat, new_dim);
  return makeBatched(result, new_bdim, get_current_level());
 }
--- a/Show More
+++ b/Show More