update int8 sdpa

update fa int8
2025-11-02 06:24:59 +08:00 · 2024-12-03 05:58:44 -05:00 · 2024-12-03 00:49:02 -08:00 · 2024-11-19 01:54:23 -05:00 · 2024-10-30 01:35:28 -07:00 · 2024-10-29 02:51:50 -07:00
1601 changed files with 24911 additions and 29480 deletions
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.x509.oid import NameOID

-
 temp_dir = mkdtemp()
 print(temp_dir)

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,7 +18,6 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -3,7 +3,6 @@ import json
 import math
 import sys

-
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -3,7 +3,6 @@ import sys

 import numpy

-
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]

--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -1,7 +1,6 @@
 import json
 import sys

-
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]

--- a/.ci/pytorch/print_sccache_log.py
+++ b/.ci/pytorch/print_sccache_log.py
@ -1,6 +1,5 @@
 import sys

-
 log_file_path = sys.argv[1]

 with open(log_file_path) as f:
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -4,7 +4,6 @@ import os
 import subprocess
 import sys

-
 COMMON_TESTS = [
    (
        "Checking that torch is available",
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@ -5,7 +5,6 @@ import sys

 import yaml

-
 # Need to import modules that lie on an upward-relative path
 sys.path.append(os.path.join(sys.path[0], ".."))

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -118,18 +118,9 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

-if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
-  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
-  set +u
-  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
-fi
-
 # Test the package
 /builder/check_binary.sh

-# Clean temp files
-cd /builder && git clean -ffdx
-
 # =================== The above code will be executed inside Docker container ===================
 EOL
 echo
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -100,20 +100,6 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
 cat >"$envfile" <<EOL
 # =================== The following code will be executed inside Docker container ===================
 export TZ=UTC
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -29,11 +29,6 @@ if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
 fi

-# this is special build with all dependencies packaged
-if [[ ${BUILD_NAME} == *-full* ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
-fi
-
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@ -8,7 +8,6 @@ import time

 import requests

-
 AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
 # 2024-01-02 clangformat: fused adam #116583
 9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
-# 2024-06-28 enable UFMT in `torch/storage.py`
-d80939e5e9337e8078f11489afefec59fd42f93b
-# 2024-06-28 enable UFMT in `torch.utils.data`
-7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -47,5 +47,3 @@ self-hosted-runner:
    - macos-latest-xlarge
    - macos-13-xlarge
    - macos-14-xlarge
-    # Organization-wide Intel hosted XPU runners
-    - linux.idc.xpu
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -36,8 +36,7 @@ runs:
          "${DOCKER_IMAGE}"
        )

-        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
@ -48,9 +47,10 @@ runs:
        docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"

    - name: Cleanup docker
-      if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
+      if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
      shell: bash
      run: |
-        # on s390x or xpu stop the container for clean worker stop
+        # on s390x stop the container for clean worker stop
+        # ignore expansion of "docker ps -q" since it could be empty
        # shellcheck disable=SC2046
-        docker stop "${{ env.CONTAINER_NAME }}" || true
+        docker stop $(docker ps -q) || true
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -407,7 +407,7 @@
  - torch/_inductor/codegen/cpp_template.py
  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/inductor/test_cpu_repro.py
+  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 import os
 import shutil
 import sys
@ -8,7 +7,6 @@ from subprocess import check_call
 from tempfile import TemporaryDirectory
 from typing import Optional

-
 SCRIPT_DIR = Path(__file__).parent
 REPO_DIR = SCRIPT_DIR.parent.parent

--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -5,6 +5,7 @@ import sys
 from typing import Any

 from github_utils import gh_delete_comment, gh_post_pr_comment
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
 from trymerge import GitHubPR
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -4,9 +4,11 @@ import json
 import os
 import re
 from typing import Any, cast, Dict, List, Optional
+
 from urllib.error import HTTPError

 from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR

--- a/.github/scripts/close_nonexistent_disable_issues.py
+++ b/.github/scripts/close_nonexistent_disable_issues.py
@ -10,7 +10,6 @@ import requests
 import rockset  # type: ignore[import]
 from gitutils import retries_decorator

-
 LOGS_QUERY = """
 with
    shas as (
--- a/.github/scripts/collect_ciflow_labels.py
+++ b/.github/scripts/collect_ciflow_labels.py
@ -1,12 +1,10 @@
 #!/usr/bin/env python3
-
 import sys
 from pathlib import Path
 from typing import Any, cast, Dict, List, Set

 import yaml

-
 GITHUB_DIR = Path(__file__).parent.parent


--- a/.github/scripts/convert_lintrunner_annotations_to_github.py
+++ b/.github/scripts/convert_lintrunner_annotations_to_github.py
@ -1,6 +1,7 @@
 import json
 import subprocess
 import sys
+
 from enum import Enum
 from pathlib import Path
 from typing import NamedTuple, Optional
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -9,7 +9,6 @@ from typing import Any, Callable, Dict, List, Set
 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo

-
 SEC_IN_DAY = 24 * 60 * 60
 CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
 NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import sys
+
 from pathlib import Path

 import yaml
--- a/.github/scripts/export_pytorch_labels.py
+++ b/.github/scripts/export_pytorch_labels.py
@ -14,6 +14,7 @@ import json
 from typing import Any

 import boto3  # type: ignore[import]
+
 from label_utils import gh_get_labels


--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -15,7 +15,6 @@ from urllib.request import Request, urlopen

 import yaml

-
 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"

 PREFIX = "test-config/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -8,13 +8,11 @@ architectures:
    * CPU
    * Latest CUDA
    * Latest ROCM
-    * Latest XPU
 """

 import os
 from typing import Dict, List, Optional, Tuple

-
 CUDA_ARCHES = ["11.8", "12.1", "12.4"]


@ -26,7 +24,6 @@ CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}

 ROCM_ARCHES = ["6.0", "6.1"]

-XPU_ARCHES = ["xpu"]

 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

@ -135,8 +132,6 @@ def arch_type(arch_version: str) -> str:
        return "cuda"
    elif arch_version in ROCM_ARCHES:
        return "rocm"
-    elif arch_version in XPU_ARCHES:
-        return "xpu"
    elif arch_version in CPU_CXX11_ABI_ARCH:
        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
@ -161,7 +156,6 @@ WHEEL_CONTAINER_IMAGES = {
        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
@ -227,7 +221,6 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
-        "xpu": "xpu",
    }.get(gpu_arch_type, gpu_arch_version)


@ -338,7 +331,7 @@ def generate_wheels_matrix(
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES
        elif os == "linux-aarch64":
@ -361,14 +354,11 @@ def generate_wheels_matrix(
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "cuda-aarch64"
-                or arch_version == "xpu"
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
-            if (
-                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
-            ) and python_version == "3.13":
+            # TODO: Enable python 3.13 on rocm, aarch64, windows
+            if (gpu_arch_type == "rocm" or os != "linux") and python_version == "3.13":
                continue

            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
@ -410,7 +400,9 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True",
-                            "devtoolset": "",
+                            "devtoolset": (
+                                "cxx11-abi" if arch_version == "cuda-aarch64" else ""
+                            ),
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": (
@ -423,26 +415,6 @@ def generate_wheels_matrix(
                            ),
                        }
                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
@ -453,9 +425,7 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "devtoolset": (
-                            "cxx11-abi"
-                            if arch_version in ["cpu-cxx11-abi", "xpu"]
-                            else ""
+                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -8,8 +8,8 @@ from typing import Dict, Iterable, List, Literal, Set
 from typing_extensions import TypedDict  # Python 3.11+

 import generate_binary_build_matrix  # type: ignore[import]
-import jinja2

+import jinja2

 Arch = Literal["windows", "linux", "macos"]

--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -16,7 +16,6 @@ from typing import Dict, List

 import generate_binary_build_matrix

-
 DOCKER_IMAGE_TYPES = ["runtime", "devel"]


--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -4,11 +4,11 @@ import argparse
 import os
 import re
 import subprocess
+
 from datetime import datetime
 from distutils.util import strtobool
 from pathlib import Path

-
 LEADING_V_PATTERN = re.compile("^v")
 TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,6 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
+
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from urllib.request import Request, urlopen

--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,6 +3,7 @@
 import json
 import os
 import warnings
+
 from dataclasses import dataclass
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 from urllib.error import HTTPError
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -19,7 +19,6 @@ from typing import (
    Union,
 )

-
 T = TypeVar("T")

 RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$")
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -1,12 +1,12 @@
 """GitHub Label Utilities."""

 import json
+
 from functools import lru_cache
 from typing import Any, List, Tuple, TYPE_CHECKING, Union

 from github_utils import gh_fetch_url_and_headers, GitHubComment

-
 # TODO: this is a temp workaround to avoid circular dependencies,
 #       and should be removed once GitHubPR is refactored out of trymerge script.
 if TYPE_CHECKING:
--- a/.github/scripts/pytest_cache.py
+++ b/.github/scripts/pytest_cache.py
@ -9,7 +9,6 @@ from pytest_caching_utils import (
    upload_pytest_cache,
 )

-
 TEMP_DIR = "./tmp"  # a backup location in case one isn't provided


--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -14,7 +14,6 @@ from file_io_utils import (
    zip_folder,
 )

-
 PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
 PYTEST_CACHE_DIR_NAME = ".pytest_cache"
 BUCKET = "gha-artifacts"
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -2,7 +2,7 @@

 set -eoux pipefail

-SYNC_BRANCH=pytorch-stable-prototype
+SYNC_BRANCH=fbcode/pytorch-stable-prototype

 git config user.email "fake@example.com"
 git config user.name  "PyTorch Stable Bot"
@ -11,9 +11,7 @@ git fetch origin main
 git fetch origin "$SYNC_BRANCH"
 git checkout "$SYNC_BRANCH"

-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
+for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
 do
    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
@ -22,12 +20,7 @@ do
        continue
    fi
    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
+    git cherry-pick -x "$SHA"
 done

 if [[ "${WITH_PUSH}" == true ]]; then
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -41,7 +41,7 @@ def main() -> None:
    )

    options = parser.parse_args()
-    tagged_images: Dict[str, bool] = {}
+    tagged_images: Dict[str, bool] = dict()
    platform_images = [
        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -7,7 +7,6 @@ cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
 pip install -e .
-pip install numpy==1.26.0

 # Run indexer
 cd ../llm-target-determinator
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -17,7 +17,9 @@ from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

 from github_utils import gh_graphql
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+
 from trymerge import (
    categorize_checks,
    DRCI_CHECKRUN_NAME,
@ -37,7 +39,6 @@ from trymerge import (
    validate_revert,
 )

-
 if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -45,6 +45,7 @@ from github_utils import (
    gh_update_pr_state,
    GitHubComment,
 )
+
 from gitutils import (
    are_ghstack_branches_in_sync,
    get_git_remote_name,
@ -61,7 +62,6 @@ from label_utils import (
 )
 from trymerge_explainer import get_revert_message, TryMergeExplainer

-
 # labels
 MERGE_IN_PROGRESS_LABEL = "merging"
 MERGE_COMPLETE_LABEL = "merged"
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@ -11,7 +11,6 @@ from github_utils import gh_post_pr_comment as gh_post_comment
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import GitHubPR

-
 SAME_SHA_ERROR = (
    "\n```\nAborting rebase because rebasing the branch resulted in the same sha as the target branch.\n"
    + "This usually happens because the PR has already been merged.  Please rebase locally and push.\n```"
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -81,7 +81,7 @@ jobs:
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: !{{ config["build_name"] }}-build
-    {%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
+    {%- if config["gpu_arch_type"] != "rocm" %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      build_name: !{{ config["build_name"] }}
@ -101,40 +101,6 @@ jobs:
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-    {%- elif config["gpu_arch_type"] == "xpu" %}
-    runs-on: linux.idc.xpu
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config) }}
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: !{{ common.download_artifact_action }}
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: !{{ config["container_image"] }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
    {%- else %}
    runs-on: linux.rocm.gpu
    timeout-minutes: !{{ common.timeout_minutes }}
--- a/.github/workflows/_linux-build-rg.yml
+++ b/.github/workflows/_linux-build-rg.yml
@ -0,0 +1,105 @@
+name: linux-build-rg
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      cuda-arch-list:
+        required: false
+        type: string
+        default: "5.2"
+        description: |
+          List of CUDA architectures CI build should target.
+      runner-group:
+        required: false
+        type: string
+        default: "arc-lf-linux.2xlarge"
+        description: Runner group to select group type
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+      test-matrix:
+        value: ${{ jobs.build.outputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+
+jobs:
+  build:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on:
+      group: ${{ inputs.runner-group }}
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.linux-build.outputs.docker-image }}
+      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
+    steps:
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Build
+        id: linux-build
+        uses: ./.github/actions/linux-build
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          docker-image-name: ${{ inputs.docker-image-name }}
+          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
+          build-with-debug: ${{ inputs.build-with-debug }}
+          sync-tag: ${{ inputs.sync-tag }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          test-matrix: ${{ inputs.test-matrix }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/_linux-test-label.yml
+++ b/.github/workflows/_linux-test-label.yml
@ -0,0 +1,85 @@
+name: linux-test-rg
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 240
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      use-gha:
+        required: false
+        type: string
+        default: ""
+        description: If set to any value, upload to GHA. Otherwise upload to S3.
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Test
+        id: linux-test
+        uses: ./.github/actions/linux-test
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          test-matrix: ${{ inputs.test-matrix }}
+          docker-image: ${{ inputs.docker-image }}
+          sync-tag: ${{ inputs.sync-tag }}
+          use-gha: ${{ inputs.use-gha }}
+          dashboard-tag: ${{ inputs.dashboard-tag }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/_linux-test-rg.yml
+++ b/.github/workflows/_linux-test-rg.yml
@ -0,0 +1,86 @@
+name: linux-test-label
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 240
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      use-gha:
+        required: false
+        type: string
+        default: ""
+        description: If set to any value, upload to GHA. Otherwise upload to S3.
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on:
+      group: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Test
+        id: linux-test
+        uses: ./.github/actions/linux-test
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          test-matrix: ${{ inputs.test-matrix }}
+          docker-image: ${{ inputs.docker-image }}
+          sync-tag: ${{ inputs.sync-tag }}
+          use-gha: ${{ inputs.use-gha }}
+          dashboard-tag: ${{ inputs.dashboard-tag }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -54,7 +54,6 @@ jobs:
      # Hardcoding below is temporary for testing ALI runners
      # This file below should match the script found in .github/scripts/runner_determinator.py
      - name: Hardcode runner-determinator script
-        id: hardcode-script
        run: |
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -751,118 +751,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_8-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_8-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1577,118 +1465,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_9-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.9"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_9-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -2068,71 +1844,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda12_1-full-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -2468,118 +2179,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.10"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_10-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -3294,118 +2893,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.11"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_11-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_11-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -4120,118 +3607,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_12-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.12"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_12-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_12-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -39,10 +39,10 @@ jobs:
  update-vision-commit-hash:
    runs-on: ubuntu-latest
    environment: update-commit-hash
-    if: ${{ github.event_name == 'schedule' }}
    steps:
      - name: update-vision-commit-hash
        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        if: ${{ github.event_name == 'schedule' }}
        with:
          repo-name: vision
          branch: main
@ -54,10 +54,10 @@ jobs:
  update-audio-commit-hash:
    runs-on: ubuntu-latest
    environment: update-commit-hash
-    if: ${{ github.event_name == 'schedule' }}
    steps:
      - name: update-audio-commit-hash
        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        if: ${{ github.event_name == 'schedule' }}
        with:
          repo-name: audio
          branch: main
@ -69,10 +69,10 @@ jobs:
  update-executorch-commit-hash:
    runs-on: ubuntu-latest
    environment: update-commit-hash
-    if: ${{ github.event_name == 'schedule' }}
    steps:
      - name: update-executorch-commit-hash
        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        if: ${{ github.event_name == 'schedule' }}
        with:
          repo-name: executorch
          branch: main
--- a/.github/workflows/runner_determinator_script_sync.yaml
+++ b/.github/workflows/runner_determinator_script_sync.yaml
@ -1,42 +0,0 @@
-name: runner-determinator
-
-on:
-  workflow_dispatch:
-  pull_request:
-    branches: [main]
-    paths:
-      - .github/workflows/_runner-determinator.yaml
-      - .github/workflows/_runner_determinator_script_sync.yaml
-      - .github/workflows/scripts/runner_determinator.py
-
-jobs:
-  python-script-sync-check:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            .github
-
-      - name: Extract the script from runner_determinator
-        run: |
-            # Runner determinator files
-            RUNNER_DETERMINATOR_WORKFLOW_FILE=.github/workflows/_runner-determinator.yml
-            RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE=.github/scripts/runner_determinator.py
-
-            # Parse the job file, extract the script and run it, up to the final EOF,
-            # to generate the python file in the local folder
-            yq '.jobs.runner-determinator.steps[] | select(.id == "hardcode-script") | .run' \
-                "${RUNNER_DETERMINATOR_WORKFLOW_FILE}" | sed '/^EOF$/q' | bash
-
-            set +e
-            DIFF="$(diff "$(basename ${RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE})" ${RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE})"
-            IS_DIFF=$?
-            set -e
-            if [ $IS_DIFF -eq 0 ]; then
-                echo "Scripts are in sync! ^_^";
-            else
-                echo -e "Scripts are *NOT* in sync:\n ${DIFF}";
-                exit 1
-            fi
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -203,14 +203,25 @@ jobs:
      cuda-version: cpu
      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}

-  win-vs2019-cuda12_1-py3-build:
-    name: win-vs2019-cuda12.1-py3
+  win-vs2019-cuda11_8-py3-build:
+    name: win-vs2019-cuda11.8-py3
    uses: ./.github/workflows/_win-build.yml
    needs: get-label-type
    with:
-      build-environment: win-vs2019-cuda12.1-py3
-      cuda-version: "12.1"
+      build-environment: win-vs2019-cuda11.8-py3
+      cuda-version: "11.8"
+      sync-tag: win-cuda-build
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+        ]}

  linux-focal-rocm6_1-py3_8-build:
    name: linux-focal-rocm6.1-py3.8
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -38,7 +38,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'flake8==6.1.0',
    'flake8-bugbear==23.3.23',
-    'flake8-comprehensions==3.15.0',
+    'flake8-comprehensions==3.12.0',
    'flake8-executable==2.1.3',
    'flake8-logging-format==0.9.0',
    'flake8-pyi==23.3.1',
@ -1531,6 +1531,10 @@ exclude_patterns = [
    'torch/signal/__init__.py',
    'torch/signal/windows/__init__.py',
    'torch/signal/windows/windows.py',
+    'torch/sparse/__init__.py',
+    'torch/sparse/_semi_structured_conversions.py',
+    'torch/sparse/_triton_ops.py',
+    'torch/sparse/semi_structured.py',
    'torch/special/__init__.py',
    'torch/testing/_internal/__init__.py',
    'torch/testing/_internal/autocast_test_lists.py',
@ -1779,7 +1783,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.5.2',
+    'ruff==0.5.0',
 ]
 is_formatter = true

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -208,6 +208,7 @@ endif()
 include(CMakeDependentOption)
 option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
 option(BUILD_BINARY "Build C++ binaries" OFF)
+option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF
       "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
@ -749,6 +750,7 @@ if(NOT TORCH_BUILD_VERSION)
      CACHE STRING "Torch build version" FORCE)
 endif()
 caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
+caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
 set(TORCH_SOVERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}")

 # ---[ CMake scripts + modules
@ -1221,6 +1223,45 @@ endif()
 add_subdirectory(c10)
 add_subdirectory(caffe2)

+# --[ Documentation
+if(BUILD_DOCS)
+  # check if Doxygen is installed
+  find_package(Doxygen)
+  if(DOXYGEN_FOUND)
+    message("Generating documentation")
+
+    set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
+    set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
+    set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
+    set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
+
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+      file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif()
+
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
+    configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
+
+    add_custom_target(
+      doc_doxygen_c ALL
+      COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Generating C++ API documentation with Doxygen"
+      VERBATIM)
+
+    add_custom_target(
+      doc_doxygen_python ALL
+      COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Generating Python API documentation with Doxygen"
+      VERBATIM)
+  else()
+    message(
+      FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
+  endif()
+endif()
+
 # ---[ CMake related files Uninistall option.
 if(NOT TARGET caffe2_uninstall)
  configure_file(
--- a/18
+++ b/18
@ -156,12 +156,12 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki

 # CUDA and CUDA math libraries
-aten/src/ATen/cuda/ @eqy @syed-ahmed
-aten/src/ATen/cudnn/ @eqy @syed-ahmed
-aten/src/ATen/native/cuda/ @eqy @syed-ahmed
-aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
-c10/cuda @eqy @syed-ahmed
-torch/cuda/ @eqy @syed-ahmed
-torch/csrc/cuda/ @eqy @syed-ahmed
-torch/backends/cuda/ @eqy @syed-ahmed
-torch/backends/cudnn/ @eqy @syed-ahmed
+aten/src/ATen/cuda/ @eqy
+aten/src/ATen/cudnn/ @eqy
+aten/src/ATen/native/cuda/ @eqy
+aten/src/ATen/native/cudnn/ @eqy
+c10/cuda @eqy
+torch/cuda/ @eqy
+torch/csrc/cuda/ @eqy
+torch/backends/cuda/ @eqy
+torch/backends/cudnn/ @eqy
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@ -3,7 +3,6 @@ from typing import Dict, List, Optional, Tuple
 import torch
 from torch import Tensor

-
 OUTPUT_DIR = "src/androidTest/assets/"


--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@ -119,7 +119,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
    }
    deviceType_ = deviceJniCodeToDeviceType(device);
    module_ = torch::jit::load(
-        std::move(modelPath->toStdString()), std::nullopt, extra_files);
+        std::move(modelPath->toStdString()), c10::nullopt, extra_files);
    if (has_extra) {
      static auto putMethod =
          facebook::jni::JMap<facebook::jni::JString, facebook::jni::JString>::
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@ -84,9 +84,9 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
    }
    deviceType_ = deviceJniCodeToDeviceType(device);
    module_ = torch::jit::_load_for_mobile(
-        std::move(modelPath->toStdString()), std::nullopt, extra_files);
+        std::move(modelPath->toStdString()), c10::nullopt, extra_files);
    torch::jit::_load_extra_only_for_mobile(
-        std::move(modelPath->toStdString()), std::nullopt, extra_files);
+        std::move(modelPath->toStdString()), c10::nullopt, extra_files);
    if (has_extra) {
      static auto putMethod =
          facebook::jni::JMap<facebook::jni::JString, facebook::jni::JString>::
--- a/android/test_app/make_assets.py
+++ b/android/test_app/make_assets.py
@ -2,7 +2,6 @@ from torchvision import models

 import torch

-
 print(torch.version.__version__)

 resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
--- a/android/test_app/make_assets_custom.py
+++ b/android/test_app/make_assets_custom.py
@ -9,7 +9,6 @@ from torchvision import models

 import torch

-
 # Download and trace the model.
 model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
 model.eval()
--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@ -82,7 +82,6 @@ using acc_type = typename AccumulateType<T, is_cuda>::type;
    using type = acc_t;                         \
  };
 #define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
-#define XPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::XPU)
 #define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
 #define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)

@ -105,25 +104,6 @@ MPS_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
 MPS_ACC_TYPE(c10::complex<float>, c10::complex<float>);
 MPS_ACC_TYPE(c10::complex<double>, c10::complex<float>);

-XPU_ACC_TYPE(BFloat16, float);
-XPU_ACC_TYPE(Half, float);
-XPU_ACC_TYPE(Float8_e5m2, float);
-XPU_ACC_TYPE(Float8_e4m3fn, float);
-XPU_ACC_TYPE(Float8_e5m2fnuz, float);
-XPU_ACC_TYPE(Float8_e4m3fnuz, float);
-XPU_ACC_TYPE(float, float);
-XPU_ACC_TYPE(double, double);
-XPU_ACC_TYPE(int8_t, int64_t);
-XPU_ACC_TYPE(uint8_t, int64_t);
-XPU_ACC_TYPE(char, int64_t);
-XPU_ACC_TYPE(int16_t, int64_t);
-XPU_ACC_TYPE(int32_t, int64_t);
-XPU_ACC_TYPE(int64_t, int64_t);
-XPU_ACC_TYPE(bool, bool);
-XPU_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
-XPU_ACC_TYPE(c10::complex<float>, c10::complex<float>);
-XPU_ACC_TYPE(c10::complex<double>, c10::complex<double>);
-
 #if defined(__CUDACC__) || defined(__HIPCC__)
 CUDA_ACC_TYPE(half, float);
 #endif
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -283,7 +283,7 @@ at::BlasBackend Context::blasPreferredBackend() {
  if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
    static const bool hipblaslt_unsupported = []() {
      static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-      for (auto index: c10::irange(getNumGPUs())) {
+      for (auto index = 0; index < at::getNumGPUs(); index++) {
        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
          TORCH_WARN_ONCE(
            "Attempting to use hipBLASLt on an unsupported architecture! "
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -73,8 +73,6 @@ class TORCH_API Context {
      return at::detail::getPrivateUse1Hooks();
    } else if (device_type == at::kMTIA) {
      return at::detail::getMTIAHooks();
-    } else if (device_type == at::kHIP) {
-      return at::detail::getHIPHooks();
    } else {
      AT_ERROR(
          c10::DeviceTypeName(device_type), " device type not an accelerator.");
@ -96,22 +94,8 @@ class TORCH_API Context {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
-  bool isPinnedPtr(
-      const void* data,
-      std::optional<DeviceType> device_type = std::nullopt) {
-    auto opt_device_type =
-        device_type.has_value() ? device_type.value() : at::getAccelerator();
-    if (!opt_device_type.has_value() || // there is no accelerator
-        !at::isAccelerator(
-            opt_device_type.value())) { // passed device not an accelerator
-      return false;
-    }
-    return getAcceleratorHooksInterface(opt_device_type.value())
-        .isPinnedPtr(data);
-  }
-  Allocator* getPinnedMemoryAllocator(
-      std::optional<DeviceType> device_type = std::nullopt) {
-    return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
+  static bool isPinnedPtr(const void* data) {
+    return detail::getCUDAHooks().isPinnedPtr(data);
  }
  static bool hasOpenMP();
  static bool hasMKL();
@ -432,73 +416,73 @@ class TORCH_API Context {

 TORCH_API Context& globalContext();

-inline void init() {
+static inline void init() {
  globalContext();
 }

 TORCH_API Allocator* getCPUAllocator();

-inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
+static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
    Backend p,
    ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      p, s);
 }

-inline DeprecatedTypeProperties& CPU(ScalarType s) {
+static inline DeprecatedTypeProperties& CPU(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CPU, s);
 }

-inline DeprecatedTypeProperties& CUDA(ScalarType s) {
+static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CUDA, s);
 }

-inline DeprecatedTypeProperties& HIP(ScalarType s) {
+static inline DeprecatedTypeProperties& HIP(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::HIP, s);
 }

-inline DeprecatedTypeProperties& MPS(ScalarType s) {
+static inline DeprecatedTypeProperties& MPS(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::MPS, s);
 }

-inline bool hasCUDA() {
+static inline bool hasCUDA() {
  return globalContext().hasCUDA();
 }

-inline bool hasMTIA() {
+static inline bool hasMTIA() {
  return globalContext().hasMTIA();
 }

-inline bool hasHIP() {
+static inline bool hasHIP() {
  return globalContext().hasHIP();
 }

-inline bool hasIPU() {
+static inline bool hasIPU() {
  return globalContext().hasIPU();
 }

-inline bool hasXLA() {
+static inline bool hasXLA() {
  return globalContext().hasXLA();
 }

-inline bool hasMPS() {
+static inline bool hasMPS() {
  return globalContext().hasMPS();
 }

-inline bool hasMAIA() {
+static inline bool hasMAIA() {
  return globalContext().hasMAIA();
 }

-inline bool hasXPU() {
+static inline bool hasXPU() {
  return globalContext().hasXPU();
 }

 // Despite its name, this function returns the number of *CUDA* GPUs.
-inline size_t getNumGPUs() {
+static inline size_t getNumGPUs() {
  // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
  // FUNCTION.  If you are interested in interrogating the number of
  // devices for a specific device type, add that function to the
@ -517,27 +501,27 @@ inline size_t getNumGPUs() {
  }
 }

-inline bool hasOpenMP() {
+static inline bool hasOpenMP() {
  return globalContext().hasOpenMP();
 }

-inline bool hasMKL() {
+static inline bool hasMKL() {
  return globalContext().hasMKL();
 }

-inline bool hasLAPACK() {
+static inline bool hasLAPACK() {
  return globalContext().hasLAPACK();
 }

-inline bool hasMAGMA() {
+static inline bool hasMAGMA() {
  return globalContext().hasMAGMA();
 }

-inline bool hasMKLDNN() {
+static inline bool hasMKLDNN() {
  return globalContext().hasMKLDNN();
 }

-inline void manual_seed(uint64_t seed) {
+static inline void manual_seed(uint64_t seed) {
  auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
  {
    // See Note [Acquire lock when using random generators]
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -2,7 +2,7 @@
 #include <ATen/DeviceAccelerator.h>
 namespace at {

-std::optional<DeviceType> getAccelerator(bool checked) {
+C10_API std::optional<DeviceType> getAccelerator(bool checked) {
 #define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
  if (at::has##device_name()) {                    \
    device_type = k##device_name;                  \
@ -25,8 +25,6 @@ std::optional<DeviceType> getAccelerator(bool checked) {
  DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
  DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
  DETECT_AND_ASSIGN_ACCELERATOR(XPU)
-  DETECT_AND_ASSIGN_ACCELERATOR(HIP)
-  DETECT_AND_ASSIGN_ACCELERATOR(MPS)
  if (checked) {
    TORCH_CHECK(
        device_type, "Cannot access accelerator device when none is available.")
@ -36,18 +34,4 @@ std::optional<DeviceType> getAccelerator(bool checked) {
 #undef DETECT_AND_ASSIGN_ACCELERATOR
 }

-bool isAccelerator(c10::DeviceType d) {
-  switch (d) {
-    case at::kCUDA:
-    case at::kMTIA:
-    case at::kXPU:
-    case at::kHIP:
-    case at::kMPS:
-    case at::kPrivateUse1:
-      return true;
-    default:
-      return false;
-  }
-}
-
 } // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -13,7 +13,9 @@
 // - It provides a set of common APIs as defined by AcceleratorHooksInterface
 //
 // As of today, accelerator devices are (in no particular order):
-// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
+// CUDA, MTIA, XPU, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS

 namespace at {

@ -22,6 +24,4 @@ namespace at {
 // When checked is true, the returned optional always has a value.
 TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);

-TORCH_API bool isAccelerator(c10::DeviceType d);
-
 } // namespace at
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -499,7 +499,7 @@ inline Tensor sum_to(
  return _sum_to(std::move(tensor), shape, always_return_non_view);
 }

-inline bool is_expandable_to(
+static inline bool is_expandable_to(
    SymIntArrayRef shape,
    c10::SymIntArrayRef desired) {
  size_t ndim = shape.size();
@ -517,7 +517,7 @@ inline bool is_expandable_to(
  return true;
 }

-inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
+static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
  auto sym_shape = c10::SymIntArrayRef(
      reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
  auto sym_desired = c10::SymIntArrayRef(
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
    return Tensor();
 }

-Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const std::optional<Tensor>& min_seqlen, const std::optional<Tensor>& max_seqlen) {
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
  auto values = at::_nested_get_values(mutated_view);
  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return values;
@ -321,8 +321,8 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
  auto max_seqlen = at::_nested_get_max_seqlen(base);
  auto nt = at::_nested_view_from_jagged(
      mutated_view, offsets, dummy, lengths, ragged_idx,
-      (min_seqlen.defined() ? std::optional<Tensor>(min_seqlen) : std::nullopt),
-      (max_seqlen.defined() ? std::optional<Tensor>(max_seqlen) : std::nullopt));
+      (min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : std::nullopt),
+      (max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : std::nullopt));

  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return nt;
--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@ -62,7 +62,7 @@ static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
  return dim == 0 || dim == -1;
 }

-Tensor sum_batching_rule(const Tensor& self, OptionalIntArrayRef opt_dims, bool keepdim, std::optional<ScalarType> dtype) {
+Tensor sum_batching_rule(const Tensor& self, OptionalIntArrayRef opt_dims, bool keepdim, optional<ScalarType> dtype) {
  if (opt_dims.has_value()) {
    auto dims = opt_dims.value();
    // PyTorch has a special case where sum(scalar_tensor, dim=0) does not fail
@ -198,7 +198,7 @@ std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int6
  return result;
 }

-Tensor clamp_batching_rule(const Tensor& self, const std::optional<Scalar>& min, const std::optional<Scalar>& max) {
+Tensor clamp_batching_rule(const Tensor& self, const optional<Scalar>& min, const optional<Scalar>& max) {
  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
  auto result = at::clamp(self_physical.tensor(), min, max);
  return self_physical.getPhysicalToLogicalMap().apply(result);
@ -508,11 +508,11 @@ static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t
 // given (sizes, strides, storage_offset) returns the maximum location that
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
-static std::optional<int64_t> maximum_indexable_location(
+static optional<int64_t> maximum_indexable_location(
    IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
  auto result = native::storage_size_for(sizes, strides);
  if (result == 0) {
-    return std::nullopt;
+    return nullopt;
  }
  return result + storage_offset;
 }
@ -526,7 +526,7 @@ static void checkBasicAsStridedValidForSlice(
    int64_t num_batch_dims,
    IntArrayRef sizes,
    IntArrayRef strides,
-    std::optional<int64_t> maybe_storage_offset) {
+    optional<int64_t> maybe_storage_offset) {
  auto slice_sizes = physical_tensor.sizes().slice(num_batch_dims);
  auto slice_strides = physical_tensor.strides().slice(num_batch_dims);
  auto base_offset = physical_tensor.storage_offset();
@ -614,7 +614,7 @@ Tensor as_strided_batching_rule(
    const Tensor& tensor,
    IntArrayRef sizes,
    IntArrayRef strides,
-    std::optional<int64_t> storage_offset) {
+    optional<int64_t> storage_offset) {
  auto physical_view = at::MultiBatchVmapTransform::logicalToPhysical(tensor);
  auto num_batch_dims = physical_view.numBatchDims();
  auto physical_sizes = physical_view.getPhysicalShape(sizes);
@ -763,7 +763,7 @@ Tensor pow_scalar_Tensor_batching_rule(const Scalar& other, const Tensor& self)
  return makeBatched(output_physical, BatchDims(old_bdims.begin(), old_bdims.end()));
 }

-Tensor clone_batching_rule(const Tensor& self, std::optional<MemoryFormat> memory_format) {
+Tensor clone_batching_rule(const Tensor& self, optional<MemoryFormat> memory_format) {
  // Memory format support is a little tricky because vmap is allowed to move
  // around batch dimensions and some memory formats are rank-dependent.
  // Another weird case is:
@ -958,12 +958,12 @@ Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
 // unwrap_and_call<..., at::to> because at::to takes TensorOptions& (!!)
 Tensor to_dtype_layout_batching_rule(
    const Tensor& self,
-    std::optional<ScalarType> dtype,
-    std::optional<Layout> layout,
-    std::optional<Device> device,
-    std::optional<bool> pin_memory,
+    optional<ScalarType> dtype,
+    optional<Layout> layout,
+    optional<Device> device,
+    optional<bool> pin_memory,
    bool non_blocking, bool copy,
-    std::optional<MemoryFormat> memory_format) {
+    optional<MemoryFormat> memory_format) {
  auto options = TensorOptions()
    .dtype(dtype)
    .layout(layout)
@ -978,10 +978,10 @@ Tensor to_dtype_layout_batching_rule(
 Tensor new_zeros_batching_rule(
    const Tensor& self,
    IntArrayRef size,
-    std::optional<ScalarType> dtype,
-    std::optional<Layout> layout,
-    std::optional<Device> device,
-    std::optional<bool> pin_memory) {
+    optional<ScalarType> dtype,
+    optional<Layout> layout,
+    optional<Device> device,
+    optional<bool> pin_memory) {
  auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
  auto physical_size = physical_view.getPhysicalShape(size);
  auto options = TensorOptions()
@ -1010,10 +1010,10 @@ Tensor new_empty_strided_batching_rule(
    const Tensor& self,
    IntArrayRef size,
    IntArrayRef stride,
-    std::optional<ScalarType> dtype,
-    std::optional<Layout> layout,
-    std::optional<Device> device,
-    std::optional<bool> pin_memory) {
+    optional<ScalarType> dtype,
+    optional<Layout> layout,
+    optional<Device> device,
+    optional<bool> pin_memory) {
  auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
  auto physical_size = physical_view.getPhysicalShape(size);

@ -1181,9 +1181,9 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
    m.impl(name, unwrap_and_call_method< \
        to_type, &Tensor::to, __VA_ARGS__>);\
  }
-  TO_BATCHING_RULE("to.device", Device, ScalarType, bool, bool, std::optional<MemoryFormat>)
-  TO_BATCHING_RULE("to.dtype", ScalarType, bool, bool, std::optional<MemoryFormat>)
-  TO_BATCHING_RULE("to.other", const Tensor&, bool, bool, std::optional<MemoryFormat>)
+  TO_BATCHING_RULE("to.device", Device, ScalarType, bool, bool, optional<MemoryFormat>)
+  TO_BATCHING_RULE("to.dtype", ScalarType, bool, bool, optional<MemoryFormat>)
+  TO_BATCHING_RULE("to.other", const Tensor&, bool, bool, optional<MemoryFormat>)
  m.impl("to.dtype_layout", to_dtype_layout_batching_rule);
 #undef TO_BATCHING_RULE
  m.impl("clone", clone_batching_rule);
--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@ -33,15 +33,15 @@ namespace at {
  _(==, x.eq(y), y.eq(x))                                                   \
  _(!=, x.ne(y), y.ne(x))

-#define DEFINE_OPERATOR(op, body, reverse_scalar_body)          \
-  inline Tensor operator op(const Tensor& x, const Tensor& y) { \
-    return body;                                                \
-  }                                                             \
-  inline Tensor operator op(const Tensor& x, const Scalar& y) { \
-    return body;                                                \
-  }                                                             \
-  inline Tensor operator op(const Scalar& x, const Tensor& y) { \
-    return reverse_scalar_body;                                 \
+#define DEFINE_OPERATOR(op, body, reverse_scalar_body)                 \
+  static inline Tensor operator op(const Tensor& x, const Tensor& y) { \
+    return body;                                                       \
+  }                                                                    \
+  static inline Tensor operator op(const Tensor& x, const Scalar& y) { \
+    return body;                                                       \
+  }                                                                    \
+  static inline Tensor operator op(const Scalar& x, const Tensor& y) { \
+    return reverse_scalar_body;                                        \
  }

 AT_FORALL_BINARY_OPS(DEFINE_OPERATOR)
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@ -65,9 +65,7 @@ inline bool areAnyOptionalTensorSubclassLike(
  if (c10::impl::dispatch_mode_enabled())
    return true;
  return std::any_of(
-      tensors.begin(),
-      tensors.end(),
-      [](const std::optional<Tensor>& opt_tensor) {
+      tensors.begin(), tensors.end(), [](const optional<Tensor>& opt_tensor) {
        return (
            opt_tensor.has_value() && isTensorSubclassLike(opt_tensor.value()));
      });
--- a/aten/src/ATen/TracerMode.h
+++ b/aten/src/ATen/TracerMode.h
@ -113,12 +113,12 @@

 namespace at::tracer::impl {

-inline bool is_dispatch_enabled() {
+static inline bool is_dispatch_enabled() {
  return c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Tracer) &&
      !c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer);
 }

-inline void set_dispatch_enabled(bool enabled) {
+static inline void set_dispatch_enabled(bool enabled) {
  TORCH_INTERNAL_ASSERT(
      !c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer),
      "Cannot enable tracing within the scope of NoTracerDispatchMode!");
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -29,7 +29,7 @@ TORCH_API int _crash_if_asan(int);
 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
 // Once cat is ported entirely to ATen this can be deleted!
-inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
+static inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
    ArrayRef<Tensor> tensors,
    const char* name,
    int pos,
--- a/aten/src/ATen/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/VmapModeRegistrations.cpp
@ -42,70 +42,70 @@ TORCH_LIBRARY_IMPL(aten, VmapMode, m) {
 #define TENSOROPTIONS std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>

  // random operations (out-of-place)
-  m.impl("bernoulli", unsupportedRandomOp<const Tensor&, std::optional<Generator>>);
-  m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, std::optional<Generator>, Tensor&>);
-  m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, std::optional<Generator>>);
-  m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, std::optional<Generator>>);
-  m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
+  m.impl("bernoulli", unsupportedRandomOp<const Tensor&, optional<Generator>>);
+  m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
+  m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, optional<Generator>>);
+  m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);

-  m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
-  m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
-  m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
-  m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
-  m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, std::optional<Generator>>);
-  m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, std::optional<Generator>, Tensor&>);
+  m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
+  m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
+  m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
+  m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
+  m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, optional<Generator>>);
+  m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, optional<Generator>, Tensor&>);

-  m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, std::optional<Generator>>);
-  m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, std::optional<Generator>, Tensor&>);
-  m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, std::optional<Generator>, Tensor&>);
-  m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, std::optional<Generator>>);
-  m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, std::optional<Generator>>);
-  m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, std::optional<Generator>, Tensor&>);
-  m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
-  m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, std::optional<Generator>, Tensor&>);
-  m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
+  m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
+  m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, optional<Generator>, Tensor&>);
+  m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, optional<Generator>>);
+  m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, optional<Generator>>);
+  m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
+  m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, optional<Generator>, Tensor&>);
+  m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);

-  m.impl("poisson", unsupportedRandomOp<const Tensor&, std::optional<Generator>>);
+  m.impl("poisson", unsupportedRandomOp<const Tensor&, optional<Generator>>);

-  m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, std::optional<int64_t>, std::optional<Generator>>);
-  m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, std::optional<Generator>>);
-  m.impl("random_", unsupportedRandomOp_<Tensor&, std::optional<Generator>>);
+  m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, optional<int64_t>, optional<Generator>>);
+  m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, optional<Generator>>);
+  m.impl("random_", unsupportedRandomOp_<Tensor&, optional<Generator>>);

-  m.impl("rand_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, std::optional<MemoryFormat>>);
-  m.impl("randn_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, std::optional<MemoryFormat>>);
+  m.impl("rand_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);
+  m.impl("randn_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);

-  m.impl("randint_like", unsupportedRandomOp<const Tensor&, int64_t, TENSOROPTIONS, std::optional<MemoryFormat>>);
-  m.impl("randint_like.low_dtype", unsupportedRandomOp<const Tensor&, int64_t, int64_t, TENSOROPTIONS, std::optional<MemoryFormat>>);
+  m.impl("randint_like", unsupportedRandomOp<const Tensor&, int64_t, TENSOROPTIONS, optional<MemoryFormat>>);
+  m.impl("randint_like.low_dtype", unsupportedRandomOp<const Tensor&, int64_t, int64_t, TENSOROPTIONS, optional<MemoryFormat>>);

  m.impl("rand", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
-  m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
-  m.impl("rand.names", unsupportedRandomOp<IntArrayRef, std::optional<DimnameList>, TENSOROPTIONS>);
-  m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, std::optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
+  m.impl("rand.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
  m.impl("rand.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
-  m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, std::optional<Generator>, Tensor&>);
+  m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);

  m.impl("randn", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
-  m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
-  m.impl("randn.names", unsupportedRandomOp<IntArrayRef, std::optional<DimnameList>, TENSOROPTIONS>);
-  m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, std::optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
+  m.impl("randn.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
  m.impl("randn.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
-  m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, std::optional<Generator>, Tensor&>);
+  m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);

  m.impl("randperm", unsupportedRandomOp<int64_t, TENSOROPTIONS>);
-  m.impl("randperm.generator", unsupportedRandomOp<int64_t, std::optional<Generator>, TENSOROPTIONS>);
+  m.impl("randperm.generator", unsupportedRandomOp<int64_t, optional<Generator>, TENSOROPTIONS>);
  m.impl("randperm.out", unsupportedRandomOp_<int64_t, Tensor&>);
-  m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, std::optional<Generator>, Tensor&>);
+  m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, optional<Generator>, Tensor&>);

  m.impl("randint", unsupportedRandomOp<int64_t, IntArrayRef, TENSOROPTIONS>);
-  m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
+  m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
  m.impl("randint.low", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, TENSOROPTIONS>);
-  m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
+  m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
  m.impl("randint.out", unsupportedRandomOp_<int64_t, IntArrayRef, Tensor&>);
-  m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, std::optional<Generator>, Tensor&>);
+  m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, optional<Generator>, Tensor&>);
  m.impl("randint.low_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, Tensor&>);
-  m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, std::optional<Generator>, Tensor&>);
+  m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, optional<Generator>, Tensor&>);

-  m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
+  m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);

 #undef TENSOROPTIONS
 }
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@ -13,7 +13,7 @@ namespace at {

 constexpr size_t dim_bitset_size = 64;

-inline std::bitset<dim_bitset_size> dim_list_to_bitset(
+static inline std::bitset<dim_bitset_size> dim_list_to_bitset(
    OptionalIntArrayRef opt_dims,
    size_t ndims) {
  TORCH_CHECK(
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -296,7 +296,7 @@ TORCH_API Tensor cached_cast(
    const Tensor& arg,
    c10::DeviceType device_type = c10::DeviceType::CUDA);

-// Overload to process std::optional<Tensor>
+// Overload to process optional<Tensor>
 inline std::optional<Tensor> cached_cast(
    at::ScalarType to_type,
    const std::optional<Tensor>& arg,
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -364,7 +364,7 @@ public:
  bool is(const Dict& rhs) const;

  // private API for now because the return type will change to TypePtr
-  // instead of std::optional<TypePtr> once types are mandatory.
+  // instead of optional<TypePtr> once types are mandatory.
  TypePtr keyType() const;
  TypePtr valueType() const;

--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@ -18,7 +18,7 @@ TORCH_API std::ostream& print(
    std::ostream& stream,
    const Tensor& tensor,
    int64_t linesize);
-inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
  return print(out,t,80);
 }
 TORCH_API void print(const Tensor & t, int64_t linesize=80);
--- a/aten/src/ATen/core/IListRef_inl.h
+++ b/aten/src/ATen/core/IListRef_inl.h
@ -159,7 +159,7 @@ class IListRefTagImpl<IListRefTag::Unboxed, at::OptionalTensorRef>

 template <>
 class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
-    : public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, std::optional<at::Tensor>> {
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, optional<at::Tensor>> {

 public:
  /*
--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@ -18,11 +18,11 @@ static std::vector<at::Tensor> get_tensor_vector() {
  return tensors;
 }

-static std::vector<std::optional<at::Tensor>> get_boxed_opt_tensor_vector() {
-  std::vector<std::optional<at::Tensor>> optional_tensors;
+static std::vector<optional<at::Tensor>> get_boxed_opt_tensor_vector() {
+  std::vector<optional<at::Tensor>> optional_tensors;
  const size_t SIZE = 5;
  for (size_t i = 0; i < SIZE * 2; i++) {
-    auto opt_tensor = (i % 2 == 0) ? std::optional<at::Tensor>(at::empty({0})) : nullopt;
+    auto opt_tensor = (i % 2 == 0) ? optional<at::Tensor>(at::empty({0})) : nullopt;
    optional_tensors.emplace_back(opt_tensor);
  }
  return optional_tensors;
@ -234,7 +234,7 @@ TEST(ITensorListRefIteratorTest, Unboxed_Iterate) {

 TEST(IOptTensorListRefTest, Boxed_Iterate) {
  auto vec = get_boxed_opt_tensor_vector();
-  const List<std::optional<at::Tensor>> boxed(vec);
+  const List<optional<at::Tensor>> boxed(vec);
  at::IOptTensorListRef list(boxed);
  size_t i = 0;
  for (const auto t : list) {
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -16,7 +16,7 @@ void NamesMode::set_enabled(bool enabled) {
  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::Named, !enabled);
 }

-const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::optional<DimnameList> names) {
+const TensorBase& internal_set_names_inplace(const TensorBase& tensor, optional<DimnameList> names) {
  impl::internal_set_names_inplace(tensor.unsafeGetTensorImpl(), names, /*validate_names=*/true);
  return tensor;
 }
@ -84,7 +84,7 @@ void check_names_valid_for(TensorImpl* impl, DimnameList names) {
  check_names_valid_for(impl->dim(), names);
 }

-void internal_set_names_inplace(TensorImpl* impl, std::optional<DimnameList> names, bool validate_names) {
+void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names) {
  TORCH_CHECK(impl->layout() == Layout::Strided,
      "NYI: named tensors only support strided layout");
  TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_xpu() || impl->device().is_privateuseone(),
@ -130,7 +130,7 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
 optional<DimnameList> get_opt_names(const TensorImpl* impl) {
  const auto* meta = get_named_tensor_meta(impl);
  if (meta == nullptr) {
-    return std::nullopt;
+    return nullopt;
  } else {
    return meta->names();
  }
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -16,7 +16,7 @@ class TensorBase;
 // actually exists outside of c10 and needs to be moved in.

 // TensorImpl has a unique_ptr<NamedTensorMetaInterface> field.
-// XXX: Ideally we would just put std::optional<vector<Dimname>> into TensorImpl.
+// XXX: Ideally we would just put optional<vector<Dimname>> into TensorImpl.
 //
 // This class has an important invariant: there must be at least ONE
 // non-wildcard
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -93,7 +93,7 @@ torch::jit::Stack boxArgs(Args... args) {
 }

 template <class T>
-inline constexpr size_t boxed_size_one() {
+static inline constexpr size_t boxed_size_one() {
  static_assert(!std::is_same<std::decay_t<T>, c10::TensorOptions>::value, "need to patch this path to support TensorOptions passed by reference");
  return 1;
 }
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -393,9 +393,9 @@ namespace impl {
  };
  template<class T, bool AllowDeprecatedTypes>
  struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
-    // If an argument is std::optional<ArrayRef<T>>, convert the IValue to an std::optional<std::vector<T>> and pass that
-    // to the operator. OptionalArray<T> is basically a std::optional<std::vector<T>> but implicitly convertible
-    // to std::optional<ArrayRef<T>>.
+    // If an argument is optional<ArrayRef<T>>, convert the IValue to an optional<std::vector<T>> and pass that
+    // to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but implicitly convertible
+    // to optional<ArrayRef<T>>.
    static OptionalArray<T> call(IValue& v) {
      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
    }
@ -404,8 +404,8 @@ namespace impl {
  template<class T, bool AllowDeprecatedTypes>
  struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
    // If an argument is OptionalArrayRef<T>, convert the IValue to an
-    // std::optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
-    // is basically a std::optional<std::vector<T>> but implicitly convertible to
+    // optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
+    // is basically a optional<std::vector<T>> but implicitly convertible to
    // OptionalArrayRef<T>
    static OptionalArray<T> call(IValue& v) {
      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -325,7 +325,7 @@ struct TORCH_API FunctionSchema {
  std::optional<AliasAnalysisKind> alias_kind_;

  template <typename T>
-  void checkArg(const IValue& value, const Argument& argument, std::optional<size_t> pos) const;
+  void checkArg(const IValue& value, const Argument& argument, optional<size_t> pos) const;

  void checkSchema() const {
    bool seen_default_arg = false;
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -328,7 +328,7 @@ template<typename T>
 inline void FunctionSchema::checkArg(
    const IValue& value,
    const Argument& argument,
-    std::optional<size_t> pos) const {
+    optional<size_t> pos) const {
  if (value.isTensor() && argument.type() == TensorType::get()) {
    // Fast-path for the common case
    return;
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -87,7 +87,7 @@ struct StreamData3Holder : c10::intrusive_ptr_target {
 } // namespace ivalue

 // This is an owning wrapper for a std::optional<std::vector<T>>
-// that can be implicitly converted to a (non-owning) std::optional<ArrayRef<T>>.
+// that can be implicitly converted to a (non-owning) optional<ArrayRef<T>>.
 // Its purpose is to be used in generated code to keep the vector alive
 // either until the end of a statement (as a temporary), or as a saved arg
 // in autograd.
@ -120,14 +120,14 @@ struct OptionalArray {

  operator std::optional<c10::ArrayRef<T>>() {
    if (!list) {
-      return std::nullopt;
+      return nullopt;
    }
    return *list;
  }

  operator c10::OptionalArrayRef<T>() {
    if (!list) {
-      return std::nullopt;
+      return nullopt;
    }
    return *list;
  }
@ -1021,9 +1021,9 @@ struct TORCH_API IValue final {
  // ToOptional: convert a IValue to the Optional obj that accepts both T and
  // None
  template <typename T>
-  std::optional<T> toOptional();
+  optional<T> toOptional();
  template <typename T>
-  std::optional<T> toOptional() const;
+  optional<T> toOptional() const;

  /// @private [doxygen private]
  /// this is a shallow comparison of two IValues to test the object identity
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -1375,7 +1375,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  // The device that was current when markCompleted was called, which we'll
  // restore when invoking callbacks. It's optional because we'll only store it
  // if the future completes successfully.
-  std::optional<c10::Device> currentDevice_;
+  optional<c10::Device> currentDevice_;

  // The events that correspond to the completion of the async I/O kernels. They
  // are recorded on the appropriate streams when the future is marked completed
@ -1748,7 +1748,7 @@ template <class T>
 struct _fake_type {};

 // generic_to<T> converts an IValue from a generic list or generic dict
-// to a concrete list/dict type likelike List<T>, Dict<...> or std::optional<T>.
+// to a concrete list/dict type likelike List<T>, Dict<...> or optional<T>.
 // Note that in the case of lists, this only works for IValue-based lists,
 // i.e. not for int64_t, double, ...
 // generic_to<T> is an implementation detail of IValue::to<T> and not
@ -1949,7 +1949,7 @@ inline T IValue::to() && {
 template <>
 inline std::optional<c10::string_view> IValue::to() && {
  // In the default implementation, the IValue is destroyed with std::move.
-  // But if the unboxed type is std::optional<string_view> we cannot destroy
+  // But if the unboxed type is optional<string_view> we cannot destroy
  // the IValue.
  return generic_to(*this, _fake_type<std::optional<c10::string_view>>{});
 }
@ -2366,7 +2366,7 @@ inline std::optional<std::reference_wrapper<const std::string>> IValue::
  if (isNone()) {
    return std::nullopt;
  }
-  AT_ASSERT(isString(), "Expected std::optional<string> but got ", tagKind());
+  AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
      "called toOptionalStringRef on null intrusive_ptr IValue");
@ -2390,17 +2390,17 @@ inline PyObject* IValue::toPyObject() const {
 }

 template <typename T>
-inline std::optional<T> IValue::toOptional() {
+inline optional<T> IValue::toOptional() {
  if (this->isNone()) {
-    return std::nullopt;
+    return nullopt;
  }
  return this->to<T>();
 }

 template <typename T>
-inline std::optional<T> IValue::toOptional() const {
+inline optional<T> IValue::toOptional() const {
  if (this->isNone()) {
-    return std::nullopt;
+    return nullopt;
  }
  return this->to<T>();
 }
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -2043,7 +2043,7 @@ template <class T, bool fake>
 struct getMaybeFakeTypePtr_<std::optional<T>, fake> final {
  static const auto& call() {
    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
-    // The "per std::optional<T>" static singleton needs to live in a .cpp file,
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
    // otherwise we'll end up with one singleton instance per shared library.
    static auto type = OptionalType::get(inner_type);
    return type;
@ -2055,7 +2055,7 @@ template<>
 struct getTypePtr_<at::OptionalIntArrayRef> final {
  static const auto& call() {
    static auto inner_type = getMaybeFakeTypePtr_<IntArrayRef, false>::call();
-    // The "per std::optional<T>" static singleton needs to live in a .cpp file,
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
    // otherwise we'll end up with one singleton instance per shared library.
    static auto type = OptionalType::get(inner_type);
    return type;
@ -2065,7 +2065,7 @@ struct getTypePtr_<at::OptionalIntArrayRef> final {
 template <bool fake>
 struct getMaybeFakeTypePtr_<at::OptionalSymIntArrayRef, fake> final {
  static const auto& call() {
-    // The "per std::optional<T>" static singleton needs to live in a .cpp file,
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
    // otherwise we'll end up with one singleton instance per shared library.
    static auto inner_type = getMaybeFakeTypePtr_<SymIntArrayRef, fake>::call();
    static auto type = OptionalType::get(inner_type);
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@ -455,7 +455,7 @@ struct TORCH_API Type {
  // this method.
  std::string annotation_str(const TypePrinter& printer) const {
    if (printer) {
-      // the printer can return std::nullopt to fall through to the default impl
+      // the printer can return nullopt to fall through to the default impl
      if (auto renamed = printer(*this)) {
        return *renamed;
      }
--- a/aten/src/ATen/core/op_registration/adaption.h
+++ b/aten/src/ATen/core/op_registration/adaption.h
@ -9,11 +9,11 @@
 * [Note: hacky wrapper removal for optional tensor]
 *
 * The kernel implementation takes an optional tensor marked in the schema as
- * Tensor? but the C++ function takes Tensor instead of the std::optional<Tensor>
+ * Tensor? but the C++ function takes Tensor instead of the optional<Tensor>
 * expected by the dispatcher.
 *
 * To remove the hacky wrapper, the C++ function is changed to take
- * std::optional<Tensor> and unwrap the Tensor value at the beginning of
+ * optional<Tensor> and unwrap the Tensor value at the beginning of
 * the function, e.g.:
 *   > c10::MaybeOwned<Tensor> weight_maybe_owned =
 *   >     at::borrow_from_optional_tensor(weight_opt);
@ -62,7 +62,7 @@ inline void check_and_update_common_device(optional<Device>& common_device, cons
  }
 }

-inline void check_and_update_common_device(optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(optional<Device>& common_device, const optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
  if (tensor.has_value()) {
    check_and_update_common_device(common_device, tensor.value(), methodName, argName);
  }
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -434,7 +434,7 @@ public:
    std::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;

    std::vector<KernelRegistrationConfig> kernels;
-    std::optional<AliasAnalysisKind> aliasAnalysisKind_;
+    optional<AliasAnalysisKind> aliasAnalysisKind_;
    friend class RegisterOperators;
    friend class Library;
  };
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -133,6 +133,32 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+
+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return  Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return  Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int16_t>(_mm256_cvtepu8_epi16(src128));
+  }
+};
+
 template <typename dst_t, typename src_t>
 struct VecConvert<
    dst_t,
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@ -246,6 +246,12 @@ public:
    return _mm256_floor_pd(values);
  }
  Vectorized<double> frac() const;
+  double reduce_add() const {
+    return values[0];
+  }
+  double reduce_max() const {
+    return values[0];
+  }
  Vectorized<double> neg() const {
    return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -342,6 +342,12 @@ public:
    }
    return loadu(tmp);
  }
+  float reduce_add() const {
+    return values[0];
+  }
+  float reduce_max() const {
+    return values[0];
+  }
  Vectorized<float> neg() const {
    return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -241,6 +241,12 @@ public:
  Vectorized<int32_t> abs() const {
    return _mm256_abs_epi32(values);
  }
+  int32_t reduce_add() const {
+    return values[0];
+  }
+  int32_t reduce_max() const {
+    return values[0];
+  }
  Vectorized<int32_t> real() const {
    return *this;
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@ -11,6 +11,7 @@
 #define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
+#include <iostream>

 namespace at {
 namespace vec {
@ -43,6 +44,9 @@ static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
 }

 static inline __m256i cvtfp32_bf16(const __m512& src) {
+// #if defined(CPU_CAPABILITY_AVX512_BF16)
+  // return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src));
+// #else
  __m512i value = _mm512_castps_si512(src);
  __m512i nan = _mm512_set1_epi32(0xffff);
  auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
@ -59,6 +63,7 @@ static inline __m256i cvtfp32_bf16(const __m512& src) {
  // Check NaN before converting back to bf16
  t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
  return _mm512_cvtusepi32_epi16(t_value);
+// #endif
 }

 static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -117,6 +117,49 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return  Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return  Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src256 = _mm512_castsi512_si256(src[0]);
+    return Vectorized<int16_t>(_mm512_cvtepu8_epi16(src256));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int32_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    auto src128 = _mm512_cvtepi32_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi128_si512(src128));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int16_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int16_t, 1>& src) {
+    auto src256 = _mm512_cvtepi16_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi256_si512(src256));
+  }
+};
+
 template <typename dst_t, typename src_t>
 struct VecConvert<
    dst_t,
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -255,6 +255,12 @@ public:
    return _mm512_floor_pd(values);
  }
  Vectorized<double> frac() const;
+  double reduce_add() const {
+    return values[0];
+  }
+  double reduce_max() const {
+    return values[0];
+  }
  Vectorized<double> neg() const {
    return _mm512_xor_pd(_mm512_set1_pd(-0.), values);
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -236,27 +236,27 @@ public:
  }
  Vectorized<float> exp_u20() const {
    // A faster version of exp with ULP=20
-    static __m512 vec_factorial_1 =
+    const __m512 vec_factorial_1 =
        _mm512_set1_ps(0.999999701f); // 1/factorial(1)
-    static __m512 vec_factorial_2 =
+    const __m512 vec_factorial_2 =
        _mm512_set1_ps(0.499991506f); // 1/factorial(2)
-    static __m512 vec_factorial_3 =
+    const __m512 vec_factorial_3 =
        _mm512_set1_ps(0.166676521f); // 1/factorial(3)
-    static __m512 vec_factorial_4 =
+    const __m512 vec_factorial_4 =
        _mm512_set1_ps(0.0418978221f); // 1/factorial(4)
-    static __m512 vec_factorial_5 =
+    const __m512 vec_factorial_5 =
        _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-    static __m512 vec_exp_log2ef =
+    const __m512 vec_exp_log2ef =
        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-    static __m512 vec_half = _mm512_set1_ps(0.5f);
-    static __m512 vec_one = _mm512_set1_ps(1.f);
-    static __m512 vec_zero = _mm512_set1_ps(0.f);
-    static __m512 vec_two = _mm512_set1_ps(2.f);
-    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-    static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-    static int n_mantissa_bits = 23;
+    const __m512 vec_half = _mm512_set1_ps(0.5f);
+    const __m512 vec_one = _mm512_set1_ps(1.f);
+    const __m512 vec_zero = _mm512_set1_ps(0.f);
+    const __m512 vec_two = _mm512_set1_ps(2.f);
+    const __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+    const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+    const int n_mantissa_bits = 23;

    // exp(x) =
    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
@ -364,6 +364,12 @@ public:
    }
    return loadu(tmp);
  }
+  float reduce_add() const {
+    return _mm512_reduce_add_ps(values);
+  }
+  float reduce_max() const {
+    return _mm512_reduce_max_ps(values);
+  }
  Vectorized<float> neg() const {
    return _mm512_xor_ps(_mm512_set1_ps(-0.f), values);
  }
@ -473,26 +479,26 @@ inline Vectorized<float> Vectorized<float>::frac() const {
 // either input is a NaN.
 template <>
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
-  auto zero_vec = _mm512_set1_epi32(0);
-  auto max = _mm512_max_ps(a, b);
-  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
-  auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
-                                                          0xFFFFFFFF));
-  // Exploit the fact that all-ones is a NaN.
-  return _mm512_or_ps(max, isnan);
+  // auto zero_vec = _mm512_set1_epi32(0);
+  return _mm512_max_ps(a, b);
+  // auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  // auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
+  //                                                         0xFFFFFFFF));
+  // // Exploit the fact that all-ones is a NaN.
+  // return _mm512_or_ps(max, isnan);
 }

 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
-  auto zero_vec = _mm512_set1_epi32(0);
-  auto min = _mm512_min_ps(a, b);
-  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
-  auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
-                                                          0xFFFFFFFF));
+  // auto zero_vec = _mm512_set1_epi32(0);
+  return _mm512_min_ps(a, b);
+  // auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  // auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
+  //                                                         0xFFFFFFFF));
  // Exploit the fact that all-ones is a NaN.
-  return _mm512_or_ps(min, isnan);
+  // return _mm512_or_ps(min, isnan);
 }

 template <>
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -267,6 +267,12 @@ public:
  Vectorized<int32_t> abs() const {
    return _mm512_abs_epi32(values);
  }
+  int32_t reduce_add() const {
+    return _mm512_reduce_add_epi32(values);
+  }
+  int32_t reduce_max() const {
+    return _mm512_reduce_max_epi32(values);
+  }
  Vectorized<int32_t> real() const {
    return *this;
  }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Valentine233	f10edf1ecd	update int8 sdpa	2024-12-03 05:58:44 -05:00
Valentine233	9d645a6025	update int8 sdpa	2024-12-03 00:49:02 -08:00
Valentine233	676da3c16a	update fa int8	2024-11-19 01:54:23 -05:00
Valentine233	46769004e5	add kernel for small size	2024-10-30 01:35:28 -07:00
Valentine233	9cb324d903	update int8 sdpa	2024-10-29 02:51:50 -07:00
Valentine233	325db8f2a3	update int8 sdpa	2024-10-21 18:15:19 -07:00
Valentine233	97922c4754	update fa_u8_brgemm	2024-10-17 20:01:34 -07:00
Valentine233	d72ab195da	update fa_u8_brgemm	2024-10-17 19:49:32 -07:00
Valentine233	43b5c4101d	int8 optimization	2024-10-14 23:45:57 -07:00
Valentine233	b640cf15ab	test	2024-08-18 23:34:46 -07:00
Valentine233	67ccb2ce72	update fa u8 brgemm	2024-07-16 19:56:05 -07:00
Valentine233	4a2715e652	add fa u8 brgemm	2024-07-12 01:58:44 -07:00