Made everything unranked

Use a better decomposition for split_with_sizes (#135728 )
This decomposition has less checks and improves the performance of torch.compile. Pull Request resolved: https://github.com/pytorch/pytorch/pull/135728 Approved by: https://github.com/ezyang
2025-10-23 06:34:55 +08:00 · 2024-09-12 15:01:52 -07:00 · 2024-09-12 16:38:51 +00:00 · 2024-09-12 15:22:06 +00:00 · 2024-09-12 14:24:43 +00:00 · 2024-09-12 11:39:09 +00:00
700 changed files with 21679 additions and 10067 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
-0.6b
+0.7b
 manylinux_2_17
 rocm6.2
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
+9be04068c3c0857a4cfd17d7e39e71d0423ebac2
+3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -108,10 +108,10 @@ ENV CMAKE_C_COMPILER cc
 ENV CMAKE_CXX_COMPILER c++
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
+COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 # Install AOTriton (Early fail)
 COPY ./aotriton_version.txt aotriton_version.txt
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +0,0 @@
-21eae954efa5bf584da70324b640288c3ee7aede
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-1b2f15840e0d70eec50d84c7a0575cb835524def
+91b14bf5593cf58a8541f3e6b9125600a867d4ef
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-dedb7bdf339a3546896d4820366ca562c586bfa0
+5fe38ffd73c2ac6ed6323b554205186696631c6f
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,12 +4,12 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.bz2'
+TARBALL='aotriton.tar.gz'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -12,10 +12,7 @@ conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

-if [ -n "${ROCM_VERSION}" ]; then
-  TRITON_REPO="https://github.com/openai/triton"
-  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
+if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -30,9 +30,14 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py

-expecttest==0.1.6
+expecttest==0.2.1
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
+#Pinned versions: 0.2.1
+#test that import:
+
+fbscribelogger==0.1.6
+#Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:

@ -332,3 +337,8 @@ onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
+
+parameterized==0.8.1
+#Description: Parameterizes unittests, both the tests themselves and the entire testing class
+#Pinned versions:
+#test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+3.1.0
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -100,10 +100,10 @@ ARG TRITON
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
+COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 # Install AOTriton
 COPY ./aotriton_version.txt aotriton_version.txt
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -596,6 +596,9 @@ test_single_dynamo_benchmark() {

 test_inductor_micro_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    test_inductor_set_cpu_affinity
+  fi
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
 python -m pip install tlparse==0.3.25

+# Install parameterized
+python -m pip install parameterized==0.8.1
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -119,6 +119,11 @@ fi
 # Test the package
 /builder/check_binary.sh

+if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
+  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
+  python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
+fi
+
 # Clean temp files
 cd /builder && git clean -ffdx

--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -90,7 +90,7 @@ fi
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -86,6 +86,18 @@
  - pull
  - inductor

+- name: OSS CI / pytorchbot / slow tests
+  patterns:
+  - test/slow_tests.json
+  approved_by:
+  - pytorchbot
+  ignore_flaky_failures: false
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+  - slow
+
 - name: OSS CI /pytorchbot / Executorch
  patterns:
  - .ci/docker/ci_commit_pins/executorch.txt
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -9,6 +9,7 @@ ciflow_push_tags:
 - ciflow/inductor-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
+- ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,6 +1,7 @@
 boto3==1.19.12
 hypothesis==6.56.4
-expecttest==0.1.6
+expecttest==0.2.1
+fbscribelogger==0.1.6
 librosa>=0.6.2
 mpmath==1.3.0
 networkx==2.8.7
@ -30,3 +31,4 @@ optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
+parameterized==0.8.1
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -15,9 +15,7 @@ REPO_DIR = SCRIPT_DIR.parent.parent

 def read_triton_pin(device: str = "cuda") -> str:
    triton_file = "triton.txt"
-    if device == "rocm":
-        triton_file = "triton-rocm.txt"
-    elif device == "xpu":
+    if device == "xpu":
        triton_file = "triton-xpu.txt"
    with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
        return f.read().strip()
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -325,6 +325,7 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[List[str]] = None,
    python_versions: Optional[List[str]] = None,
+    use_split_build: bool = False,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -371,7 +372,17 @@ def generate_wheels_matrix(
            ) and python_version == "3.13":
                continue

+            if use_split_build and (
+                arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
+            ):
+                raise RuntimeError(
+                    "Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
+                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
+                    "Please modify the matrix generation to exclude this combination."
+                )
+
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
+
            if (
                arch_version in ["12.4", "12.1", "11.8"]
                and os == "linux"
@ -385,6 +396,7 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
+                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
                        ),
@ -400,7 +412,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                if arch_version != "cuda-aarch64":
+                # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
+                if python_version == "3.10" and arch_version == "12.1":
                    ret.append(
                        {
                            "python_version": python_version,
@ -409,40 +422,16 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
-                            "use_split_build": "True",
+                            "use_split_build": "True" if use_split_build else "False",
                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": (
-                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                                if os != "linux-aarch64"
-                                else ""
-                            ),
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
+                            "pytorch_extra_install_requirements": "",
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
                                ".", "_"
                            ),
                        }
                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
@ -452,6 +441,7 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
+                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
                        ),
@ -467,6 +457,7 @@ def generate_wheels_matrix(
                        ),
                    }
                )
+
    return ret


--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -61,6 +61,7 @@ class BinaryBuildWorkflow:
    # Mainly for macos
    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
+    use_split_build: bool = False

    def __post_init__(self) -> None:
        if self.abi_version:
@ -69,12 +70,20 @@ class BinaryBuildWorkflow:
            )
        else:
            self.build_environment = f"{self.os}-binary-{self.package_type}"
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
            GITHUB_DIR
            / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
        )
+        if self.use_split_build:
+            output_file_path = (
+                GITHUB_DIR
+                / f"workflows/generated-{self.build_environment}-{self.branches}"
+            )
        with open(output_file_path, "w") as output_file:
            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
@ -110,6 +119,20 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            use_split_build=True,
+            arches=["11.8", "12.1", "12.4", "cpu"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+        use_split_build=True,
+    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="conda",
@ -162,6 +185,21 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        ),
        branches="main",
    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["11.8", "12.1", "12.4"],
+            python_versions=["3.9"],
+            use_split_build=True,
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_PERIODIC},
+        ),
+        branches="main",
+        use_split_build=True,
+    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -3,49 +3,94 @@
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
-https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-which users will get their jobs to run on experimental runners. This user list
-is also a comma separated list of additional features or experiments which the
-user could be opted in to.
+https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+of which runners should be used to run which job.
+
+The configuration has two parts, the settings and a list of opted-in users,
+separated by a line containing "---".  If the line is not present, the
+settings are considered to be empty with only the second part, the user
+list, defined.
+
+The first part is a YAML block that defines the rollout settings. This can be
+used to define any settings that are needed to determine which runners to use.
+It's fields are defined by the RolloutSettings class below.
+
+The second part is a list of users who are explicitly opted in to the LF fleet.
+The user list is also a comma separated list of additional features or
+experiments which the user could be opted in to.

 The user list has the following rules:

- Users are GitHub usernames with the @ prefix
- If the first line is a "*" then all users will use the new runners
- If the first line is a "!" then all users will use the old runners
+- Users are GitHub usernames, which must start with the @ prefix
 - Each user is also a comma-separated list of features/experiments to enable
- A "#" prefix indicates the user is opted out of the new runners but is opting
-  into features/experiments.
+- A "#" prefix opts the user out of all experiments

-Example user list:
+Example config:
+    # A list of experiments that can be opted into.
+    # This defines the behavior they'll induce when opted into.
+    # Expected syntax is:
+    #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+    #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-    @User1
-    @User2,amz2023
-    #@UserOptOutOfNewRunner,amz2023
+    experiments:
+      lf:
+        rollout_percent: 25
+
+    ---
+
+    # Opt-ins:
+    # Users can opt into the LF fleet by adding their GitHub username to this list
+    # and specifying experiments to enable in a comma-separated list.
+    # Experiments should be from the above list.
+
+    @User1,lf,split_build
+    @User2,lf
+    @User3,split_build
 """

 import logging
 import os
+import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Iterable
+from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+import yaml
 from github import Auth, Github
 from github.Issue import Issue


-WORKFLOW_LABEL_META = ""  # use meta runners
+DEFAULT_LABEL_PREFIX = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-RUNNER_AMI_LEGACY = ""
-RUNNER_AMI_AMZ2023 = "amz2023"
-
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+SETTING_EXPERIMENTS = "experiments"
+
+LF_FLEET_EXPERIMENT = "lf"
+CANARY_FLEET_SUFFIX = ".c"
+
+
+class Experiment(NamedTuple):
+    rollout_perc: float = (
+        0  # Percentage of workflows to experiment on when user is not opted-in.
+    )
+
+    # Add more fields as needed
+
+
+class Settings(NamedTuple):
+    """
+    Settings for the experiments that can be opted into.
+    """
+
+    experiments: Dict[str, Experiment] = {}
+
+
 class ColorFormatter(logging.Formatter):
    """Color codes the log messages based on the log level"""

@ -172,85 +217,180 @@ def is_exception_branch(branch: str) -> bool:
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-    """
-    Determines if the job should run on the LF fleet or the Meta fleet
-
-    Returns:
-        The appropriate label prefix for the runner, corresponding to the fleet to use.
-        This gets prefixed to the very start of the runner label.
-    """
-
+def load_yaml(yaml_text: str) -> Any:
    try:
-        if rollout_state[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif rollout_state[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ").split(",")[0]
-                for usr_raw in rollout_state.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
+        data = yaml.safe_load(yaml_text)
+        return data
+    except yaml.YAMLError as exc:
+        log.exception("Error loading YAML")
+        raise


-def get_optin_feature(
-    rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+    """
+    Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the rest is the users.
+    """
+    rollout_state_parts = rollout_state.split("---")
+    if len(rollout_state_parts) >= 2:
+        return rollout_state_parts[0], rollout_state_parts[1]
+    else:
+        return "", rollout_state
+
+
+class UserOptins(Dict[str, List[str]]):
+    """
+    Dictionary of users with a list of features they have opted into
+    """
+
+
+def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+    """
+    Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+    Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+        - Example line: "@User1,lf,split_build"
+        - A "#" prefix indicates the user is opted out of all experiments
+
+
+    """
+    optins = UserOptins()
+    for user in user_optin_text.split("\n"):
+        user = user.strip("\r\n\t -")
+        if not user or not user.startswith("@"):
+            # Not a valid user. Skip
+            continue
+
+        if user:
+            usr_name = user.split(",")[0].strip("@")
+            optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+    return optins
+
+
+def parse_settings_from_text(settings_text: str) -> Settings:
+    """
+    Parse the experiments from the issue body into a list of ExperimentSettings
+    """
+    try:
+        if settings_text:
+            # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+            # for easy reading
+            # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+            #       the backtick character in shell commands.
+            backtick = chr(96)  # backtick character
+            settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+            settings = load_yaml(settings_text)
+
+            # For now we just load experiments. We can expand this if/when we add more settings
+            experiments = {}
+
+            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                valid_settings = {}
+                for setting in exp_settings:
+                    if setting not in Experiment._fields:
+                        log.warning(
+                            f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                        )
+                    else:
+                        valid_settings[setting] = exp_settings[setting]
+
+                experiments[exp_name] = Experiment(**valid_settings)
+            return Settings(experiments)
+
+    except Exception:
+        log.exception("Failed to parse settings")
+
+    return Settings()
+
+
+def parse_settings(rollout_state: str) -> Settings:
+    """
+    Parse settings, if any, from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the default values are used.
+    """
+    settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_settings_from_text(settings_text)
+
+
+def parse_users(rollout_state: str) -> UserOptins:
+    """
+    Parse users from the rollout state.
+
+    """
+    _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_user_opt_in_from_text(users_text)
+
+
+def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+    """
+    Check if a user is opted into an experiment
+    """
+    return experiment_name in user_optins.get(user, [])
+
+
+def get_runner_prefix(
+    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
 ) -> str:
-    """
-    Used to dynamically opt in jobs to specific runner-type variants.
+    settings = parse_settings(rollout_state)
+    user_optins = parse_users(rollout_state)

-    Returns:
-        The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-        This variant name is prefixed to the runner-type in the label.
-    """
-    try:
-        userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-        all_opted_in_users = set()
-        for user in userlist:
-            for i in user.split(","):
-                if i == feature:
-                    all_opted_in_users.add(user.split(",")[0])
-        opted_in_requestors = {
-            usr for usr in workflow_requestors if usr in all_opted_in_users
-        }
+    fleet_prefix = ""
+    prefixes = []
+    for experiment_name, experiment_settings in settings.experiments.items():
+        enabled = False

-        if opted_in_requestors:
+        # Is any workflow_requestor opted in to this experiment?
+        opted_in_users = [
+            requestor
+            for requestor in workflow_requestors
+            if is_user_opted_in(requestor, user_optins, experiment_name)
+        ]
+
+        if opted_in_users:
            log.info(
-                f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
            )
-            return feature
-        else:
-            log.info(
-                f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-            )
-            return fallback
+            enabled = True
+        elif experiment_settings.rollout_perc:
+            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                log.info(
+                    f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                )
+                enabled = True

-    except Exception as e:
+        if enabled:
+            label = experiment_name
+            if experiment_name == LF_FLEET_EXPERIMENT:
+                # We give some special treatment to the "lf" experiment since determines the fleet we use
+                #  - If it's enabled, then we always list it's prefix first
+                #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                if is_canary:
+                    label += CANARY_FLEET_SUFFIX
+                fleet_prefix = label
+            else:
+                prefixes.append(label)
+
+    if len(prefixes) > 1:
        log.error(
-            f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+            f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
        )
-        return fallback
+        prefixes = prefixes[:1]
+
+    # Fleet always comes first
+    if fleet_prefix:
+        prefixes.insert(0, fleet_prefix)
+
+    return ".".join(prefixes) + "." if prefixes else ""


 def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -268,9 +408,10 @@ def main() -> None:
    args = parse_args()

    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-        runner_ami = RUNNER_AMI_LEGACY
+        log.info(
+            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+        )
+        runner_label_prefix = DEFAULT_LABEL_PREFIX
    else:
        try:
            rollout_state = get_rollout_state_from_issue(
@ -285,35 +426,18 @@ def main() -> None:
                args.github_branch,
            )

-            label_type = get_fleet(
-                rollout_state,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-            runner_ami = get_optin_feature(
-                rollout_state=rollout_state,
-                workflow_requestors=(
-                    args.github_issue_owner,
-                    username,
-                ),
-                feature=RUNNER_AMI_AMZ2023,
-                fallback=RUNNER_AMI_LEGACY,
+            is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+            runner_label_prefix = get_runner_prefix(
+                rollout_state, (args.github_issue_owner, username), is_canary
            )
+
        except Exception as e:
            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
            )
-            label_type = WORKFLOW_LABEL_META
-            runner_ami = RUNNER_AMI_LEGACY

-    # For Canary builds use canary runners
-    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-        label_type = WORKFLOW_LABEL_LF_CANARY
-
-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-    set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


 if __name__ == "__main__":
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -51,6 +51,8 @@ def main() -> None:

    for platform_image in platform_images:  # type: ignore[attr-defined]
        for arch in platform_image.keys():  # type: ignore[attr-defined]
+            if arch == "cpu-s390x":
+                continue
            tag_image(
                platform_image[arch],  # type: ignore[index]
                default_tag,
--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -0,0 +1,237 @@
+from unittest import main, TestCase
+from unittest.mock import Mock, patch
+
+import runner_determinator as rd
+
+
+class TestRunnerDeterminatorIssueParser(TestCase):
+    def test_parse_settings(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_settings_in_code_block(self) -> None:
+        settings_text = """
+
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+    def test_parse_users_without_settings(self) -> None:
+        settings_text = """
+
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+
+class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
+    def test_opted_in_user(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
+
+    def test_opted_in_user_two_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
+
+    @patch("random.uniform", return_value=50)
+    def test_opted_out_user(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into both experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_lf_prefix_always_comes_first(self) -> None:
+        settings_text = """
+        experiments:
+            otherExp:
+                rollout_perc: 0
+            lf:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,otherExp,lf
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_commented_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        #@User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_extra_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+            foo:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf,otherExp,foo
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -45,7 +45,7 @@
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
  {%- endif %}

 {%- else %}
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -62,49 +62,94 @@ jobs:
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
-          https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-          which users will get their jobs to run on experimental runners. This user list
-          is also a comma separated list of additional features or experiments which the
-          user could be opted in to.
+          https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+          of which runners should be used to run which job.
+
+          The configuration has two parts, the settings and a list of opted-in users,
+          separated by a line containing "---".  If the line is not present, the
+          settings are considered to be empty with only the second part, the user
+          list, defined.
+
+          The first part is a YAML block that defines the rollout settings. This can be
+          used to define any settings that are needed to determine which runners to use.
+          It's fields are defined by the RolloutSettings class below.
+
+          The second part is a list of users who are explicitly opted in to the LF fleet.
+          The user list is also a comma separated list of additional features or
+          experiments which the user could be opted in to.

          The user list has the following rules:

-          - Users are GitHub usernames with the @ prefix
-          - If the first line is a "*" then all users will use the new runners
-          - If the first line is a "!" then all users will use the old runners
+          - Users are GitHub usernames, which must start with the @ prefix
          - Each user is also a comma-separated list of features/experiments to enable
-          - A "#" prefix indicates the user is opted out of the new runners but is opting
-            into features/experiments.
+          - A "#" prefix opts the user out of all experiments

-          Example user list:
+          Example config:
+              # A list of experiments that can be opted into.
+              # This defines the behavior they'll induce when opted into.
+              # Expected syntax is:
+              #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+              #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-              @User1
-              @User2,amz2023
-              #@UserOptOutOfNewRunner,amz2023
+              experiments:
+                lf:
+                  rollout_percent: 25
+
+              ---
+
+              # Opt-ins:
+              # Users can opt into the LF fleet by adding their GitHub username to this list
+              # and specifying experiments to enable in a comma-separated list.
+              # Experiments should be from the above list.
+
+              @User1,lf,split_build
+              @User2,lf
+              @User3,split_build
          """

          import logging
          import os
+          import random
          from argparse import ArgumentParser
          from logging import LogRecord
-          from typing import Any, Iterable
+          from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+          import yaml
          from github import Auth, Github
          from github.Issue import Issue


-          WORKFLOW_LABEL_META = ""  # use meta runners
+          DEFAULT_LABEL_PREFIX = ""  # use meta runners
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-          RUNNER_AMI_LEGACY = ""
-          RUNNER_AMI_AMZ2023 = "amz2023"
-
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+          SETTING_EXPERIMENTS = "experiments"
+
+          LF_FLEET_EXPERIMENT = "lf"
+          CANARY_FLEET_SUFFIX = ".c"
+
+
+          class Experiment(NamedTuple):
+              rollout_perc: float = (
+                  0  # Percentage of workflows to experiment on when user is not opted-in.
+              )
+
+              # Add more fields as needed
+
+
+          class Settings(NamedTuple):
+              """
+              Settings for the experiments that can be opted into.
+              """
+
+              experiments: Dict[str, Experiment] = {}
+
+
          class ColorFormatter(logging.Formatter):
              """Color codes the log messages based on the log level"""

@ -231,85 +276,180 @@ jobs:
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-          def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-              """
-              Determines if the job should run on the LF fleet or the Meta fleet
-
-              Returns:
-                  The appropriate label prefix for the runner, corresponding to the fleet to use.
-                  This gets prefixed to the very start of the runner label.
-              """
-
+          def load_yaml(yaml_text: str) -> Any:
              try:
-                  if rollout_state[0] == "!":
-                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
-                      return WORKFLOW_LABEL_META
-                  elif rollout_state[0] == "*":
-                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
-                      return WORKFLOW_LABEL_LF
-                  else:
-                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ").split(",")[0]
-                          for usr_raw in rollout_state.split()
-                      }
-                      opted_in_requestors = {
-                          usr for usr in workflow_requestors if usr in all_opted_in_users
-                      }
-                      if opted_in_requestors:
-                          log.info(
-                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                          )
-                          return WORKFLOW_LABEL_LF
-                      else:
-                          log.info(
-                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                          )
-                          return WORKFLOW_LABEL_META
-
-              except Exception as e:
-                  log.error(
-                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-                  )
-                  return WORKFLOW_LABEL_META
+                  data = yaml.safe_load(yaml_text)
+                  return data
+              except yaml.YAMLError as exc:
+                  log.exception("Error loading YAML")
+                  raise


-          def get_optin_feature(
-              rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+          def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+              """
+              Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the rest is the users.
+              """
+              rollout_state_parts = rollout_state.split("---")
+              if len(rollout_state_parts) >= 2:
+                  return rollout_state_parts[0], rollout_state_parts[1]
+              else:
+                  return "", rollout_state
+
+
+          class UserOptins(Dict[str, List[str]]):
+              """
+              Dictionary of users with a list of features they have opted into
+              """
+
+
+          def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+              """
+              Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+              Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+                  - Example line: "@User1,lf,split_build"
+                  - A "#" prefix indicates the user is opted out of all experiments
+
+
+              """
+              optins = UserOptins()
+              for user in user_optin_text.split("\n"):
+                  user = user.strip("\r\n\t -")
+                  if not user or not user.startswith("@"):
+                      # Not a valid user. Skip
+                      continue
+
+                  if user:
+                      usr_name = user.split(",")[0].strip("@")
+                      optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+              return optins
+
+
+          def parse_settings_from_text(settings_text: str) -> Settings:
+              """
+              Parse the experiments from the issue body into a list of ExperimentSettings
+              """
+              try:
+                  if settings_text:
+                      # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+                      # for easy reading
+                      # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+                      #       the backtick character in shell commands.
+                      backtick = chr(96)  # backtick character
+                      settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+                      settings = load_yaml(settings_text)
+
+                      # For now we just load experiments. We can expand this if/when we add more settings
+                      experiments = {}
+
+                      for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                          valid_settings = {}
+                          for setting in exp_settings:
+                              if setting not in Experiment._fields:
+                                  log.warning(
+                                      f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                                  )
+                              else:
+                                  valid_settings[setting] = exp_settings[setting]
+
+                          experiments[exp_name] = Experiment(**valid_settings)
+                      return Settings(experiments)
+
+              except Exception:
+                  log.exception("Failed to parse settings")
+
+              return Settings()
+
+
+          def parse_settings(rollout_state: str) -> Settings:
+              """
+              Parse settings, if any, from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the default values are used.
+              """
+              settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_settings_from_text(settings_text)
+
+
+          def parse_users(rollout_state: str) -> UserOptins:
+              """
+              Parse users from the rollout state.
+
+              """
+              _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_user_opt_in_from_text(users_text)
+
+
+          def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+              """
+              Check if a user is opted into an experiment
+              """
+              return experiment_name in user_optins.get(user, [])
+
+
+          def get_runner_prefix(
+              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
          ) -> str:
-              """
-              Used to dynamically opt in jobs to specific runner-type variants.
+              settings = parse_settings(rollout_state)
+              user_optins = parse_users(rollout_state)

-              Returns:
-                  The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-                  This variant name is prefixed to the runner-type in the label.
-              """
-              try:
-                  userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-                  all_opted_in_users = set()
-                  for user in userlist:
-                      for i in user.split(","):
-                          if i == feature:
-                              all_opted_in_users.add(user.split(",")[0])
-                  opted_in_requestors = {
-                      usr for usr in workflow_requestors if usr in all_opted_in_users
-                  }
+              fleet_prefix = ""
+              prefixes = []
+              for experiment_name, experiment_settings in settings.experiments.items():
+                  enabled = False

-                  if opted_in_requestors:
+                  # Is any workflow_requestor opted in to this experiment?
+                  opted_in_users = [
+                      requestor
+                      for requestor in workflow_requestors
+                      if is_user_opted_in(requestor, user_optins, experiment_name)
+                  ]
+
+                  if opted_in_users:
                      log.info(
-                          f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                          f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
                      )
-                      return feature
-                  else:
-                      log.info(
-                          f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-                      )
-                      return fallback
+                      enabled = True
+                  elif experiment_settings.rollout_perc:
+                      # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+                      if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                          log.info(
+                              f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                          )
+                          enabled = True

-              except Exception as e:
+                  if enabled:
+                      label = experiment_name
+                      if experiment_name == LF_FLEET_EXPERIMENT:
+                          # We give some special treatment to the "lf" experiment since determines the fleet we use
+                          #  - If it's enabled, then we always list it's prefix first
+                          #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                          if is_canary:
+                              label += CANARY_FLEET_SUFFIX
+                          fleet_prefix = label
+                      else:
+                          prefixes.append(label)
+
+              if len(prefixes) > 1:
                  log.error(
-                      f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+                      f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
                  )
-                  return fallback
+                  prefixes = prefixes[:1]
+
+              # Fleet always comes first
+              if fleet_prefix:
+                  prefixes.insert(0, fleet_prefix)
+
+              return ".".join(prefixes) + "." if prefixes else ""


          def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -327,9 +467,10 @@ jobs:
              args = parse_args()

              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-                  label_type = WORKFLOW_LABEL_META
-                  runner_ami = RUNNER_AMI_LEGACY
+                  log.info(
+                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+                  )
+                  runner_label_prefix = DEFAULT_LABEL_PREFIX
              else:
                  try:
                      rollout_state = get_rollout_state_from_issue(
@ -344,35 +485,18 @@ jobs:
                          args.github_branch,
                      )

-                      label_type = get_fleet(
-                          rollout_state,
-                          (
-                              args.github_issue_owner,
-                              username,
-                          ),
-                      )
-                      runner_ami = get_optin_feature(
-                          rollout_state=rollout_state,
-                          workflow_requestors=(
-                              args.github_issue_owner,
-                              username,
-                          ),
-                          feature=RUNNER_AMI_AMZ2023,
-                          fallback=RUNNER_AMI_LEGACY,
+                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+                      runner_label_prefix = get_runner_prefix(
+                          rollout_state, (args.github_issue_owner, username), is_canary
                      )
+
                  except Exception as e:
                      log.error(
-                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
                      )
-                      label_type = WORKFLOW_LABEL_META
-                      runner_ami = RUNNER_AMI_LEGACY

-              # For Canary builds use canary runners
-              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-                  label_type = WORKFLOW_LABEL_LF_CANARY
-
-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-              set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


          if __name__ == "__main__":
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -29,9 +29,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -66,7 +76,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -101,7 +112,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -33,9 +33,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -73,7 +83,8 @@ jobs:
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -110,7 +121,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4"]
@ -143,7 +155,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -178,7 +191,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -207,7 +221,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
@ -238,7 +253,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
@ -269,7 +285,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
@ -303,7 +320,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
@ -334,7 +352,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: xpu
    steps:
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -13,7 +13,6 @@ on:
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton.txt
-      - .ci/docker/ci_commit_pins/triton-rocm.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt
  pull_request:
    paths:
@ -21,7 +20,6 @@ on:
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton.txt
-      - .ci/docker/ci_commit_pins/triton-rocm.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt

 concurrency:
@ -29,9 +27,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-wheel:
    name: "Build Triton Wheel"
-    runs-on: [self-hosted, linux.2xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
    strategy:
      fail-fast: false
      matrix:
@ -120,7 +128,7 @@ jobs:
          fi
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts

-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4.4.0
        with:
          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
          if-no-files-found: error
@ -201,7 +209,8 @@ jobs:

  build-conda:
    name: "Build Triton Conda"
-    runs-on: [self-hosted, linux.2xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    strategy:
      fail-fast: false
      matrix:
@ -253,7 +262,7 @@ jobs:
          docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}" $RELEASE
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts

-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4.4.0
        with:
          name: pytorch-triton-conda-${{ matrix.py_vers }}
          if-no-files-found: error
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -16,6 +16,15 @@ on:
    paths: [.github/workflows/create_release.yml]

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  release:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Create Release
@ -63,7 +72,7 @@ jobs:
          files: ${{env.PT_RELEASE_FILE}}
      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
@ -73,12 +82,14 @@ jobs:

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-    runs-on: linux.2xlarge
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: sourcecode-upload
    name: Upload source code to S3 for release tags
    permissions:
      id-token: write
-    needs: release
+    needs:
+      - get-label-type
+      - release
    steps:
      - uses: actions/download-artifact@v4.1.7
        with:
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -30,8 +30,18 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  docker-build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    needs: get-label-type
    timeout-minutes: 240
    strategy:
      fail-fast: false
@ -68,7 +78,7 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: [self-hosted, "${{ matrix.runner }}"]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -34,9 +34,19 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  generate-matrix:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
    outputs:
      matrix: ${{ steps.generate-matrix.outputs.matrix }}
    steps:
@ -54,10 +64,12 @@ jobs:

  build:
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    timeout-minutes: 240
-    needs: generate-matrix
+    needs:
+      - generate-matrix
+      - get-label-type
    strategy:
      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
      fail-fast: false
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -58,6 +58,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -81,6 +82,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -103,6 +105,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -125,6 +128,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -149,6 +153,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda-aarch64
    secrets:
@ -170,6 +175,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -193,6 +199,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -215,6 +222,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -237,6 +245,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -261,6 +270,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda-aarch64
    secrets:
@ -282,6 +292,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -305,6 +316,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -327,6 +339,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
@ -349,6 +362,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -373,6 +387,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda-aarch64
    secrets:
@ -394,6 +409,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -417,6 +433,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -439,6 +456,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
    secrets:
@ -461,6 +479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -485,6 +504,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda-aarch64
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -54,6 +54,7 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
@ -77,6 +78,7 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
@ -85,53 +87,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_9-cuda11_8-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda11_8-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -146,6 +101,7 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
@ -169,6 +125,7 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
@ -177,53 +134,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_9-cuda12_1-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_1-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -238,6 +148,7 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
@ -261,6 +172,7 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
@ -268,50 +180,3 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_4-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-main
+++ b/.github/workflows/generated-linux-binary-manywheel-split-main
@ -0,0 +1,182 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel-split
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/periodic/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_9-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -58,6 +58,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -81,6 +82,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -103,6 +105,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
    secrets:
@ -124,6 +127,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -147,6 +151,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -169,6 +174,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
    secrets:
@ -190,6 +196,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -213,6 +220,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -235,6 +243,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
    secrets:
@ -256,6 +265,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -279,6 +289,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -301,6 +312,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
    secrets:
@ -322,6 +334,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -345,6 +358,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -367,6 +381,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
    secrets:
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -49,7 +49,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -169,7 +169,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -176,7 +176,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cpu-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -316,7 +316,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -435,7 +435,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -550,7 +550,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda11_8-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -695,7 +695,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -810,7 +810,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_1-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -836,7 +836,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -955,7 +955,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -1070,7 +1070,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_4-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -169,7 +169,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -176,7 +176,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cpu-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -316,7 +316,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -435,7 +435,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -550,7 +550,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda11_8-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -695,7 +695,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -810,7 +810,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_1-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -836,7 +836,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -955,7 +955,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -1070,7 +1070,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_4-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -18,11 +18,22 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -0,0 +1,40 @@
+name: inductor-micro-benchmark-x86
+
+on:
+  schedule:
+    - cron: 0 7 * * *
+  push:
+    tags:
+      - ciflow/inductor-micro-benchmark-cpu-x86/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      # Use metal host for benchmark jobs
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+        ]}
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -16,10 +16,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -13,10 +13,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -68,10 +68,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -50,10 +50,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-aarch64-py3_10-inductor-build:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -48,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -66,10 +66,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -18,10 +18,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -60,7 +71,9 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -22,10 +22,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-rocm6_1-py3_8-inductor-build:
    name: rocm6.1-py3.8-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -223,7 +223,7 @@ jobs:
          cache: pip
      - name: Install dependencies
        run: |
-          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
+          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
        run: |
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -57,8 +57,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -87,8 +89,10 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

@ -333,8 +337,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,18 +3,12 @@ name: rocm
 on:
  push:
    branches:
-#     - main
+      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT

 concurrency:
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -56,12 +56,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
@ -87,8 +89,9 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -10,8 +10,18 @@ permissions:
  contents: read

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  index:
-    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
    environment: target-determinator-env
    steps:
      - name: Clone PyTorch
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -11,10 +11,21 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -266,8 +266,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -2,7 +2,7 @@ name: Upload test stats

 on:
  workflow_run:
-    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-cu124, inductor-rocm]
+    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm]
    types:
      - completed

@ -96,7 +96,7 @@ jobs:
          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

      - name: Upload gpt-fast benchmark results to Rockset
-        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
+        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains('inductor-micro-benchmark', github.event.workflow_run.name)
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -138,7 +138,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
-    'expecttest==0.1.6',
+    'expecttest==0.2.1',
    'mypy==1.10.0',
    'sympy==1.12.1 ; python_version == "3.8"',
    'sympy==1.13.0 ; python_version >= "3.9"',
@ -210,6 +210,8 @@ include_patterns = [
    'aten/src/ATen/native/nested/*.h',
    'c10/**/*.cpp',
    'c10/**/*.h',
+    'caffe2/**/*.cc',
+    'caffe2/**/*.h',
    'torch/*.h',
    'torch/csrc/*.h',
    'torch/csrc/*.cpp',
--- a/BUCK.oss
+++ b/BUCK.oss
@ -65,7 +65,6 @@ cxx_library(
        "caffe2/serialize/file_adapter.cc",
        "caffe2/serialize/inline_container.cc",
        "caffe2/serialize/istream_adapter.cc",
-        "caffe2/serialize/read_adapter_interface.cc",
    ],
    visibility = ["PUBLIC"],
    deps = [
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -332,6 +332,7 @@ intern_build_aten_ops(
        "@fbgemm",
        "@mkl",
        "@sleef",
+        "@mkl_dnn//:mkl-dnn",
    ],
 )

@ -472,7 +473,6 @@ filegroup(
        "caffe2/serialize/file_adapter.cc",
        "caffe2/serialize/inline_container.cc",
        "caffe2/serialize/istream_adapter.cc",
-        "caffe2/serialize/read_adapter_interface.cc",
    ],
 )

--- a/1
+++ b/1
@ -57,7 +57,6 @@ nn/qat/ @jerryzh168
 # Docker
 /.ci/docker/ @jeffdaily
 /.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
-/.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
 /.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui

 # Github Actions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
 | 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
 | 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
 | 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)

+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5(                             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                            \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
 #define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(                                                    \
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))

+#define AT_DISPATCH_FLOATING_TYPES_AND5(    \
+    SCALARTYPE1,                            \
+    SCALARTYPE2,                            \
+    SCALARTYPE3,                            \
+    SCALARTYPE4,                            \
+    SCALARTYPE5,                            \
+    TYPE,                                   \
+    NAME,                                   \
+    ...)                                    \
+  AT_DISPATCH_SWITCH(                       \
+      TYPE,                                 \
+      NAME,                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
+          SCALARTYPE1,                      \
+          SCALARTYPE2,                      \
+          SCALARTYPE3,                      \
+          SCALARTYPE4,                      \
+          SCALARTYPE5,                      \
+          __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -707,7 +707,12 @@ bool are_all_mutations_under_no_grad_or_inference_mode(const Tensor& functional_
 }

 bool isFunctionalTensor(const at::Tensor& tensor) {
-  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+}
+
+bool isBaseTensor(const at::Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
+  return unsafeGetFunctionalWrapper(tensor)->isBaseTensor();
 }

 bool isFunctionalTensor(const std::optional<Tensor>& t) {
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -165,6 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    was_storage_changed_ = true;
  }

+  // A FunctionalTensor is considered a base if its not a view of another
+  // tensor.
+  bool isBaseTensor() const {
+    return view_metas_.empty();
+  }
+
  c10::SymInt get_storage_size(bool before) {
    return functional_storage_impl()->get_storage_size(before);
  }
@ -290,6 +296,8 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
  return functional_impl;
 }

+TORCH_API bool isBaseTensor(const at::Tensor& tensor);
+
 TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
 TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
 TORCH_API bool isFunctionalTensor(
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -69,7 +69,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
        at::ScalarType::Undefined, // Vulkan
        at::ScalarType::Undefined, // Metal
        at::kHalf, // XPU
-        at::ScalarType::Undefined, // MPS
+        at::kHalf, // MPS
        at::ScalarType::Undefined, // Meta (tensors with no data)
        at::kBFloat16, // HPU / HABANA
        at::ScalarType::Undefined, // SX-Aurora / NEC
@ -206,6 +206,118 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }

+TORCH_LIBRARY_IMPL(_, AutocastMPS, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
+TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
+  // lower_precision_fp
+  KERNEL_MPS2(_convolution, deprecated, lower_precision_fp)
+  KERNEL_MPS(_convolution, lower_precision_fp)
+  KERNEL_MPS(conv1d, lower_precision_fp)
+  KERNEL_MPS(conv2d, lower_precision_fp)
+  KERNEL_MPS(conv_tbc, lower_precision_fp)
+  KERNEL_MPS(conv_transpose1d, lower_precision_fp)
+  KERNEL_MPS2(conv_transpose2d, input, lower_precision_fp)
+  KERNEL_MPS(convolution, lower_precision_fp)
+  KERNEL_MPS(_mps_convolution, lower_precision_fp)
+  KERNEL_MPS(prelu, lower_precision_fp)
+  KERNEL_MPS(addmm, lower_precision_fp)
+  KERNEL_MPS(addmv, lower_precision_fp)
+  KERNEL_MPS(addr, lower_precision_fp)
+  KERNEL_MPS(matmul, lower_precision_fp)
+  KERNEL_MPS(einsum, lower_precision_fp)
+  KERNEL_MPS(mm, lower_precision_fp)
+  KERNEL_MPS(mv, lower_precision_fp)
+  KERNEL_MPS(linear, lower_precision_fp)
+  KERNEL_MPS(addbmm, lower_precision_fp)
+  KERNEL_MPS(baddbmm, lower_precision_fp)
+  KERNEL_MPS(bmm, lower_precision_fp)
+  KERNEL_MPS(chain_matmul, lower_precision_fp)
+  KERNEL_MPS(linalg_multi_dot, lower_precision_fp)
+  KERNEL_MPS(lstm_cell, lower_precision_fp)
+
+  // fp32
+  KERNEL_MPS(acos, fp32)
+  KERNEL_MPS(asin, fp32)
+  KERNEL_MPS(cosh, fp32)
+  KERNEL_MPS(erfinv, fp32)
+  KERNEL_MPS(exp, fp32)
+  KERNEL_MPS(expm1, fp32)
+  KERNEL_MPS(log, fp32)
+  KERNEL_MPS(log10, fp32)
+  KERNEL_MPS(log2, fp32)
+  KERNEL_MPS(log1p, fp32)
+  KERNEL_MPS(reciprocal, fp32)
+  KERNEL_MPS(rsqrt, fp32)
+  KERNEL_MPS(sinh, fp32)
+  KERNEL_MPS(tan, fp32)
+  KERNEL_MPS2(pow, Tensor_Scalar, fp32)
+  KERNEL_MPS2(pow, Tensor_Tensor, fp32)
+  KERNEL_MPS2(pow, Scalar, fp32)
+  KERNEL_MPS(softplus, fp32)
+  KERNEL_MPS(layer_norm, fp32)
+  KERNEL_MPS(native_layer_norm, fp32)
+  KERNEL_MPS(group_norm, fp32)
+  KERNEL_MPS2(frobenius_norm, dim, fp32)
+  KERNEL_MPS(nuclear_norm, fp32)
+  KERNEL_MPS2(nuclear_norm, dim, fp32)
+  KERNEL_MPS(batch_norm, fp32)
+  KERNEL_MPS(cosine_similarity, fp32)
+  KERNEL_MPS(poisson_nll_loss, fp32)
+  KERNEL_MPS(cosine_embedding_loss, fp32)
+  KERNEL_MPS(nll_loss, fp32)
+  KERNEL_MPS(nll_loss2d, fp32)
+  KERNEL_MPS(hinge_embedding_loss, fp32)
+  KERNEL_MPS(kl_div, fp32)
+  KERNEL_MPS(l1_loss, fp32)
+  KERNEL_MPS(smooth_l1_loss, fp32)
+  KERNEL_MPS(huber_loss, fp32)
+  KERNEL_MPS(mse_loss, fp32)
+  KERNEL_MPS(margin_ranking_loss, fp32)
+  KERNEL_MPS(multilabel_margin_loss, fp32)
+  KERNEL_MPS(soft_margin_loss, fp32)
+  KERNEL_MPS(triplet_margin_loss, fp32)
+  KERNEL_MPS(multi_margin_loss, fp32)
+  KERNEL_MPS(binary_cross_entropy_with_logits, fp32)
+  KERNEL_MPS(dist, fp32)
+  KERNEL_MPS(pdist, fp32)
+  KERNEL_MPS(cdist, fp32)
+  KERNEL_MPS(renorm, fp32)
+  KERNEL_MPS(logsumexp, fp32)
+
+  // fp32_set_opt_dtype
+  KERNEL_MPS(prod, fp32)
+  KERNEL_MPS2(prod, dim_int, fp32)
+  KERNEL_MPS2(prod, dim_Dimname, fp32)
+  KERNEL_MPS2(softmax, int, fp32)
+  KERNEL_MPS2(softmax, Dimname, fp32)
+  KERNEL_MPS2(log_softmax, int, fp32)
+  KERNEL_MPS2(log_softmax, Dimname, fp32)
+  KERNEL_MPS(cumprod, fp32)
+  KERNEL_MPS2(cumprod, dimname, fp32)
+  KERNEL_MPS(cumsum, fp32)
+  KERNEL_MPS2(cumsum, dimname, fp32)
+  KERNEL_MPS(linalg_vector_norm, fp32)
+  KERNEL_MPS(linalg_matrix_norm, fp32)
+  KERNEL_MPS2(linalg_matrix_norm, str_ord, fp32)
+  KERNEL_MPS(sum, fp32)
+  KERNEL_MPS2(sum, dim_IntList, fp32)
+  KERNEL_MPS2(sum, dim_DimnameList, fp32)
+  //
+  // promote
+  KERNEL_MPS(addcdiv, promote)
+  KERNEL_MPS(addcmul, promote)
+  KERNEL_MPS(atan2, promote)
+  KERNEL_MPS(bilinear, promote)
+  KERNEL_MPS(cross, promote)
+  KERNEL_MPS(dot, promote)
+  KERNEL_MPS(grid_sampler, promote)
+  KERNEL_MPS(index_put, promote)
+  KERNEL_MPS(tensordot, promote)
+  KERNEL_MPS(scatter_add, promote)
+}
+
 TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -145,6 +145,8 @@ inline bool is_autocast_eligible(
      return tensor.is_xla() && tensor.is_floating_point();
    case c10::DeviceType::PrivateUse1:
      return tensor.is_privateuseone() && tensor.is_floating_point();
+    case c10::DeviceType::MPS:
+      return tensor.is_mps() && tensor.is_floating_point();
    default:
      return false;
  }
@ -168,6 +170,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
      return DispatchKey::AutocastXLA;
    case c10::DeviceType::PrivateUse1:
      return DispatchKey::AutocastPrivateUse1;
+    case c10::DeviceType::MPS:
+      return DispatchKey::AutocastMPS;
    default:
      throw std::runtime_error(
          "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
@ -178,7 +182,7 @@ inline bool is_autocast_available(c10::DeviceType device_type) {
  if (device_type == at::kCPU || device_type == at::kCUDA ||
      device_type == at::kXPU || device_type == at::kIPU ||
      device_type == at::kHPU || device_type == at::kXLA ||
-      device_type == at::kPrivateUse1) {
+      device_type == at::kPrivateUse1 || device_type == at::kMPS) {
    return true;
  } else {
    return false;
@ -745,6 +749,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
      REDISPATCH_SIGNATURE,                                  \
      POLICY)

+// KERNEL_MPS registration for AutocastMPS
+#define KERNEL_MPS(OP, POLICY)            \
+  m.impl(                                 \
+      TORCH_SELECTIVE_NAME("aten::" #OP), \
+      &WrapFunction<                      \
+          CastPolicy::POLICY,             \
+          DeviceType::MPS,                \
+          decltype(ATEN_FN(OP)),          \
+          decltype(ATEN_FN(OP)),          \
+          &ATEN_FN(OP)>::type::call);
+
+#define KERNEL_MPS2(OP, OVERLOAD, POLICY)               \
+  m.impl(                                               \
+      TORCH_SELECTIVE_NAME("aten::" #OP "." #OVERLOAD), \
+      &WrapFunction<                                    \
+          CastPolicy::POLICY,                           \
+          DeviceType::MPS,                              \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          &ATEN_FN2(OP, OVERLOAD)>::type::call);
+
 // Op lists for different policies.
 // To make sure other backends can reuse the policy op list.
 #define AT_FORALL_LOWER_PRECISION_FP(_)  \
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -228,6 +228,7 @@ namespace c10 {
  _(aten, is_autocast_cpu_enabled)   \
  _(aten, is_autocast_xla_enabled)   \
  _(aten, get_autocast_dtype)        \
+  _(aten, is_autocast_mps_enabled)   \
  FORALL_ATEN_BASE_SYMBOLS(_)        \
  _(onnx, Add)                       \
  _(onnx, Concat)                    \
--- a/aten/src/ATen/cpu/Utils.cpp
+++ b/aten/src/ATen/cpu/Utils.cpp
@ -9,7 +9,7 @@
 #endif

 namespace at::cpu {
-bool is_cpu_support_avx2() {
+bool is_avx2_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
 #else
@ -17,7 +17,7 @@ bool is_cpu_support_avx2() {
 #endif
 }

-bool is_cpu_support_avx512() {
+bool is_avx512_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
 #else
@ -25,7 +25,7 @@ bool is_cpu_support_avx512() {
 #endif
 }

-bool is_cpu_support_avx512_vnni() {
+bool is_avx512_vnni_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
 #else
@ -33,7 +33,15 @@ bool is_cpu_support_avx512_vnni() {
 #endif
 }

-bool is_cpu_support_amx_tile() {
+bool is_avx512_bf16_supported() {
+#if !defined(__s390x__) && !defined(__powerpc__)
+  return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
+#else
+  return false;
+#endif
+}
+
+bool is_amx_tile_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_amx_tile();
 #else
@ -42,7 +50,7 @@ bool is_cpu_support_amx_tile() {
 }

 bool init_amx() {
-  if (!is_cpu_support_amx_tile()) {
+  if (!is_amx_tile_supported()) {
    return false;
  }

--- a/aten/src/ATen/cpu/Utils.h
+++ b/aten/src/ATen/cpu/Utils.h
@ -6,14 +6,17 @@

 namespace at::cpu {

-TORCH_API bool is_cpu_support_avx2();
-TORCH_API bool is_cpu_support_avx512();
+TORCH_API bool is_avx2_supported();
+TORCH_API bool is_avx512_supported();

 // Detect if CPU support Vector Neural Network Instruction.
-TORCH_API bool is_cpu_support_avx512_vnni();
+TORCH_API bool is_avx512_vnni_supported();
+
+// Detect if CPU supports AVX512_BF16 ISA
+TORCH_API bool is_avx512_bf16_supported();

 // Detect if CPU support Advanced Matrix Extension.
-TORCH_API bool is_cpu_support_amx_tile();
+TORCH_API bool is_amx_tile_supported();

 // Enable the system to use AMX instructions.
 TORCH_API bool init_amx();
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -636,6 +636,21 @@ inline void transpose_mxn<float, 8, 8>(
  _mm256_storeu_ps(&dst[7 * ld_dst], th);
 }

+template<>
+inline void transpose_mxn<float, 16, 16>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  transpose_mxn<float, 8, 8>(
+          src , ld_src, dst, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8 * ld_src, ld_src, dst + 8, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
+}
 #endif

 }} // namespace at::vec::CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -582,8 +582,7 @@ Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<floa
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
 // kernel for transposing mxn where m, n <= 16
 // M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
-template <>
-inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
  TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
  // load from src to registers
  __m512 input[16];
@ -667,8 +666,39 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
  }
 }

+template<>
+inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+  int64_t i = 0;
+  for (; i < M / 16 * 16; i += 16) {
+    int64_t j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    if (nrem > 0) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem);
+    }
+  }
+  // handle remainder i
+  int mrem = M - i;
+  if (mrem > 0) {
+    int j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    transpose_mxn_16x16(
+        src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem);
+  }
+}
+
 template <typename T, int M, int N,
-          typename std::enable_if_t<std::is_same<T, float>::value && M <= 16 && N <= 16, int> = 0>
+          typename std::enable_if_t<std::is_same<T, float>::value, int> = 0>
 inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
  transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
 }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1408,7 +1408,6 @@ void scaled_gemm(
    const void *result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    void* amax_ptr,
    bool use_fast_accum) {
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
@ -1421,13 +1420,9 @@ void scaled_gemm(
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
-#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
-  // Amax support in ROCm as of 6.2
-  if (isFloat8Type(result_dtype)) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+  if (result_scale_ptr != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
  }
-#endif
 #ifndef USE_ROCM
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
 #endif
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -140,7 +140,6 @@ void scaled_gemm(
    const void* result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    void* amax_ptr,
    bool use_fast_accum);

 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -188,7 +188,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
       "ROCM_VERSION",
       [rocm_version]() { return rocm_version; },
-       [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+       [rocm_version](auto&& k) {
+        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
+        return rocm_version == k ? OK : FAIL;
+      });
  }
  // gfx arch
  {
@ -196,7 +199,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "GCN_ARCH_NAME",
        [gcn_arch_name]() { return gcn_arch_name; },
-        [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+        [gcn_arch_name](auto&& k) {
+          TUNABLE_LOG1("GCN_ARCH_NAME validation: expect ", k, " to match ", gcn_arch_name);
+          return gcn_arch_name == k ? OK : FAIL;
+        });
  }
  // rocblas
  {
@ -212,7 +218,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "ROCBLAS_VERSION",
        [rocblas_version]() { return rocblas_version; },
-        [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+        [rocblas_version](auto&& k) {
+          TUNABLE_LOG1("ROCBLAS_VERSION validation: expect ", k, " to match ", rocblas_version);
+          return rocblas_version == k ? OK : FAIL;
+        });
  }
  // hipblaslt
  {
@ -226,7 +235,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "HIPBLASLT_VERSION",
        [hipblaslt_version]() { return hipblaslt_version; },
-        [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+        [hipblaslt_version](auto&& k) {
+          TUNABLE_LOG1("HIPBLASLT_VERSION validation: expect ", k, " to match ", hipblaslt_version);
+          return hipblaslt_version == k ? OK : FAIL;
+        });
  }
 #endif
 }
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -104,7 +104,6 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
          params->c_scale_ptr,
          params->ldc,
          params->c_dtype,
-          params->amax_ptr,
          params->use_fast_accum);
      return OK;
    }
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -23,6 +23,9 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
  OP_DECOMPOSE(dropout_);
  OP_DECOMPOSE(feature_alpha_dropout_);
  OP_DECOMPOSE(feature_dropout_);
+  OP_DECOMPOSE(dropout);
+  OP_DECOMPOSE(_scaled_dot_product_attention_math);
+  OP_DECOMPOSE(scaled_dot_product_attention);
 }

 static void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@ -235,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE(relu6_);
  OP_DECOMPOSE(prelu);
  OP_DECOMPOSE2(softmax, int);
-  OP_DECOMPOSE(scaled_dot_product_attention);
  OP_DECOMPOSE(special_gammainc);
  OP_DECOMPOSE(special_gammaincc);
  OP_DECOMPOSE(special_logit);
@ -261,7 +263,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE(special_xlogy);
  OP_DECOMPOSE2(special_xlogy, other_scalar);
  OP_DECOMPOSE2(special_xlogy, self_scalar);
-  OP_DECOMPOSE(_scaled_dot_product_attention_math);


  m.impl("split.sizes", native::split_symint);
@ -386,6 +387,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(to, dtype);
  OP_DECOMPOSE2(to, dtype_layout);
  OP_DECOMPOSE2(to, other);
+
+  // Random ops that are also registered here
+  OP_DECOMPOSE(dropout);
+  OP_DECOMPOSE(_scaled_dot_product_attention_math);
+  OP_DECOMPOSE(scaled_dot_product_attention);
 }

 } // namespace at::functorch
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -496,6 +496,11 @@ _scaled_dot_product_flash_attention_batch_rule(
  bool return_debug_mask,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
@ -540,6 +545,11 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
  bool is_causal,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
@ -577,6 +587,11 @@ _scaled_dot_product_cudnn_attention_batch_rule(
  bool return_debug_mask,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -28,6 +28,7 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
  _executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue;

  // Choose level which optimizes for GPU
+  [_compilationDescriptor disableTypeInference];
  _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
  _executionDescriptor.compilationDescriptor = _compilationDescriptor;
 }
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -41,6 +41,17 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
 #include <fbgemm/FbgemmI64.h>
 #endif  // USE_FBGEMM

+#if AT_MKLDNN_ENABLED()
+#include <oneapi/dnnl/dnnl_version.h>
+#endif // oneDNN
+
+#define ONEDNN_UKERNEL_ENABLED (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=5)
+
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#include <oneapi/dnnl/dnnl_ukernel.hpp>
+#include <oneapi/dnnl/dnnl.hpp>
+#endif // oneDNN BRGEMM
+
 namespace at::native::cpublas {
 namespace internal {

@ -822,4 +833,350 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
      n, x, incx, y, incy);
 }

-}  // namespace at::native::cpublas
+// oneDNN BRGEMM
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+struct BrgemmKey {
+  int64_t M;
+  int64_t N;
+  int64_t K;
+  int64_t batch_size;
+  int64_t lda;
+  int64_t ldb;
+  int64_t ldc;
+  ScalarType dt_a;
+  ScalarType dt_b;
+  ScalarType dt_c;
+  float alpha;
+  float beta;
+  BrgemmKey(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t batch_size,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      ScalarType dt_a,
+      ScalarType dt_b,
+      ScalarType dt_c,
+      float alpha,
+      float beta)
+      : M(M),
+        N(N),
+        K(K),
+        batch_size(batch_size),
+        lda(lda),
+        ldb(ldb),
+        ldc(ldc),
+        dt_a(dt_a),
+        dt_b(dt_b),
+        dt_c(dt_c),
+        alpha(alpha),
+        beta(beta) {}
+  bool operator==(const BrgemmKey& other) const {
+    return M == other.M && N == other.N && K == other.K &&
+        batch_size == other.batch_size && lda == other.lda &&
+        ldb == other.ldb && ldc == other.ldc && dt_a == other.dt_a &&
+        dt_b == other.dt_b && dt_c == other.dt_c && alpha == other.alpha &&
+        beta == other.beta;
+  }
+};
+
+struct PackKey {
+  int64_t K;
+  int64_t N;
+  int64_t ld_in;
+  int64_t ld_out;
+  ScalarType dt_in;
+  ScalarType dt_out;
+  PackKey(
+      int64_t K,
+      int64_t N,
+      int64_t ld_in,
+      int64_t ld_out,
+      ScalarType dt_in,
+      ScalarType dt_out)
+      : K(K),
+        N(N),
+        ld_in(ld_in),
+        ld_out(ld_out),
+        dt_in(dt_in),
+        dt_out(dt_out) {}
+  bool operator==(const PackKey& other) const {
+    return N == other.N && K == other.K && ld_in == other.ld_in &&
+        ld_out == other.ld_out && dt_in == other.dt_in &&
+        dt_out == other.dt_out;
+  }
+};
+
+inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+  if (dtype == ScalarType::Float) {
+    return dnnl::memory::data_type::f32;
+  } else if (dtype == ScalarType::BFloat16) {
+    return dnnl::memory::data_type::bf16;
+  } else if (dtype == ScalarType::Half) {
+    return dnnl::memory::data_type::f16;
+  } else if (dtype == ScalarType::Byte) {
+    return dnnl::memory::data_type::u8;
+  } else if (dtype == ScalarType::Char) {
+    return dnnl::memory::data_type::s8;
+  } else {
+    TORCH_CHECK(false, "get_dnnl_dtype expects float/bfloat16/half/int8 tensor input");
+  }
+}
+
+template<typename key_t>
+struct UnsafeUkernelKeyHasher {
+  std::size_t operator()(const key_t& key) const;
+};
+
+template<>
+std::size_t UnsafeUkernelKeyHasher<BrgemmKey>::operator()(const BrgemmKey& key) const {
+  // Use beta, M, N, and K to compute hash to reduce the overhead as
+  // batch size, alpha, and data types are unlikely to change within the same kernel and
+  // leading dimensions are likely to be related to M, K, N or use fixed values.
+  std::size_t h = std::hash<float>()(key.beta + 1);
+  h = std::hash<int64_t>()(key.M) ^ (h << 1);
+  h = std::hash<int64_t>()(key.N) ^ (h << 1);
+  h = std::hash<int64_t>()(key.K) ^ (h << 1);
+  h = std::hash<int64_t>()(key.ldc) ^ (h << 1);
+  return h;
+}
+
+template<>
+std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) const {
+  // Use K and N to compute hash to reduce the overhead as
+  // data types are unlikely to change and
+  // ld_in/ld_out is likely to be related to K, N or use fixed values
+  std::size_t h = std::hash<int64_t>()(key.K);
+  h = std::hash<int64_t>()(key.N) ^ (h << 1);
+  return h;
+}
+
+template <typename key_t, typename value_t>
+struct KernelCache  {
+  using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
+  static inline std::shared_ptr<value_t>&& fetch_or_create(
+      const key_t& key,
+      const std::function<std::shared_ptr<value_t>()>& callback) {
+    auto&& search = get_store().find(key);
+    if (search != get_store().end()) {
+      return std::move(search->second);
+    } else {
+      get_store().insert({key, callback()});
+      return std::move(get_store()[key]);
+    }
+  }
+
+  static inline kstore_t& get_store() {
+    static thread_local kstore_t cache_kernels;
+    return cache_kernels;
+  }
+};
+
+// Helper struct for convenient brgemm configuration
+struct GemmHelper {
+  GemmHelper(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t bs,
+      int64_t ld_a,
+      int64_t ld_b,
+      int64_t ld_c,
+      ScalarType dt_a,
+      ScalarType dt_b,
+      ScalarType dt_c,
+      const float alpha,
+      const float beta) {
+    // Create brgemm
+    brg = dnnl::ukernel::brgemm(
+        M,
+        N,
+        K,
+        bs,
+        ld_a,
+        ld_b,
+        ld_c,
+        get_dnnl_dtype(dt_a),
+        get_dnnl_dtype(dt_b),
+        get_dnnl_dtype(dt_c),
+        alpha,
+        beta);
+    // Create a scratchpad buffer for the brgemm execution
+    scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
+    // Prepare default vector of pairs of tensors A and B offsets for each batch.
+    A_B_offsets.reserve(1);
+    A_B_offsets[0] = std::make_pair(0, 0);
+  }
+  dnnl::ukernel::brgemm brg;
+  std::vector<uint8_t> scratchpad;
+  std::vector<std::pair<int64_t, int64_t>> A_B_offsets;
+};
+
+struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
+  // Fetch/create GemmHelper object and execute brgemm with batch size = 1
+  template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
+  static inline void call(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t ld_a,
+      int64_t ld_b,
+      int64_t ld_c,
+      const float alpha,
+      const float beta,
+      const scalar_t_a* A,
+      const scalar_t_b* B,
+      scalar_t_c* C) {
+    auto&& key = BrgemmKey(
+        M,
+        N,
+        K,
+        int64_t(1),
+        ld_a,
+        ld_b,
+        ld_c,
+        c10::CppTypeToScalarType<scalar_t_a>::value,
+        c10::CppTypeToScalarType<scalar_t_b>::value,
+        c10::CppTypeToScalarType<scalar_t_c>::value,
+        alpha,
+        beta);
+    // Fetch/create GemmHelper object
+    auto&& value = fetch_or_create(key, [&]() {
+      auto&& v = std::make_shared<GemmHelper>(
+          M,
+          N,
+          K,
+          1,
+          ld_a,
+          ld_b,
+          ld_c,
+          c10::CppTypeToScalarType<scalar_t_a>::value,
+          c10::CppTypeToScalarType<scalar_t_b>::value,
+          c10::CppTypeToScalarType<scalar_t_c>::value,
+          alpha,
+          beta);
+      (*v).brg.generate();
+      return std::move(v);
+    });
+    if (get_current() != value) {
+      dnnl::ukernel::brgemm::release_hw_context();
+      ((*value).brg).set_hw_context();
+      get_current() = value;
+    }
+    ((*value).brg)
+        .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
+  }
+
+  static inline std::shared_ptr<GemmHelper>& get_current() {
+    static thread_local std::shared_ptr<GemmHelper> current;
+    return current;
+  }
+
+  static inline bool device_check(ScalarType dtype) {
+    if (!at::globalContext().userEnabledMkldnn()) {
+      return false;
+    }
+    if (dtype == ScalarType::Half) {
+      static bool fp16_support = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_fp16;
+      return fp16_support;
+    }
+    return false;
+  }
+};
+
+using pack_t = dnnl::ukernel::brgemm_pack_B;
+struct Pack : public KernelCache <PackKey, pack_t> {
+  static inline void call(
+      int64_t K,
+      int64_t N,
+      int64_t ld_in,
+      int64_t ld_out,
+      ScalarType dt_in,
+      ScalarType dt_out,
+      const void* in,
+      void* out) {
+    auto&& key = PackKey(K, N, ld_in, ld_out, dt_in, dt_out);
+    auto&& pack = fetch_or_create(key, [&]() {
+      auto&& p = std::make_shared<pack_t>(
+          K, N, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
+      if (need_pack(dt_in)) {
+        (*p).generate();
+      }
+      return std::move(p);
+    });
+    if (need_pack(dt_in)) {
+      (*pack).execute(in, out);
+    } else {
+      TORCH_CHECK(false, "No need to pack");
+    }
+  }
+
+  static inline bool need_pack(ScalarType dtype) {
+    if (!at::globalContext().userEnabledMkldnn()) {
+      return false;
+    }
+    if (dtype == ScalarType::Half) {
+      static bool fp16_pack = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_amx_fp16;
+      return fp16_pack;
+    }
+    return false;
+  }
+};
+#endif
+
+void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::Half* A,
+    const at::Half* B,
+    float* C) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  if (Brgemm::device_check(ScalarType::Half)) {
+    Brgemm::call<at::Half, at::Half, float>(
+      M, N, K, ld_a, ld_b, ld_c, alpha, beta, A, B, C);
+    return;
+  }
+#endif
+  TORCH_CHECK(false,
+  "Half Brgemm is only supported on X64 when oneDNN ukernel is enabled and avx512_fp16 is supported");
+}
+
+void brgemm_release() {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  dnnl::ukernel::brgemm::release_hw_context();
+#endif
+}
+
+void pack(
+    int64_t K,
+    int64_t N,
+    int64_t ld_in,
+    int64_t ld_out,
+    ScalarType dt_in,
+    ScalarType dt_out,
+    const void* in,
+    void* out) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  Pack::call(K, N, ld_in, ld_out, dt_in, dt_out, in, out);
+#else
+  TORCH_CHECK(false, "pack is only supported on X64 with oneDNN ukernel enabled");
+#endif
+}
+
+bool need_pack(ScalarType dt_in) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  return Pack::need_pack(dt_in);
+#else
+  return false;
+#endif
+}
+
+} // namespace at::native::cpublas
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@ -7,6 +7,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/Scalar.h>

+
 namespace at::native::cpublas {

 namespace internal {
@ -186,4 +187,40 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
 void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
 void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);

-}  // namespace at::native::cpublas
+// Batch-reduce GEMM
+// Operates by the following formula:
+// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
+// A Base pointer to a tensor A.
+// B Base pointer to a tensor B.
+// C Pointer to a tensor C (accumulation buffer).
+TORCH_API void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::Half* A,
+    const at::Half* B,
+    float* C);
+
+// Release brgemm hardware context
+void brgemm_release();
+
+// Pack B matrix to get better performance if needed
+void pack(
+    int64_t K,
+    int64_t N,
+    int64_t ld_in,
+    int64_t ld_out,
+    ScalarType dt_in,
+    ScalarType dt_out,
+    const void* in,
+    void* out);
+
+// Whether pack is needed in the platform.
+bool need_pack(ScalarType dt_in);
+
+} // namespace at::native::cpublas
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@ -144,7 +144,7 @@ static void col2im_out_cpu_template(

  output.resize_({batch_size, n_output_plane, output_height, output_width});

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
      input.scalar_type(), "col2im_out_cpu", [&] {
        Tensor input_n = Tensor();
        Tensor output_n = Tensor();
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -421,12 +421,18 @@ struct ConvParams {
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (needs_64bit_indexing_no_split(input, weight)) {
-      return false;
-    }
    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
      return false;
    }
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
+        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
+                        " if the V8 API is not enabled or before cuDNN version 9.3+."
+                        " Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
+        return false;
+      }
+    }
    if (!input.is_cuda() || !cudnn_enabled) {
      return false;
    }
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@ -94,7 +94,7 @@ static void im2col_out_cpu_template(

  output.resize_({batch_size, n_output_plane, output_length});

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
      input.scalar_type(), "im2col_out_cpu", [&] {
        Tensor input_n;
        Tensor output_n;
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -19,6 +19,7 @@
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/Matmul.h>
+#include <ATen/native/mkldnn/Utils.h>
 #include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
@ -1358,13 +1359,8 @@ static inline int64_t get_mkldnn_matmul_min_dim() {
  static auto value = [&] {
    const int64_t default_min_dim = [&] {
      // Minimum dimension requirement for MKLDNN; derived based on experiments.
-      // By default, it's only enabled on Neoverse V1.
-#if !defined(__s390x__)  && !defined(__powerpc__)
-      if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
-        return 8;
-      }
-#endif
-      return 0;
+      //it's enabled on all Neoverse cpus.
+      return is_arm_neoverse() ? 8 : 0;
    }();
    const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM");
    return ptr != nullptr ? std::atoi(ptr) : default_min_dim;
@ -1377,13 +1373,8 @@ static inline int64_t get_mkldnn_matmul_min_size() {
  static auto value = [&] {
    const int64_t default_min_size = [&] {
      // Minimum size requirement for MKLDNN; derived based on experiments.
-      // By default, it's only enabled on Neoverse V1.
-#if !defined(__s390x__)  && !defined(__powerpc__)
-      if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
-        return 8 * 1024;
-      }
-#endif
-      return 0;
+      // it's enabled on all Neoverse cpus.
+      return is_arm_neoverse() ? 8 * 1024 : 0;
    }();
    const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE");
    return ptr != nullptr ? std::atoi(ptr) : default_min_size;
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -209,7 +209,13 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(

  bool all_contiguous = is_contiguous(input);
  constexpr bool mixed_type = !std::is_same_v<scalar_t, param_t>;
-  const auto dtype = mixed_type ? kFloat : input.scalar_type();
+  // Using float data type for Half _var_sum in batchnorm stats updating on CPU
+  // to avoid _var_sum overflow since the representation range of Half is small.
+  using opmath_t = std::conditional_t<std::is_same_v<param_t, at::Half>, at::opmath_type<param_t>, param_t>;
+  auto dtype = mixed_type ? kFloat : input.scalar_type();
+  if (dtype == kHalf) {
+    dtype = kFloat;
+  }

  auto save_mean_a = save_mean.accessor<param_t, 1>();
  auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
@ -220,9 +226,9 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
  if (all_contiguous) {
    auto _mean = at::empty({n_input}, input.options().dtype(dtype));
    auto _var_sum = at::empty({n_input}, input.options().dtype(dtype));
-    auto _mean_a = _mean.accessor<param_t, 1>();
-    auto _var_sum_a = _var_sum.accessor<param_t, 1>();
-    auto momentum_ = static_cast<param_t>(momentum);
+    auto _mean_a = _mean.accessor<opmath_t, 1>();
+    auto _var_sum_a = _var_sum.accessor<opmath_t, 1>();
+    auto momentum_ = static_cast<opmath_t>(momentum);

    batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);

--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -284,7 +284,7 @@ void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& newsize) {
  } else if (device_type == at::kPrivateUse1) {
    at::detail::getPrivateUse1Hooks().resizePrivateUse1Bytes(
        storage, newsize.expect_int());
-  } else if (device_type == at::kXPU || device_type == at::kHPU) {
+  } else if (device_type == at::kXPU || device_type == at::kHPU || device_type == at::kMTIA) {
    ptrdiff_t size_bytes_i = newsize.expect_int();
    TORCH_CHECK(
        !c10::overflows<int64_t>(size_bytes_i),
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@ -11,18 +11,18 @@ namespace ao {
 namespace sparse {

 namespace {
-constexpr int64_t serialization_version_index = 0;
-constexpr int64_t bias_index = 1;
-constexpr int64_t out_features_block_size_index = 2;
-constexpr int64_t in_features_block_size_index = 3;
-constexpr int64_t weight_scales_index = 4;
-constexpr int64_t weight_zero_point_index = 5;
-constexpr int64_t quantization_scheme_index = 6;
-constexpr int64_t row_block_indices_index = 7;
-constexpr int64_t col_block_indices_index = 8;
-constexpr int64_t weight_values_index = 9;
-constexpr int64_t num_output_channels_index = 10;
-constexpr int64_t num_input_channels_index = 11;
+constexpr int64_t serialization_version_index [[maybe_unused]] = 0;
+constexpr int64_t bias_index [[maybe_unused]] = 1;
+constexpr int64_t out_features_block_size_index [[maybe_unused]] = 2;
+constexpr int64_t in_features_block_size_index [[maybe_unused]] = 3;
+constexpr int64_t weight_scales_index [[maybe_unused]] = 4;
+constexpr int64_t weight_zero_point_index [[maybe_unused]] = 5;
+constexpr int64_t quantization_scheme_index [[maybe_unused]] = 6;
+constexpr int64_t row_block_indices_index [[maybe_unused]] = 7;
+constexpr int64_t col_block_indices_index [[maybe_unused]] = 8;
+constexpr int64_t weight_values_index [[maybe_unused]] = 9;
+constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
+constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;

 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
 std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -81,6 +81,12 @@ void atan2_kernel(TensorIteratorBase& iter) {
 }

 #if !defined(C10_MOBILE)
+#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...)  \
+  AT_DISPATCH_V2(                                        \
+      TYPE,                                              \
+      NAME,                                              \
+      AT_WRAP(__VA_ARGS__),                              \
+      AT_EXPAND(AT_INTEGRAL_TYPES_V2))
 #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
  AT_DISPATCH_V2(                \
      TYPE,                                              \
@ -104,6 +110,8 @@ void atan2_kernel(TensorIteratorBase& iter) {
  AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),       \
      kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
+#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...)  \
+  AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
 #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                \
      kComplexHalf, kHalf, kBool, kBFloat16, TYPE, NAME, __VA_ARGS__)
@ -382,7 +390,7 @@ void bitwise_and_kernel(TensorIteratorBase& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    cpu_kernel(iter, [](bool a, bool b) { return a && b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_and_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a & b; },
@ -395,7 +403,7 @@ void bitwise_or_kernel(TensorIteratorBase& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    cpu_kernel(iter, [](bool a, bool b) { return a || b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_or_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a | b; },
@ -410,7 +418,7 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
    // this operation for both Boolean and integral types.
    cpu_kernel(iter, [](bool a, bool b) { return a != b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_xor_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a ^ b; },
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -16,7 +16,6 @@
 #else
 #include <ATen/ops/empty.h>
 #endif
-
 namespace at::native {

 namespace {
@ -202,7 +201,97 @@ void reshape_attn_mask_to_4d(
                .expand({attn_mask_size_0, attn_mask_size_1, qSize, kvSize});
 }

-template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size>
+template <typename scalar_t>
+inline void copy_value_with_pad(
+    const scalar_t* value_ptr,
+    scalar_t* dst_ptr,
+    int64_t rows,
+    int64_t cols,
+    int64_t prows,
+    int64_t pcols,
+    int64_t ldi) {
+  auto vec_size = at::vec::Vectorized<scalar_t>::size();
+  int64_t i = 0;
+  for (; i < rows; i++) {
+    int64_t j = 0;
+    for (; j < cols - (cols % vec_size); j += vec_size) {
+      auto vec_v =
+          at::vec::Vectorized<scalar_t>::loadu(value_ptr + i * ldi + j);
+      vec_v.store(dst_ptr + i * pcols + j);
+    }
+
+    if (j < cols) {
+      auto vec_v = at::vec::Vectorized<scalar_t>::loadu(
+          value_ptr + i * ldi + j, cols - j);
+      vec_v.store(dst_ptr + i * pcols + j, cols - j);
+    }
+
+    // col padding
+    auto psize = pcols - cols;
+    if (psize > 0) {
+      auto zero_vec = at::vec::Vectorized<scalar_t>(0);
+      int64_t pj = 0;
+      for (; pj < psize - (psize % vec_size); pj += vec_size) {
+        zero_vec.store(dst_ptr + i * pcols + cols + pj);
+      }
+      if (pj < psize) {
+        zero_vec.store(dst_ptr + i * pcols + cols + pj, psize - pj);
+      }
+    }
+  }
+  // row padding
+  for (; i < prows; i++) {
+    auto zero_vec = at::vec::Vectorized<scalar_t>(0);
+    int64_t j = 0;
+    for (; j < pcols - (pcols % vec_size); j += vec_size) {
+      zero_vec.store(dst_ptr + i * pcols + j);
+    }
+    if (j < pcols) {
+      zero_vec.store(dst_ptr + i * pcols + j, pcols - j);
+    }
+
+  }
+}
+
+template <typename scalar_t>
+inline void pad_remain_row_col_zero(
+    scalar_t* value_ptr,
+    int rows,
+    int cols,
+    int prows,
+    int pcols,
+    int ldi) {
+  auto psize = pcols - cols;
+  if (psize == 0 && prows == rows) {
+    return;
+  }
+  auto vec_size = at::vec::Vectorized<scalar_t>::size();
+  auto zero = at::vec::Vectorized<scalar_t>(0);
+  if (psize > 0) {
+    for (int i = 0; i < rows; i++) {
+      int j = 0;
+      for (; j < psize - (psize % vec_size); j += vec_size) {
+        zero.store(value_ptr + i * ldi + cols + j);
+      }
+      if (j < psize) {
+        zero.store(value_ptr + i * ldi + cols + j, psize - j);
+      }
+    }
+  }
+
+  for (int i = rows; i < prows; i++) {
+    int j = 0;
+    for (; j < pcols - (pcols % vec_size); j += vec_size) {
+      zero.store(value_ptr + i * ldi + j);
+    }
+    if (j < pcols) {
+      zero.store(value_ptr + i * ldi + j, pcols - j);
+    }
+  }
+
+}
+
+template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size, bool with_pack=false>
 void cpu_flash_attention(
    const Tensor& output,
    const Tensor& logsumexp,
@ -278,21 +367,70 @@ void cpu_flash_attention(

  int64_t qSplitSize = q_split_size > qSize ? qSize : q_split_size;
  int64_t kvSplitSize = kv_split_size > kvSize ? kvSize : kv_split_size;
-  int64_t qSlice = (qSize - 1) / qSplitSize + 1;
+  int64_t qSlice = (qSize + qSplitSize - 1) / qSplitSize;
+  int64_t kvSlice = (kvSize + kvSplitSize - 1) / kvSplitSize;
+  int64_t kvTail = (kvSize - 1) % kvSplitSize + 1;
  int64_t num_thread = at::get_num_threads();

  const auto dtype = query.scalar_type();
  const auto accumulate_dtype = toOpMathType(dtype);

+  // Whether pack is needed
+  bool need_pack = false;
+  // Block size of packing B matrix
+  int64_t packb_size = 64;
+  // Use packb_size due to the limitation:
+  // oneDNN pack only supports output leading dimention being one of (16, 32, 48, 64)
+  // For instance,
+  // for q @ k.T [qSplitSize, headSize] * [headSize, kvSplitSize] = [qSplitSize, kvSplitSize],
+  // we need to split kvSplitSize with packb_size for packing k.T,
+  // for (q @ k.T) @ v [qSplitSize, kvSplitSize] x [kvSplitSize, headSize] -> [qSplitSize, headSize],
+  // we need to split headSize with packb_size for packing v
+  // TODO Simplify the check when oneDNN supports fused pack with transpose and has better performance
+  if (with_pack) {
+    need_pack = num_head >= 4 && headSize % packb_size == 0 && kvSize >= packb_size;
+    if (need_pack) {
+      float pack_size = batchSize * num_head * kvSize * headSize / 1024;
+      float gemm_size_per_thread =
+          (batchSize * num_head * qSlice + num_thread - 1) / num_thread *
+          qSplitSize * (is_causal ? qSize : kvSize) * headSize / 1024;
+      float gsize = gemm_size_per_thread / pack_size;
+      // When the number of gemm is much greater than the number of pack,
+      // the pack and padding overhead can be overlaped.
+      if (pack_size < 2688) {
+        need_pack = gsize >= 36 || (gsize >= 24 && headSize > packb_size);
+      } else if (pack_size < 16384) {
+        need_pack = gsize >= (is_causal ? 54 : 52);
+      } else {
+        need_pack = gsize >= (is_causal ? 54 : 40);
+      }
+    }
+  }
+
+  int64_t rHeadSize = need_pack ? (headSize + packb_size - 1) / packb_size * packb_size : headSize;
+  int64_t rkvSplitSize = need_pack ? (kvSplitSize + packb_size - 1) / packb_size * packb_size : kvSplitSize;
+  int64_t rkvTail = need_pack ? (kvTail + packb_size - 1) / packb_size * packb_size : kvTail;
+  int64_t rkvSize = kv_split_size > kvSize ? rkvTail : rkvSplitSize * kvSlice + rkvTail;
+
+  // oneDNN pack does not support odd K now, we need also pad odd K
+  bool headSize_even = headSize % 2 == 0;
+  int64_t eheadSize = need_pack && !headSize_even ? headSize + 1: headSize;
+  int64_t ekvSplitSize = need_pack && (kvSplitSize % 2 != 0) ? kvSplitSize + 1 : kvSplitSize;
+  int64_t ekvTail = need_pack && (kvTail % 2 != 0) ? kvTail + 1 : kvTail;
+
  // allocate per thread temp buf (accumulate type)
  int64_t size_per_thread =
-      /* qk     */ qSplitSize * kvSplitSize +
+      /* qk     */ qSplitSize * rkvSplitSize +
      /* qk_max */ qSplitSize +
      /* qk_sum */ qSplitSize +
-      /* dst    */ qSplitSize * headSize;
+      /* dst    */ qSplitSize * rHeadSize;

  at::Tensor buf = at::empty({num_thread, size_per_thread}, query.options().dtype(accumulate_dtype));
-  at::Tensor buf_reduced = at::empty({num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0}, query.options());
+  at::Tensor buf_reduced = at::empty(
+    {num_thread,
+     qSplitSize,
+     is_reduced_type ? ekvSplitSize : 0},
+     query.options());

  // Data ptrs
  const scalar_t* q_data = query.const_data_ptr<scalar_t>();
@ -306,16 +444,128 @@ void cpu_flash_attention(
  accum_t* buf_data = buf.data_ptr<accum_t>();
  scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;

+  // Buffer to store padding query
+  scalar_t* query_padding_ptr = nullptr;
+  std::unique_ptr<scalar_t[]> query_padding_data;
+  if (!headSize_even && need_pack) {
+    query_padding_data = std::make_unique<scalar_t[]>(num_thread * qSplitSize * eheadSize);
+    query_padding_ptr = query_padding_data.get();
+  }
+  // Buffer to store Key and Value after transforms
+  scalar_t* key_reorder_ptr = nullptr;
+  std::unique_ptr<scalar_t[]> key_reorder_data;
+  scalar_t* value_reorder_ptr = nullptr;
+  std::unique_ptr<scalar_t[]> value_reorder_data;
+  int kv_padding_size = (kvSize - 1) / kvSplitSize * ekvSplitSize + ekvTail;
+  if (need_pack) {
+    key_reorder_data = std::make_unique<scalar_t[]>(batchSize * num_head * eheadSize * rkvSize);
+    key_reorder_ptr = key_reorder_data.get();
+    value_reorder_data = std::make_unique<scalar_t[]>(batchSize * num_head * kv_padding_size * rHeadSize);
+    value_reorder_ptr = value_reorder_data.get();
+  }
+
+  // Reorder K, V
+  if (need_pack) {
+    at::parallel_for(0, batchSize * num_head * kvSlice, 1, [&](int64_t begin, int64_t end) {
+        int64_t i = 0, j = 0, l = 0, n = 0;
+        at::native::data_index_init(begin, i, batchSize, j, num_head, l, kvSlice);
+        std::unique_ptr<scalar_t[]> transpose_buffer = std::make_unique<scalar_t[]>(eheadSize * packb_size);
+        scalar_t* transpose_buffer_ptr = transpose_buffer.get();
+        std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
+        scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
+        for (const auto z : c10::irange(begin, end)) {
+          (void)z; // Suppress unused variable
+          n = l * kvSplitSize;
+          int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
+          int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
+
+          // Split kvSplitSize with packb_size
+          // [kvSplitSize, headSize] -> [div_up(kvSplitSize, packb_size), packb_size, headSize]
+          // Transpose [packb_size, headSize] -> [headSize, packb_size]
+          // Pack transposed buffer
+
+          for (int64_t b = 0; b < kvBlockSize; b += packb_size) {
+            bool tail = kvBlockSize - b < packb_size;
+            // TODO Use fused pack with transpose support when oneDNN supports such usage
+            utils::transpose<uint16_t>(
+                tail ? kvBlockSize - b : packb_size,
+                headSize,
+                /* src_ptr */
+                reinterpret_cast<const uint16_t*>(
+                    k_data + i * kStrideB + j * kStrideH + n * kStrideN +
+                    b * kStrideN),
+                /* ld_src */ kStrideN,
+                /* dst */ reinterpret_cast<uint16_t*>(transpose_buffer_ptr),
+                /* ld_dst */ packb_size);
+            // Pad [headSize, x] -> [eheadSize, x]
+            if (!headSize_even) {
+              pad_remain_row_col_zero<scalar_t>(
+                  transpose_buffer_ptr,
+                  headSize,
+                  packb_size,
+                  eheadSize,
+                  packb_size,
+                  packb_size);
+            }
+            // Pack
+            cpublas::pack(
+                /* K */ eheadSize,
+                /* N */ packb_size,
+                /* ld_in */ packb_size,
+                /* ld_out */ packb_size,
+                /* dt_in */ dtype,
+                /* dt_out */ dtype,
+                transpose_buffer_ptr,
+                key_reorder_ptr + i * num_head * eheadSize * rkvSize +
+                    j * eheadSize * rkvSize + n * eheadSize + b * eheadSize);
+          }
+
+          // Split headSize with packb_size
+          // [kvSplitSize, headSize] -> [kvSplitSize,  div_up(headSize, packb_size), packb_size]
+          for (int64_t b = 0; b < headSize; b += packb_size) {
+            // Do copy due to the limitation of input_ld of oneDNN pack:
+            // Regarding packing [K, N], only input_ld == N is supported
+            // TODO: remove the copy when pack supports input_ld >= N
+            copy_value_with_pad<scalar_t>(
+                v_data + i * vStrideB + j * vStrideH + n * vStrideN + b,
+                v_copy_buffer_ptr,
+                kvBlockSize,
+                (headSize - b < packb_size) ? headSize - b : packb_size,
+                ekvBlockSize,
+                packb_size,
+                vStrideN);
+            cpublas::pack(
+                ekvBlockSize,
+                packb_size,
+                packb_size,
+                packb_size,
+                dtype,
+                dtype,
+                v_copy_buffer_ptr,
+                value_reorder_ptr +
+                    i * num_head * kv_padding_size * rHeadSize +
+                    j * kv_padding_size * rHeadSize + n * rHeadSize +
+                    ekvBlockSize * b);
+          }
+          // Move to the next query
+          at::native::data_index_step(i, batchSize, j, num_head, l, kvSlice);
+        }
+      });
+  }
+
  at::parallel_for(0, batchSize * num_head * qSlice, 1, [&](int64_t begin, int64_t end) {
    int64_t i = 0, j = 0, k = 0;
    data_index_init(begin, i, batchSize, j, num_head, k, qSlice);
    int ompIdx = at::get_thread_num();
    accum_t* buf_ptr = buf_data + ompIdx * size_per_thread;
    accum_t* qk_data = buf_ptr;
-    accum_t* qk_max_data = qk_data + qSplitSize * kvSplitSize;
+    accum_t* qk_max_data = qk_data + qSplitSize * rkvSplitSize;
    accum_t* qk_sum_data = qk_max_data + qSplitSize;
    accum_t* dst_data = qk_sum_data + qSplitSize;
-    scalar_t* qk_reduced_data = is_reduced_type ? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize : nullptr;
+    scalar_t* qk_reduced_data = is_reduced_type ? buf_reduced_data + ompIdx * qSplitSize * ekvSplitSize : nullptr;
+    scalar_t* query_t_padding_ptr = (!headSize_even && need_pack)
+            ? query_padding_ptr + ompIdx * qSplitSize * eheadSize
+            : nullptr;

    for (const auto z : c10::irange(begin, end)) {
      (void)z; // Suppress unused variable
@ -327,10 +577,46 @@ void cpu_flash_attention(
      fill_stub(qk_sum_data,
          static_cast<accum_t>(0), qBlockSize);
      int64_t num_keys = is_causal ? std::min(m + qBlockSize, kvSize) : kvSize;
+      if (!headSize_even && need_pack) {
+        // Pad query if headSize is not even
+        // [qBlockSize, headSize] -> [qBlockSize, eheadSize]
+        copy_value_with_pad<scalar_t>(
+          q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+          query_t_padding_ptr,
+          qBlockSize,
+          headSize,
+          qBlockSize,
+          eheadSize,
+          qStrideM
+        );
+      }
      for (int64_t n = 0; n < num_keys; n += kvSplitSize) {
        int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
+        int64_t ekvBlockSize = (need_pack && kvBlockSize % 2 != 0) ? kvBlockSize + 1 : kvBlockSize;
+        int64_t rkvBlockSize = kvBlockSize == kvSplitSize ? rkvSplitSize : rkvTail;
        // Calculate scale * q @ k.T
-        cpublas::gemm(
+        if (need_pack) {
+          if constexpr (std::is_same_v<scalar_t, at::Half>) {
+            for (int64_t b = 0; b < kvBlockSize; b += packb_size) {
+              cpublas::brgemm(
+                  qBlockSize,
+                  packb_size,
+                  eheadSize,
+                  headSize_even ? qStrideM : eheadSize,
+                  packb_size,
+                  rkvBlockSize,
+                  1.f,
+                  0.f,
+                  !headSize_even
+                      ? query_t_padding_ptr
+                      : q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+                  key_reorder_ptr + i * num_head * eheadSize * rkvSize +
+                      j * eheadSize * rkvSize + n * eheadSize + b * eheadSize,
+                  qk_data + b);
+            }
+          }
+        } else {
+          cpublas::gemm(
            TransposeType::Transpose,
            TransposeType::NoTranspose,
            kvBlockSize,
@ -346,11 +632,12 @@ void cpu_flash_attention(
            static_cast<accum_t>(0),
            qk_data,
            kvBlockSize);
+        }
        // Apply causal mask, fill unused with -inf
        if (is_causal && num_keys - n <= kvSplitSize) {
          for (const auto row : c10::irange(qBlockSize)) {
            int64_t last_col = m + row - n;
-            accum_t* row_ptr = qk_data + row * kvBlockSize;
+            accum_t* row_ptr = qk_data + row * rkvBlockSize;
            fill_stub(row_ptr + last_col + 1,
                -std::numeric_limits<accum_t>::infinity(),
                kvBlockSize - last_col - 1);
@ -363,29 +650,29 @@ void cpu_flash_attention(
          for (int64_t row = 0; row < qBlockSize; ++row) {
 #if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
              _scale_attn_mask_fusion_kernel(
-                qk_data + row * kvBlockSize,
+                qk_data + row * rkvBlockSize,
                mask_data + i * mStrideB + j * mStrideH +
                    (m + row) * mStrideM + (mStrideN == 0 ? 0 : n),
                kvBlockSize,
-                qk_data + row * kvBlockSize,
+                qk_data + row * rkvBlockSize,
                scaling_factor,
                mStrideN == 0);
 #else
              if (mStrideN == 0) {
                _scale_attn_mask_fusion_kernel</*is_stride_0*/ true>(
-                  qk_data + row * kvBlockSize,
+                  qk_data + row * rkvBlockSize,
                  mask_data + i * mStrideB + j * mStrideH +
                      (m + row) * mStrideM,
                  kvBlockSize,
-                  qk_data + row * kvBlockSize,
+                  qk_data + row * rkvBlockSize,
                  scaling_factor);
              } else {
                _scale_attn_mask_fusion_kernel</*is_stride_0*/ false>(
-                  qk_data + row * kvBlockSize,
+                  qk_data + row * rkvBlockSize,
                  mask_data + i * mStrideB + j * mStrideH +
                      (m + row) * mStrideM + n,
                  kvBlockSize,
-                  qk_data + row * kvBlockSize,
+                  qk_data + row * rkvBlockSize,
                  scaling_factor);
              }
 #endif
@ -398,28 +685,28 @@ void cpu_flash_attention(
            // max per row
            tmp_max = at::vec::reduce_all<accum_t>(
                [](Vec& x, Vec& y) { return at::vec::maximum(x, y); },
-                qk_data + row * kvBlockSize,
+                qk_data + row * rkvBlockSize,
                kvBlockSize);
          } else {
            // apply scaling factor and max per row in fusion
            _mul_reduce_max_fusion_kernel(
-                qk_data + row * kvBlockSize,
+                qk_data + row * rkvBlockSize,
                scaling_factor,
                kvBlockSize,
-                qk_data + row * kvBlockSize,
+                qk_data + row * rkvBlockSize,
                tmp_max);
          }
          tmp_max = qk_max_data[row] > tmp_max ? qk_max_data[row] : tmp_max;
          if (tmp_max == -std::numeric_limits<accum_t>::infinity()) {
            // to avoid `nan = exp2f(-inf - (-inf))`
-            fill_stub(conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize,
+            fill_stub(conditional_data_ptr(qk_data, qk_reduced_data) + row * ekvBlockSize,
              static_cast<scalar_t>(0), kvBlockSize);
          } else {
            tmp_sum = tmp_max;
            // qk <- exp(qk - max) and sum per row
            _exp_reduce_sum_fusion_kernel(
-                qk_data + row * kvBlockSize, kvBlockSize,
-                conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize,
+                qk_data + row * rkvBlockSize, kvBlockSize,
+                conditional_data_ptr(qk_data, qk_reduced_data) + row * ekvBlockSize,
                tmp_sum);
            // exp_tmp <- exp(max[row] - max)
            exp_tmp = std::exp(qk_max_data[row] - tmp_max);
@ -431,12 +718,40 @@ void cpu_flash_attention(
            if (n > 0) {
              vec::map<accum_t>(
                [exp_tmp](Vec x) { return x * Vec(exp_tmp); },
-                dst_data + row * headSize, dst_data + row * headSize, headSize);
+                dst_data + row * rHeadSize,
+                dst_data + row * rHeadSize,
+                headSize);
            }
          }
+          if (need_pack && kvBlockSize % 2 != 0) {
+            // Pad: [qSplitSize,kvSplitSize] -> [qSplitSize,kvSplitSize + 1]
+            *(qk_reduced_data + row * (1 + kvBlockSize) + kvBlockSize) = scalar_t(0);
+          }
        }
        // Calculate Softmax(q @ k.T) @ v
-        cpublas::gemm(
+        if (need_pack) {
+          int64_t psize = n / kvSplitSize * ekvSplitSize;
+          if constexpr (std::is_same_v<scalar_t, at::Half>) {
+            for (int64_t b = 0; b < headSize; b += packb_size) {
+              cpublas::brgemm(
+                  qBlockSize,
+                  packb_size,
+                  ekvBlockSize,
+                  ekvBlockSize,
+                  packb_size,
+                  rHeadSize,
+                  1.0,
+                  n == 0 ? 0.f : 1.f,
+                  qk_reduced_data,
+                  value_reorder_ptr +
+                      i * num_head * kv_padding_size * rHeadSize +
+                      j * kv_padding_size * rHeadSize + psize * rHeadSize +
+                      b * ekvBlockSize,
+                  dst_data + b);
+            }
+          }
+        } else {
+          cpublas::gemm(
            TransposeType::NoTranspose,
            TransposeType::NoTranspose,
            headSize,
@ -451,6 +766,7 @@ void cpu_flash_attention(
            n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
            dst_data,
            headSize);
+        }
      }

      // dst <- dst / sum[row]
@ -465,7 +781,7 @@ void cpu_flash_attention(
        vec::map<scalar_t>(
          [sum_reciprocal](Vec x) { return x * Vec(sum_reciprocal); },
          out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM,
-          dst_data + row * headSize,
+          dst_data + row * rHeadSize,
          headSize);
      }
      // Store logsumexp for backward
@ -478,7 +794,9 @@ void cpu_flash_attention(
      data_index_step(i, batchSize, j, num_head, k, qSlice);
    }
  });
-
+  if (need_pack) {
+    cpublas::brgemm_release();
+  }
 }

 template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size>
@ -826,6 +1144,13 @@ void cpu_flash_attention_backward(
      AT_PRIVATE_CASE_TYPE_USING_HINT(                     \
          at::ScalarType::Half, mask_t, __VA_ARGS__))

+#define FLASH_ATTENTION_KERNEL(FNAME, PACK, TYPE1, TYPE2, SEQ1, SEQ2, ...)   \
+  if (PACK) {                                                      \
+    FNAME<TYPE1, TYPE2, SEQ1, SEQ2, true>(__VA_ARGS__);            \
+  } else {                                                         \
+    FNAME<TYPE1, TYPE2, SEQ1, SEQ2>(__VA_ARGS__);                  \
+  }
+
 void flash_attention_kernel_impl(
    const Tensor& output,
    const Tensor& logsumexp,
@ -838,33 +1163,37 @@ void flash_attention_kernel_impl(
    std::optional<double> scale) {
  auto q_seq_len = query.size(2);

+  // When q_seq_len and k_seq_len are long enough,
+  // cpu_flash_attention with pack has better performance.
+  bool could_pack = (query.scalar_type() == kHalf && cpublas::need_pack(kHalf));
+
  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] {
    if (!attn_mask.has_value()) {
      if (q_seq_len >= 768) {
-        cpu_flash_attention<scalar_t, scalar_t, 256, 512>(
+        FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 256, 512,
          output, logsumexp, query, key, value,
          dropout_p, is_causal, attn_mask, scale);
      } else if (q_seq_len >= 192) {
-        cpu_flash_attention<scalar_t, scalar_t, 64, 512>(
+        FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 64, 512,
          output, logsumexp, query, key, value,
          dropout_p, is_causal, attn_mask, scale);
      } else {
-        cpu_flash_attention<scalar_t, scalar_t, 32, 512>(
+        FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 32, 512,
          output, logsumexp, query, key, value,
          dropout_p, is_causal, attn_mask, scale);
      }
    } else {
      AT_DISPATCH_MASK_TYPES(attn_mask.value().scalar_type(), "flash_attention_mask", [&]() {
        if (q_seq_len >= 768) {
-          cpu_flash_attention<scalar_t, mask_t, 256, 512>(
+          FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 256, 512,
            output, logsumexp, query, key, value,
            dropout_p, is_causal, attn_mask, scale);
        } else if (q_seq_len >= 192) {
-          cpu_flash_attention<scalar_t, mask_t, 64, 512>(
+          FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 64, 512,
            output, logsumexp, query, key, value,
            dropout_p, is_causal, attn_mask, scale);
        } else {
-          cpu_flash_attention<scalar_t, mask_t, 32, 512>(
+          FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 32, 512,
            output, logsumexp, query, key, value,
            dropout_p, is_causal, attn_mask, scale);
        }
@ -873,6 +1202,8 @@ void flash_attention_kernel_impl(
  });
 }

+#undef FLASH_ATTENTION_KERNEL
+
 void flash_attention_backward_kernel_impl(
    const at::Tensor& grad_q,
    const at::Tensor& grad_k,
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@ -159,6 +159,12 @@ inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_
  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
  fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
 }
+
+template <>
+inline void transpose<uint16_t>(int64_t M, int64_t N, const uint16_t* src, int64_t ld_src, uint16_t* dst, int64_t ld_dst) {
+  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  fbgemm::transpose_simd<uint16_t>(M, N, src, ld_src, dst, ld_dst);
+}
 #endif

 template <typename index_t, typename F>
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -964,9 +964,9 @@ ScalingType get_scaling_type(

 } // namespace

-// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
+// Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
-// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
+// If output matrix type is 16 or 32-bit type, scale_result is not applied.
 // Known limitations:
 //  - Only works if mat1 is row-major and mat2 is column-major
 //  - Only works if matrices sizes are divisible by 32
@ -1068,9 +1068,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  const auto out_dtype_ = args.result->scalar_type();
  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");

-  // Some scaled_gemms require an amax to populate lets create one here
-  Tensor amax = at::empty({0}, mat1.options().dtype(ScalarType::Float));
-
 #ifdef USE_ROCM
  auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -1126,7 +1123,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
      params.ldc = args.result_ld;
      params.c_dtype = out_dtype_;
-      params.amax_ptr = amax.data_ptr();
      params.use_fast_accum = use_fast_accum;
      if (transa_ && transb_) {
        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
@ -1150,11 +1146,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  else
 #endif
  {
-#if defined(USE_ROCM) && ROCM_VERSION >= 60200
-  // hipBlasLT requires scaleD to be set to something in order to use AMAX
-    auto dummy_options = TensorOptions().dtype(kFloat).device(kCUDA);
-    auto dummy_scale = at::ones(1, dummy_options);
-#endif
    at::cuda::blas::scaled_gemm(
        args.transa,
        args.transb,
@ -1172,14 +1163,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        bias ? bias->data_ptr(): nullptr,
        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
        args.result->data_ptr(),
-#if defined(USE_ROCM) && ROCM_VERSION >= 60200
-        scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
-#else
        scale_result ? scale_result->data_ptr() : nullptr,
-#endif
        args.result_ld,
        out_dtype_,
-        amax.data_ptr(),
        use_fast_accum);
  }

--- a/aten/src/ATen/native/cuda/Col2Im.cu
+++ b/aten/src/ATen/native/cuda/Col2Im.cu
@ -102,7 +102,7 @@ void col2im_out_cuda_template(
  output.resize_({batch_size, n_output_plane, output_height, output_width});
  int64_t output_batch_stride = output.stride(0);

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
      input.scalar_type(), "col2im_out_cuda", [&] {
    int64_t height_col = (output_height + 2 * pad_height -
                          (dilation_height * (kernel_height - 1) + 1)) /
--- a/aten/src/ATen/native/cuda/Im2Col.cu
+++ b/aten/src/ATen/native/cuda/Im2Col.cu
@ -103,7 +103,7 @@ static void im2col_out_cuda_template(
  output.resize_({batch_size, n_output_plane, output_length});

  // Launch kernel
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
      input.scalar_type(), "im2col_out_cuda", [&] {
    Tensor input_n;
    Tensor output_n;
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -1092,7 +1092,11 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  }

  constexpr int min_values_per_thread = 16;
+#ifndef USE_ROCM
  constexpr int max_values_per_thread = 256;
+#else
+  constexpr int max_values_per_thread = 1024;
+#endif

  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
    // Divide the input across warps in a thread-block, if that leaves at least
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -22,6 +22,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -43,6 +44,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -86,9 +88,9 @@ using graph_and_tensors = std::tuple<
    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
    // TODO(eqy): additional options
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
@ -104,7 +106,8 @@ using graph_and_tensors_backward = std::tuple<
    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
@ -126,6 +129,8 @@ struct MHAParams {
  std::array<int, MAX_MHA_DIM> q_stride;
  std::array<int, MAX_MHA_DIM> k_stride;
  std::array<int, MAX_MHA_DIM> v_stride;
+  std::array<int, MAX_MHA_DIM> bias_dim;
+  std::array<int, MAX_MHA_DIM> bias_stride;
  int64_t b;
  int64_t h;
  int64_t s_q;
@ -135,6 +140,9 @@ struct MHAParams {
  double dropout_probability;
  bool is_causal;
  bool return_softmaxstats;
+  // might be redundant if we take 0 dim/stride
+  // as signaling no-bias
+  bool has_attn_bias;
 };

 void setMHAParams(
@ -148,6 +156,7 @@ void setMHAParams(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    double dropout_probability,
    bool is_causal,
    bool return_softmaxstats) {
@ -166,6 +175,7 @@ void setMHAParams(
  params.dropout_probability = dropout_probability;
  params.is_causal = is_causal;
  params.return_softmaxstats = return_softmaxstats;
+  params.has_attn_bias = attn_bias.has_value();
  TORCH_INTERNAL_ASSERT(
      q.sizes().size() == MAX_MHA_DIM,
      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
@ -190,6 +200,17 @@ void setMHAParams(
  std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
  std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
  std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+  // uninit is OK as the struct is memset 0'd
+  if (params.has_attn_bias) {
+    std::copy(
+        attn_bias.value().sizes().begin(),
+        attn_bias.value().sizes().end(),
+        params.bias_dim.begin());
+    std::copy(
+        attn_bias.value().strides().begin(),
+        attn_bias.value().strides().end(),
+        params.bias_stride.begin());
+  }
 }

 struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
@ -203,6 +224,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
      const Tensor& q,
      const Tensor& k,
      const Tensor& v,
+      const std::optional<Tensor>& attn_bias,
      double dropout_probability,
      bool is_causal,
      bool return_softmaxstats) {
@ -217,6 +239,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
        q,
        k,
        v,
+        attn_bias,
        dropout_probability,
        is_causal,
        return_softmaxstats);
@ -285,6 +308,7 @@ auto build_graph_and_tensors(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -301,36 +325,6 @@ auto build_graph_and_tensors(
  mha_graph->set_io_data_type(dtype)
      .set_intermediate_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT);
-  auto Q = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Q")
-          .set_dim(std::vector<int64_t>(
-              q.sizes().data(), q.sizes().data() + q.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              q.sizes(),
-              std::vector<int64_t>(
-                  q.strides().data(),
-                  q.strides().data() + q.strides().size()))));
-  auto K = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("K")
-          .set_dim(std::vector<int64_t>(
-              k.sizes().data(), k.sizes().data() + k.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              k.sizes(),
-              std::vector<int64_t>(
-                  k.strides().data(),
-                  k.strides().data() + k.strides().size()))));
-  auto V = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("V")
-          .set_dim(std::vector<int64_t>(
-              v.sizes().data(), v.sizes().data() + v.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              v.sizes(),
-              std::vector<int64_t>(
-                  v.strides().data(),
-                  v.strides().data() + v.strides().size()))));
  auto attn_scale =
      mha_graph->tensor(fe::graph::Tensor_attributes()
                            .set_name("Attn_scale")
@ -338,11 +332,6 @@ auto build_graph_and_tensors(
                            .set_stride({1, 1, 1, 1})
                            .set_is_pass_by_value(true)
                            .set_data_type(fe::DataType_t::FLOAT));
-  // TODO(eqy): support bias in the future in a follow-up PR
-  // auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
-  //                         .set_name("bias")
-  //                         .set_dim({b, 1, s_q, s_kv})
-  //                         .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                    .set_name("Seed")
                                    .set_dim({1, 1, 1, 1})
@ -360,11 +349,30 @@ auto build_graph_and_tensors(
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
          .set_dropout(dropout_probability, seed, offset);
-  // Optional bias in flash attention is only supported 8.9.3 onwards
-  if (cudnnGetVersion() >= 8904) {
-    // scaled_dot_product_flash_attention_options.set_alibi_mask(true);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(q.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(k.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(v.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    scaled_dot_product_flash_attention_options.set_bias(bias.value());
  }
-
  auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
                                     .set_name("Seq_q")
                                     .set_dim({b, 1, 1, 1})
@ -376,20 +384,9 @@ auto build_graph_and_tensors(
                                      .set_stride({1, 1, 1, 1})
                                      .set_data_type(fe::DataType_t::INT32));

-  // if (cudnnGetVersion() >= 8903) {
-  //     scaled_dot_product_flash_attention_options.set_bias(bias)
-  //         .set_padding_mask(true)
-  //         .set_seq_len_q(seq_q)
-  //         .set_seq_len_kv(seq_kv);
-  // }
-
  auto [O, Stats] =
      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
-  O->set_output(true)
-      .set_dim(std::vector<int64_t>(
-          o.sizes().data(), o.sizes().data() + o.sizes().size()))
-      .set_stride(std::vector<int64_t>(
-          o.strides().data(), o.strides().data() + o.strides().size()));
+  O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());

  if (Stats) {
    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
@ -407,6 +404,7 @@ auto build_graph_and_tensors(
      std::move(Q),
      std::move(K),
      std::move(V),
+      std::move(bias),
      std::move(attn_scale),
      std::move(seed),
      std::move(offset),
@ -427,6 +425,7 @@ auto build_graph_and_tensors_backward(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -447,24 +446,6 @@ auto build_graph_and_tensors_backward(
  mha_graph->set_io_data_type(dtype)
      .set_intermediate_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT);
-  auto Q = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Q")
-          .set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(q.strides().begin(), q.strides().end())));
-  auto K = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("K")
-          .set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(k.strides().begin(), k.strides().end())));
-  auto V = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("V")
-          .set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(v.strides().begin(), v.strides().end())));
  auto attn_scale =
      mha_graph->tensor(fe::graph::Tensor_attributes()
                            .set_name("Attn_scale")
@ -472,6 +453,31 @@ auto build_graph_and_tensors_backward(
                            .set_stride({1, 1, 1, 1})
                            .set_is_pass_by_value(true)
                            .set_data_type(fe::DataType_t::FLOAT));
+  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                   .set_name("CUDNN_SDPA_BACKWARD")
+                                   .set_causal_mask(is_causal)
+                                   .set_attn_scale(attn_scale);
+  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Q")
+                                 .set_dim(q.sizes().vec())
+                                 .set_stride(q.strides().vec()));
+  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("K")
+                                 .set_dim(k.sizes().vec())
+                                 .set_stride(k.strides().vec()));
+  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("V")
+                                 .set_dim(v.sizes().vec())
+                                 .set_stride(v.strides().vec()));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    sdpa_backward_options.set_bias(bias.value());
+  }
  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                    .set_name("Seed")
                                    .set_dim({1, 1, 1, 1})
@ -482,47 +488,27 @@ auto build_graph_and_tensors_backward(
                                      .set_dim({1, 1, 1, 1})
                                      .set_stride({1, 1, 1, 1})
                                      .set_data_type(fe::DataType_t::INT32));
-  auto O = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("O")
-          .set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(o.strides().begin(), o.strides().end())));
-  auto STATS = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Stats")
-          .set_dim(std::vector<int64_t>(
-              softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
-          .set_stride(std::vector<int64_t>(
-              softmaxstats.strides().begin(), softmaxstats.strides().end()))
-          .set_data_type(fe::DataType_t::FLOAT));
-  auto DO = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("DO")
-          .set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
-  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
-                                   .set_name("CUDNN_SDPA_BACKWARD")
-                                   .set_causal_mask(is_causal)
-                                   .set_attn_scale(attn_scale);
+  auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("O")
+                                 .set_dim(o.sizes().vec())
+                                 .set_stride(o.strides().vec()));
+  auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Stats")
+                                     .set_dim(softmaxstats.sizes().vec())
+                                     .set_stride(softmaxstats.strides().vec())
+                                     .set_data_type(fe::DataType_t::FLOAT));
+  auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_name("DO")
+                                  .set_dim(dO.sizes().vec())
+                                  .set_stride(dO.strides().vec()));
  if (dropout_probability != 0.0f) {
    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
  }
  auto [DQ, DK, DV] =
      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
-  DQ->set_output(true)
-      .set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
-  DK->set_output(true)
-      .set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
-  DV->set_output(true)
-      .set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
+  DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+  DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+  DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
  AT_CUDNN_FRONTEND_CHECK(
@ -534,6 +520,7 @@ auto build_graph_and_tensors_backward(
      std::move(Q),
      std::move(K),
      std::move(V),
+      std::move(bias),
      std::move(attn_scale),
      std::move(Seed),
      std::move(Offset),
@ -559,6 +546,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -573,6 +561,11 @@ void run_cudnn_SDP_fprop(
    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
  }

+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }
+
  auto key = MHACacheKeyWrapper(
      b,
      h,
@ -583,6 +576,7 @@ void run_cudnn_SDP_fprop(
      q,
      k,
      v,
+      attn_bias,
      dropout_probability,
      is_causal,
      return_softmaxstats);
@ -605,13 +599,14 @@ void run_cudnn_SDP_fprop(
        q,
        k,
        v,
+        attn_bias,
        softmaxstats,
        o,
        dropoutseed,
        dropoutoffset,
        handle);
  }
-  auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
+  auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
      graph_and_tensors_values;
  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack = {
@ -619,13 +614,15 @@ void run_cudnn_SDP_fprop(
          {K, k.data_ptr()},
          {V, v.data_ptr()},
          {attn_scale, &scaling_factor},
-          //{bias, bias.data_ptr()},
          {seed, dropoutseed.data_ptr()},
          {offset, dropoutoffset.data_ptr()},
          {O, o.data_ptr()}};
  if (return_softmaxstats) {
    variant_pack[Stats] = softmaxstats.data_ptr();
  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
@ -647,6 +644,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -655,6 +653,12 @@ void run_cudnn_SDP_bprop(
    Tensor& dV,
    const Tensor& dropoutseed,
    const Tensor& dropoutoffset) {
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
+      !softmaxstats.numel()) {
+    return;
+  }
+
  Tensor dO_ = dO;
  if (!dO.strides()[dO.strides().size() - 1]) {
    TORCH_WARN(
@ -694,6 +698,7 @@ void run_cudnn_SDP_bprop(
      q,
      k,
      v,
+      attn_bias,
      dropout_probability,
      is_causal,
      true);
@ -715,6 +720,7 @@ void run_cudnn_SDP_bprop(
        q,
        k,
        v,
+        attn_bias,
        o,
        dO_,
        softmaxstats,
@ -726,8 +732,20 @@ void run_cudnn_SDP_bprop(
        handle);
  }
  auto
-      [mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
-          graph_and_tensors_backward_values;
+      [mha_graph,
+       Q,
+       K,
+       V,
+       bias,
+       attn_scale,
+       Seed,
+       Offset,
+       O,
+       Do,
+       Stats,
+       Dq,
+       Dk,
+       Dv] = graph_and_tensors_backward_values;
  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack = {// inputs
                      {Q, q.data_ptr()},
@ -746,6 +764,9 @@ void run_cudnn_SDP_bprop(
    variant_pack[Seed] = dropoutseed.data_ptr();
    variant_pack[Offset] = dropoutoffset.data_ptr();
  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@ -18,6 +18,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -36,6 +37,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .0.0
 .1.0