[prototype] Invoke subgraph higher order op

Fix bug in split-build workflows codegen (#136043 )
By just deleting a few rogue lines left out in https://github.com/pytorch/pytorch/pull/135510 If file in workflows folder does not have a `.yml` extensions it will not be launched at all, will it? Pull Request resolved: https://github.com/pytorch/pytorch/pull/136043 Approved by: https://github.com/kit1980, https://github.com/atalman
2025-10-21 21:49:24 +08:00 · 2024-09-16 12:43:28 -07:00 · 2024-09-13 21:29:06 +00:00 · 2024-09-13 21:16:38 +00:00 · 2024-09-13 21:14:10 +00:00 · 2024-09-13 20:42:31 +00:00
397 changed files with 11837 additions and 4068 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
-0.6b
+0.7b
 manylinux_2_17
 rocm6.2
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
+9be04068c3c0857a4cfd17d7e39e71d0423ebac2
+3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
+91b14bf5593cf58a8541f3e6b9125600a867d4ef
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-757b6a61e7df814ba806f498f8bb3160f84b120c
+5fe38ffd73c2ac6ed6323b554205186696631c6f
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,12 +4,12 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.bz2'
+TARBALL='aotriton.tar.gz'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -337,3 +337,8 @@ onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
+
+parameterized==0.8.1
+#Description: Parameterizes unittests, both the tests themselves and the entire testing class
+#Pinned versions:
+#test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+3.1.0
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
 python -m pip install tlparse==0.3.25

+# Install parameterized
+python -m pip install parameterized==0.8.1
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -86,6 +86,18 @@
  - pull
  - inductor

+- name: OSS CI / pytorchbot / slow tests
+  patterns:
+  - test/slow_tests.json
+  approved_by:
+  - pytorchbot
+  ignore_flaky_failures: false
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+  - slow
+
 - name: OSS CI /pytorchbot / Executorch
  patterns:
  - .ci/docker/ci_commit_pins/executorch.txt
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -31,3 +31,4 @@ optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
+parameterized==0.8.1
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -412,8 +412,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                if python_version == "3.10" and arch_version == "12.1":
+                # Special build building to use on Colab. Python 3.11 for 12.1 CUDA
+                if python_version == "3.11" and arch_version == "12.1":
                    ret.append(
                        {
                            "python_version": python_version,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -70,17 +70,15 @@ class BinaryBuildWorkflow:
            )
        else:
            self.build_environment = f"{self.os}-binary-{self.package_type}"
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
            GITHUB_DIR
            / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
        )
-        if self.use_split_build:
-            output_file_path = (
-                GITHUB_DIR
-                / f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
-            )
        with open(output_file_path, "w") as output_file:
            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -168,6 +168,14 @@ def gh_post_commit_comment(
    )


+def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
+    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
+    if dry_run:
+        print(f"Dry run closing PR {pr_num}")
+    else:
+        gh_fetch_url(url, method="PATCH", data={"state": "closed"})
+
+
 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
    gh_fetch_url(url, method="DELETE")
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -3,49 +3,94 @@
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
-https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-which users will get their jobs to run on experimental runners. This user list
-is also a comma separated list of additional features or experiments which the
-user could be opted in to.
+https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+of which runners should be used to run which job.
+
+The configuration has two parts, the settings and a list of opted-in users,
+separated by a line containing "---".  If the line is not present, the
+settings are considered to be empty with only the second part, the user
+list, defined.
+
+The first part is a YAML block that defines the rollout settings. This can be
+used to define any settings that are needed to determine which runners to use.
+It's fields are defined by the RolloutSettings class below.
+
+The second part is a list of users who are explicitly opted in to the LF fleet.
+The user list is also a comma separated list of additional features or
+experiments which the user could be opted in to.

 The user list has the following rules:

- Users are GitHub usernames with the @ prefix
- If the first line is a "*" then all users will use the new runners
- If the first line is a "!" then all users will use the old runners
+- Users are GitHub usernames, which must start with the @ prefix
 - Each user is also a comma-separated list of features/experiments to enable
- A "#" prefix indicates the user is opted out of the new runners but is opting
-  into features/experiments.
+- A "#" prefix opts the user out of all experiments

-Example user list:
+Example config:
+    # A list of experiments that can be opted into.
+    # This defines the behavior they'll induce when opted into.
+    # Expected syntax is:
+    #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+    #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-    @User1
-    @User2,amz2023
-    #@UserOptOutOfNewRunner,amz2023
+    experiments:
+      lf:
+        rollout_percent: 25
+
+    ---
+
+    # Opt-ins:
+    # Users can opt into the LF fleet by adding their GitHub username to this list
+    # and specifying experiments to enable in a comma-separated list.
+    # Experiments should be from the above list.
+
+    @User1,lf,split_build
+    @User2,lf
+    @User3,split_build
 """

 import logging
 import os
+import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Iterable
+from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+import yaml
 from github import Auth, Github
 from github.Issue import Issue


-WORKFLOW_LABEL_META = ""  # use meta runners
+DEFAULT_LABEL_PREFIX = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-RUNNER_AMI_LEGACY = ""
-RUNNER_AMI_AMZ2023 = "amz2023"
-
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+SETTING_EXPERIMENTS = "experiments"
+
+LF_FLEET_EXPERIMENT = "lf"
+CANARY_FLEET_SUFFIX = ".c"
+
+
+class Experiment(NamedTuple):
+    rollout_perc: float = (
+        0  # Percentage of workflows to experiment on when user is not opted-in.
+    )
+
+    # Add more fields as needed
+
+
+class Settings(NamedTuple):
+    """
+    Settings for the experiments that can be opted into.
+    """
+
+    experiments: Dict[str, Experiment] = {}
+
+
 class ColorFormatter(logging.Formatter):
    """Color codes the log messages based on the log level"""

@ -172,85 +217,180 @@ def is_exception_branch(branch: str) -> bool:
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-    """
-    Determines if the job should run on the LF fleet or the Meta fleet
-
-    Returns:
-        The appropriate label prefix for the runner, corresponding to the fleet to use.
-        This gets prefixed to the very start of the runner label.
-    """
-
+def load_yaml(yaml_text: str) -> Any:
    try:
-        if rollout_state[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif rollout_state[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ").split(",")[0]
-                for usr_raw in rollout_state.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
+        data = yaml.safe_load(yaml_text)
+        return data
+    except yaml.YAMLError as exc:
+        log.exception("Error loading YAML")
+        raise


-def get_optin_feature(
-    rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+    """
+    Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the rest is the users.
+    """
+    rollout_state_parts = rollout_state.split("---")
+    if len(rollout_state_parts) >= 2:
+        return rollout_state_parts[0], rollout_state_parts[1]
+    else:
+        return "", rollout_state
+
+
+class UserOptins(Dict[str, List[str]]):
+    """
+    Dictionary of users with a list of features they have opted into
+    """
+
+
+def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+    """
+    Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+    Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+        - Example line: "@User1,lf,split_build"
+        - A "#" prefix indicates the user is opted out of all experiments
+
+
+    """
+    optins = UserOptins()
+    for user in user_optin_text.split("\n"):
+        user = user.strip("\r\n\t -")
+        if not user or not user.startswith("@"):
+            # Not a valid user. Skip
+            continue
+
+        if user:
+            usr_name = user.split(",")[0].strip("@")
+            optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+    return optins
+
+
+def parse_settings_from_text(settings_text: str) -> Settings:
+    """
+    Parse the experiments from the issue body into a list of ExperimentSettings
+    """
+    try:
+        if settings_text:
+            # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+            # for easy reading
+            # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+            #       the backtick character in shell commands.
+            backtick = chr(96)  # backtick character
+            settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+            settings = load_yaml(settings_text)
+
+            # For now we just load experiments. We can expand this if/when we add more settings
+            experiments = {}
+
+            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                valid_settings = {}
+                for setting in exp_settings:
+                    if setting not in Experiment._fields:
+                        log.warning(
+                            f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                        )
+                    else:
+                        valid_settings[setting] = exp_settings[setting]
+
+                experiments[exp_name] = Experiment(**valid_settings)
+            return Settings(experiments)
+
+    except Exception:
+        log.exception("Failed to parse settings")
+
+    return Settings()
+
+
+def parse_settings(rollout_state: str) -> Settings:
+    """
+    Parse settings, if any, from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the default values are used.
+    """
+    settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_settings_from_text(settings_text)
+
+
+def parse_users(rollout_state: str) -> UserOptins:
+    """
+    Parse users from the rollout state.
+
+    """
+    _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_user_opt_in_from_text(users_text)
+
+
+def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+    """
+    Check if a user is opted into an experiment
+    """
+    return experiment_name in user_optins.get(user, [])
+
+
+def get_runner_prefix(
+    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
 ) -> str:
-    """
-    Used to dynamically opt in jobs to specific runner-type variants.
+    settings = parse_settings(rollout_state)
+    user_optins = parse_users(rollout_state)

-    Returns:
-        The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-        This variant name is prefixed to the runner-type in the label.
-    """
-    try:
-        userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-        all_opted_in_users = set()
-        for user in userlist:
-            for i in user.split(","):
-                if i == feature:
-                    all_opted_in_users.add(user.split(",")[0])
-        opted_in_requestors = {
-            usr for usr in workflow_requestors if usr in all_opted_in_users
-        }
+    fleet_prefix = ""
+    prefixes = []
+    for experiment_name, experiment_settings in settings.experiments.items():
+        enabled = False

-        if opted_in_requestors:
+        # Is any workflow_requestor opted in to this experiment?
+        opted_in_users = [
+            requestor
+            for requestor in workflow_requestors
+            if is_user_opted_in(requestor, user_optins, experiment_name)
+        ]
+
+        if opted_in_users:
            log.info(
-                f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
            )
-            return feature
-        else:
-            log.info(
-                f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-            )
-            return fallback
+            enabled = True
+        elif experiment_settings.rollout_perc:
+            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                log.info(
+                    f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                )
+                enabled = True

-    except Exception as e:
+        if enabled:
+            label = experiment_name
+            if experiment_name == LF_FLEET_EXPERIMENT:
+                # We give some special treatment to the "lf" experiment since determines the fleet we use
+                #  - If it's enabled, then we always list it's prefix first
+                #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                if is_canary:
+                    label += CANARY_FLEET_SUFFIX
+                fleet_prefix = label
+            else:
+                prefixes.append(label)
+
+    if len(prefixes) > 1:
        log.error(
-            f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+            f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
        )
-        return fallback
+        prefixes = prefixes[:1]
+
+    # Fleet always comes first
+    if fleet_prefix:
+        prefixes.insert(0, fleet_prefix)
+
+    return ".".join(prefixes) + "." if prefixes else ""


 def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -268,9 +408,10 @@ def main() -> None:
    args = parse_args()

    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-        runner_ami = RUNNER_AMI_LEGACY
+        log.info(
+            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+        )
+        runner_label_prefix = DEFAULT_LABEL_PREFIX
    else:
        try:
            rollout_state = get_rollout_state_from_issue(
@ -285,35 +426,18 @@ def main() -> None:
                args.github_branch,
            )

-            label_type = get_fleet(
-                rollout_state,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-            runner_ami = get_optin_feature(
-                rollout_state=rollout_state,
-                workflow_requestors=(
-                    args.github_issue_owner,
-                    username,
-                ),
-                feature=RUNNER_AMI_AMZ2023,
-                fallback=RUNNER_AMI_LEGACY,
+            is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+            runner_label_prefix = get_runner_prefix(
+                rollout_state, (args.github_issue_owner, username), is_canary
            )
+
        except Exception as e:
            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
            )
-            label_type = WORKFLOW_LABEL_META
-            runner_ami = RUNNER_AMI_LEGACY

-    # For Canary builds use canary runners
-    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-        label_type = WORKFLOW_LABEL_LF_CANARY
-
-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-    set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


 if __name__ == "__main__":
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -51,6 +51,8 @@ def main() -> None:

    for platform_image in platform_images:  # type: ignore[attr-defined]
        for arch in platform_image.keys():  # type: ignore[attr-defined]
+            if arch == "cpu-s390x":
+                continue
            tag_image(
                platform_image[arch],  # type: ignore[index]
                default_tag,
--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -0,0 +1,237 @@
+from unittest import main, TestCase
+from unittest.mock import Mock, patch
+
+import runner_determinator as rd
+
+
+class TestRunnerDeterminatorIssueParser(TestCase):
+    def test_parse_settings(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_settings_in_code_block(self) -> None:
+        settings_text = """
+
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+    def test_parse_users_without_settings(self) -> None:
+        settings_text = """
+
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+
+class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
+    def test_opted_in_user(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
+
+    def test_opted_in_user_two_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
+
+    @patch("random.uniform", return_value=50)
+    def test_opted_out_user(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into both experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_lf_prefix_always_comes_first(self) -> None:
+        settings_text = """
+        experiments:
+            otherExp:
+                rollout_perc: 0
+            lf:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,otherExp,lf
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_commented_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        #@User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_extra_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+            foo:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf,otherExp,foo
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -36,6 +36,7 @@ from warnings import warn

 import yaml
 from github_utils import (
+    gh_close_pr,
    gh_fetch_json_list,
    gh_fetch_merge_base,
    gh_fetch_url,
@ -1174,11 +1175,11 @@ class GitHubPR:
            for pr in additional_merged_prs:
                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)

-        if comment_id and self.pr_num:
-            # When the merge process reaches this part, we can assume that the commit
-            # has been successfully pushed to trunk
-            merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
+        # When the merge process reaches this part, we can assume that the commit
+        # has been successfully pushed to trunk
+        merge_commit_sha = repo.rev_parse(name=self.default_branch())

+        if comment_id and self.pr_num:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
@ -1203,6 +1204,17 @@ class GitHubPR:
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")

+        # Usually Github will see that the commit has "resolves <pr_num>" in the
+        # commit message and close the PR, but sometimes it doesn't, leading to
+        # confusion.  When it doesn't, we close it manually.
+        time.sleep(60)  # Give Github some time to close the PR
+        manually_close_merged_pr(
+            pr=self,
+            additional_merged_prs=additional_merged_prs,
+            merge_commit_sha=merge_commit_sha,
+            dry_run=dry_run,
+        )
+
    def merge_changes(
        self,
        repo: GitRepo,
@ -1503,6 +1515,34 @@ def checks_to_markdown_bullets(
    ]


+def manually_close_merged_pr(
+    pr: GitHubPR,
+    additional_merged_prs: List[GitHubPR],
+    merge_commit_sha: str,
+    dry_run: bool,
+) -> None:
+    def _comment_and_close(pr: GitHubPR, comment: str) -> None:
+        pr = GitHubPR(pr.org, pr.project, pr.pr_num)  # Refresh the PR
+        if not pr.is_closed():
+            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
+            gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
+
+    message = (
+        f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
+        "so mergebot is closing it manually.  If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+    )
+    _comment_and_close(pr, message)
+    for additional_pr in additional_merged_prs:
+        message = (
+            f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
+            "but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
+            "If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+        )
+        _comment_and_close(additional_pr, message)
+
+    print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
+
+
@retries_decorator()
 def save_merge_record(
    comment_id: int,
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -62,49 +62,94 @@ jobs:
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
-          https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-          which users will get their jobs to run on experimental runners. This user list
-          is also a comma separated list of additional features or experiments which the
-          user could be opted in to.
+          https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+          of which runners should be used to run which job.
+
+          The configuration has two parts, the settings and a list of opted-in users,
+          separated by a line containing "---".  If the line is not present, the
+          settings are considered to be empty with only the second part, the user
+          list, defined.
+
+          The first part is a YAML block that defines the rollout settings. This can be
+          used to define any settings that are needed to determine which runners to use.
+          It's fields are defined by the RolloutSettings class below.
+
+          The second part is a list of users who are explicitly opted in to the LF fleet.
+          The user list is also a comma separated list of additional features or
+          experiments which the user could be opted in to.

          The user list has the following rules:

-          - Users are GitHub usernames with the @ prefix
-          - If the first line is a "*" then all users will use the new runners
-          - If the first line is a "!" then all users will use the old runners
+          - Users are GitHub usernames, which must start with the @ prefix
          - Each user is also a comma-separated list of features/experiments to enable
-          - A "#" prefix indicates the user is opted out of the new runners but is opting
-            into features/experiments.
+          - A "#" prefix opts the user out of all experiments

-          Example user list:
+          Example config:
+              # A list of experiments that can be opted into.
+              # This defines the behavior they'll induce when opted into.
+              # Expected syntax is:
+              #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+              #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-              @User1
-              @User2,amz2023
-              #@UserOptOutOfNewRunner,amz2023
+              experiments:
+                lf:
+                  rollout_percent: 25
+
+              ---
+
+              # Opt-ins:
+              # Users can opt into the LF fleet by adding their GitHub username to this list
+              # and specifying experiments to enable in a comma-separated list.
+              # Experiments should be from the above list.
+
+              @User1,lf,split_build
+              @User2,lf
+              @User3,split_build
          """

          import logging
          import os
+          import random
          from argparse import ArgumentParser
          from logging import LogRecord
-          from typing import Any, Iterable
+          from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+          import yaml
          from github import Auth, Github
          from github.Issue import Issue


-          WORKFLOW_LABEL_META = ""  # use meta runners
+          DEFAULT_LABEL_PREFIX = ""  # use meta runners
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-          RUNNER_AMI_LEGACY = ""
-          RUNNER_AMI_AMZ2023 = "amz2023"
-
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+          SETTING_EXPERIMENTS = "experiments"
+
+          LF_FLEET_EXPERIMENT = "lf"
+          CANARY_FLEET_SUFFIX = ".c"
+
+
+          class Experiment(NamedTuple):
+              rollout_perc: float = (
+                  0  # Percentage of workflows to experiment on when user is not opted-in.
+              )
+
+              # Add more fields as needed
+
+
+          class Settings(NamedTuple):
+              """
+              Settings for the experiments that can be opted into.
+              """
+
+              experiments: Dict[str, Experiment] = {}
+
+
          class ColorFormatter(logging.Formatter):
              """Color codes the log messages based on the log level"""

@ -231,85 +276,180 @@ jobs:
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-          def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-              """
-              Determines if the job should run on the LF fleet or the Meta fleet
-
-              Returns:
-                  The appropriate label prefix for the runner, corresponding to the fleet to use.
-                  This gets prefixed to the very start of the runner label.
-              """
-
+          def load_yaml(yaml_text: str) -> Any:
              try:
-                  if rollout_state[0] == "!":
-                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
-                      return WORKFLOW_LABEL_META
-                  elif rollout_state[0] == "*":
-                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
-                      return WORKFLOW_LABEL_LF
-                  else:
-                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ").split(",")[0]
-                          for usr_raw in rollout_state.split()
-                      }
-                      opted_in_requestors = {
-                          usr for usr in workflow_requestors if usr in all_opted_in_users
-                      }
-                      if opted_in_requestors:
-                          log.info(
-                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                          )
-                          return WORKFLOW_LABEL_LF
-                      else:
-                          log.info(
-                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                          )
-                          return WORKFLOW_LABEL_META
-
-              except Exception as e:
-                  log.error(
-                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-                  )
-                  return WORKFLOW_LABEL_META
+                  data = yaml.safe_load(yaml_text)
+                  return data
+              except yaml.YAMLError as exc:
+                  log.exception("Error loading YAML")
+                  raise


-          def get_optin_feature(
-              rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+          def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+              """
+              Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the rest is the users.
+              """
+              rollout_state_parts = rollout_state.split("---")
+              if len(rollout_state_parts) >= 2:
+                  return rollout_state_parts[0], rollout_state_parts[1]
+              else:
+                  return "", rollout_state
+
+
+          class UserOptins(Dict[str, List[str]]):
+              """
+              Dictionary of users with a list of features they have opted into
+              """
+
+
+          def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+              """
+              Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+              Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+                  - Example line: "@User1,lf,split_build"
+                  - A "#" prefix indicates the user is opted out of all experiments
+
+
+              """
+              optins = UserOptins()
+              for user in user_optin_text.split("\n"):
+                  user = user.strip("\r\n\t -")
+                  if not user or not user.startswith("@"):
+                      # Not a valid user. Skip
+                      continue
+
+                  if user:
+                      usr_name = user.split(",")[0].strip("@")
+                      optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+              return optins
+
+
+          def parse_settings_from_text(settings_text: str) -> Settings:
+              """
+              Parse the experiments from the issue body into a list of ExperimentSettings
+              """
+              try:
+                  if settings_text:
+                      # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+                      # for easy reading
+                      # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+                      #       the backtick character in shell commands.
+                      backtick = chr(96)  # backtick character
+                      settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+                      settings = load_yaml(settings_text)
+
+                      # For now we just load experiments. We can expand this if/when we add more settings
+                      experiments = {}
+
+                      for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                          valid_settings = {}
+                          for setting in exp_settings:
+                              if setting not in Experiment._fields:
+                                  log.warning(
+                                      f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                                  )
+                              else:
+                                  valid_settings[setting] = exp_settings[setting]
+
+                          experiments[exp_name] = Experiment(**valid_settings)
+                      return Settings(experiments)
+
+              except Exception:
+                  log.exception("Failed to parse settings")
+
+              return Settings()
+
+
+          def parse_settings(rollout_state: str) -> Settings:
+              """
+              Parse settings, if any, from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the default values are used.
+              """
+              settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_settings_from_text(settings_text)
+
+
+          def parse_users(rollout_state: str) -> UserOptins:
+              """
+              Parse users from the rollout state.
+
+              """
+              _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_user_opt_in_from_text(users_text)
+
+
+          def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+              """
+              Check if a user is opted into an experiment
+              """
+              return experiment_name in user_optins.get(user, [])
+
+
+          def get_runner_prefix(
+              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
          ) -> str:
-              """
-              Used to dynamically opt in jobs to specific runner-type variants.
+              settings = parse_settings(rollout_state)
+              user_optins = parse_users(rollout_state)

-              Returns:
-                  The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-                  This variant name is prefixed to the runner-type in the label.
-              """
-              try:
-                  userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-                  all_opted_in_users = set()
-                  for user in userlist:
-                      for i in user.split(","):
-                          if i == feature:
-                              all_opted_in_users.add(user.split(",")[0])
-                  opted_in_requestors = {
-                      usr for usr in workflow_requestors if usr in all_opted_in_users
-                  }
+              fleet_prefix = ""
+              prefixes = []
+              for experiment_name, experiment_settings in settings.experiments.items():
+                  enabled = False

-                  if opted_in_requestors:
+                  # Is any workflow_requestor opted in to this experiment?
+                  opted_in_users = [
+                      requestor
+                      for requestor in workflow_requestors
+                      if is_user_opted_in(requestor, user_optins, experiment_name)
+                  ]
+
+                  if opted_in_users:
                      log.info(
-                          f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                          f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
                      )
-                      return feature
-                  else:
-                      log.info(
-                          f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-                      )
-                      return fallback
+                      enabled = True
+                  elif experiment_settings.rollout_perc:
+                      # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+                      if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                          log.info(
+                              f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                          )
+                          enabled = True

-              except Exception as e:
+                  if enabled:
+                      label = experiment_name
+                      if experiment_name == LF_FLEET_EXPERIMENT:
+                          # We give some special treatment to the "lf" experiment since determines the fleet we use
+                          #  - If it's enabled, then we always list it's prefix first
+                          #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                          if is_canary:
+                              label += CANARY_FLEET_SUFFIX
+                          fleet_prefix = label
+                      else:
+                          prefixes.append(label)
+
+              if len(prefixes) > 1:
                  log.error(
-                      f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+                      f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
                  )
-                  return fallback
+                  prefixes = prefixes[:1]
+
+              # Fleet always comes first
+              if fleet_prefix:
+                  prefixes.insert(0, fleet_prefix)
+
+              return ".".join(prefixes) + "." if prefixes else ""


          def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -327,9 +467,10 @@ jobs:
              args = parse_args()

              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-                  label_type = WORKFLOW_LABEL_META
-                  runner_ami = RUNNER_AMI_LEGACY
+                  log.info(
+                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+                  )
+                  runner_label_prefix = DEFAULT_LABEL_PREFIX
              else:
                  try:
                      rollout_state = get_rollout_state_from_issue(
@ -344,35 +485,18 @@ jobs:
                          args.github_branch,
                      )

-                      label_type = get_fleet(
-                          rollout_state,
-                          (
-                              args.github_issue_owner,
-                              username,
-                          ),
-                      )
-                      runner_ami = get_optin_feature(
-                          rollout_state=rollout_state,
-                          workflow_requestors=(
-                              args.github_issue_owner,
-                              username,
-                          ),
-                          feature=RUNNER_AMI_AMZ2023,
-                          fallback=RUNNER_AMI_LEGACY,
+                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+                      runner_label_prefix = get_runner_prefix(
+                          rollout_state, (args.github_issue_owner, username), is_canary
                      )
+
                  except Exception as e:
                      log.error(
-                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
                      )
-                      label_type = WORKFLOW_LABEL_META
-                      runner_ami = RUNNER_AMI_LEGACY

-              # For Canary builds use canary runners
-              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-                  label_type = WORKFLOW_LABEL_LF_CANARY
-
-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-              set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


          if __name__ == "__main__":
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -29,9 +29,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -66,7 +76,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -101,7 +112,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -33,9 +33,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -73,7 +83,8 @@ jobs:
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -110,7 +121,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4"]
@ -143,7 +155,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -178,7 +191,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -207,7 +221,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
@ -238,7 +253,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
@ -269,7 +285,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
@ -303,7 +320,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
@ -334,7 +352,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: xpu
    steps:
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -27,9 +27,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-wheel:
    name: "Build Triton Wheel"
-    runs-on: [self-hosted, linux.4xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
    strategy:
      fail-fast: false
      matrix:
@ -199,7 +209,8 @@ jobs:

  build-conda:
    name: "Build Triton Conda"
-    runs-on: [self-hosted, linux.2xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    strategy:
      fail-fast: false
      matrix:
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -16,6 +16,15 @@ on:
    paths: [.github/workflows/create_release.yml]

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  release:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Create Release
@ -63,7 +72,7 @@ jobs:
          files: ${{env.PT_RELEASE_FILE}}
      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
@ -73,12 +82,14 @@ jobs:

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-    runs-on: linux.2xlarge
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: sourcecode-upload
    name: Upload source code to S3 for release tags
    permissions:
      id-token: write
-    needs: release
+    needs:
+      - get-label-type
+      - release
    steps:
      - uses: actions/download-artifact@v4.1.7
        with:
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -30,8 +30,18 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  docker-build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    needs: get-label-type
    timeout-minutes: 240
    strategy:
      fail-fast: false
@ -68,7 +78,7 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: [self-hosted, "${{ matrix.runner }}"]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -34,9 +34,19 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  generate-matrix:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
    outputs:
      matrix: ${{ steps.generate-matrix.outputs.matrix }}
    steps:
@ -54,10 +64,12 @@ jobs:

  build:
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    timeout-minutes: 240
-    needs: generate-matrix
+    needs:
+      - generate-matrix
+      - get-label-type
    strategy:
      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
      fail-fast: false
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -1010,76 +1010,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1766,6 +1696,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-main.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -19,7 +19,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -28,7 +28,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -58,7 +58,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -81,7 +81,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -105,7 +105,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -128,7 +128,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -152,7 +152,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -175,7 +175,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -24,7 +24,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -33,7 +33,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -63,7 +63,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -86,7 +86,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -134,7 +134,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -157,7 +157,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -205,7 +205,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -228,7 +228,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -275,7 +275,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-test:  # Testing
@ -296,7 +296,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -343,7 +343,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -366,7 +366,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -414,7 +414,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -437,7 +437,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -467,76 +467,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -555,7 +485,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -578,7 +508,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -625,7 +555,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-test:  # Testing
@ -646,7 +576,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -693,7 +623,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +646,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -764,7 +694,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +717,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -817,6 +747,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -835,7 +835,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -858,7 +858,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -905,7 +905,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-test:  # Testing
@ -926,7 +926,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -973,7 +973,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -996,7 +996,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1044,7 +1044,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1067,7 +1067,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1115,7 +1115,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1138,7 +1138,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1185,7 +1185,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-test:  # Testing
@ -1206,7 +1206,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -1253,7 +1253,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1276,7 +1276,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1324,7 +1324,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1347,7 +1347,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1395,7 +1395,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1418,7 +1418,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1465,7 +1465,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-test:  # Testing
@ -1486,7 +1486,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -18,11 +18,22 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -16,10 +16,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -13,10 +13,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -68,10 +68,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -50,10 +50,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-aarch64-py3_10-inductor-build:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -48,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -66,10 +66,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -18,10 +18,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -60,7 +71,9 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -22,10 +22,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-rocm6_1-py3_8-inductor-build:
    name: rocm6.1-py3.8-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -57,8 +57,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -87,8 +89,10 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

@ -333,8 +337,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,18 +3,12 @@ name: rocm
 on:
  push:
    branches:
-#     - main
+      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT

 concurrency:
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -56,12 +56,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
@ -87,8 +89,9 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -10,8 +10,18 @@ permissions:
  contents: read

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  index:
-    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
    environment: target-determinator-env
    steps:
      - name: Clone PyTorch
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -11,10 +11,21 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -266,8 +266,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -96,7 +96,7 @@ jobs:
          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

      - name: Upload gpt-fast benchmark results to Rockset
-        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
+        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains('inductor-micro-benchmark', github.event.workflow_run.name)
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
--- a/README.md
+++ b/README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
      - [NVIDIA CUDA Support](#nvidia-cuda-support)
      - [AMD ROCm Support](#amd-rocm-support)
      - [Intel GPU Support](#intel-gpu-support)
-    - [Install Dependencies](#install-dependencies)
    - [Get the PyTorch Source](#get-the-pytorch-source)
+    - [Install Dependencies](#install-dependencies)
    - [Install PyTorch](#install-pytorch)
      - [Adjust Build Options (Optional)](#adjust-build-options-optional)
  - [Docker Image](#docker-image)
@ -161,9 +161,34 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
 - Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
+- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
+- Visual Studio or Visual Studio Build Tool on Windows

-We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
+Professional, or Community Editions. You can also install the build tools from
+https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
+come with Visual Studio Code by default.
+
+\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+
+An example of environment setup is shown below:
+
+* Linux:
+
+```bash
+$ source <CONDA_INSTALL_DIR>/bin/activate
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+```
+
+* Windows:
+
+```bash
+$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+```

 ##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
@ -194,12 +219,23 @@ If you want to compile with Intel GPU support, follow these
 If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
 Other potentially useful environment variables may be found in `setup.py`.

+#### Get the PyTorch Source
+```bash
+git clone --recursive https://github.com/pytorch/pytorch
+cd pytorch
+# if you are updating an existing checkout
+git submodule sync
+git submodule update --init --recursive
+```
+
 #### Install Dependencies

 **Common**

 ```bash
 conda install cmake ninja
+# Run this command on native Windows
+conda install rust
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
 pip install -r requirements.txt
 ```
@ -235,15 +271,6 @@ pip install mkl-static mkl-include
 conda install -c conda-forge libuv=1.39
 ```

-#### Get the PyTorch Source
-```bash
-git clone --recursive https://github.com/pytorch/pytorch
-cd pytorch
-# if you are updating an existing checkout
-git submodule sync
-git submodule update --init --recursive
-```
-
 #### Install PyTorch
 **On Linux**

@ -284,13 +311,6 @@ python3 setup.py develop

 **On Windows**

-Choose Correct Visual Studio Version.
-
-PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
-Professional, or Community Editions. You can also install the build tools from
-https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
-come with Visual Studio Code by default.
-
 If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)

 **CPU-only builds**
@ -298,7 +318,6 @@ If you want to build legacy python code, please refer to [Building on legacy cod
 In this mode PyTorch computations will run on your CPU, not your GPU

 ```cmd
-conda activate
 python setup.py develop
 ```

--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)

+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5(                             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                            \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
 #define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(                                                    \
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))

+#define AT_DISPATCH_FLOATING_TYPES_AND5(    \
+    SCALARTYPE1,                            \
+    SCALARTYPE2,                            \
+    SCALARTYPE3,                            \
+    SCALARTYPE4,                            \
+    SCALARTYPE5,                            \
+    TYPE,                                   \
+    NAME,                                   \
+    ...)                                    \
+  AT_DISPATCH_SWITCH(                       \
+      TYPE,                                 \
+      NAME,                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
+          SCALARTYPE1,                      \
+          SCALARTYPE2,                      \
+          SCALARTYPE3,                      \
+          SCALARTYPE4,                      \
+          SCALARTYPE5,                      \
+          __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1408,7 +1408,6 @@ void scaled_gemm(
    const void *result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    void* amax_ptr,
    bool use_fast_accum) {
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
@ -1421,13 +1420,9 @@ void scaled_gemm(
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
-#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
-  // Amax support in ROCm as of 6.2
-  if (isFloat8Type(result_dtype)) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+  if (result_scale_ptr != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
  }
-#endif
 #ifndef USE_ROCM
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
 #endif
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -140,7 +140,6 @@ void scaled_gemm(
    const void* result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    void* amax_ptr,
    bool use_fast_accum);

 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -188,7 +188,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
       "ROCM_VERSION",
       [rocm_version]() { return rocm_version; },
-       [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+       [rocm_version](auto&& k) {
+        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
+        return rocm_version == k ? OK : FAIL;
+      });
  }
  // gfx arch
  {
@ -196,7 +199,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "GCN_ARCH_NAME",
        [gcn_arch_name]() { return gcn_arch_name; },
-        [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+        [gcn_arch_name](auto&& k) {
+          TUNABLE_LOG1("GCN_ARCH_NAME validation: expect ", k, " to match ", gcn_arch_name);
+          return gcn_arch_name == k ? OK : FAIL;
+        });
  }
  // rocblas
  {
@ -212,7 +218,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "ROCBLAS_VERSION",
        [rocblas_version]() { return rocblas_version; },
-        [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+        [rocblas_version](auto&& k) {
+          TUNABLE_LOG1("ROCBLAS_VERSION validation: expect ", k, " to match ", rocblas_version);
+          return rocblas_version == k ? OK : FAIL;
+        });
  }
  // hipblaslt
  {
@ -226,7 +235,10 @@ TuningResultsValidator::TuningResultsValidator() {
    RegisterValidator(
        "HIPBLASLT_VERSION",
        [hipblaslt_version]() { return hipblaslt_version; },
-        [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+        [hipblaslt_version](auto&& k) {
+          TUNABLE_LOG1("HIPBLASLT_VERSION validation: expect ", k, " to match ", hipblaslt_version);
+          return hipblaslt_version == k ? OK : FAIL;
+        });
  }
 #endif
 }
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -104,7 +104,6 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
          params->c_scale_ptr,
          params->ldc,
          params->c_dtype,
-          params->amax_ptr,
          params->use_fast_accum);
      return OK;
    }
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -779,6 +779,28 @@ std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
                            self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce);
 }

+std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_two_batch_rule(
+    const Tensor& self, std::optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor& index, std::optional<int64_t> index_bdim,
+    const Tensor& src, std::optional<int64_t> src_bdim,
+    const c10::string_view reduce,
+    bool include_self) {
+  return scatter_batch_rule(ATEN_FN2(scatter_reduce, two),
+                            self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
+}
+
+std::tuple<Tensor, std::optional<int64_t>> scatter_reduce__two_batch_rule(
+    const Tensor& self, std::optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor& index, std::optional<int64_t> index_bdim,
+    const Tensor& src, std::optional<int64_t> src_bdim,
+    const c10::string_view reduce,
+    bool include_self) {
+  return scatter_batch_rule(ATEN_FN2(scatter_reduce_, two),
+                            self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
+}
+
 std::tuple<Tensor, std::optional<int64_t>> scatter_value_reduce_batch_rule(
    const Tensor& self, std::optional<int64_t> self_bdim,
    int64_t dim,
@ -1250,6 +1272,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
  VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
  VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
  VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
+  VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule);
+  VMAP_SUPPORT2(scatter_reduce_, two, scatter_reduce__two_batch_rule);
  // as_strided_scatter does not work with the for-loop fallback today,
  // because as_strided_scatter will return an output that matches
  // the strides/storage_offset of its input.
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -209,7 +209,13 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(

  bool all_contiguous = is_contiguous(input);
  constexpr bool mixed_type = !std::is_same_v<scalar_t, param_t>;
-  const auto dtype = mixed_type ? kFloat : input.scalar_type();
+  // Using float data type for Half _var_sum in batchnorm stats updating on CPU
+  // to avoid _var_sum overflow since the representation range of Half is small.
+  using opmath_t = std::conditional_t<std::is_same_v<param_t, at::Half>, at::opmath_type<param_t>, param_t>;
+  auto dtype = mixed_type ? kFloat : input.scalar_type();
+  if (dtype == kHalf) {
+    dtype = kFloat;
+  }

  auto save_mean_a = save_mean.accessor<param_t, 1>();
  auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
@ -220,9 +226,9 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
  if (all_contiguous) {
    auto _mean = at::empty({n_input}, input.options().dtype(dtype));
    auto _var_sum = at::empty({n_input}, input.options().dtype(dtype));
-    auto _mean_a = _mean.accessor<param_t, 1>();
-    auto _var_sum_a = _var_sum.accessor<param_t, 1>();
-    auto momentum_ = static_cast<param_t>(momentum);
+    auto _mean_a = _mean.accessor<opmath_t, 1>();
+    auto _var_sum_a = _var_sum.accessor<opmath_t, 1>();
+    auto momentum_ = static_cast<opmath_t>(momentum);

    batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);

--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@ -11,18 +11,18 @@ namespace ao {
 namespace sparse {

 namespace {
-constexpr int64_t serialization_version_index = 0;
-constexpr int64_t bias_index = 1;
-constexpr int64_t out_features_block_size_index = 2;
-constexpr int64_t in_features_block_size_index = 3;
-constexpr int64_t weight_scales_index = 4;
-constexpr int64_t weight_zero_point_index = 5;
-constexpr int64_t quantization_scheme_index = 6;
-constexpr int64_t row_block_indices_index = 7;
-constexpr int64_t col_block_indices_index = 8;
-constexpr int64_t weight_values_index = 9;
-constexpr int64_t num_output_channels_index = 10;
-constexpr int64_t num_input_channels_index = 11;
+constexpr int64_t serialization_version_index [[maybe_unused]] = 0;
+constexpr int64_t bias_index [[maybe_unused]] = 1;
+constexpr int64_t out_features_block_size_index [[maybe_unused]] = 2;
+constexpr int64_t in_features_block_size_index [[maybe_unused]] = 3;
+constexpr int64_t weight_scales_index [[maybe_unused]] = 4;
+constexpr int64_t weight_zero_point_index [[maybe_unused]] = 5;
+constexpr int64_t quantization_scheme_index [[maybe_unused]] = 6;
+constexpr int64_t row_block_indices_index [[maybe_unused]] = 7;
+constexpr int64_t col_block_indices_index [[maybe_unused]] = 8;
+constexpr int64_t weight_values_index [[maybe_unused]] = 9;
+constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
+constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;

 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
 std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -81,6 +81,12 @@ void atan2_kernel(TensorIteratorBase& iter) {
 }

 #if !defined(C10_MOBILE)
+#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...)  \
+  AT_DISPATCH_V2(                                        \
+      TYPE,                                              \
+      NAME,                                              \
+      AT_WRAP(__VA_ARGS__),                              \
+      AT_EXPAND(AT_INTEGRAL_TYPES_V2))
 #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
  AT_DISPATCH_V2(                \
      TYPE,                                              \
@ -104,6 +110,8 @@ void atan2_kernel(TensorIteratorBase& iter) {
  AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),       \
      kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
+#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...)  \
+  AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
 #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                \
      kComplexHalf, kHalf, kBool, kBFloat16, TYPE, NAME, __VA_ARGS__)
@ -382,7 +390,7 @@ void bitwise_and_kernel(TensorIteratorBase& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    cpu_kernel(iter, [](bool a, bool b) { return a && b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_and_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a & b; },
@ -395,7 +403,7 @@ void bitwise_or_kernel(TensorIteratorBase& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    cpu_kernel(iter, [](bool a, bool b) { return a || b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_or_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a | b; },
@ -410,7 +418,7 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
    // this operation for both Boolean and integral types.
    cpu_kernel(iter, [](bool a, bool b) { return a != b; });
  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cpu", [&]() {
+    _AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_xor_cpu", [&]() {
      cpu_kernel_vec(
          iter,
          [](scalar_t a, scalar_t b) -> scalar_t { return a ^ b; },
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -473,8 +473,7 @@ void cpu_flash_attention(
        scalar_t* transpose_buffer_ptr = transpose_buffer.get();
        std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
        scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
-        for (const auto z : c10::irange(begin, end)) {
-          (void)z; // Suppress unused variable
+        for (C10_UNUSED auto z : c10::irange(begin, end)) {
          n = l * kvSplitSize;
          int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
          int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
@ -567,8 +566,7 @@ void cpu_flash_attention(
            ? query_padding_ptr + ompIdx * qSplitSize * eheadSize
            : nullptr;

-    for (const auto z : c10::irange(begin, end)) {
-      (void)z; // Suppress unused variable
+    for (C10_UNUSED auto z : c10::irange(begin, end)) {
      int64_t m = k * qSplitSize;
      int64_t qBlockSize = std::min(qSplitSize, qSize - m);
      // Initialize max and sum
@ -933,8 +931,7 @@ void cpu_flash_attention_backward(

    at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
    accum_t* dsum_data = dsum.data_ptr<accum_t>();
-    for (const auto z : c10::irange(begin, end)) {
-      (void)z; // Suppress unused variable
+    for (C10_UNUSED auto z : c10::irange(begin, end)) {
      // rowsum of grad_out * out
      for (int64_t m = 0; m < qSize; m += qSplitSize) {
        int64_t qBlockSize = std::min(qSplitSize, qSize - m);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -964,9 +964,9 @@ ScalingType get_scaling_type(

 } // namespace

-// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
+// Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
-// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
+// If output matrix type is 16 or 32-bit type, scale_result is not applied.
 // Known limitations:
 //  - Only works if mat1 is row-major and mat2 is column-major
 //  - Only works if matrices sizes are divisible by 32
@ -1068,9 +1068,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  const auto out_dtype_ = args.result->scalar_type();
  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");

-  // Some scaled_gemms require an amax to populate lets create one here
-  Tensor amax = at::empty({0}, mat1.options().dtype(ScalarType::Float));
-
 #ifdef USE_ROCM
  auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -1126,7 +1123,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
      params.ldc = args.result_ld;
      params.c_dtype = out_dtype_;
-      params.amax_ptr = amax.data_ptr();
      params.use_fast_accum = use_fast_accum;
      if (transa_ && transb_) {
        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
@ -1150,11 +1146,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  else
 #endif
  {
-#if defined(USE_ROCM) && ROCM_VERSION >= 60200
-  // hipBlasLT requires scaleD to be set to something in order to use AMAX
-    auto dummy_options = TensorOptions().dtype(kFloat).device(kCUDA);
-    auto dummy_scale = at::ones(1, dummy_options);
-#endif
    at::cuda::blas::scaled_gemm(
        args.transa,
        args.transb,
@ -1172,14 +1163,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        bias ? bias->data_ptr(): nullptr,
        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
        args.result->data_ptr(),
-#if defined(USE_ROCM) && ROCM_VERSION >= 60200
-        scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
-#else
        scale_result ? scale_result->data_ptr() : nullptr,
-#endif
        args.result_ld,
        out_dtype_,
-        amax.data_ptr(),
        use_fast_accum);
  }

--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -1092,7 +1092,11 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  }

  constexpr int min_values_per_thread = 16;
+#ifndef USE_ROCM
  constexpr int max_values_per_thread = 256;
+#else
+  constexpr int max_values_per_thread = 1024;
+#endif

  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
    // Divide the input across warps in a thread-block, if that leaves at least
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@ -177,12 +177,11 @@ struct KthValueLauncher {
      cuda::detail::TensorInfo<scalar_t, index_t> values_info,
      int collapse_values_dim,
      cuda::detail::TensorInfo<int64_t, index_t> indices_info,
-      int collapse_indices_dim,
+      C10_UNUSED int collapse_indices_dim,
      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
      int collapse_self_dim,
      int64_t num_slices,
      int64_t slice_size) {
-    (void)collapse_indices_dim; // Suppress unused variable warning
    dim3 grid;
    if (!getGridFromTiles(num_slices, grid)) {
      AT_ERROR("slices are too many");
@ -213,15 +212,13 @@ struct MedianLauncher {
  template <typename scalar_t, typename index_t, int all_dims>
  inline void launch(
      cuda::detail::TensorInfo<scalar_t, index_t> values_info,
-      int collapse_values_dim,
+      C10_UNUSED int collapse_values_dim,
      cuda::detail::TensorInfo<int64_t, index_t> indices_info,
-      int collapse_indices_dim,
+      C10_UNUSED int collapse_indices_dim,
      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
      int collapse_self_dim,
      int64_t num_slices,
      int64_t slice_size) {
-    (void)collapse_values_dim; // Suppress unused variable warning
-    (void)collapse_indices_dim; // Suppress unused variable warning
    dim3 grid;
    if (!getGridFromTiles(num_slices, grid)) {
      AT_ERROR("slices are too many");
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -22,6 +22,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -43,6 +44,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -86,9 +88,9 @@ using graph_and_tensors = std::tuple<
    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
    // TODO(eqy): additional options
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
@ -104,7 +106,8 @@ using graph_and_tensors_backward = std::tuple<
    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
@ -126,6 +129,8 @@ struct MHAParams {
  std::array<int, MAX_MHA_DIM> q_stride;
  std::array<int, MAX_MHA_DIM> k_stride;
  std::array<int, MAX_MHA_DIM> v_stride;
+  std::array<int, MAX_MHA_DIM> bias_dim;
+  std::array<int, MAX_MHA_DIM> bias_stride;
  int64_t b;
  int64_t h;
  int64_t s_q;
@ -135,6 +140,9 @@ struct MHAParams {
  double dropout_probability;
  bool is_causal;
  bool return_softmaxstats;
+  // might be redundant if we take 0 dim/stride
+  // as signaling no-bias
+  bool has_attn_bias;
 };

 void setMHAParams(
@ -148,6 +156,7 @@ void setMHAParams(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    double dropout_probability,
    bool is_causal,
    bool return_softmaxstats) {
@ -166,6 +175,7 @@ void setMHAParams(
  params.dropout_probability = dropout_probability;
  params.is_causal = is_causal;
  params.return_softmaxstats = return_softmaxstats;
+  params.has_attn_bias = attn_bias.has_value();
  TORCH_INTERNAL_ASSERT(
      q.sizes().size() == MAX_MHA_DIM,
      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
@ -190,6 +200,17 @@ void setMHAParams(
  std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
  std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
  std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+  // uninit is OK as the struct is memset 0'd
+  if (params.has_attn_bias) {
+    std::copy(
+        attn_bias.value().sizes().begin(),
+        attn_bias.value().sizes().end(),
+        params.bias_dim.begin());
+    std::copy(
+        attn_bias.value().strides().begin(),
+        attn_bias.value().strides().end(),
+        params.bias_stride.begin());
+  }
 }

 struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
@ -203,6 +224,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
      const Tensor& q,
      const Tensor& k,
      const Tensor& v,
+      const std::optional<Tensor>& attn_bias,
      double dropout_probability,
      bool is_causal,
      bool return_softmaxstats) {
@ -217,6 +239,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
        q,
        k,
        v,
+        attn_bias,
        dropout_probability,
        is_causal,
        return_softmaxstats);
@ -285,6 +308,7 @@ auto build_graph_and_tensors(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -301,36 +325,6 @@ auto build_graph_and_tensors(
  mha_graph->set_io_data_type(dtype)
      .set_intermediate_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT);
-  auto Q = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Q")
-          .set_dim(std::vector<int64_t>(
-              q.sizes().data(), q.sizes().data() + q.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              q.sizes(),
-              std::vector<int64_t>(
-                  q.strides().data(),
-                  q.strides().data() + q.strides().size()))));
-  auto K = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("K")
-          .set_dim(std::vector<int64_t>(
-              k.sizes().data(), k.sizes().data() + k.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              k.sizes(),
-              std::vector<int64_t>(
-                  k.strides().data(),
-                  k.strides().data() + k.strides().size()))));
-  auto V = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("V")
-          .set_dim(std::vector<int64_t>(
-              v.sizes().data(), v.sizes().data() + v.sizes().size()))
-          .set_stride(fixSizeOneDimStrideSDPA(
-              v.sizes(),
-              std::vector<int64_t>(
-                  v.strides().data(),
-                  v.strides().data() + v.strides().size()))));
  auto attn_scale =
      mha_graph->tensor(fe::graph::Tensor_attributes()
                            .set_name("Attn_scale")
@ -338,11 +332,6 @@ auto build_graph_and_tensors(
                            .set_stride({1, 1, 1, 1})
                            .set_is_pass_by_value(true)
                            .set_data_type(fe::DataType_t::FLOAT));
-  // TODO(eqy): support bias in the future in a follow-up PR
-  // auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
-  //                         .set_name("bias")
-  //                         .set_dim({b, 1, s_q, s_kv})
-  //                         .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                    .set_name("Seed")
                                    .set_dim({1, 1, 1, 1})
@ -360,11 +349,30 @@ auto build_graph_and_tensors(
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
          .set_dropout(dropout_probability, seed, offset);
-  // Optional bias in flash attention is only supported 8.9.3 onwards
-  if (cudnnGetVersion() >= 8904) {
-    // scaled_dot_product_flash_attention_options.set_alibi_mask(true);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(q.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(k.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(v.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    scaled_dot_product_flash_attention_options.set_bias(bias.value());
  }
-
  auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
                                     .set_name("Seq_q")
                                     .set_dim({b, 1, 1, 1})
@ -376,20 +384,9 @@ auto build_graph_and_tensors(
                                      .set_stride({1, 1, 1, 1})
                                      .set_data_type(fe::DataType_t::INT32));

-  // if (cudnnGetVersion() >= 8903) {
-  //     scaled_dot_product_flash_attention_options.set_bias(bias)
-  //         .set_padding_mask(true)
-  //         .set_seq_len_q(seq_q)
-  //         .set_seq_len_kv(seq_kv);
-  // }
-
  auto [O, Stats] =
      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
-  O->set_output(true)
-      .set_dim(std::vector<int64_t>(
-          o.sizes().data(), o.sizes().data() + o.sizes().size()))
-      .set_stride(std::vector<int64_t>(
-          o.strides().data(), o.strides().data() + o.strides().size()));
+  O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());

  if (Stats) {
    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
@ -407,6 +404,7 @@ auto build_graph_and_tensors(
      std::move(Q),
      std::move(K),
      std::move(V),
+      std::move(bias),
      std::move(attn_scale),
      std::move(seed),
      std::move(offset),
@ -427,6 +425,7 @@ auto build_graph_and_tensors_backward(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -447,24 +446,6 @@ auto build_graph_and_tensors_backward(
  mha_graph->set_io_data_type(dtype)
      .set_intermediate_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT);
-  auto Q = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Q")
-          .set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(q.strides().begin(), q.strides().end())));
-  auto K = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("K")
-          .set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(k.strides().begin(), k.strides().end())));
-  auto V = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("V")
-          .set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(v.strides().begin(), v.strides().end())));
  auto attn_scale =
      mha_graph->tensor(fe::graph::Tensor_attributes()
                            .set_name("Attn_scale")
@ -472,6 +453,31 @@ auto build_graph_and_tensors_backward(
                            .set_stride({1, 1, 1, 1})
                            .set_is_pass_by_value(true)
                            .set_data_type(fe::DataType_t::FLOAT));
+  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                   .set_name("CUDNN_SDPA_BACKWARD")
+                                   .set_causal_mask(is_causal)
+                                   .set_attn_scale(attn_scale);
+  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Q")
+                                 .set_dim(q.sizes().vec())
+                                 .set_stride(q.strides().vec()));
+  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("K")
+                                 .set_dim(k.sizes().vec())
+                                 .set_stride(k.strides().vec()));
+  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("V")
+                                 .set_dim(v.sizes().vec())
+                                 .set_stride(v.strides().vec()));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    sdpa_backward_options.set_bias(bias.value());
+  }
  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                    .set_name("Seed")
                                    .set_dim({1, 1, 1, 1})
@ -482,47 +488,27 @@ auto build_graph_and_tensors_backward(
                                      .set_dim({1, 1, 1, 1})
                                      .set_stride({1, 1, 1, 1})
                                      .set_data_type(fe::DataType_t::INT32));
-  auto O = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("O")
-          .set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(o.strides().begin(), o.strides().end())));
-  auto STATS = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Stats")
-          .set_dim(std::vector<int64_t>(
-              softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
-          .set_stride(std::vector<int64_t>(
-              softmaxstats.strides().begin(), softmaxstats.strides().end()))
-          .set_data_type(fe::DataType_t::FLOAT));
-  auto DO = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("DO")
-          .set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
-          .set_stride(
-              std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
-  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
-                                   .set_name("CUDNN_SDPA_BACKWARD")
-                                   .set_causal_mask(is_causal)
-                                   .set_attn_scale(attn_scale);
+  auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("O")
+                                 .set_dim(o.sizes().vec())
+                                 .set_stride(o.strides().vec()));
+  auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Stats")
+                                     .set_dim(softmaxstats.sizes().vec())
+                                     .set_stride(softmaxstats.strides().vec())
+                                     .set_data_type(fe::DataType_t::FLOAT));
+  auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_name("DO")
+                                  .set_dim(dO.sizes().vec())
+                                  .set_stride(dO.strides().vec()));
  if (dropout_probability != 0.0f) {
    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
  }
  auto [DQ, DK, DV] =
      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
-  DQ->set_output(true)
-      .set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
-  DK->set_output(true)
-      .set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
-  DV->set_output(true)
-      .set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
-      .set_stride(
-          std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
+  DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+  DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+  DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
  AT_CUDNN_FRONTEND_CHECK(
@ -534,6 +520,7 @@ auto build_graph_and_tensors_backward(
      std::move(Q),
      std::move(K),
      std::move(V),
+      std::move(bias),
      std::move(attn_scale),
      std::move(Seed),
      std::move(Offset),
@ -559,6 +546,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -573,6 +561,11 @@ void run_cudnn_SDP_fprop(
    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
  }

+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }
+
  auto key = MHACacheKeyWrapper(
      b,
      h,
@ -583,6 +576,7 @@ void run_cudnn_SDP_fprop(
      q,
      k,
      v,
+      attn_bias,
      dropout_probability,
      is_causal,
      return_softmaxstats);
@ -605,13 +599,14 @@ void run_cudnn_SDP_fprop(
        q,
        k,
        v,
+        attn_bias,
        softmaxstats,
        o,
        dropoutseed,
        dropoutoffset,
        handle);
  }
-  auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
+  auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
      graph_and_tensors_values;
  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack = {
@ -619,13 +614,15 @@ void run_cudnn_SDP_fprop(
          {K, k.data_ptr()},
          {V, v.data_ptr()},
          {attn_scale, &scaling_factor},
-          //{bias, bias.data_ptr()},
          {seed, dropoutseed.data_ptr()},
          {offset, dropoutoffset.data_ptr()},
          {O, o.data_ptr()}};
  if (return_softmaxstats) {
    variant_pack[Stats] = softmaxstats.data_ptr();
  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
@ -647,6 +644,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
@ -655,6 +653,12 @@ void run_cudnn_SDP_bprop(
    Tensor& dV,
    const Tensor& dropoutseed,
    const Tensor& dropoutoffset) {
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
+      !softmaxstats.numel()) {
+    return;
+  }
+
  Tensor dO_ = dO;
  if (!dO.strides()[dO.strides().size() - 1]) {
    TORCH_WARN(
@ -694,6 +698,7 @@ void run_cudnn_SDP_bprop(
      q,
      k,
      v,
+      attn_bias,
      dropout_probability,
      is_causal,
      true);
@ -715,6 +720,7 @@ void run_cudnn_SDP_bprop(
        q,
        k,
        v,
+        attn_bias,
        o,
        dO_,
        softmaxstats,
@ -726,8 +732,20 @@ void run_cudnn_SDP_bprop(
        handle);
  }
  auto
-      [mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
-          graph_and_tensors_backward_values;
+      [mha_graph,
+       Q,
+       K,
+       V,
+       bias,
+       attn_scale,
+       Seed,
+       Offset,
+       O,
+       Do,
+       Stats,
+       Dq,
+       Dk,
+       Dv] = graph_and_tensors_backward_values;
  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack = {// inputs
                      {Q, q.data_ptr()},
@ -746,6 +764,9 @@ void run_cudnn_SDP_bprop(
    variant_pack[Seed] = dropoutseed.data_ptr();
    variant_pack[Offset] = dropoutoffset.data_ptr();
  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@ -18,6 +18,7 @@ void run_cudnn_SDP_fprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    Tensor& softmaxstats,
    Tensor& o,
    Tensor& dropoutseed,
@ -36,6 +37,7 @@ void run_cudnn_SDP_bprop(
    const Tensor& q,
    const Tensor& k,
    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
    const Tensor& o,
    const Tensor& dO,
    const Tensor& softmaxstats,
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@ -197,7 +197,8 @@ Tensor group_norm(

  const Tensor kEmpty;
  auto memory_format = input.suggest_memory_format();
-  const auto& X = input.device().is_cpu() ? input.contiguous(memory_format) : input.contiguous();
+  const auto& X = input.device().is_cpu() || input.is_privateuseone() ?
+                  input.contiguous(memory_format) : input.contiguous();
  const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
  const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
  TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C);
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@ -6,6 +6,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
+#include <ATen/OpMathType.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -295,7 +296,12 @@ Tensor rms_norm(
      eps_val = eps.value();
    }

-    auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val)));
+    // upcast is needed for fp16 and bf16
+    c10::ScalarType opmath_t = toOpMathType(input.scalar_type());
+    Tensor upcasted_input = input.to(opmath_t);
+
+    Tensor rqrst_input = rsqrt(at::pow(upcasted_input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val));
+    Tensor result = upcasted_input.mul(rqrst_input).type_as(input);

    if (weight_opt.has_value()) {
      result = result.mul(weight_opt.value());
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h
@ -2,10 +2,10 @@

@interface MPSCNNNeuronOp : NSObject

-+ (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13));
-+ (MPSCNNNeuronReLU*)relu;
-+ (MPSCNNNeuronSigmoid*)sigmoid;
-+ (MPSCNNNeuronTanH*)tanh;
+ (MPSCNNNeuron*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13));
+ (MPSCNNNeuron*)relu;
+ (MPSCNNNeuron*)sigmoid;
+ (MPSCNNNeuron*)tanh;

@end

--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm
@ -8,69 +8,67 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations")

@implementation MPSCNNNeuronOp

-+ (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
-// Remove this once we support iOS 11.3
-#if TARGET_OS_MACCATALYST
-  return nil;
-#else
+ (MPSCNNNeuron*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
+  static MPSCNNNeuron* neuron = nil;
  static dispatch_once_t onceToken;
-  static MPSCNNNeuronHardSigmoid* neuron = nil;
  dispatch_once(&onceToken, ^{
+#if TARGET_OS_MACCATALYST
+    neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor hardSigmoidDescriptor]];
+#else
    neuron = [[MPSCNNNeuronHardSigmoid alloc]
-        initWithDevice:[MetalContext sharedInstance].device
-                     a:1.0 / 6.0
-                     b:0.5];
+              initWithDevice:[MetalContext sharedInstance].device
+              a:1.0 / 6.0
+              b:0.5];
+#endif
  });
  return neuron;
-#endif
 }

-+ (MPSCNNNeuronReLU*)relu {
-// Remove this once we support iOS 11.3
-#if TARGET_OS_MACCATALYST
-  return nil;
-#else
-  static MPSCNNNeuronReLU* relu = nil;
+ (MPSCNNNeuron*)relu {
+  static MPSCNNNeuron* neuron = nil;
  static dispatch_once_t onceToken;
  dispatch_once(&onceToken, ^{
-    relu = [[MPSCNNNeuronReLU alloc]
-        initWithDevice:[MetalContext sharedInstance].device
-                     a:0];
-  });
-  return relu;
+#if TARGET_OS_MACCATALYST
+    neuron = [[MPSCNNNeuron alloc]
+              initWithDevice:[MetalContext sharedInstance].device
+              neuronDescriptor:[MPSCNNNeuronOpDescriptor reluDescriptor]];
+#else
+    neuron = [[MPSCNNNeuronReLU alloc]
+              initWithDevice:[MetalContext sharedInstance].device
+              a:0];
 #endif
+  });
+  return neuron;
 }

-+ (MPSCNNNeuronSigmoid*)sigmoid {
-// Remove this once we support iOS 11.3
-#if TARGET_OS_MACCATALYST
-  return nil;
-#else
+ (MPSCNNNeuron*)sigmoid {
+  static MPSCNNNeuron* neuron = nil;
  static dispatch_once_t onceToken;
-  static MPSCNNNeuronSigmoid* sigmoid = nil;
  dispatch_once(&onceToken, ^{
-    sigmoid = [[MPSCNNNeuronSigmoid alloc]
-        initWithDevice:[MetalContext sharedInstance].device];
-  });
-  return sigmoid;
+#if TARGET_OS_MACCATALYST
+    neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor sigmoidDescriptor]];
+#else
+    neuron = [[MPSCNNNeuronSigmoid alloc]
+              initWithDevice:[MetalContext sharedInstance].device];
 #endif
+  });
+  return neuron;
 }

-+ (MPSCNNNeuronTanH*)tanh {
-// Remove this once we support iOS 11.3
-#if TARGET_OS_MACCATALYST
-  return nil;
-#else
+ (MPSCNNNeuron*)tanh {
+  static MPSCNNNeuron* neuron = nil;
  static dispatch_once_t onceToken;
-  static MPSCNNNeuronTanH* tanh = nil;
  dispatch_once(&onceToken, ^{
-    tanh = [[MPSCNNNeuronTanH alloc]
-        initWithDevice:[MetalContext sharedInstance].device
-                     a:1
-                     b:1];
-  });
-  return tanh;
+#if TARGET_OS_MACCATALYST
+    neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor tanhDescriptor]];
+#else
+    neuron = [[MPSCNNNeuronTanH alloc]
+              initWithDevice:[MetalContext sharedInstance].device
+              a:1
+              b:1];
 #endif
+  });
+  return neuron;
 }

@end
@ -85,9 +83,9 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
  static MPSNNNeuronDescriptor* neuronDesc = nil;
  dispatch_once(&onceToken, ^{
    neuronDesc = [MPSNNNeuronDescriptor
-        cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid
-                                  a:1.0 / 6.0
-                                  b:0.5];
+                  cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid
+                  a:1.0 / 6.0
+                  b:0.5];
  });
  return neuronDesc;
 }
@ -97,8 +95,8 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
  static MPSNNNeuronDescriptor* neuronDesc = nil;
  dispatch_once(&onceToken, ^{
    neuronDesc =
-        [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU
-                                                         a:0];
+    [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU
+                                                     a:0];
  });
  return neuronDesc;
 }
@ -108,7 +106,7 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
  static MPSNNNeuronDescriptor* neuronDesc = nil;
  dispatch_once(&onceToken, ^{
    neuronDesc = [MPSNNNeuronDescriptor
-        cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid];
+                  cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid];
  });
  return neuronDesc;
 }
@ -117,10 +115,9 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
  static dispatch_once_t onceToken;
  static MPSNNNeuronDescriptor* neuronDesc = nil;
  dispatch_once(&onceToken, ^{
-    neuronDesc =
-        [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH
-                                                         a:1.0
-                                                         b:1.0];
+    neuronDesc = [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH
+                                                                  a:1.0
+                                                                  b:1.0];
  });
  return neuronDesc;
 }
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@ -437,6 +437,20 @@ static void nllnd_loss_forward_impl(Tensor& output,
  if (output.numel() == 0)
    return;

+  // https://github.com/pytorch/pytorch/blob/042f2f7746a064f1527d95d1f1d712b4f0b34186/aten/src/ATen/native/cuda/Loss.cu#L335-L346
+  if (target_arg.numel() == 0) {
+    // Here target (and input) have zero elements
+    // Mean reduction on empty tensors produces NaN. See the discussion in
+    // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
+    if (reduction == Reduction::Mean) {
+      output.fill_(std::numeric_limits<double>::quiet_NaN());
+    } else {
+      output.zero_();
+    }
+    total_weight.zero_();
+    return;
+  }
+
  struct CachedGraph : public MPSCachedGraph {
    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
    MPSGraphTensor* inputTensor_ = nil;
@ -537,7 +551,9 @@ static void nllnd_loss_forward_impl(Tensor& output,
          mpsGraphBatchSizeTensor = [mpsGraph reductionSumWithTensor:mpsSelectOneTensor
                                                                axes:nil
                                                                name:@"batchSizeReductionTensor"];
-          mpsGraphReducedTensor = divisionNoNaN(mpsGraph, mpsGraphReducedTensor, mpsGraphBatchSizeTensor);
+          mpsGraphReducedTensor = [mpsGraph divisionWithPrimaryTensor:mpsGraphReducedTensor
+                                                      secondaryTensor:mpsGraphBatchSizeTensor
+                                                                 name:@"divisionTensor"];
        }
      }

--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -904,8 +904,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  for (const auto idx : c10::irange(axis)) {
    stat_shape.push_back(input_shape[idx]);
  }
-  for (const auto idx : c10::irange(axis, input.dim())) {
-    (void)idx; // Suppress unused variable
+  for (C10_UNUSED auto idx : c10::irange(axis, input.dim())) {
    stat_shape.push_back(1);
  }
  mean = mean.view(stat_shape);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -8554,7 +8554,7 @@
  device_check: NoCheck   # TensorIterator
  variants: method, function
  dispatch:
-    CPU, CUDA: __rshift__
+    CPU, CUDA, MPS: __rshift__
  tags: pointwise

 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@ -14706,6 +14706,11 @@
    CUDA: _fbgemm_dense_to_jagged_forward_symint
    CPU: _padded_dense_to_jagged_forward_cpu

+- func: _nested_from_padded_tensor(Tensor padded, Tensor offsets, Tensor dummy, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None, SymInt? sum_S=None) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
 - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
  dispatch:
    NestedTensorCPU: NestedTensor_softmax_dropout
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@ -313,9 +313,9 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
    output_padding.emplace_back(config_vals.at(idx));
    idx++;
  }
-  int64_t groups = config_vals.at(idx);
+  int64_t groups [[maybe_unused]] = config_vals.at(idx);
  idx++;
-  int64_t flags = config_vals.at(idx);
+  int64_t flags [[maybe_unused]] = config_vals.at(idx);
  idx++;
  TORCH_INTERNAL_ASSERT(idx == static_cast<int64_t>(config_vals.size()),
      "Unexpected length of config_vals, expected ",
@ -323,7 +323,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
      " got ",
      config_vals.size());

-  bool transpose = flags & (1 << 0);
+  bool transpose [[maybe_unused]] = flags & (1 << 0);

  int64_t other_flags = flags & ~(1 << 0);
  TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, ".");
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@ -251,8 +251,8 @@ TORCH_LIBRARY(_quantized, m) {
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor"));
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor"));
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
-  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"), {at::Tag::flexible_layout});
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"), {at::Tag::flexible_layout});
 }

 TORCH_LIBRARY(onednn, m) {
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -536,6 +536,24 @@ std::optional<Tensor> convert_boolean_attn_mask(const std::optional<Tensor>& att
  // Otherwise, attn_mask represents an additive attention tensor
  return attn_mask;
 }
+
+// alternate version to workaround -inf issue with cuDNN
+// TODO(eqy): delete this when cuDNN -inf issue is resolved
+std::optional<Tensor> convert_boolean_attn_mask_cudnn(const std::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
+  // Pass through
+  if(!attn_mask.has_value()){
+    return std::nullopt;
+  }
+  // Convert boolean mask to additive mask; need to invert mask to indicate what
+  // to mask *out*.
+  if (attn_mask->dtype() == at::kBool) {
+    // TODO Use the max type of the input and output
+    return at::where(attn_mask->logical_not(), -65504.0, at::scalar_tensor(0.0, at::TensorOptions().dtype(dtype)));
+  }
+  // Otherwise, attn_mask represents an additive attention tensor
+  return attn_mask;
+}
+
 // Memory Efficient Attention requires a padded attn mask bias
 // This function pads the attn_mask bias to be a multiple of 16
 // Then slices the padded bias to the original size
@ -698,15 +716,16 @@ Tensor scaled_dot_product_attention(
          query_, key, value, attn_mask_, dropout_p, is_causal, scale, enable_gqa);
  }
  sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
-  std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
  switch (backend) {
    case sdp::SDPBackend::cudnn_attention: {
+      std::optional<Tensor> attn_mask = convert_boolean_attn_mask_cudnn(attn_mask_, query_.dtype());
      bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
      auto out_lse_softmax = at::_scaled_dot_product_cudnn_attention(
-          query_, key, value, attn_mask_, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
+          query_, key, value, attn_mask, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
      return std::get<0>(out_lse_softmax);
    }
    case sdp::SDPBackend::flash_attention: {
+      std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
      if(query_.device().type() == DeviceType::CUDA){
        c10::SymInt og_size = query_.sym_size(-1);
        Tensor query_padded = pad_last_dim<8, false>(query_);
@ -723,6 +742,7 @@ Tensor scaled_dot_product_attention(
          query_, key, value, dropout_p, is_causal, attn_mask, scale));
    }
    case sdp::SDPBackend::efficient_attention: {
+      std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
      bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
      if (attn_mask.has_value()) {
        attn_mask.value() = preprocess_mask(attn_mask.value(), query_, key, value);;
@ -732,11 +752,13 @@ Tensor scaled_dot_product_attention(
      return std::get<0>(out_and_lse);
    }
    case sdp::SDPBackend::overrideable: {
+      std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
      auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
          query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
      return std::get<0>(out_lse_softmax);
    }
-    case sdp::SDPBackend::math:
+    case sdp::SDPBackend::math: {
+      std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
      if ((!GradMode::is_enabled() || (!query_.requires_grad() && !key.requires_grad() && !value.requires_grad()))
          && query_.device().type() == DeviceType::MPS && dropout_p == 0.0
          && query_.is_contiguous() && key.is_contiguous() && value.is_contiguous()
@ -761,6 +783,7 @@ Tensor scaled_dot_product_attention(
          std::nullopt, /*dropout_mask*/
          scale,
          enable_gqa));
+    }
    default:
      TORCH_CHECK(
          false,
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@ -774,6 +774,18 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
  TORCH_CHECK(
      max_seqlen_batch_k == max_seqlen_batch_v,
      "Key and Value must have the same sequence length");
+  auto attn_bias_ = attn_bias;
+  if (attn_bias_.has_value()) {
+    const auto bias_dim = attn_bias_.value().dim();
+    if (bias_dim == 2) {
+      attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+    } else if (bias_dim == 3) {
+      attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+    } else {
+      attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+      TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+    }
+  }

  Tensor attention, log_sumexp;

@ -818,13 +830,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
                      query/* Tensor q*/,
                      key/* Tensor k*/,
                      value/* Tensor v*/,
+                      attn_bias_ /* std::optional<Tensor> */,
                      log_sumexp/*Tensor softmaxstats*/,
                      attention/*Tensor o*/,
                      cudnn_seed/*Tensor dropoutseed*/,
                      cudnn_offset/*Tensor dropoutoffset*/);

  // TODO(eqy): support debug_attn_mask
-  return std::make_tuple(attention, log_sumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, cudnn_seed, cudnn_offset, Tensor());
+  return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
 }

 std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
@ -1102,10 +1115,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
      offset_t = at::empty({}, at::dtype(at::kLong).device(device));
    } else {
      auto [seed, offset] = at::cuda::philox::unpack(philox_state);
-      seed_t = at::scalar_tensor(
-          at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
-      offset_t = at::scalar_tensor(
-          at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
+#ifdef USE_ROCM
+      const auto options = at::dtype(at::kLong).device(at::kCUDA);
+#else
+      const auto options = at::dtype(at::kLong);
+#endif
+      seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), options);
+      offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), options);
    }
  } else {
    // Not using dropout
@ -1118,7 +1134,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
  auto ret = aotriton::v2::flash::check_gpu(stream);
  if (hipSuccess != ret) {
      TORCH_CHECK(false,
-                  "[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx94a:sramecc+:xnack-)")
+                  "[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
+                  " (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
  }

  // AOTriton may accept aligned on logsumexp tensor in the future for better
@ -1147,8 +1164,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_

  using aotriton::v2::flash::attn_fwd;
  using sdp::aotriton_adapter::mk_aotensor;
+  using sdp::aotriton_adapter::mk_aoscalartensor;
+  using sdp::aotriton_adapter::mk_philoxtensor;
  aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, aotriton::DType::kFloat16);
  at::Tensor softmax_fa_t = at::empty({ 0, 0, 0, 0 }, query.options());
+  const bool use_philox_state = in_capture_stream;
+  auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
+  auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
+  auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
+  auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
+  auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
  hipError_t err; // TODO: Error handling
  err = attn_fwd(mk_aotensor(q_t, "q"),
                 mk_aotensor(k_t, "k"),
@ -1158,8 +1183,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
                 mk_aotensor<2>(softmax_lse, "M"),
                 mk_aotensor(output_t, "Out"),
                 dropout_p,
-                 use_dropout ? *seed_t.data_ptr<int64_t>() : 0,
-                 use_dropout ? *offset_t.data_ptr<int64_t>() : 0,
+                 seed,
+                 offset1,
+                 offset2,
+                 seed_output,
+                 offset_output,
                 mk_aotensor(softmax_fa_t, "encoded_softmax"),
                 is_causal,
                 stream);
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@ -195,6 +195,27 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
    const int64_t num_heads = query.size(1);
    const int64_t head_dim_qk = query.size(3);
    const int64_t head_dim_v = value.size(3);
+    const int64_t max_seqlen_batch_q = query.size(2);
+    const int64_t max_seqlen_batch_k = key.size(2);
+
+    // This is needed because SaveVariable automatically converts
+    // std::optional to undefined tensor
+    std::optional<Tensor> attn_bias_;
+    if (attn_bias.defined()) {
+      attn_bias_ = attn_bias;
+    }
+    if (attn_bias_.has_value()) {
+      const auto bias_dim = attn_bias_.value().dim();
+      if (bias_dim == 2) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+      } else if (bias_dim == 3) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+      } else {
+        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+      }
+    }
+
    const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
    auto dq = at::empty_like(query);
    auto dk = at::empty_like(key);
@ -211,6 +232,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
                        query /*const Tensor& q*/,
                        key /*const Tensor& k*/,
                        value /*const Tensor& v*/,
+                        attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
                        out /*const Tensor& o*/,
                        grad_out/*const Tensor& dO*/,
                        logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
@ -219,7 +241,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
                        dv/*Tensor& dV*/,
                        philox_seed/*Tensor& dropoutseed*/,
                        philox_offset/*Tensor& dropoutoffset*/);
-    return std::make_tuple(dq, dk, dv);
+    return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
 }

 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@ -394,7 +416,8 @@ _efficient_attention_backward(
  auto ret = aotriton::v2::flash::check_gpu(stream);
  if (hipSuccess != ret) {
    TORCH_CHECK(false,
-                "[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
+                "[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
+                " (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
  }
  const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
  bool is_causal;
@ -419,6 +442,7 @@ _efficient_attention_backward(
  hipError_t err;
  using aotriton::v2::flash::attn_bwd;
  using sdp::aotriton_adapter::mk_aotensor;
+  using sdp::aotriton_adapter::mk_aoscalartensor;
  using sdp::aotriton_adapter::cast_dtype;
  aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
  err = attn_bwd(mk_aotensor(q_t, "q"),
@ -435,8 +459,9 @@ _efficient_attention_backward(
                 mk_aotensor<2>(softmax_lse, "L"),
                 mk_aotensor<2>(delta, "delta"),
                 float(dropout_p),
-                 rng_engine_inputs.seed_.val,
-                 rng_engine_inputs.offset_.val,
+                 mk_aoscalartensor(philox_seed),
+                 mk_aoscalartensor(philox_offset),
+                 0,
                 is_causal,
                 stream);
 #else
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -210,6 +210,7 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
  // Check that the gpu is capable of running flash attention
  using sm80 = SMVersion<8, 0>;
  using sm90 = SMVersion<9, 0>;
+  auto dprops = at::cuda::getCurrentDeviceProperties();
 #if USE_ROCM
 #if USE_AOTRITON
  auto stream = at::cuda::getCurrentCUDAStream().stream();
@ -221,11 +222,19 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
      }
      return false;
  }
+  c10::string_view arch(dprops->gcnArchName);
+  if (arch == "gfx1100") {
+    static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
+    if (!enable_navi3x) {
+      TORCH_WARN_ONCE("Flash attention support on Navi31 GPU is still experimental."
+                      " Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
+      return false;
+    }
+  }
 #else
  return false;
 #endif
 #else
-  auto dprops = at::cuda::getCurrentDeviceProperties();
  if (!check_sm_version<sm80, sm90>(dprops)) {
    if (debug) {
      TORCH_WARN(
@ -245,6 +254,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
  // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
  using sm50 = SMVersion<5, 0>;
  using sm90 = SMVersion<9, 0>;
+  auto dprops = at::cuda::getCurrentDeviceProperties();
 #if USE_ROCM
 #if USE_AOTRITON
  auto stream = at::cuda::getCurrentCUDAStream().stream();
@ -256,11 +266,19 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
      }
      return false;
  }
+  c10::string_view arch(dprops->gcnArchName);
+  if (arch == "gfx1100") {
+    static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
+    if (!enable_navi3x) {
+      TORCH_WARN_ONCE("Memory Efficient attention on Navi31 GPU is still experimental."
+                      " Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
+      return false;
+    }
+  }
 #else
  return false;
 #endif
 #else
-  auto dprops = at::cuda::getCurrentDeviceProperties();
  if (!check_sm_version<sm50, sm90>(dprops)) {
    if (debug) {
      TORCH_WARN(
@ -561,7 +579,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
          check_cudnn_deterministic,
          // check_is_causal,
          check_dtypes_low_precision,
-          check_for_attn_mask_cudnn,
+          check_attn_mask_shape,
          check_cudnn_hardware_support
          );
  for (auto& constraint : general_constraints) {
@ -616,9 +634,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
      }
    }
  }
+#if USE_ROCM
+  constexpr bool backend_supports_grouped_query_attention = false;
+#else
+  constexpr bool backend_supports_grouped_query_attention = true;
+#endif
  if (has_only_dense_inputs(params)) {
    constexpr auto dense_constraints = array_of<bool (*)(sdp_params const&, bool)>(
-        check_batch_size_and_num_heads_dense<true /*supports_grouped_query_attention=*/>,
+        check_batch_size_and_num_heads_dense<backend_supports_grouped_query_attention>,
        check_nonzero_sequence_lengths_dense,
        check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>);
    for (auto& constraint : dense_constraints) {
--- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
+++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
@ -115,6 +115,18 @@ aotriton::TensorView<Rank> mk_aotensor(const at::Tensor& q, c10::string_view ten
                                    cast_dtype(q.dtype()));
 }

+inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q)
+{
+  return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(q.data_ptr()),
+                                 cast_dtype(q.dtype()));
+}
+
+inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr)
+{
+  return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(ptr),
+                                 aotriton::DType::kUInt64);  // AOTriton excepts unsigned int64
+}
+
 } // namespace aotriton_adapter

 } // namespace sdp
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
@ -72,7 +72,8 @@ void check_gpu_arch(hipStream_t stream) {
  auto ret = aotriton::v2::flash::check_gpu(stream);
  if (hipSuccess != ret) {
      TORCH_CHECK(false,
-                  "FlashAttention only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
+                  "[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
+                  " (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
  }
 }

@ -164,6 +165,8 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
  at::Tensor seed_t, offset_t;

+  at::PhiloxCudaState philox_state;
+  bool use_philox_state = false;
  if (p_dropout > 0.0)  {
    // number of times random will be generated per thread, to offset philox counter in thc random
    // state
@ -171,12 +174,14 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
    int64_t counter_offset = batch_size * num_heads * 32;
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(gen->mutex_);
-    at::PhiloxCudaState philox_state = gen->philox_cuda_state(counter_offset);
+    philox_state = gen->philox_cuda_state(counter_offset);
    if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
      auto [seed, offset] = at::cuda::philox::unpack(philox_state);
-      seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
-      offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
+      seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong).device(at::kCUDA));
+      offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong).device(at::kCUDA));
    } else {
+      // See Note [CUDA Graph-safe RNG states] about the design
+      use_philox_state = true;
      seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
      offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
    }
@ -185,19 +190,8 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
      seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
      offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
    } else {
-      seed_t = at::empty({}, at::dtype(at::kLong));
-      offset_t = at::empty({}, at::dtype(at::kLong));
-    }
-  }
-
-  at::PhiloxCudaState philox_args;
-  if (p_dropout > 0.0) {
-    if (at::cuda::currentStreamCaptureStatus() ==
-        at::cuda::CaptureStatus::None)
-    {
-      philox_args = at::PhiloxCudaState(*seed_t.data_ptr<int64_t>(), *offset_t.data_ptr<int64_t>());
-    } else { // dropout + capture
-      philox_args = at::PhiloxCudaState(seed_t.data_ptr<int64_t>(), offset_t.data_ptr<int64_t>(), 0);
+      seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+      offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
    }
  }

@ -219,9 +213,17 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head

  hipError_t err; // TODO: Error handling
  using aotriton::v2::flash::attn_fwd;
+  using aotriton::TensorView;
  using sdp::aotriton_adapter::mk_aotensor;
+  using sdp::aotriton_adapter::mk_aoscalartensor;
+  using sdp::aotriton_adapter::mk_philoxtensor;
  using sdp::aotriton_adapter::cast_dtype;
  aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
+  auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
+  auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
+  auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
+  auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
+  auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
  err = attn_fwd(mk_aotensor(q_t, "q"),
                 mk_aotensor(k_t, "k"),
                 mk_aotensor(v_t, "v"),
@ -230,8 +232,11 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
                 mk_aotensor<2>(M, "M"),
                 mk_aotensor(output_t, "Out"),
                 p_dropout,
-                 philox_args.seed_.val,
-                 philox_args.offset_.val,
+                 seed,
+                 offset1,
+                 offset2,
+                 seed_output,
+                 offset_output,
                 mk_aotensor(softmax_fa_t, "encoded_softmax"),
                 is_causal,
                 stream);
@ -392,17 +397,6 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
    dv_expanded = dv;
  }

-  at::PhiloxCudaState philox_args;
-  if (p_dropout > 0.0) {
-    if (at::cuda::currentStreamCaptureStatus() ==
-        at::cuda::CaptureStatus::None)
-    {
-      philox_args = at::PhiloxCudaState(*philox_seed.data_ptr<int64_t>(), *philox_offset.data_ptr<int64_t>());
-    } else { // dropout + capture
-      philox_args = at::PhiloxCudaState(philox_seed.data_ptr<int64_t>(), philox_offset.data_ptr<int64_t>(), 0);
-    }
-  }
-
  at::Tensor q_t = q.permute({0,2,1,3});
  at::Tensor k_t = k.permute({0,2,1,3});
  at::Tensor v_t = v.permute({0,2,1,3});
@ -420,6 +414,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
  {
    using aotriton::v2::flash::attn_bwd;
    using sdp::aotriton_adapter::mk_aotensor;
+    using sdp::aotriton_adapter::mk_aoscalartensor;
    using sdp::aotriton_adapter::cast_dtype;
    aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
    err = attn_bwd(mk_aotensor(q_t, "q"),
@ -436,8 +431,9 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
                   mk_aotensor<2>(softmax_lse_cont, "L"),
                   mk_aotensor<2>(delta, "delta"),
                   p_dropout,
-                   philox_args.seed_.val,
-                   philox_args.offset_.val,
+                   mk_aoscalartensor(philox_seed),
+                   mk_aoscalartensor(philox_offset),
+                   0,
                   is_causal,
                   stream);
  }
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@ -275,17 +275,6 @@ inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
  return true;
 }

-// TODO(eqy): remove this once support is added
-inline bool check_for_attn_mask_cudnn(sdp_params const& params, bool debug) {
-  if (params.attn_mask.has_value()) {
-    if (debug) {
-      TORCH_WARN("cuDNN Attention does not support non-null attn_mask.");
-    }
-    return false;
-  }
-  return true;
-}
-
 inline bool check_attn_mask_shape(sdp_params const& params, bool debug) {
  auto attn_mask = params.attn_mask;
  if (!attn_mask.has_value()) {
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,23



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -130,6 +130,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,0
+
+
+
 hf_DistilBert,pass,0


@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -170,6 +178,10 @@ maml_omniglot,pass,0



+mnasnet1_0,pass,0
+
+
+
 mobilenet_v2,pass,0


@ -242,6 +254,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,fail_to_run,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -126,6 +126,10 @@ MobileBertForQuestionAnswering,pass,0



+OPTForCausalLM,pass,0
+
+
+
 PLBartForCausalLM,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@ -130,6 +130,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,0
+
+
+
 hf_DistilBert,pass,0


@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -170,6 +178,10 @@ maml_omniglot,pass,0



+mnasnet1_0,pass,0
+
+
+
 mobilenet_v2,pass,0


@ -242,6 +254,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,fail_to_run,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0



+OPTForCausalLM,pass,0
+
+
+
 PLBartForCausalLM,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -138,6 +138,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,19
+
+
+
 hf_DistilBert,pass,0


@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_Longformer,pass,4
+
+
+
 hf_Reformer,pass,5



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -178,6 +190,10 @@ maml,pass_due_to_skip,0



+mnasnet1_0,pass,0
+
+
+
 maml_omniglot,pass,0


@ -258,6 +274,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0



+OPTForCausalLM,pass,0
+
+
+
 PLBartForCausalLM,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -138,6 +138,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,19
+
+
+
 hf_DistilBert,pass,0


@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_Longformer,pass,4
+
+
+
 hf_Reformer,pass,5



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -182,6 +194,10 @@ maml_omniglot,pass,0



+mnasnet1_0,pass,0
+
+
+
 mobilenet_v2,pass,0


@ -258,6 +274,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0



+OPTForCausalLM,pass,0
+
+
+
 PLBartForCausalLM,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -138,6 +138,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,13
+
+
+
 hf_DistilBert,pass,0


@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_Longformer,pass,4
+
+
+
 hf_Reformer,pass,5



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,23



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -114,6 +114,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,0
+
+
+
 hf_DistilBert,pass,0


@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -158,6 +166,10 @@ mobilenet_v2,pass,0



+mnasnet1_0,pass,0
+
+
+
 mobilenet_v2_quantized_qat,fail_to_run,0


@ -226,6 +238,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,fail_to_run,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@ -114,6 +114,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,0
+
+
+
 hf_DistilBert,pass,0


@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


@ -154,6 +162,10 @@ maml_omniglot,pass,0



+mnasnet1_0,pass,0
+
+
+
 mobilenet_v2,pass,0


@ -226,6 +238,10 @@ resnext50_32x4d,pass,0



+shufflenet_v2_x1_0,pass,0
+
+
+
 soft_actor_critic,fail_to_run,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0



+OPTForCausalLM,pass,0
+
+
+
 PLBartForCausalLM,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -122,6 +122,10 @@ hf_Bert_large,pass,0



+hf_BigBird,pass,13
+
+
+
 hf_DistilBert,pass,0


@ -134,10 +138,18 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_Longformer,pass,4
+
+
+
 hf_Reformer,pass,5



+hf_T5,pass,0
+
+
+
 hf_T5_base,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,23



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,23



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,23



--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1338,6 +1338,16 @@ def try_script(model, example_inputs):
        return None


+def _produce_dynamic_shapes_for_export(path, x):
+    # mark_dynamic() is ignored for export.
+    # use this to produce dynamic_shapes spec instead.
+    from torch.export.dynamic_shapes import Dim
+
+    if not isinstance(x, torch.Tensor):
+        return None
+    return {i: Dim.AUTO for i in getattr(x, "_dynamo_dynamic_indices", {})}
+
+
 class AOTInductorModelCache:
    cache = {}

@ -1345,6 +1355,7 @@ class AOTInductorModelCache:
    def load(cls, model, example_inputs, device):
        import torch._inductor
        import torch.export._trace
+        from torch.export.dynamic_shapes import _tree_map_with_path

        key = weakref.ref(model)
        if key not in cls.cache:
@ -1364,10 +1375,16 @@ class AOTInductorModelCache:
            else:
                _register_dataclass_output_as_pytree(example_outputs)

+            combined_args = tuple(example_args) + tuple(example_kwargs.values())
+            dynamic_shapes = _tree_map_with_path(
+                _produce_dynamic_shapes_for_export, combined_args
+            )
+
            gm = torch.export._trace._export(
                model,
                example_args,
                example_kwargs,
+                dynamic_shapes=dynamic_shapes,
                pre_dispatch=True,
                strict=False,
            ).module()
@ -1382,11 +1399,20 @@ class AOTInductorModelCache:


 def export(model, example_inputs):
+    from torch.export.dynamic_shapes import _tree_map_with_path
+
    example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
    example_outputs = model(*example_args, **example_kwargs)
    _register_dataclass_output_as_pytree(example_outputs)

-    ep = torch.export.export(model, example_args, example_kwargs)
+    combined_args = tuple(example_args) + tuple(example_kwargs.values())
+    dynamic_shapes = _tree_map_with_path(
+        _produce_dynamic_shapes_for_export, combined_args
+    )
+
+    ep = torch.export.export(
+        model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes
+    )

    def opt_export(_, example_inputs):
        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
@ -2370,7 +2396,11 @@ class BenchmarkRunner:
        return set()

    @property
-    def skip_models_for_freezing(self):
+    def skip_models_for_freezing_cpu(self):
+        return set()
+
+    @property
+    def skip_models_for_freezing_cuda(self):
        return set()

    @property
@ -4275,7 +4305,6 @@ def run(runner, args, original_dir=None):
        runner.skip_models.update(runner.slow_models)

    if args.devices == ["cpu"]:
-        runner.skip_models.update(runner.very_slow_models)
        runner.skip_models.update(runner.skip_models_for_cpu)
    elif args.devices == ["cuda"]:
        runner.skip_models.update(runner.skip_models_for_cuda)
@ -4284,7 +4313,10 @@ def run(runner, args, original_dir=None):
        runner.skip_models.update(runner.skip_multiprocess_models)

    if args.freezing:
-        runner.skip_models.update(runner.skip_models_for_freezing)
+        if args.devices == ["cpu"]:
+            runner.skip_models.update(runner.skip_models_for_freezing_cpu)
+        elif args.devices == ["cuda"]:
+            runner.skip_models.update(runner.skip_models_for_freezing_cuda)

    if args.no_skip:
        runner.skip_models.clear()
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -505,7 +505,7 @@ class HuggingfaceRunner(BenchmarkRunner):
                return 4e-3, cosine
            if (
                current_device == "cpu"
-                and name in self._config["tolerance"]["higher_inference"]
+                and name in self._config["tolerance"]["higher_inference_cpu"]
            ):
                return 4e-3, cosine
        return 1e-3, cosine
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -11,9 +11,7 @@ skip:
    - GPTJForQuestionAnswering

  device:
-    cpu:
-      # OOMs
-      - OPTForCausalLM
+    cpu: []

  control_flow:
    - AllenaiLongformerBase
@ -71,6 +69,7 @@ batch_size:
    TrOCRForCausalLM: 2
    XGLMForCausalLM: 4
    XLNetLMHeadModel: 2
+    YituTechConvBert: 2


 tolerance:
--- a/benchmarks/dynamo/microbenchmarks/operatorbench.py
+++ b/benchmarks/dynamo/microbenchmarks/operatorbench.py
@ -1,5 +1,7 @@
 #!/usr/bin/env python3

+from contextlib import nullcontext
+
 import click
 import numpy as np
 from operator_inp_utils import OperatorInputsLoader
@ -16,11 +18,13 @@ from torch.utils._pytree import tree_map_only


 aten = torch.ops.aten
+profile_enabled = False


 def compute_speedups(
    operator, models, example_inputs, repeats, accuracy_checking=False, device="cuda"
 ):
+    global profile_enabled
    expected = models[0](*example_inputs)
    if accuracy_checking:
        for model in models[1:]:
@ -35,20 +39,32 @@ def compute_speedups(

    timings = np.zeros((repeats, len(models)), np.float64)
    for rep in range(repeats):
-        # interleave the runs to handle frequency scaling and load changes
-        for m, model in enumerate(models):
-            if device == "cuda":
-                model(*example_inputs)
-
-                # benchmarker.benchmark_gpu() clears L2 cache to hide the latency of CPU launch time
-                # along with cuda synchronization
-                timings[rep, m] = benchmarker.benchmark_gpu(
-                    lambda: model(*example_inputs)
+        record_rep_context = (
+            torch.profiler.record_function(f"rep_{rep}")
+            if profile_enabled
+            else nullcontext()
+        )
+        with record_rep_context:
+            # interleave the runs to handle frequency scaling and load changes
+            for m, model in enumerate(models):
+                record_model_context = (
+                    torch.profiler.record_function(f"model_{m}")
+                    if profile_enabled
+                    else nullcontext()
                )
-            else:
-                from torch._inductor.utils import timed
+                with record_model_context:
+                    if device == "cuda":
+                        model(*example_inputs)

-                timings[rep, m] = timed(model, example_inputs)
+                        # benchmarker.benchmark_gpu() clears L2 cache to hide the latency of CPU launch time
+                        # along with cuda synchronization
+                        timings[rep, m] = benchmarker.benchmark_gpu(
+                            lambda: model(*example_inputs)
+                        )
+                    else:
+                        from torch._inductor.utils import timed
+
+                        timings[rep, m] = timed(model, example_inputs)
    return np.median(timings, axis=0)


@ -171,6 +187,7 @@ def skip_operator(operator):
@click.option(
    "--channels-last", help="force inputs to channels last", is_flag=True, default=False
 )
+@click.option("--profile", help="profile the benchmark", is_flag=True, default=False)
 def benchmark(
    suite,
    op,
@ -183,7 +200,9 @@ def benchmark(
    inp_file,
    start_idx,
    channels_last,
+    profile,
 ):
+    global profile_enabled
    if inp_file is not None:
        loader = OperatorInputsLoader(inp_file)
    else:
@ -209,6 +228,8 @@ def benchmark(
        ops = [eval(op)]

    max_samples = max_samples + start_idx
+    profile_enabled = profile
+
    for operator in ops:
        if skip_operator(operator):
            continue
@ -216,10 +237,31 @@ def benchmark(
        print(f"Running {operator}")
        inp_gen = loader.get_inputs_for_operator(operator, dtype=dtype, device=device)
        timings = []
-
-        for i in range(min(max_samples, 1000000)):
+        inputs_list = []
+        for _ in range(min(max_samples, 1000000)):
            try:
                inps = next(inp_gen)
+                inputs_list.append(inps)
+            except StopIteration:
+                break
+
+        profiler_context = (
+            torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                record_shapes=False,
+                profile_memory=False,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    f"./log/operator_{operator}", use_gzip=True
+                ),
+            )
+            if profile_enabled
+            else nullcontext()
+        )
+        with profiler_context as prof:
+            for i, inps in enumerate(inputs_list):
                if inps is None:
                    break
                if i < start_idx:
@ -230,28 +272,32 @@ def benchmark(
                    args, kwargs = tree_map_only(
                        torch.Tensor, to_channels_last, (args, kwargs)
                    )
-
-            except StopIteration:
-                break
-            try:
-                # aten, nvfuser, inductor
-                timings.append(
-                    microbenchmark(
-                        operator,
-                        args,
-                        kwargs,
-                        dtype,
-                        accuracy_checking,
-                        repeats,
-                        measure_nvfuser,
-                        device,
+                try:
+                    iter_context = (
+                        torch.profiler.record_function(f"iter_{i}")
+                        if profile_enabled
+                        else nullcontext()
                    )
-                )
-            except Exception as e:
-                print(f"error {operator}")
-                print(e)
-                # comment out this line to avoid blocking other tests
-                # raise e
+                    with iter_context:
+                        # aten, nvfuser, inductor
+                        timings.append(
+                            microbenchmark(
+                                operator,
+                                args,
+                                kwargs,
+                                dtype,
+                                accuracy_checking,
+                                repeats,
+                                measure_nvfuser,
+                                device,
+                            )
+                        )
+
+                except Exception as e:
+                    print(f"error {operator}")
+                    print(e)
+                    # comment out this line to avoid blocking other tests
+                    # raise e

        if not timings:
            continue
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py
@ -60,6 +60,13 @@ class BenchmarkBase(ABC):
    # TODO is there other parts we need to add ?
    _enable_compile_time_instruction_count = False

+    # number of iterations used to run when collecting instruction_count or compile_time_instruction_count.
+    _num_iterations = 5
+
+    def with_iterations(self, value):
+        self._num_iterations = value
+        return self
+
    def enable_instruction_count(self):
        self._enable_instruction_count = True
        return self
@ -88,7 +95,7 @@ class BenchmarkBase(ABC):
    def _count_instructions(self):
        print(f"collecting instruction count for {self.name()}")
        results = []
-        for i in range(10):
+        for i in range(self._num_iterations):
            self._prepare()
            id = i_counter.start()
            self._work()
@ -102,7 +109,7 @@ class BenchmarkBase(ABC):
        config.record_compile_time_instruction_count = True

        results = []
-        for i in range(10):
+        for i in range(self._num_iterations):
            self._prepare()
            # CompileTimeInstructionCounter.record is only called on convert_frame._compile_inner
            # hence this will only count instruction count spent in compile_inner.
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
@ -4,21 +4,29 @@ import sys
 from benchmark_base import BenchmarkBase

 import torch
+from torch._inductor.utils import fresh_inductor_cache


 class Benchmark(BenchmarkBase):
-    def __init__(self, backend):
-        self.backend = backend
+    def __init__(self, backend, dynamic=False, is_gpu=False):
+        self._backend = backend
+        self._dynamic = dynamic
+        self._device = "cuda" if is_gpu else "cpu"

    def name(self):
-        return f"add_loop_{self.backend}"
+        prefix = f"add_loop_{self._backend}"
+        if self._dynamic:
+            prefix += "_dynamic"
+        if self._device == "cuda":
+            prefix += "_gpu"
+        return prefix

    def description(self):
        return "a loop over 100 add node"

-    def _prepare_once(self, dynamic=True):
-        self.a = torch.ones(1000)
-        self.b = torch.torch.ones(1000)
+    def _prepare_once(self):
+        self.a = torch.ones(1000, device=self._device)
+        self.b = torch.torch.ones(1000, device=self._device)

    def _prepare(self):
        torch._dynamo.reset()
@ -26,7 +34,7 @@ class Benchmark(BenchmarkBase):
        gc.disable()

    def _work(self):
-        @torch.compile(backend=self.backend, fullgraph=True)
+        @torch.compile(backend=self._backend, fullgraph=True, dynamic=self._dynamic)
        def f(a, b):
            result = a.clone()
            for i in range(1000):
@ -38,17 +46,24 @@ class Benchmark(BenchmarkBase):
                    result = result.sin()
            return result

-        f(self.a, self.b)
+        with fresh_inductor_cache():
+            f(self.a, self.b)


 def main():
    result_path = sys.argv[1]
-    Benchmark(
-        "eager"
-    ).enable_compile_time_instruction_count().collect_all().append_results(result_path)
-    Benchmark(
-        "inductor"
-    ).enable_compile_time_instruction_count().collect_all().append_results(result_path)
+    all = [
+        Benchmark("eager"),
+        Benchmark("eager", dynamic=True),
+        Benchmark("inductor"),
+        Benchmark("inductor", is_gpu=True),
+        Benchmark("inductor", is_gpu=True, dynamic=True),
+    ]
+
+    for benchmark in all:
+        benchmark.enable_compile_time_instruction_count().collect_all().append_results(
+            result_path
+        )


 if __name__ == "__main__":
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
@ -32,7 +32,9 @@ class Benchmark(BenchmarkBase):

 def main():
    result_path = sys.argv[1]
-    Benchmark().enable_instruction_count().collect_all().append_results(result_path)
+    Benchmark().enable_compile_time_instruction_count().collect_all().append_results(
+        result_path
+    )


 if __name__ == "__main__":
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@ -387,11 +387,6 @@ def get_skip_tests(suite, device, is_training: bool):
            skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
        elif device == "cuda":
            skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
-    else:
-        if hasattr(module, "SKIP"):
-            skip_tests.update(module.SKIP)
-        if is_training and hasattr(module, "SKIP_TRAIN"):
-            skip_tests.update(module.SKIP_TRAIN)

    skip_tests = (f"-x {name}" for name in skip_tests)
    skip_str = " ".join(skip_tests)
@ -438,7 +433,7 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                    if args.enable_cpu_launcher:
                        launcher_cmd = f"python -m torch.backends.xeon.run_cpu {args.cpu_launcher_args}"
                    cmd = f"{launcher_cmd} benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
-                    cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
+                    cmd = f"{cmd} {base_cmd} {args.extra_args} --dashboard"
                    skip_tests_str = get_skip_tests(suite, device, args.training)
                    cmd = f"{cmd} {skip_tests_str}"

--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .0.0
 .1.0