[CI] Adds support for selecting experiments for workflows on runner determinator (#137614)

adds a `default` tag to experiment configurations, allowing to remove some experiments by default on the random draw: ``` experiments: lf: rollout_perc: 25 otherExp: rollout_perc: 25 default: false --- ``` and includes the configuration to filter what experiments are of interest for a particular workflow (comma separated): ``` get-test-label-type: name: get-test-label-type uses: ./.github/workflows/_runner-determinator.yml with: ... check_experiments: "awsa100" ``` The end goal, is to enable us to run multiple experiments, that are independent from one another. For example, while we still runs the LF infra experiment, we want to migrate other runners leveraging the current solution. A immediate UC is for the A100 instances, where we want to migrate to AWS. Those new instances will during the migration period be labeled both `awsa100.linux.gcp.a100` and `linux.aws.a100`. Once the experiment ends, we will remove the first confusing one. ``` jobs: get-build-label-type: name: get-build-label-type uses: ./.github/workflows/_runner-determinator.yml with: ... get-test-label-type: name: get-test-label-type uses: ./.github/workflows/_runner-determinator.yml with: ... check_experiments: "awsa100" linux-focal-cuda12_1-py3_10-gcc9-inductor-build: name: cuda12.1-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-build.yml needs: - get-build-label-type - get-test-label-type with: runner_prefix: "${{ needs.get-build-label-type.outputs.label-type }}" ... test-matrix: | { include: [ { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" }, ... ]} ... ``` ``` experiments: lf: rollout_perc: 50 awsa100: rollout_perc: 50 default: false ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/137614 Approved by: https://github.com/malfet
2025-11-03 23:45:05 +08:00 · 2024-10-11 19:19:59 +00:00
parent 709021143d
commit 2cb983ab97
3 changed files with 213 additions and 12 deletions
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -39,7 +39,8 @@ Example config:
    experiments:
      lf:
        rollout_percent: 25
-
+        all_branches: false
+        default: true
    ---

    # Opt-ins:
@ -57,7 +58,7 @@ import os
 import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
+from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Tuple

 import yaml
 from github import Auth, Github
@ -86,6 +87,9 @@ class Experiment(NamedTuple):
    all_branches: bool = (
        False  # If True, the experiment is also enabled on the exception branches
    )
+    default: bool = (
+        True  # If True, the experiment is enabled by default for all queries
+    )

    # Add more fields as needed

@ -140,6 +144,12 @@ def set_github_output(key: str, value: str) -> None:
        f.write(f"{key}={value}\n")


+def _str_comma_separated_to_set(value: str) -> FrozenSet[str]:
+    return frozenset(
+        filter(lambda itm: itm != "", map(str.strip, value.strip(" \n\t").split(",")))
+    )
+
+
 def parse_args() -> Any:
    parser = ArgumentParser("Get dynamic rollout settings")
    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
@ -174,6 +184,13 @@ def parse_args() -> Any:
        required=True,
        help="Current GitHub ref type, branch or tag",
    )
+    parser.add_argument(
+        "--eligible-experiments",
+        type=_str_comma_separated_to_set,
+        required=False,
+        default="",
+        help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
+    )

    return parser.parse_args()

@ -348,6 +365,7 @@ def get_runner_prefix(
    rollout_state: str,
    workflow_requestors: Iterable[str],
    branch: str,
+    eligible_experiments: FrozenSet[str] = frozenset(),
    is_canary: bool = False,
 ) -> str:
    settings = parse_settings(rollout_state)
@ -356,14 +374,25 @@ def get_runner_prefix(
    fleet_prefix = ""
    prefixes = []
    for experiment_name, experiment_settings in settings.experiments.items():
-        enabled = False
-
        if not experiment_settings.all_branches and is_exception_branch(branch):
            log.info(
                f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
            )
            continue

+        if eligible_experiments:
+            if experiment_name not in eligible_experiments:
+                exp_list = ", ".join(eligible_experiments)
+                log.info(
+                    f"Skipping experiment '{experiment_name}', as it is not in the eligible_experiments list: {exp_list}"
+                )
+                continue
+        elif not experiment_settings.default:
+            log.info(
+                f"Skipping experiment '{experiment_name}', as it is not a default experiment"
+            )
+            continue
+
        # Is any workflow_requestor opted in to this experiment?
        opted_in_users = [
            requestor
@ -371,11 +400,13 @@ def get_runner_prefix(
            if is_user_opted_in(requestor, user_optins, experiment_name)
        ]

+        enabled = False
        if opted_in_users:
            log.info(
                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
            )
            enabled = True
+
        elif experiment_settings.rollout_perc:
            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
@ -444,6 +475,7 @@ def main() -> None:
            rollout_state,
            (args.github_issue_owner, username),
            args.github_branch,
+            args.eligible_experiments,
            is_canary,
        )