[TD] Use label to configure td on distributed for rollout (#122976)

Gate TD on distributed behind label

TODO:
auto add label to certain people's prs
Pull Request resolved: https://github.com/pytorch/pytorch/pull/122976
Approved by: https://github.com/huydhn, https://github.com/ZainRizvi
This commit is contained in:
Catherine Lee
2024-04-08 15:53:55 +00:00
committed by PyTorch MergeBot
parent 4f66db80ca
commit 61be8843c9
6 changed files with 23 additions and 1 deletions

View File

@ -40,6 +40,9 @@ outputs:
ci-no-td:
description: True if ci-no-td label was on PR or [ci-no-td] in PR body.
value: ${{ steps.filter.outputs.ci-no-td }}
ci-td-distributed:
description: True if ci-td-distributed label was on PR or [ci-td-distributed] in PR body.
value: ${{ steps.filter.outputs.ci-td-distributed }}
runs:
using: composite

View File

@ -1,5 +1,6 @@
tracking_issue: 24422
ciflow_tracking_issue: 64124
TD_rollout_issue: 123120
ciflow_push_tags:
- ciflow/binaries
- ciflow/binaries_conda

View File

@ -480,6 +480,11 @@ def perform_misc_tasks(
"ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
)
set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td"))
# Only relevant for the one linux distributed cuda job, delete this when TD
# is rolled out completely
set_output(
"ci-td-distributed", check_for_setting(labels, pr_body, "ci-td-distributed")
)
# Obviously, if the job name includes unstable, then this is an unstable job
is_unstable = job_name and IssueType.UNSTABLE.value in job_name

View File

@ -639,6 +639,7 @@ class TestConfigFilter(TestCase):
ci_verbose_test_logs: bool = False,
ci_no_test_timeout: bool = False,
ci_no_td: bool = False,
ci_td_distributed: bool = False,
is_unstable: bool = False,
reenabled_issues: str = "",
) -> str:
@ -647,6 +648,7 @@ class TestConfigFilter(TestCase):
f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
f"ci-no-test-timeout={ci_no_test_timeout}\n"
f"ci-no-td={ci_no_td}\n"
f"ci-td-distributed={ci_td_distributed}\n"
f"is-unstable={is_unstable}\n"
f"reenabled-issues={reenabled_issues}\n"
)

View File

@ -200,6 +200,7 @@ jobs:
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
@ -252,6 +253,7 @@ jobs:
-e VERBOSE_TEST_LOGS \
-e NO_TEST_TIMEOUT \
-e NO_TD \
-e TD_DISTRIBUTED \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \

View File

@ -32,6 +32,7 @@ from torch.testing._internal.common_utils import (
retry_shell,
set_cwd,
shell,
TEST_CUDA,
TEST_WITH_ASAN,
TEST_WITH_CROSSREF,
TEST_WITH_ROCM,
@ -1180,7 +1181,15 @@ def parse_args():
action="store_true",
help="Enables removing tests based on TD",
default=IS_CI
and (TEST_WITH_CROSSREF or TEST_WITH_ASAN)
and (
TEST_WITH_CROSSREF
or TEST_WITH_ASAN
or (
strtobool(os.environ.get("TD_DISTRIBUTED", "False"))
and os.getenv("TEST_CONFIG") == "distributed"
and TEST_CUDA
)
)
and os.getenv("BRANCH", "") != "main"
and not strtobool(os.environ.get("NO_TD", "False")),
)