[CI][testing] Use 3 processes for testing on sm89 and sm90 jobs (#158691)

3 procs were used for sm86, but we switched to sm89 and the check failed so it switched back to 2 sm90 is H100, but idk what unittests we have running there, but I assume they also have a lot of memory They use larger runners, which have more GPU memory, so its usually ok. I think it's ~22GB -> 10GB per proc if 2, 6GB per proc if 3 (cuda context maybe 1GB) I've applied skips to the ones that OOMed Time decreases from ~2.7hr per test job -> ~2hr Pull Request resolved: https://github.com/pytorch/pytorch/pull/158691 Approved by: https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2025-07-25 15:26:26 +00:00
parent 9535995bbc
commit 561193e5f2
4 changed files with 14 additions and 2 deletions
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@ -1212,6 +1212,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
    @supported_platform
    @common_utils.parametrize("head_dim", [17, 24, 94, 121])
    @common_utils.parametrize("dtype", test_dtypes_fast)
    @common_utils.serialTest()
    def test_non_pow_2_headdim(self, device, dtype, head_dim):
        self.run_test(
            _rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
--- a/test/inductor/test_inplace_padding.py
+++ b/test/inductor/test_inplace_padding.py
@ -9,6 +9,7 @@ from torch._dynamo.utils import same
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import serialTest
 from torch.testing._internal.inductor_utils import (
    GPU_TYPE,
    HAS_GPU,
@ -211,6 +212,7 @@ class InplacePaddingTest(TestCase):
    @requires_cuda_with_enough_memory(2e10)
    @inductor_config.patch(force_shape_pad=True)
    @serialTest()
    def test_linear_and_cel(self):
        # Use nan for torch.empty
        torch.use_deterministic_algorithms(True)
--- a/test/inductor/test_kernel_optimization.py
+++ b/test/inductor/test_kernel_optimization.py
@ -4,6 +4,7 @@ import torch
 import torch._inductor
 from torch._dynamo.utils import counters
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import serialTest
 from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
@ -62,6 +63,7 @@ class TestKernelOptimization(TestCase):
        },
        post_grad_fusion_options={},
    )
    @serialTest()  # Needs slightly more memory on GPUs
    def test_einsum_to_pointwise(self):
        counters.clear()
        module = TestEinsumtoPointwise().to(GPU_TYPE)
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@ -10,6 +10,14 @@ from tools.stats.import_test_stats import get_disabled_tests
 from tools.testing.test_run import ShardedTest, TestRun
 try:
    from torch.testing._internal.common_cuda import SM80OrLater
    from torch.testing._internal.common_utils import TEST_CUDA
 except ImportError:
    TEST_CUDA = False
    SM80OrLater = False
 if TYPE_CHECKING:
    from collections.abc import Sequence
@ -18,14 +26,13 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
 IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
 BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
 USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
 # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
 # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
 # used to run tests.  If they are not equal, the only consequence should be
 # unequal shards.
 IS_ROCM = os.path.exists("/opt/rocm")
-NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if USE_3_PROCS else 2
+NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if not TEST_CUDA or SM80OrLater else 2
 NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
 THRESHOLD = 60 * 10  # 10 minutes