[CI][testing] Use 3 processes for testing on sm89 and sm90 jobs (#158691)

3 procs were used for sm86, but we switched to sm89 and the check failed so it switched back to 2

sm90 is H100, but idk what unittests we have running there, but I assume they also have a lot of memory

They use larger runners, which have more GPU memory, so its usually ok.  I think it's ~22GB -> 10GB per proc if 2, 6GB per proc if 3 (cuda context maybe 1GB)

I've applied skips to the ones that OOMed

Time decreases from ~2.7hr per test job -> ~2hr

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158691
Approved by: https://github.com/huydhn
This commit is contained in:
Catherine Lee
2025-07-25 15:26:26 +00:00
committed by PyTorch MergeBot
parent 9535995bbc
commit 561193e5f2
4 changed files with 14 additions and 2 deletions

View File

@ -1212,6 +1212,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
@supported_platform @supported_platform
@common_utils.parametrize("head_dim", [17, 24, 94, 121]) @common_utils.parametrize("head_dim", [17, 24, 94, 121])
@common_utils.parametrize("dtype", test_dtypes_fast) @common_utils.parametrize("dtype", test_dtypes_fast)
@common_utils.serialTest()
def test_non_pow_2_headdim(self, device, dtype, head_dim): def test_non_pow_2_headdim(self, device, dtype, head_dim):
self.run_test( self.run_test(
_rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device _rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device

View File

@ -9,6 +9,7 @@ from torch._dynamo.utils import same
from torch._inductor.test_case import run_tests, TestCase from torch._inductor.test_case import run_tests, TestCase
from torch._inductor.utils import run_and_get_code from torch._inductor.utils import run_and_get_code
from torch.testing import FileCheck from torch.testing import FileCheck
from torch.testing._internal.common_utils import serialTest
from torch.testing._internal.inductor_utils import ( from torch.testing._internal.inductor_utils import (
GPU_TYPE, GPU_TYPE,
HAS_GPU, HAS_GPU,
@ -211,6 +212,7 @@ class InplacePaddingTest(TestCase):
@requires_cuda_with_enough_memory(2e10) @requires_cuda_with_enough_memory(2e10)
@inductor_config.patch(force_shape_pad=True) @inductor_config.patch(force_shape_pad=True)
@serialTest()
def test_linear_and_cel(self): def test_linear_and_cel(self):
# Use nan for torch.empty # Use nan for torch.empty
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)

View File

@ -4,6 +4,7 @@ import torch
import torch._inductor import torch._inductor
from torch._dynamo.utils import counters from torch._dynamo.utils import counters
from torch._inductor.test_case import run_tests, TestCase from torch._inductor.test_case import run_tests, TestCase
from torch.testing._internal.common_utils import serialTest
from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
@ -62,6 +63,7 @@ class TestKernelOptimization(TestCase):
}, },
post_grad_fusion_options={}, post_grad_fusion_options={},
) )
@serialTest() # Needs slightly more memory on GPUs
def test_einsum_to_pointwise(self): def test_einsum_to_pointwise(self):
counters.clear() counters.clear()
module = TestEinsumtoPointwise().to(GPU_TYPE) module = TestEinsumtoPointwise().to(GPU_TYPE)

View File

@ -10,6 +10,14 @@ from tools.stats.import_test_stats import get_disabled_tests
from tools.testing.test_run import ShardedTest, TestRun from tools.testing.test_run import ShardedTest, TestRun
try:
from torch.testing._internal.common_cuda import SM80OrLater
from torch.testing._internal.common_utils import TEST_CUDA
except ImportError:
TEST_CUDA = False
SM80OrLater = False
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Sequence from collections.abc import Sequence
@ -18,14 +26,13 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1" IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "") BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
# NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
# to ensure that sharding is consistent, NUM_PROCS is the actual number of procs # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
# used to run tests. If they are not equal, the only consequence should be # used to run tests. If they are not equal, the only consequence should be
# unequal shards. # unequal shards.
IS_ROCM = os.path.exists("/opt/rocm") IS_ROCM = os.path.exists("/opt/rocm")
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if USE_3_PROCS else 2 NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if not TEST_CUDA or SM80OrLater else 2
NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2 NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
THRESHOLD = 60 * 10 # 10 minutes THRESHOLD = 60 * 10 # 10 minutes