mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[CI][testing] Use 3 processes for testing on sm89 and sm90 jobs (#158691)
3 procs were used for sm86, but we switched to sm89 and the check failed so it switched back to 2 sm90 is H100, but idk what unittests we have running there, but I assume they also have a lot of memory They use larger runners, which have more GPU memory, so its usually ok. I think it's ~22GB -> 10GB per proc if 2, 6GB per proc if 3 (cuda context maybe 1GB) I've applied skips to the ones that OOMed Time decreases from ~2.7hr per test job -> ~2hr Pull Request resolved: https://github.com/pytorch/pytorch/pull/158691 Approved by: https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
9535995bbc
commit
561193e5f2
@ -1212,6 +1212,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
|
||||
@supported_platform
|
||||
@common_utils.parametrize("head_dim", [17, 24, 94, 121])
|
||||
@common_utils.parametrize("dtype", test_dtypes_fast)
|
||||
@common_utils.serialTest()
|
||||
def test_non_pow_2_headdim(self, device, dtype, head_dim):
|
||||
self.run_test(
|
||||
_rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
|
||||
|
@ -9,6 +9,7 @@ from torch._dynamo.utils import same
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import run_and_get_code
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_utils import serialTest
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU,
|
||||
@ -211,6 +212,7 @@ class InplacePaddingTest(TestCase):
|
||||
|
||||
@requires_cuda_with_enough_memory(2e10)
|
||||
@inductor_config.patch(force_shape_pad=True)
|
||||
@serialTest()
|
||||
def test_linear_and_cel(self):
|
||||
# Use nan for torch.empty
|
||||
torch.use_deterministic_algorithms(True)
|
||||
|
@ -4,6 +4,7 @@ import torch
|
||||
import torch._inductor
|
||||
from torch._dynamo.utils import counters
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import serialTest
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
|
||||
|
||||
|
||||
@ -62,6 +63,7 @@ class TestKernelOptimization(TestCase):
|
||||
},
|
||||
post_grad_fusion_options={},
|
||||
)
|
||||
@serialTest() # Needs slightly more memory on GPUs
|
||||
def test_einsum_to_pointwise(self):
|
||||
counters.clear()
|
||||
module = TestEinsumtoPointwise().to(GPU_TYPE)
|
||||
|
@ -10,6 +10,14 @@ from tools.stats.import_test_stats import get_disabled_tests
|
||||
from tools.testing.test_run import ShardedTest, TestRun
|
||||
|
||||
|
||||
try:
|
||||
from torch.testing._internal.common_cuda import SM80OrLater
|
||||
from torch.testing._internal.common_utils import TEST_CUDA
|
||||
except ImportError:
|
||||
TEST_CUDA = False
|
||||
SM80OrLater = False
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
@ -18,14 +26,13 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
|
||||
BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
|
||||
USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
|
||||
|
||||
# NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
|
||||
# to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
|
||||
# used to run tests. If they are not equal, the only consequence should be
|
||||
# unequal shards.
|
||||
IS_ROCM = os.path.exists("/opt/rocm")
|
||||
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if USE_3_PROCS else 2
|
||||
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if not TEST_CUDA or SM80OrLater else 2
|
||||
NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
|
||||
THRESHOLD = 60 * 10 # 10 minutes
|
||||
|
||||
|
Reference in New Issue
Block a user