mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[CI][testing] Use 3 processes for testing on sm89 and sm90 jobs (#158691)
3 procs were used for sm86, but we switched to sm89 and the check failed so it switched back to 2 sm90 is H100, but idk what unittests we have running there, but I assume they also have a lot of memory They use larger runners, which have more GPU memory, so its usually ok. I think it's ~22GB -> 10GB per proc if 2, 6GB per proc if 3 (cuda context maybe 1GB) I've applied skips to the ones that OOMed Time decreases from ~2.7hr per test job -> ~2hr Pull Request resolved: https://github.com/pytorch/pytorch/pull/158691 Approved by: https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
9535995bbc
commit
561193e5f2
@ -1212,6 +1212,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
|
|||||||
@supported_platform
|
@supported_platform
|
||||||
@common_utils.parametrize("head_dim", [17, 24, 94, 121])
|
@common_utils.parametrize("head_dim", [17, 24, 94, 121])
|
||||||
@common_utils.parametrize("dtype", test_dtypes_fast)
|
@common_utils.parametrize("dtype", test_dtypes_fast)
|
||||||
|
@common_utils.serialTest()
|
||||||
def test_non_pow_2_headdim(self, device, dtype, head_dim):
|
def test_non_pow_2_headdim(self, device, dtype, head_dim):
|
||||||
self.run_test(
|
self.run_test(
|
||||||
_rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
|
_rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
|
||||||
|
@ -9,6 +9,7 @@ from torch._dynamo.utils import same
|
|||||||
from torch._inductor.test_case import run_tests, TestCase
|
from torch._inductor.test_case import run_tests, TestCase
|
||||||
from torch._inductor.utils import run_and_get_code
|
from torch._inductor.utils import run_and_get_code
|
||||||
from torch.testing import FileCheck
|
from torch.testing import FileCheck
|
||||||
|
from torch.testing._internal.common_utils import serialTest
|
||||||
from torch.testing._internal.inductor_utils import (
|
from torch.testing._internal.inductor_utils import (
|
||||||
GPU_TYPE,
|
GPU_TYPE,
|
||||||
HAS_GPU,
|
HAS_GPU,
|
||||||
@ -211,6 +212,7 @@ class InplacePaddingTest(TestCase):
|
|||||||
|
|
||||||
@requires_cuda_with_enough_memory(2e10)
|
@requires_cuda_with_enough_memory(2e10)
|
||||||
@inductor_config.patch(force_shape_pad=True)
|
@inductor_config.patch(force_shape_pad=True)
|
||||||
|
@serialTest()
|
||||||
def test_linear_and_cel(self):
|
def test_linear_and_cel(self):
|
||||||
# Use nan for torch.empty
|
# Use nan for torch.empty
|
||||||
torch.use_deterministic_algorithms(True)
|
torch.use_deterministic_algorithms(True)
|
||||||
|
@ -4,6 +4,7 @@ import torch
|
|||||||
import torch._inductor
|
import torch._inductor
|
||||||
from torch._dynamo.utils import counters
|
from torch._dynamo.utils import counters
|
||||||
from torch._inductor.test_case import run_tests, TestCase
|
from torch._inductor.test_case import run_tests, TestCase
|
||||||
|
from torch.testing._internal.common_utils import serialTest
|
||||||
from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
|
from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
|
||||||
|
|
||||||
|
|
||||||
@ -62,6 +63,7 @@ class TestKernelOptimization(TestCase):
|
|||||||
},
|
},
|
||||||
post_grad_fusion_options={},
|
post_grad_fusion_options={},
|
||||||
)
|
)
|
||||||
|
@serialTest() # Needs slightly more memory on GPUs
|
||||||
def test_einsum_to_pointwise(self):
|
def test_einsum_to_pointwise(self):
|
||||||
counters.clear()
|
counters.clear()
|
||||||
module = TestEinsumtoPointwise().to(GPU_TYPE)
|
module = TestEinsumtoPointwise().to(GPU_TYPE)
|
||||||
|
@ -10,6 +10,14 @@ from tools.stats.import_test_stats import get_disabled_tests
|
|||||||
from tools.testing.test_run import ShardedTest, TestRun
|
from tools.testing.test_run import ShardedTest, TestRun
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from torch.testing._internal.common_cuda import SM80OrLater
|
||||||
|
from torch.testing._internal.common_utils import TEST_CUDA
|
||||||
|
except ImportError:
|
||||||
|
TEST_CUDA = False
|
||||||
|
SM80OrLater = False
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
@ -18,14 +26,13 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
|
|||||||
|
|
||||||
IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
|
IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
|
||||||
BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
|
BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
|
||||||
USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
|
|
||||||
|
|
||||||
# NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
|
# NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
|
||||||
# to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
|
# to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
|
||||||
# used to run tests. If they are not equal, the only consequence should be
|
# used to run tests. If they are not equal, the only consequence should be
|
||||||
# unequal shards.
|
# unequal shards.
|
||||||
IS_ROCM = os.path.exists("/opt/rocm")
|
IS_ROCM = os.path.exists("/opt/rocm")
|
||||||
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if USE_3_PROCS else 2
|
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if not TEST_CUDA or SM80OrLater else 2
|
||||||
NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
|
NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
|
||||||
THRESHOLD = 60 * 10 # 10 minutes
|
THRESHOLD = 60 * 10 # 10 minutes
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user