Disable test_fs family for dynamo (#91459)

This should help address https://github.com/pytorch/pytorch/issues/67002.  At the end of these tests, any temp file `/dev/shm/torch_*` are cleaned up, but somehow it might take longer than 0.5s to finish causing the test to fail.  So, the PR tries to increase this max waiting time to 5s while polling for the result every 0.5s as before

### Testing
`pytest test_multiprocessing.py -k test_fs --verbose --flake-finder` to run `test_fs`, `test_fs_is_shared`, `test_fs_pool`, `test_fs_preserve_sharing`, and `test_fs_sharing` 50 times on a dynamo shard.  All passes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/91459
Approved by: https://github.com/kit1980, https://github.com/ZainRizvi, https://github.com/atalman
This commit is contained in:
Huy Do
2022-12-29 00:26:57 +00:00
committed by PyTorch MergeBot
parent f012d0ea5b
commit dbd0d76515

View File

@ -15,7 +15,7 @@ import torch.multiprocessing as mp
import torch.utils.hooks
from torch.nn import Parameter
from torch.testing._internal.common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN,
load_tests, slowTest, TEST_WITH_TSAN)
load_tests, slowTest, TEST_WITH_TSAN, TEST_WITH_TORCHDYNAMO)
# load_tests from common_utils is used to automatically filter tests for
# sharding on sandcastle. This line silences flake warnings
@ -23,6 +23,7 @@ load_tests = load_tests
TEST_REPEATS = 30
HAS_SHM_FILES = os.path.isdir('/dev/shm')
MAX_WAITING_TIME_IN_SECONDS = 5
TEST_CUDA_IPC = torch.cuda.is_available() and \
sys.platform != 'darwin' and \
sys.platform != 'win32'
@ -219,10 +220,19 @@ class leak_checker(object):
def has_shm_files(self, wait=True):
if not HAS_SHM_FILES:
return False
result = self._has_shm_files()
if result and mp.get_sharing_strategy() == 'file_system' and wait:
time.sleep(0.5)
return self._has_shm_files()
if not result or mp.get_sharing_strategy() != 'file_system' or not wait:
return result
total_waiting_time = 0
waiting_time = 0.5
while total_waiting_time <= MAX_WAITING_TIME_IN_SECONDS and result:
time.sleep(waiting_time)
total_waiting_time += waiting_time
result = self._has_shm_files()
return result
def _has_shm_files(self):
@ -342,19 +352,27 @@ class TestMultiprocessing(TestCase):
@unittest.skipIf(TEST_WITH_ASAN,
"seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
@unittest.skipIf(TEST_WITH_TORCHDYNAMO,
"Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
def test_fs_sharing(self):
with fs_sharing():
self._test_sharing(repeat=TEST_REPEATS)
@unittest.skipIf(TEST_WITH_TORCHDYNAMO,
"Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
def test_fs_preserve_sharing(self):
with fs_sharing():
self._test_preserve_sharing(repeat=TEST_REPEATS)
@unittest.skipIf(TEST_WITH_TORCHDYNAMO,
"Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
def test_fs_pool(self):
with fs_sharing():
self._test_pool(repeat=TEST_REPEATS)
@unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
@unittest.skipIf(TEST_WITH_TORCHDYNAMO,
"Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
def test_fs(self):
def queue_put():
x = torch.DoubleStorage(4)