mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 22:34:53 +08:00
Use filelock to build extension_device backend one at a time (#137930)
Fixes https://github.com/pytorch/pytorch/issues/136125 Fixes https://github.com/pytorch/pytorch/issues/137026 Fixes https://github.com/pytorch/pytorch/issues/137027 The compilation fails during `setUpClass`, so disabling the test doesn't do nothing. The theory I have for this flaky issue is that `test_open_device_registration` from both `TritonExtensionBackendTests` and `ExtensionBackendTests` are run in parallel and cleaned up while the other is still in fly, causing flaky failure. Here is an example failure https://github.com/pytorch/pytorch/actions/runs/11331105492/job/31512603585#step:22:1710 Pull Request resolved: https://github.com/pytorch/pytorch/pull/137930 Approved by: https://github.com/malfet
This commit is contained in:
@ -22,6 +22,8 @@ except ImportError:
|
||||
ExtensionWrapperCodegen,
|
||||
)
|
||||
|
||||
from filelock import FileLock, Timeout
|
||||
|
||||
import torch._inductor.config as config
|
||||
from torch._inductor import cpu_vec_isa, metrics
|
||||
from torch._inductor.codegen import cpp_utils
|
||||
@ -51,10 +53,20 @@ TestCase = test_torchinductor.TestCase
|
||||
class BaseExtensionBackendTests(TestCase):
|
||||
module = None
|
||||
|
||||
# Use a lock file so that only one test can build this extension at a time
|
||||
lock_file = "extension_device.lock"
|
||||
lock = FileLock(lock_file)
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
|
||||
try:
|
||||
cls.lock.acquire(timeout=600)
|
||||
except Timeout:
|
||||
# This shouldn't happen, still attempt to build the extension anyway
|
||||
pass
|
||||
|
||||
# Build Extension
|
||||
torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
|
||||
source_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -77,6 +89,10 @@ class BaseExtensionBackendTests(TestCase):
|
||||
|
||||
torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
|
||||
|
||||
if os.path.exists(cls.lock_file):
|
||||
os.remove(cls.lock_file)
|
||||
cls.lock.release()
|
||||
|
||||
def setUp(self):
|
||||
torch._dynamo.reset()
|
||||
super().setUp()
|
||||
|
||||
Reference in New Issue
Block a user