Use filelock to build extension_device backend one at a time (#137930)

Fixes https://github.com/pytorch/pytorch/issues/136125
Fixes https://github.com/pytorch/pytorch/issues/137026
Fixes https://github.com/pytorch/pytorch/issues/137027

The compilation fails during `setUpClass`, so disabling the test doesn't do nothing.  The theory I have for this flaky issue is that `test_open_device_registration` from both `TritonExtensionBackendTests` and `ExtensionBackendTests` are run in parallel and cleaned up while the other is still in fly, causing flaky failure.

Here is an example failure https://github.com/pytorch/pytorch/actions/runs/11331105492/job/31512603585#step:22:1710

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137930
Approved by: https://github.com/malfet
This commit is contained in:
Huy Do
2024-10-15 17:46:26 +00:00
committed by PyTorch MergeBot
parent 60eb3fccfa
commit 0b7ef196cd

View File

@ -22,6 +22,8 @@ except ImportError:
ExtensionWrapperCodegen,
)
from filelock import FileLock, Timeout
import torch._inductor.config as config
from torch._inductor import cpu_vec_isa, metrics
from torch._inductor.codegen import cpp_utils
@ -51,10 +53,20 @@ TestCase = test_torchinductor.TestCase
class BaseExtensionBackendTests(TestCase):
module = None
# Use a lock file so that only one test can build this extension at a time
lock_file = "extension_device.lock"
lock = FileLock(lock_file)
@classmethod
def setUpClass(cls):
super().setUpClass()
try:
cls.lock.acquire(timeout=600)
except Timeout:
# This shouldn't happen, still attempt to build the extension anyway
pass
# Build Extension
torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
source_file_path = os.path.dirname(os.path.abspath(__file__))
@ -77,6 +89,10 @@ class BaseExtensionBackendTests(TestCase):
torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
if os.path.exists(cls.lock_file):
os.remove(cls.lock_file)
cls.lock.release()
def setUp(self):
torch._dynamo.reset()
super().setUp()