[ROCm] HIP Lazy Streams (#119996)

For ROCm/HIP, each stream is lazily initialized rather than creating all streams when the first stream is requested. HIP streams are not as lightweight as CUDA streams; the pooling strategy can affect performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/119996 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-02-20 16:24:04 +00:00
parent 26fbbc3e84
commit d3839b624b
2 changed files with 49 additions and 12 deletions
--- a/test/test_cuda_trace.py
+++ b/test/test_cuda_trace.py
@ -74,7 +74,14 @@ class TestCudaTrace(TestCase):
    def test_stream_creation_callback(self):
        cuda_trace.register_callback_for_cuda_stream_creation(self.mock)

-        torch.cuda.Stream()
+        # see Note [HIP Lazy Streams]
+        if torch.version.hip:
+            user_stream = torch.cuda.Stream()
+            with torch.cuda.stream(user_stream):
+                tensor = torch.ones(5, device="cuda")
+        else:
+            torch.cuda.Stream()
+
        self.mock.assert_called()

    def test_device_synchronization_callback(self):