mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add a deterministic mode to skip the on device benchmarking that we know should affect numeric. This include - pad-mm - dynamic rblock scaling - template autotuning - coordinate descent tuning for reduction - reduction config autotuning in CachingAutotuner. For reduction both RBLOCK, num_warps should affect numeric. XBLOCK does not. We can still autotune XBLOCK for reductions. - benchmarking for computation communication reordering pass The mode definitely has perf hit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163589 Approved by: https://github.com/v0i0
102 lines
3.1 KiB
Python
102 lines
3.1 KiB
Python
# Owner(s): ["module: inductor"]
|
|
import contextlib
|
|
import unittest
|
|
|
|
import torch
|
|
import torch._inductor.config as inductor_config
|
|
from torch._dynamo.utils import counters
|
|
from torch._inductor.test_case import run_tests, TestCase
|
|
from torch._inductor.utils import fresh_cache
|
|
from torch.testing._internal.common_utils import (
|
|
instantiate_parametrized_tests,
|
|
parametrize,
|
|
)
|
|
from torch.testing._internal.inductor_utils import (
|
|
GPU_TYPE,
|
|
HAS_CUDA_AND_TRITON,
|
|
IS_BIG_GPU,
|
|
)
|
|
|
|
|
|
@instantiate_parametrized_tests
|
|
class DeterministicTest(TestCase):
|
|
def setUp(self) -> None:
|
|
super().setUp()
|
|
self._exit_stack = contextlib.ExitStack()
|
|
self._exit_stack.enter_context(fresh_cache())
|
|
|
|
def tearDown(self) -> None:
|
|
self._exit_stack.close()
|
|
super().tearDown()
|
|
|
|
@parametrize("deterministic", [False, True])
|
|
def test_mm_padding(self, deterministic):
|
|
with inductor_config.patch(deterministic=deterministic):
|
|
|
|
@torch.compile()
|
|
def foo(x, y):
|
|
return x @ y
|
|
|
|
inps = [torch.rand([2049, 2049], device=GPU_TYPE) for _ in range(2)]
|
|
out = foo(*inps)
|
|
self.assertEqual(out, inps[0] @ inps[1])
|
|
|
|
if deterministic:
|
|
self.assertTrue(counters["inductor"]["pad_mm_bench"] == 0)
|
|
else:
|
|
self.assertTrue(counters["inductor"]["pad_mm_bench"] > 0)
|
|
|
|
@parametrize("deterministic", [False, True])
|
|
@inductor_config.patch(max_autotune=True)
|
|
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
|
def test_max_autotune(self, deterministic):
|
|
with inductor_config.patch(deterministic=deterministic):
|
|
|
|
@torch.compile()
|
|
def foo(x, y):
|
|
return x @ y
|
|
|
|
inps = [torch.rand([2048, 2048], device=GPU_TYPE) for _ in range(2)]
|
|
out = foo(*inps)
|
|
self.assertEqual(out, inps[0] @ inps[1])
|
|
|
|
if deterministic:
|
|
self.assertTrue(counters["inductor"]["select_algorithm_autotune"] == 0)
|
|
else:
|
|
self.assertTrue(counters["inductor"]["select_algorithm_autotune"] > 0)
|
|
|
|
def test_pointwise_coordesc_tuning(self):
|
|
@torch.compile(mode="max-autotune")
|
|
def f(x):
|
|
return x + 1
|
|
|
|
x = torch.randn(2048, device=GPU_TYPE)
|
|
self.assertEqual(f(x), x + 1)
|
|
|
|
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
|
|
|
|
@parametrize("deterministic", [False, True])
|
|
def test_reduction_coordesc_tuning(self, deterministic):
|
|
with inductor_config.patch(
|
|
deterministic=deterministic, coordinate_descent_tuning=True
|
|
):
|
|
|
|
@torch.compile()
|
|
def foo(x):
|
|
return x.sum(dim=-1)
|
|
|
|
inp = torch.rand([2048, 2048], device=GPU_TYPE)
|
|
|
|
out = foo(inp)
|
|
self.assertEqual(out, inp.sum(dim=-1))
|
|
|
|
if deterministic:
|
|
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] == 0)
|
|
else:
|
|
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if HAS_CUDA_AND_TRITON:
|
|
run_tests()
|