Files
pytorch/test/inductor/test_deterministic.py
Shunting Zhang 40b25578e4 [Inductor] deterministic mode (#163589)
Add a deterministic mode to skip the on device benchmarking that we know should affect numeric. This include
- pad-mm
- dynamic rblock scaling
- template autotuning
- coordinate descent tuning for reduction
- reduction config autotuning in CachingAutotuner.  For reduction both RBLOCK, num_warps should affect numeric. XBLOCK does not. We can still autotune XBLOCK for reductions.
- benchmarking for computation communication reordering pass

The mode definitely has perf hit.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163589
Approved by: https://github.com/v0i0
2025-10-04 01:05:08 +00:00

102 lines
3.1 KiB
Python

# Owner(s): ["module: inductor"]
import contextlib
import unittest
import torch
import torch._inductor.config as inductor_config
from torch._dynamo.utils import counters
from torch._inductor.test_case import run_tests, TestCase
from torch._inductor.utils import fresh_cache
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
)
from torch.testing._internal.inductor_utils import (
GPU_TYPE,
HAS_CUDA_AND_TRITON,
IS_BIG_GPU,
)
@instantiate_parametrized_tests
class DeterministicTest(TestCase):
def setUp(self) -> None:
super().setUp()
self._exit_stack = contextlib.ExitStack()
self._exit_stack.enter_context(fresh_cache())
def tearDown(self) -> None:
self._exit_stack.close()
super().tearDown()
@parametrize("deterministic", [False, True])
def test_mm_padding(self, deterministic):
with inductor_config.patch(deterministic=deterministic):
@torch.compile()
def foo(x, y):
return x @ y
inps = [torch.rand([2049, 2049], device=GPU_TYPE) for _ in range(2)]
out = foo(*inps)
self.assertEqual(out, inps[0] @ inps[1])
if deterministic:
self.assertTrue(counters["inductor"]["pad_mm_bench"] == 0)
else:
self.assertTrue(counters["inductor"]["pad_mm_bench"] > 0)
@parametrize("deterministic", [False, True])
@inductor_config.patch(max_autotune=True)
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
def test_max_autotune(self, deterministic):
with inductor_config.patch(deterministic=deterministic):
@torch.compile()
def foo(x, y):
return x @ y
inps = [torch.rand([2048, 2048], device=GPU_TYPE) for _ in range(2)]
out = foo(*inps)
self.assertEqual(out, inps[0] @ inps[1])
if deterministic:
self.assertTrue(counters["inductor"]["select_algorithm_autotune"] == 0)
else:
self.assertTrue(counters["inductor"]["select_algorithm_autotune"] > 0)
def test_pointwise_coordesc_tuning(self):
@torch.compile(mode="max-autotune")
def f(x):
return x + 1
x = torch.randn(2048, device=GPU_TYPE)
self.assertEqual(f(x), x + 1)
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
@parametrize("deterministic", [False, True])
def test_reduction_coordesc_tuning(self, deterministic):
with inductor_config.patch(
deterministic=deterministic, coordinate_descent_tuning=True
):
@torch.compile()
def foo(x):
return x.sum(dim=-1)
inp = torch.rand([2048, 2048], device=GPU_TYPE)
out = foo(inp)
self.assertEqual(out, inp.sum(dim=-1))
if deterministic:
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] == 0)
else:
self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
if __name__ == "__main__":
if HAS_CUDA_AND_TRITON:
run_tests()