remove fast_flush arguments (#135387)

I've removed them from upstream Triton in https://github.com/triton-lang/triton/pull/4485. It looks like most places in the code use the default value of `fast_flush=True` anyway, though there are two PRs from @pearu that use `False`. To my knowledge, there's no reason to use the `False` value. Differential Revision: [D62325778](https://our.internmc.facebook.com/intern/diff/D62325778) Pull Request resolved: https://github.com/pytorch/pytorch/pull/135387 Approved by: https://github.com/nmacchioni, https://github.com/jansel
2025-10-20 12:54:11 +08:00 · 2024-09-12 18:52:20 -07:00
parent 7dc1788396
commit b346e99376
7 changed files with 9 additions and 17 deletions
--- a/benchmarks/sparse/triton_ops.py
+++ b/benchmarks/sparse/triton_ops.py
@ -28,9 +28,7 @@ def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device):


 def _test_worker(test_func):
-    ms, ms_min, ms_max = benchmarker.benchmark_gpu(
-        test_func, warmup=500, rep=100, fast_flush=False
-    )
+    ms, ms_min, ms_max = benchmarker.benchmark_gpu(test_func, warmup=500, rep=100)

    tflops = 2 * m * k * n * 1e-12 / (ms * 1e-3)
    return ms, tflops
--- a/torch/_inductor/codegen/multi_kernel.py
+++ b/torch/_inductor/codegen/multi_kernel.py
@ -309,7 +309,7 @@ class MultiKernelCall:
            return inner

        return [
-            benchmarker.benchmark_gpu(wrap_fn(kernel), rep=40, fast_flush=True)
+            benchmarker.benchmark_gpu(wrap_fn(kernel), rep=40)
            for kernel in self.kernels
        ]

--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@ -2503,7 +2503,7 @@ class TritonKernel(SIMDKernel):

            result.writeline("args = get_args()")
            result.writeline(
-                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40, fast_flush=True)"
+                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
            )
            result.writeline(f"num_gb = {num_gb}")
            result.writeline("gb_per_s = num_gb / (ms / 1e3)")
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@ -994,7 +994,7 @@ class ComboKernel(Kernel):

            result.writeline("args = get_args()")
            result.writeline(
-                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40, fast_flush=True)"
+                "ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
            )
            result.writeline(f"num_gb = {num_gb}")
            result.writeline("gb_per_s = num_gb / (ms / 1e3)")
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@ -672,7 +672,7 @@ class CachingAutotuner(KernelInterface):

            return do_bench_using_profiling(kernel_call, warmup=10, rep=40)

-        return benchmarker.benchmark_gpu(kernel_call, rep=40, fast_flush=True)
+        return benchmarker.benchmark_gpu(kernel_call, rep=40)

    def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
        from ..compile_fx import clone_preserve_strides
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@ -120,9 +120,7 @@ def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
                    f"  {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
                )
        else:
-            ms = benchmarker.benchmark_gpu(
-                lambda: kernel_mod.call(args), rep=40, fast_flush=True
-            )
+            ms = benchmarker.benchmark_gpu(lambda: kernel_mod.call(args), rep=40)
            assert (
                len(triton_kernel.launchers) == 1
            ), "Autotuner should have selected the best config"
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@ -507,9 +507,7 @@ def optimize_scatter_mm(
        def test_func():
            return bsr_scatter_mm(bsr, dense, indices_data=indices_data)

-        ms_min = triton.testing.do_bench(
-            test_func, warmup=500, rep=100, fast_flush=False
-        )
+        ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100)

        return ms_min

@ -663,7 +661,7 @@ def tune_bsr_dense_addmm(
                input, bsr, dense, beta=beta, alpha=alpha, meta=meta, out=out
            )

-        return triton.testing.do_bench(test_func, warmup=500, rep=100, fast_flush=False)
+        return triton.testing.do_bench(test_func, warmup=500, rep=100)

    # The step function that increments a specified meta parameter:
    def step_meta_parameter(name, value, direction, meta, M=M, N=N, K=K, BM=BM, BK=BK):
@ -866,9 +864,7 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
                        else:
                            raise NotImplementedError(op)

-                        ms_min = triton.testing.do_bench(
-                            test_func, warmup=500, rep=100, fast_flush=False
-                        )
+                        ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100)

                        return ms_min