remove fast_flush arguments (#135387)

I've removed them from upstream Triton in https://github.com/triton-lang/triton/pull/4485. It looks like most places in the code use the default value of `fast_flush=True` anyway, though there are two PRs from @pearu that use `False`. To my knowledge, there's no reason to use the `False` value.

Differential Revision: [D62325778](https://our.internmc.facebook.com/intern/diff/D62325778)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/135387
Approved by: https://github.com/nmacchioni, https://github.com/jansel
This commit is contained in:
Jez Ng
2024-09-12 18:52:20 -07:00
committed by PyTorch MergeBot
parent 7dc1788396
commit b346e99376
7 changed files with 9 additions and 17 deletions

View File

@ -28,9 +28,7 @@ def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device):
def _test_worker(test_func):
ms, ms_min, ms_max = benchmarker.benchmark_gpu(
test_func, warmup=500, rep=100, fast_flush=False
)
ms, ms_min, ms_max = benchmarker.benchmark_gpu(test_func, warmup=500, rep=100)
tflops = 2 * m * k * n * 1e-12 / (ms * 1e-3)
return ms, tflops

View File

@ -309,7 +309,7 @@ class MultiKernelCall:
return inner
return [
benchmarker.benchmark_gpu(wrap_fn(kernel), rep=40, fast_flush=True)
benchmarker.benchmark_gpu(wrap_fn(kernel), rep=40)
for kernel in self.kernels
]

View File

@ -2503,7 +2503,7 @@ class TritonKernel(SIMDKernel):
result.writeline("args = get_args()")
result.writeline(
"ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40, fast_flush=True)"
"ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
)
result.writeline(f"num_gb = {num_gb}")
result.writeline("gb_per_s = num_gb / (ms / 1e3)")

View File

@ -994,7 +994,7 @@ class ComboKernel(Kernel):
result.writeline("args = get_args()")
result.writeline(
"ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40, fast_flush=True)"
"ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)"
)
result.writeline(f"num_gb = {num_gb}")
result.writeline("gb_per_s = num_gb / (ms / 1e3)")

View File

@ -672,7 +672,7 @@ class CachingAutotuner(KernelInterface):
return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
return benchmarker.benchmark_gpu(kernel_call, rep=40, fast_flush=True)
return benchmarker.benchmark_gpu(kernel_call, rep=40)
def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
from ..compile_fx import clone_preserve_strides

View File

@ -120,9 +120,7 @@ def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
f" {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
)
else:
ms = benchmarker.benchmark_gpu(
lambda: kernel_mod.call(args), rep=40, fast_flush=True
)
ms = benchmarker.benchmark_gpu(lambda: kernel_mod.call(args), rep=40)
assert (
len(triton_kernel.launchers) == 1
), "Autotuner should have selected the best config"

View File

@ -507,9 +507,7 @@ def optimize_scatter_mm(
def test_func():
return bsr_scatter_mm(bsr, dense, indices_data=indices_data)
ms_min = triton.testing.do_bench(
test_func, warmup=500, rep=100, fast_flush=False
)
ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100)
return ms_min
@ -663,7 +661,7 @@ def tune_bsr_dense_addmm(
input, bsr, dense, beta=beta, alpha=alpha, meta=meta, out=out
)
return triton.testing.do_bench(test_func, warmup=500, rep=100, fast_flush=False)
return triton.testing.do_bench(test_func, warmup=500, rep=100)
# The step function that increments a specified meta parameter:
def step_meta_parameter(name, value, direction, meta, M=M, N=N, K=K, BM=BM, BK=BK):
@ -866,9 +864,7 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
else:
raise NotImplementedError(op)
ms_min = triton.testing.do_bench(
test_func, warmup=500, rep=100, fast_flush=False
)
ms_min = triton.testing.do_bench(test_func, warmup=500, rep=100)
return ms_min