mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-06 00:54:56 +08:00
Summary: Adding NEON specializations of Vectorized<T> for int8, int16, int32 and int64. Correcness has been checked using test_ops.py and the comprehensive torch test operator_benchmark_test.py has been enhanced by adding cases of bitwise operations, boolean ops and integer ops. The benchmark, which uses the PyTorch API, shows significant enhancements in a wide variety of operations: Before: bitwise xor: 779.882us boolean any: 636.209us boolean all: 538.621us integer mul: 304.457us integer asr: 447.997us After: bitwise xor: 680.221us ---> 15% higher throughput boolean any: 391.468us ---> 63% higher throughput boolean all: 390.189us ---> 38% higher throughput integer mul: 193.532us ---> 57% higher throughput integer asr: 179.929us---> 149% higher throughput Test Plan: Correctness: buck2 test @mode/opt //caffe2/test:test_ops buck2 test @mode/opt //caffe2/test:torch buck2 test @mode/opt //caffe2/test/distributed/launcher/fb:fb_run_test Performance: buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test Differential Revision: D84424638 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165273 Approved by: https://github.com/malfet
74 lines
1.8 KiB
Python
74 lines
1.8 KiB
Python
import operator_benchmark as op_bench
|
|
|
|
import torch
|
|
|
|
|
|
"""Microbenchmarks for boolean operators. Supports both Caffe2/PyTorch."""
|
|
|
|
# Configs for PT all operator
|
|
all_long_configs = op_bench.cross_product_configs(
|
|
M=[8, 128], N=[32, 64], K=[256, 512], device=["cpu", "cuda"], tags=["long"]
|
|
)
|
|
|
|
|
|
all_short_configs = op_bench.config_list(
|
|
attr_names=["M", "N", "K"],
|
|
attrs=[
|
|
[1, 1, 1],
|
|
[64, 64, 64],
|
|
[64, 64, 128],
|
|
],
|
|
cross_product_configs={
|
|
"device": ["cpu", "cuda"],
|
|
},
|
|
tags=["short"],
|
|
)
|
|
|
|
|
|
class AllBenchmark(op_bench.TorchBenchmarkBase):
|
|
def init(self, M, N, K, device):
|
|
self.inputs = {
|
|
"input_one": torch.randint(0, 2, (M, N, K), device=device, dtype=torch.bool)
|
|
}
|
|
self.set_module_name("all")
|
|
|
|
def forward(self, input_one):
|
|
return torch.all(input_one)
|
|
|
|
|
|
# The generated test names based on all_short_configs will be in the following pattern:
|
|
# all_M8_N16_K32_devicecpu
|
|
# all_M8_N16_K32_devicecpu_bwdall
|
|
# all_M8_N16_K32_devicecpu_bwd1
|
|
# all_M8_N16_K32_devicecpu_bwd2
|
|
# ...
|
|
# Those names can be used to filter tests.
|
|
|
|
op_bench.generate_pt_test(all_long_configs + all_short_configs, AllBenchmark)
|
|
|
|
"""Mircobenchmark for any operator."""
|
|
|
|
|
|
class AnyBenchmark(op_bench.TorchBenchmarkBase):
|
|
def init(self, M, N, device):
|
|
self.inputs = {
|
|
"input_one": torch.randint(0, 2, (M, N), device=device, dtype=torch.bool)
|
|
}
|
|
self.set_module_name("any")
|
|
|
|
def forward(self, input_one):
|
|
return torch.any(input_one)
|
|
|
|
|
|
any_configs = op_bench.cross_product_configs(
|
|
M=[8, 256],
|
|
N=[256, 16],
|
|
device=["cpu", "cuda"],
|
|
tags=["any"],
|
|
)
|
|
|
|
op_bench.generate_pt_test(any_configs, AnyBenchmark)
|
|
|
|
if __name__ == "__main__":
|
|
op_bench.benchmark_runner.main()
|