mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 16:04:58 +08:00 
			
		
		
		
	Summary: Adding NEON specializations of Vectorized<double> Correcness has been checked using test_ops.py and running torch test Test Plan: Correctness: buck2 test mode/opt //caffe2/test:test_ops buck2 test mode/opt //caffe2/test:torch Performance: Added torch.float64 as data type to test within binary_test.py Reviewed By: mcfi Differential Revision: D84924406 Pull Request resolved: https://github.com/pytorch/pytorch/pull/166092 Approved by: https://github.com/malfet
		
			
				
	
	
		
			209 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			209 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import operator_benchmark as op_bench
 | 
						|
 | 
						|
import torch
 | 
						|
 | 
						|
 | 
						|
"""Microbenchmarks for binary operators."""
 | 
						|
 | 
						|
 | 
						|
# Benchmark ops performance with broadcast
 | 
						|
binary_ops_bcast_list = op_bench.op_list(
 | 
						|
    attr_names=["op_name", "op_func"],
 | 
						|
    attrs=[
 | 
						|
        ["add", torch.add],
 | 
						|
        ["sub", torch.sub],
 | 
						|
        ["div", torch.div],
 | 
						|
        ["mul", torch.mul],
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
# Configs with broadcast
 | 
						|
binary_configs_broadcast = op_bench.config_list(
 | 
						|
    attr_names=["in_one", "in_two"],
 | 
						|
    attrs=[
 | 
						|
        [[64, 1, 64], [1, 64, 1]],
 | 
						|
    ],
 | 
						|
    cross_product_configs={
 | 
						|
        "device": ["cpu"],
 | 
						|
        "dtype": [torch.float, torch.bfloat16, torch.float64],
 | 
						|
    },
 | 
						|
    tags=["short"],
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
 | 
						|
    def init(self, in_one, in_two, dtype, device, op_func):
 | 
						|
        self.inputs = {
 | 
						|
            "in_one": torch.randn(in_one, device=device).to(dtype=dtype),
 | 
						|
            "in_two": torch.randn(in_two, device=device).to(dtype=dtype),
 | 
						|
        }
 | 
						|
        self.op_func = op_func
 | 
						|
 | 
						|
    def forward(self, in_one, in_two):
 | 
						|
        return self.op_func(in_one, in_two)
 | 
						|
 | 
						|
 | 
						|
op_bench.generate_pt_tests_from_op_list(
 | 
						|
    binary_ops_bcast_list, binary_configs_broadcast, BinaryOpBcastBenchmark
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
# Benchmark ops performance without broadcast
 | 
						|
binary_ops_list = op_bench.op_list(
 | 
						|
    attr_names=["op_name", "op_func"],
 | 
						|
    attrs=[
 | 
						|
        ["add", torch.add],
 | 
						|
        ["sub", torch.sub],
 | 
						|
        ["div", torch.div],
 | 
						|
        ["mul", torch.mul],
 | 
						|
        ["asr", torch.bitwise_right_shift],
 | 
						|
        ["lsl", torch.bitwise_left_shift],
 | 
						|
        ["xor", torch.bitwise_xor],
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
binary_short_configs = op_bench.config_list(
 | 
						|
    attr_names=["M", "N", "K"],
 | 
						|
    attrs=[
 | 
						|
        [1, 1, 1],
 | 
						|
        [64, 64, 64],
 | 
						|
        [64, 64, 128],
 | 
						|
    ],
 | 
						|
    cross_product_configs={
 | 
						|
        "device": ["cpu", "cuda"],
 | 
						|
        "dtype_one": [torch.int32, torch.uint8],
 | 
						|
        "dtype_two": [torch.int32, torch.uint8],
 | 
						|
    },
 | 
						|
    tags=["short"],
 | 
						|
)
 | 
						|
 | 
						|
binary_long_configs = op_bench.cross_product_configs(
 | 
						|
    M=[8, 128],
 | 
						|
    N=[32, 64],
 | 
						|
    K=[256, 512],
 | 
						|
    device=["cpu", "cuda"],
 | 
						|
    dtype_one=[torch.int8, torch.int32, torch.uint8],
 | 
						|
    dtype_two=[torch.int8, torch.int32, torch.uint8],
 | 
						|
    tags=["long"],
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
 | 
						|
    def init(self, M, N, K, device, dtype_one, dtype_two, op_func):
 | 
						|
        self.inputs = {
 | 
						|
            "input_one": torch.randn(M, N, K, device=device).to(dtype=dtype_one),
 | 
						|
            "input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two),
 | 
						|
        }
 | 
						|
        self.op_func = op_func
 | 
						|
 | 
						|
    def forward(self, input_one, input_two):
 | 
						|
        return self.op_func(input_one, input_two)
 | 
						|
 | 
						|
 | 
						|
op_bench.generate_pt_tests_from_op_list(
 | 
						|
    binary_ops_list, binary_short_configs + binary_long_configs, BinaryOpBenchmark
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
######
 | 
						|
# Benchmark ops performance for boolean dtype
 | 
						|
######
 | 
						|
 | 
						|
 | 
						|
# Benchmark ops performance with broadcast
 | 
						|
binary_ops_bcast_list = op_bench.op_list(
 | 
						|
    attr_names=["op_name", "op_func"],
 | 
						|
    attrs=[["logical_and", torch.logical_and]],
 | 
						|
)
 | 
						|
 | 
						|
# Configs with broadcast
 | 
						|
binary_configs_broadcast = op_bench.config_list(
 | 
						|
    attr_names=["in_one", "in_two"],
 | 
						|
    attrs=[
 | 
						|
        [[64, 1, 64], [1, 64, 1]],
 | 
						|
    ],
 | 
						|
    cross_product_configs={
 | 
						|
        "device": ["cpu"],
 | 
						|
        "dtype": [torch.bool],
 | 
						|
    },
 | 
						|
    tags=["short"],
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
 | 
						|
    def init(self, in_one, in_two, dtype, device, op_func):
 | 
						|
        self.inputs = {
 | 
						|
            "in_one": torch.bernoulli(0.5 * torch.ones(in_one, device=device)).to(
 | 
						|
                dtype=dtype
 | 
						|
            ),
 | 
						|
            "in_two": torch.bernoulli(0.5 * torch.ones(in_two, device=device)).to(
 | 
						|
                dtype=dtype
 | 
						|
            ),
 | 
						|
        }
 | 
						|
        self.op_func = op_func
 | 
						|
 | 
						|
    def forward(self, in_one, in_two):
 | 
						|
        return self.op_func(in_one, in_two)
 | 
						|
 | 
						|
 | 
						|
op_bench.generate_pt_tests_from_op_list(
 | 
						|
    binary_ops_bcast_list, binary_configs_broadcast, BinaryOpBcastBenchmark
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
# Benchmark ops performance without broadcast
 | 
						|
binary_ops_list = op_bench.op_list(
 | 
						|
    attr_names=["op_name", "op_func"],
 | 
						|
    attrs=[["logical_and", torch.logical_and]],
 | 
						|
)
 | 
						|
 | 
						|
binary_short_configs = op_bench.config_list(
 | 
						|
    attr_names=["M", "N", "K"],
 | 
						|
    attrs=[
 | 
						|
        [1, 1, 1],
 | 
						|
        [64, 64, 64],
 | 
						|
        [64, 64, 128],
 | 
						|
    ],
 | 
						|
    cross_product_configs={
 | 
						|
        "device": ["cpu", "cuda"],
 | 
						|
        "dtype_one": [torch.bool],
 | 
						|
        "dtype_two": [torch.bool],
 | 
						|
    },
 | 
						|
    tags=["short"],
 | 
						|
)
 | 
						|
 | 
						|
binary_long_configs = op_bench.cross_product_configs(
 | 
						|
    M=[8, 128],
 | 
						|
    N=[32, 64],
 | 
						|
    K=[256, 512],
 | 
						|
    device=["cpu", "cuda"],
 | 
						|
    dtype_one=[torch.bool, torch.bool],
 | 
						|
    dtype_two=[torch.bool, torch.bool],
 | 
						|
    tags=["long"],
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
 | 
						|
    def init(self, M, N, K, device, dtype_one, dtype_two, op_func):
 | 
						|
        self.inputs = {
 | 
						|
            "input_one": torch.bernoulli(0.5 * torch.ones(M, N, K, device=device)).to(
 | 
						|
                dtype=dtype_one
 | 
						|
            ),
 | 
						|
            "input_two": torch.bernoulli(0.5 * torch.ones(M, N, K, device=device)).to(
 | 
						|
                dtype=dtype_two
 | 
						|
            ),
 | 
						|
        }
 | 
						|
        self.op_func = op_func
 | 
						|
 | 
						|
    def forward(self, input_one, input_two):
 | 
						|
        return self.op_func(input_one, input_two)
 | 
						|
 | 
						|
 | 
						|
op_bench.generate_pt_tests_from_op_list(
 | 
						|
    binary_ops_list, binary_short_configs + binary_long_configs, BinaryOpBenchmark
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    op_bench.benchmark_runner.main()
 |