import argparse
import itertools
import os

# from . import conv           # noqa: F401
# from . import normalization  # noqa: F401
# from . import pooling        # noqa: F401
from . import (  # noqa: F401
    attention,
    benchmark,
    broadcast,
    concat,
    elementwise,
    matmul,
    reduction,
    rnn_eltwise,
    softmax,
    swish,
    tensor_engine,
)


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""Benchmark operators in specific shapes.
Works only with Python3.\n A few examples:
  * benchmark.py: runs all the default configs with all the benchmarks.
  * benchmark.py reduce: runs all the default configs with all benchmark with a prefix 'reduce'
  * benchmark.py layernorm_fwd_cpu_128_32_128_128: run a particular benchmark in that config""",
    )
    parser.add_argument(
        "benchmark_names",
        type=str,
        default=None,
        nargs="*",
        help="name of the benchmark to run",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cpu,cuda",
        help="a comma separated list of device names",
    )
    parser.add_argument(
        "--mode",
        type=str,
        default="fwd,both",
        help="a comma separated list of running modes",
    )
    parser.add_argument(
        "--dtype",
        type=str,
        default="float32",
        help="a comma separated list of Data Types: {float32[default], float16}",
    )
    parser.add_argument(
        "--input-iter",
        type=str,
        default=None,
        help="a comma separated list of Tensor dimensions that includes a start, \
              stop, and increment that can be constant or a power of 2 \
              {start:stop:inc,start:stop:pow2}",
    )
    parser.add_argument(
        "--engine",
        type=str,
        default="pt",
        help="the underlying tensor engine. only pt for now",
    )
    parser.add_argument(
        "--jit-mode",
        "--jit_mode",
        type=str,
        default="trace",
        help="the jit mode to use: one of {trace, none}",
    )
    parser.add_argument(
        "--cuda-pointwise-loop-levels",
        "--cuda_pointwise_loop_levels",
        type=int,
        default=None,
        help="num of loop levesl for Cuda pointwise operations: 2 or 3",
    )
    parser.add_argument(
        "--cuda-pointwise-block-count",
        "--cuda_pointwise_block_count",
        type=int,
        default=None,
        help="num of block for Cuda pointwise operations",
    )
    parser.add_argument(
        "--cuda-pointwise-block-size",
        "--cuda_pointwise_block_size",
        type=int,
        default=None,
        help="num of blocks for Cuda pointwise operations",
    )
    parser.add_argument(
        "--cuda-fuser",
        "--cuda_fuser",
        type=str,
        default="te",
        help="The Cuda fuser backend to use: one of {te, nvf, old, none}",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="stdout",
        help="The output format of the benchmark run {stdout[default], json}",
    )
    parser.add_argument(
        "--print-ir",
        action="store_true",
        help="Print the IR graph of the Fusion.",
    )
    parser.add_argument(
        "--print-kernel",
        action="store_true",
        help="Print generated kernel(s).",
    )
    parser.add_argument(
        "--no-dynamic-shape",
        action="store_true",
        help="Disable shape randomization in dynamic benchmarks.",
    )
    parser.add_argument(
        "--cpu-fusion",
        "--cpu_fusion",
        default=False,
        action="store_true",
        help="Enable CPU fusion.",
    )
    parser.add_argument(
        "--cat-wo-conditionals",
        "--cat_wo_conditionals",
        default=False,
        action="store_true",
        help="Enable CAT wo conditionals.",
    )

    args = parser.parse_args()

    if args.cuda_fuser == "te":
        import torch

        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_texpr_fuser_enabled(True)
        torch._C._jit_override_can_fuse_on_gpu(True)
        torch._C._get_graph_executor_optimize(True)
    elif args.cuda_fuser == "old":
        import torch

        torch._C._jit_set_profiling_executor(False)
        torch._C._jit_set_texpr_fuser_enabled(False)
        torch._C._jit_override_can_fuse_on_gpu(True)
    elif args.cuda_fuser == "nvf":
        import torch

        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_texpr_fuser_enabled(False)
        torch._C._jit_set_nvfuser_enabled(True)
        torch._C._get_graph_executor_optimize(True)
    else:
        raise ValueError(f"Undefined fuser: {args.cuda_fuser}")

    if args.cpu_fusion:
        import torch

        torch._C._jit_override_can_fuse_on_cpu(True)
    else:
        import torch

        torch._C._jit_override_can_fuse_on_cpu(False)

    if args.cat_wo_conditionals:
        import torch

        torch._C._jit_cat_wo_conditionals(True)
    else:
        import torch

        torch._C._jit_cat_wo_conditionals(False)

    def set_global_threads(num_threads):
        os.environ["OMP_NUM_THREADS"] = str(num_threads)
        os.environ["MKL_NUM_THREADS"] = str(num_threads)
        os.environ["TVM_NUM_THREADS"] = str(num_threads)
        os.environ["NNC_NUM_THREADS"] = str(num_threads)

    devices = args.device.split(",")
    # accept 'gpu' as an alternative as the 'cuda' device
    devices = ["cuda" if device == "gpu" else device for device in devices]
    cpu_count = 0
    for index, device in enumerate(devices):
        if device.startswith("cpu"):
            cpu_count += 1
            if cpu_count > 1:
                raise ValueError(
                    f"more than one CPU device is not allowed: {cpu_count:d}"
                )
            if device == "cpu":
                continue
            num_threads_str = device[3:]
            try:
                # see if the device is in 'cpu1' or 'cpu4' format
                num_threads = int(num_threads_str)
                set_global_threads(num_threads)
                devices[index] = "cpu"
            except ValueError:
                continue

    modes = args.mode.split(",")

    datatypes = args.dtype.split(",")
    for index, dtype in enumerate(datatypes):
        datatypes[index] = getattr(torch, dtype)
        if not datatypes[index]:
            raise AttributeError(f"DataType: {dtype} is not valid!")

    tensor_engine.set_engine_mode(args.engine)

    def run_default_configs(bench_cls, allow_skip=True):
        for mode, device, dtype, config in itertools.product(
            modes, devices, datatypes, bench_cls.default_configs()
        ):
            bench = bench_cls(mode, device, dtype, *config)
            bench.output_type = args.output
            bench.jit_mode = args.jit_mode
            if not bench.is_supported():
                if allow_skip:
                    continue
                else:
                    raise ValueError(
                        f"attempted to run an unsupported benchmark: {bench.desc()}"
                    )
            bench.run(args)

    def run_with_input_iter(bench_cls, input_iter, allow_skip=True):
        tensor_dim_specs = input_iter.split(",")
        tensor_dim_specs = [dim.split(":") for dim in tensor_dim_specs]

        configs = []
        for start, stop, inc in tensor_dim_specs:
            dim_list = []
            if inc == "pow2":
                curr = int(start)
                while curr <= int(stop):
                    dim_list.append(curr)
                    curr <<= 1
            elif inc == "pow2+1":
                curr = int(start)
                while curr <= int(stop):
                    dim_list.append(curr)
                    curr -= 1
                    curr <<= 1
                    curr += 1
            else:
                dim_list = list(range(int(start), int(stop) + int(inc), int(inc)))
            configs.append(dim_list)
        configs = itertools.product(*configs)

        for mode, device, dtype, config in itertools.product(
            modes, devices, datatypes, list(configs)
        ):
            bench = bench_cls(mode, device, dtype, *config)
            bench.output_type = args.output
            bench.jit_mode = args.jit_mode
            if not bench.is_supported():
                if allow_skip:
                    continue
                else:
                    raise ValueError(
                        f"attempted to run an unsupported benchmark: {bench.desc()}"
                    )
            bench.run(args)

    benchmark_classes = benchmark.benchmark_classes
    if not args.benchmark_names:
        # by default, run all the benchmarks
        for benchmark_cls in benchmark_classes:
            run_default_configs(benchmark_cls, allow_skip=True)
    else:
        for name in args.benchmark_names:
            # if the name is the prefix of a benchmark class, run all the benchmarks for that class
            match_class_name = False
            for bench_cls in benchmark_classes:
                if name in bench_cls.module():
                    match_class_name = True
                    if (args.input_iter is not None) and bench_cls.input_iterable():
                        run_with_input_iter(bench_cls, args.input_iter, allow_skip=True)
                    else:
                        if args.input_iter is not None:
                            print(
                                f"WARNING: Incompatible benchmark class called with input_iter arg: {name}"
                            )
                        run_default_configs(bench_cls, allow_skip=True)

            if match_class_name:
                continue

            # if not a class module, parse the config and call it that way
            match_class_name = False
            for bench_cls in benchmark_classes:
                cls_module = bench_cls.module()
                if name.startswith(cls_module):
                    match_class_name = True
                    if name[len(cls_module)] != "_":
                        raise ValueError(f"invalid name: {name}")
                    config_str = name[(len(cls_module) + 1) :]
                    config = config_str.split("_")
                    if len(config) < 2:
                        raise ValueError(f"invalid config: {config}")
                    mode, device = config[0:2]
                    # TODO: make sure virtual devices such as 'cpu1' and 'cpu4' are supported.
                    if mode not in ["fwd", "both"]:
                        raise ValueError(f"invalid mode: {mode}")
                    for i, entry in enumerate(config):
                        try:
                            value = int(entry)
                            config[i] = value
                        except ValueError:
                            pass
                    # TODO: output dtype in the config and  parse it back from the str
                    bench = bench_cls(config[0], config[1], torch.float32, *config[2:])
                    bench.jit_mode = args.jit_mode
                    bench.output_type = args.output
                    bench.run(args)

            if not match_class_name:
                available_classes = ", ".join(
                    [bench_cls.module() for bench_cls in benchmark_classes]
                )
                raise ValueError(
                    f"invalid name: {name}\nAvailable benchmark classes:\n{available_classes}"
                )


if __name__ == "__main__":
    main()