From 521dbbfaffcc45a907b0a8d816f4af7d588ffcb5 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 5 Jan 2024 21:23:30 +0000 Subject: [PATCH] Remove cpp/tensorexpr benchmarks (#116868) Summary: These refer to a deprecated backend of torchscript which is no longer built in releases, and require llvm to be built. Test Plan: ``` python setup.py develop ``` Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/116868 Approved by: https://github.com/hl475, https://github.com/chenyang78, https://github.com/eellison, https://github.com/mikekgfb --- CMakeLists.txt | 1 - benchmarks/cpp/CMakeLists.txt | 2 - benchmarks/cpp/convolution.cpp | 313 --------- benchmarks/cpp/tensorexpr/CMakeLists.txt | 24 - benchmarks/cpp/tensorexpr/bench_approx.cpp | 433 ------------ benchmarks/cpp/tensorexpr/bench_batchnorm.cpp | 216 ------ benchmarks/cpp/tensorexpr/bench_compile.cpp | 71 -- benchmarks/cpp/tensorexpr/bench_concat.cpp | 293 --------- .../cpp/tensorexpr/bench_fuser_overhead.cpp | 59 -- benchmarks/cpp/tensorexpr/bench_gemm.cpp | 313 --------- benchmarks/cpp/tensorexpr/bench_kernels.cpp | 101 --- benchmarks/cpp/tensorexpr/bench_ops.py | 113 ---- benchmarks/cpp/tensorexpr/bench_parallel.cpp | 71 -- .../cpp/tensorexpr/bench_prefix_sum.cpp | 395 ----------- benchmarks/cpp/tensorexpr/bench_reduce.cpp | 621 ------------------ .../cpp/tensorexpr/bench_signed_log1p.cpp | 166 ----- benchmarks/cpp/tensorexpr/main.cpp | 3 - caffe2/CMakeLists.txt | 8 - cmake/Summary.cmake | 1 - 19 files changed, 3204 deletions(-) delete mode 100644 benchmarks/cpp/CMakeLists.txt delete mode 100644 benchmarks/cpp/convolution.cpp delete mode 100644 benchmarks/cpp/tensorexpr/CMakeLists.txt delete mode 100644 benchmarks/cpp/tensorexpr/bench_approx.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_compile.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_concat.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_gemm.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_kernels.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_ops.py delete mode 100644 benchmarks/cpp/tensorexpr/bench_parallel.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_reduce.cpp delete mode 100644 benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp delete mode 100644 benchmarks/cpp/tensorexpr/main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ae21c51296dc..0be84cd241ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,6 @@ cmake_dependent_option( option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF) option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF) option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF) -option(BUILD_TENSOREXPR_BENCHMARK "Build C++ binaries for tensorexpr benchmarks (need gbenchmark)" OFF) option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF) option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF) option(BUILD_JNI "Build JNI bindings" OFF) diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt deleted file mode 100644 index d4a6cdec54da..000000000000 --- a/benchmarks/cpp/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(convolution_bench convolution.cpp) -target_link_libraries(convolution_bench PRIVATE torch_library benchmark) diff --git a/benchmarks/cpp/convolution.cpp b/benchmarks/cpp/convolution.cpp deleted file mode 100644 index 998bad2e46b1..000000000000 --- a/benchmarks/cpp/convolution.cpp +++ /dev/null @@ -1,313 +0,0 @@ -#include -#include -#include -#include -#include - -struct ConvParams { - std::vector input; - std::vector weight; - std::vector bias; - std::vector stride; - std::vector padding; - std::vector dilation; - int64_t groups; -}; - -struct xs { - explicit xs(const std::vector& v_) : v(v_) {} - const std::vector& v; -}; - -std::ostream& operator<<(std::ostream& os, const xs& x) { - bool first = true; - for (auto const& xx : x.v) { - if (!first) { - os << "x"; - } - first = false; - os << xx; - } - return os; -} - -std::ostream& operator<<(std::ostream& os, const ConvParams& params) { - os << "I" << xs(params.input) << "_W" << xs(params.weight) << "_B" - << xs(params.bias) << "_S" << xs(params.stride) << "_P" - << xs(params.padding) << "_D" << xs(params.dilation) << "_G" - << params.groups; - return os; -} - -std::vector MobileNetV3Params = { - {{1, 3, 224, 224}, {16, 3, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 16, 112, 112}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 16, 112, 112}, {16, 1, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 16}, - {{1, 16, 56, 56}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 16, 56, 56}, {72, 16, 1, 1}, {72}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 72, 56, 56}, {72, 1, 3, 3}, {72}, {2, 2}, {1, 1}, {1, 1}, 72}, - {{1, 72, 28, 28}, {24, 72, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 24, 28, 28}, {88, 24, 1, 1}, {88}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 88, 28, 28}, {88, 1, 3, 3}, {88}, {1, 1}, {1, 1}, {1, 1}, 88}, - {{1, 88, 28, 28}, {24, 88, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 24, 28, 28}, {96, 24, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 96, 28, 28}, {96, 1, 5, 5}, {96}, {2, 2}, {2, 2}, {1, 1}, 96}, - {{1, 96, 14, 14}, {40, 96, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240}, - {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240}, - {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 40, 14, 14}, {120, 40, 1, 1}, {120}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 120, 14, 14}, {120, 1, 5, 5}, {120}, {1, 1}, {2, 2}, {1, 1}, 120}, - {{1, 120, 14, 14}, {48, 120, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 48, 14, 14}, {144, 48, 1, 1}, {144}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 144, 14, 14}, {144, 1, 5, 5}, {144}, {1, 1}, {2, 2}, {1, 1}, 144}, - {{1, 144, 14, 14}, {48, 144, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 48, 14, 14}, {288, 48, 1, 1}, {288}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 288, 14, 14}, {288, 1, 5, 5}, {288}, {2, 2}, {2, 2}, {1, 1}, 288}, - {{1, 288, 7, 7}, {96, 288, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576}, - {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576}, - {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 576, 1, 1}, {1280, 576, 1, 1}, {1280}, {1, 1}, {0, 0}, {1, 1}, 1}, -}; - -std::vector ResNet18Params = { - {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {128, 64, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {128, 64, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {256, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {256, 128, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {512, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, -}; - -std::vector ResNet50Params = { - {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 56, 56}, {128, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 128, 56, 56}, {128, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 56, 56}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 28, 28}, {256, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 28, 28}, {256, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 28, 28}, {1024, 512, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {512, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 14, 14}, {512, 512, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1}, - {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 1024, 14, 14}, {2048, 1024, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1}, - {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, - {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1}, - {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1}, -}; - -struct EnableMklDnn { - explicit EnableMklDnn(bool enable) - : prev_(at::globalContext().userEnabledMkldnn()) { - at::globalContext().setUserEnabledMkldnn(enable); - } - - ~EnableMklDnn() { - at::globalContext().setUserEnabledMkldnn(prev_); - } - - bool prev_; -}; - -template -static void BM_conv2d_native( - benchmark::State& state, - const ConvParams& params) { - EnableMklDnn mkl(WithMklDnn); - auto input = at::randn(params.input); - auto weight = at::randn(params.weight); - auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{}; - auto output = at::conv2d( - input, - weight, - bias, - params.stride, - params.padding, - params.dilation, - params.groups); - for (auto _ : state) { - output = at::conv2d( - input, - weight, - bias, - params.stride, - params.padding, - params.dilation, - params.groups); - } - state.counters["GFLOPS/s"] = benchmark::Counter( - 2.0f * output.numel() * weight.numel() / weight.size(0) * - state.iterations(), - benchmark::Counter::kIsRate); - state.counters["GB/s"] = benchmark::Counter( - // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()), - benchmark::Counter::kIsRate); -} - -enum MklDnnReorder { - None, - WeightOnly, - WeightAndInput, -}; - -template -static void BM_conv2d_mkldnn( - benchmark::State& state, - const ConvParams& params) { - auto input = at::randn(params.input); - auto weight = at::randn(params.weight); - auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{}; - - if (Reorder == WeightAndInput) { - auto it_input = at::native::itensor_from_mkldnn(input.to_mkldnn()); - auto r = ideep::tensor( - params.input, ideep::data_type::f32, ideep::format_tag::aBcd16b); - it_input.reorder_to(r); - input = at::native::new_with_itensor_mkldnn( - std::move(r), at::kFloat, at::Device(at::kCPU)); - } - - if (Reorder == WeightOnly || Reorder == WeightAndInput) { - weight = at::mkldnn_reorder_conv2d_weight( - weight.to_mkldnn(), - params.padding, - params.stride, - params.dilation, - params.groups); - - bias = params.bias.size() > 0 ? bias.to_mkldnn() : bias; - } - - auto output = at::mkldnn_convolution( - input, - weight, - bias, - params.padding, - params.stride, - params.dilation, - params.groups); - for (auto _ : state) { - output = at::mkldnn_convolution( - input, - weight, - bias, - params.padding, - params.stride, - params.dilation, - params.groups); - } - state.counters["GFLOPS/s"] = benchmark::Counter( - 2.0f * output.numel() * weight.numel() / weight.size(0) * - state.iterations(), - benchmark::Counter::kIsRate); - state.counters["GB/s"] = benchmark::Counter( - // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()), - benchmark::Counter::kIsRate); -} - -std::string name( - const char* base, - const char* suffix, - const ConvParams& params) { - std::ostringstream os; - os << base << "_" << suffix << "_" << params; - return os.str(); -} - -void registerOne(const char* base, const ConvParams& params) { - benchmark::RegisterBenchmark( - name(base, "native", params).data(), BM_conv2d_native, params); - benchmark::RegisterBenchmark( - name(base, "native_nomkl", params).data(), - BM_conv2d_native, - params); - benchmark::RegisterBenchmark( - name(base, "mkldnn_none", params).data(), BM_conv2d_mkldnn, params); - benchmark::RegisterBenchmark( - name(base, "mkldnn_weight", params).data(), - BM_conv2d_mkldnn, - params); - benchmark::RegisterBenchmark( - name(base, "mkldnn_input", params).data(), - BM_conv2d_mkldnn, - params); -} - -int main(int argc, char** argv) { - c10::InferenceMode guard; - -#define BENCH(x) \ - for (auto const& params : x##Params) { \ - registerOne(#x, params); \ - } - BENCH(MobileNetV3); - BENCH(ResNet18); - BENCH(ResNet50); -#undef BENCH - - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/benchmarks/cpp/tensorexpr/CMakeLists.txt b/benchmarks/cpp/tensorexpr/CMakeLists.txt deleted file mode 100644 index fbe657bdf2ee..000000000000 --- a/benchmarks/cpp/tensorexpr/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -find_package(AVX) - -add_executable( - tensorexpr_bench - bench_approx.cpp - bench_batchnorm.cpp - bench_concat.cpp - bench_compile.cpp - bench_signed_log1p.cpp - bench_fuser_overhead.cpp - bench_gemm.cpp - bench_kernels.cpp - bench_parallel.cpp - bench_prefix_sum.cpp - bench_reduce.cpp - main.cpp) - -if(C_AVX2_FOUND) - message(STATUS "AVX2 compiler support found") - target_compile_options(tensorexpr_bench PUBLIC -mavx2) - target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2) -endif() - -target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark) diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp deleted file mode 100644 index e3276abf99b8..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_approx.cpp +++ /dev/null @@ -1,433 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "caffe2/operators/logit_op.h" -#include "caffe2/operators/tanh_op.h" - -using namespace torch::jit; -using namespace torch::jit::tensorexpr; - -void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) { - auto loops = ln->getLoopStmtsFor(target); - ForPtr inner, tail; - ln->splitWithTail(loops[0], width, &inner, &tail); - ln->vectorize(inner); -} - -void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) { - std::vector loops = ln->getLoopStmtsFor(target); - ForPtr inner, tail; - ln->splitWithTail(loops[0], 16 * 8, &inner, &tail); - ForPtr outer = loops[0]; - ln->vectorize(inner); - ln->splitWithTail(outer, 8, &inner, &tail); - StmtPtr unrolled; - LoopNest::fullUnroll(inner, &unrolled); -} - -static void relu_nnc(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - auto clamp = 0; - torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = [&]() { - auto elem = A.load(i); - auto min = FloatImm::make(clamp); - return CompareSelect::make(elem, min, min, elem, kLT); - }(); - return A_elem; - }); - LoopNest ln({B}); - optimizePointwise(&ln, B); - ln.prepareForCodegen(); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::randn({state.range(0)}); - at::Tensor B_t = torch::randn(state.range(0)); - auto B_ref = at::relu(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void log_nnc_sleef(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - torch::jit::tensorexpr::Tensor B = - Compute("B", {N}, [&](const VarHandle& i) { return log(A.load(i)); }); - LoopNest ln({B}); - ln.prepareForCodegen(); - vectorize(&ln, B, 8); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::log(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void log_nnc_fast(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - torch::jit::tensorexpr::Tensor B = Compute( - "B", {N}, [&](const VarHandle& i) { return fast_log(A.load(i)); }); - LoopNest ln({B}); - optimizePointwise(&ln, B); - ln.prepareForCodegen(); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::log(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void log_nnc_vml(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - torch::jit::tensorexpr::Tensor B = - Compute("B", {N}, [&](const VarHandle& i) { return log_vml(A.load(i)); }); - LoopNest ln({B}); - vectorize(&ln, B, 8); - ln.prepareForCodegen(); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::log(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void log_aten(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - for (auto _ : state) { - at::log_out(B_t, A_t); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void logit_nnc_sleef(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - auto clamp = 1e-6f; - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = [&]() { - auto elem = A.load(i); - auto min = FloatImm::make(clamp); - auto max = FloatImm::make(1.0f - clamp); - elem = CompareSelect::make(elem, min, min, elem, kLT); - return CompareSelect::make(elem, max, max, elem, kGT); - }(); - return log(A_elem / (FloatImm::make(1.0f) - A_elem)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - optimizePointwise(&ln, B); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::logit(A_t, clamp); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref))); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void logit_nnc_fast(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - auto clamp = 1e-6f; - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = [&]() { - auto elem = A.load(i); - auto min = FloatImm::make(clamp); - auto max = FloatImm::make(1.0f - clamp); - elem = CompareSelect::make(elem, min, min, elem, kLT); - return CompareSelect::make(elem, max, max, elem, kGT); - }(); - return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - optimizePointwise(&ln, B); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::logit(A_t, clamp); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref))); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void logit_nnc_vml(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - auto clamp = 1e-6f; - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = [&]() { - auto elem = A.load(i); - auto min = FloatImm::make(clamp); - auto max = FloatImm::make(1.0f - clamp); - elem = CompareSelect::make(elem, min, min, elem, kLT); - return CompareSelect::make(elem, max, max, elem, kGT); - }(); - return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - vectorize(&ln, B, 16); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::logit(A_t, clamp); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref))); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void logit_aten(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto clamp = 1e-6f; - for (auto _ : state) { - at::native::logit_out(A_t, clamp, B_t); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -template -void logit_caffe2_impl(int size, const T* X, T* Y, float eps_ = 1e-6f) { - using namespace caffe2; - ConstEigenVectorMap X_vec(X, size); - EigenVectorMap Y_vec(Y, size); - Y_vec = X_vec.array().min(static_cast(1.0f - eps_)); - Y_vec = Y_vec.array().max(eps_); - Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log(); -} - -static void logit_caffe2(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - at::Tensor B_ref = torch::randn({state.range(0)}); - auto N = state.range(0); - auto X = A_t.data_ptr(); - auto Y = B_t.data_ptr(); - auto clamp = 1e-6f; - at::native::logit_out(A_t, clamp, B_ref); - logit_caffe2_impl(N, X, Y, clamp); - TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref))); - - for (auto _ : state) { - logit_caffe2_impl(N, X, Y, clamp); - } - - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void tanh_nnc_fast(benchmark::State& state) { - auto N = VarHandle("N", kInt); - BufHandle A("A", {N}, kFloat); - torch::jit::tensorexpr::Tensor B = Compute( - "B", {N}, [&](const VarHandle& i) { return fast_tanh(A.load(i)); }); - LoopNest ln({B}); - optimizePointwise(&ln, B); - ln.prepareForCodegen(); - StmtPtr s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::tanh(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - TORCH_CHECK(at::allclose(B_t, B_ref, 1e-3f, 1e-6f)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["tanh/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void tanh_aten(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - for (auto _ : state) { - at::tanh_out(A_t, B_t); - } - state.counters["tanh/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -} - -static void tanh_caffe2(benchmark::State& state) { -#ifdef FBCODE_CAFFE2 - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - at::Tensor B_ref = torch::randn({state.range(0)}); - - auto N = state.range(0); - auto X = A_t.data_ptr(); - auto Y = B_t.data_ptr(); - caffe2::CPUContext c; - auto tanh = caffe2::TanhFunctor(); - at::tanh_out(A_t, B_ref); - tanh(N, X, Y, &c); - TORCH_CHECK(at::native::allclose(B_t, B_ref, 1e-3f, 1e-6f)); - - for (auto _ : state) { - tanh(N, X, Y, &c); - } - state.counters["tanh/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), - benchmark::Counter::kIsRate); -#endif -} - -BENCHMARK(relu_nnc)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args( - {2 << 14}); -BENCHMARK(log_nnc_sleef) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(log_nnc_fast) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(log_nnc_vml) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(log_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args( - {2 << 14}); -BENCHMARK(logit_nnc_sleef) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(logit_nnc_fast) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(logit_nnc_vml) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(logit_aten) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(logit_caffe2) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(tanh_nnc_fast) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); -BENCHMARK(tanh_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args( - {2 << 14}); -BENCHMARK(tanh_caffe2) - ->Args({2 << 5}) - ->Args({2 << 8}) - ->Args({2 << 12}) - ->Args({2 << 14}); diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp deleted file mode 100644 index 77e86020f28a..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include -#include -#include -#include -#include -#include - -using namespace torch::jit::tensorexpr; - -namespace { -class BatchNorm : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - N_ = state.range(0); - C_ = state.range(1); - H_ = state.range(2); - W_ = state.range(3); - input_ = torch::ones({N_, C_, H_, W_}); - weight_ = torch::ones({C_}); - bias_ = torch::ones({C_}); - mean_ = torch::ones({C_}) * 0.5f; - var_ = torch::ones({C_}) * 0.1f; - ref_ = at::batch_norm( - input_, - weight_, - bias_, - mean_, - var_, - training_, - momentum_, - eps_, - cudnn_enabled_); - output_ = at::empty_like(ref_); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(ref_, output_)); - state.counters["GB/s"] = benchmark::Counter( - uint64_t(state.iterations()) * (input_.nbytes() + ref_.nbytes()), - benchmark::Counter::kIsRate); - } - - int N_; - int C_; - int H_; - int W_; - at::Tensor input_; - at::Tensor weight_; - at::Tensor bias_; - at::Tensor mean_; - at::Tensor var_; - at::Tensor output_; - at::Tensor ref_; - bool training_{false}; - float momentum_{0.1}; - float eps_{1.0e-5f}; - bool cudnn_enabled_{false}; -}; -} // namespace - -BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) { - for (auto _ : state) { - output_ = at::batch_norm( - input_, - weight_, - bias_, - mean_, - var_, - training_, - momentum_, - eps_, - cudnn_enabled_); - } -} - -BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) { - BufHandle input("input", {N_, C_, H_, W_}, kFloat); - BufHandle weight("weight", {C_}, kFloat); - BufHandle bias("bias", {C_}, kFloat); - BufHandle mean("mean", {C_}, kFloat); - BufHandle var("var", {C_}, kFloat); - VarHandle eps("eps", kFloat); - - using axis = const VarHandle&; - Tensor output = - Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) { - // Compute affine terms. - auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps); - auto weight_v = weight.load(c); - auto bias_v = bias.load(c); - auto alpha = inv_var * weight_v; - auto beta = bias_v - mean.load(c) * alpha; - - return input.load(n, c, h, w) * alpha + beta; - }); - LoopNest nest({output}); - auto loops = nest.getLoopStmtsFor(output); - LoopNest::flatten({loops[2], loops[3]}); - loops = nest.getLoopStmtsFor(output); - LoopNest::flatten({loops[0], loops[1]}); - loops = nest.getLoopStmtsFor(output); - loops[0]->set_parallel(); - nest.prepareForCodegen(); - StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps}); - - std::vector args; - for (auto _ : state) { - args.clear(); - output_ = at::empty_like(input_); - for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) { - args.push_back(t.data_ptr()); - } - args.push_back(eps_); - cg.call(args); - } -} - -BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) { - for (auto _ : state) { - output_ = at::batch_norm( - input_, - weight_, - bias_, - mean_, - var_, - training_, - momentum_, - eps_, - cudnn_enabled_); - output_.relu_(); - } -} - -BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) { - BufHandle input("input", {N_, C_, H_, W_}, kFloat); - BufHandle weight("weight", {C_}, kFloat); - BufHandle bias("bias", {C_}, kFloat); - BufHandle mean("mean", {C_}, kFloat); - BufHandle var("var", {C_}, kFloat); - VarHandle eps("eps", kFloat); - - using axis = const VarHandle&; - Tensor output = - Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) { - // Compute affine terms. - auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps); - auto weight_v = weight.load(c); - auto bias_v = bias.load(c); - auto alpha = inv_var * weight_v; - auto beta = bias_v - mean.load(c) * alpha; - - auto bn = input.load(n, c, h, w) * alpha + beta; - return CompareSelect::make(bn, 0.f, 0.f, bn, kLT); - }); - LoopNest nest({output}); - nest.prepareForCodegen(); - StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps}); - - std::vector args; - for (auto _ : state) { - args.clear(); - output_ = at::empty_like(input_); - for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) { - args.push_back(t.data_ptr()); - } - args.push_back(eps_); - cg.call(args); - } -} - -BENCHMARK_REGISTER_F(BatchNorm, ATen) - ->Args({1, 64, 112, 112}) - ->Args({1, 256, 14, 14}) - ->Args({1, 128, 28, 28}) - ->Args({1, 64, 56, 56}) - ->Args({1, 512, 7, 7}) - ->Args({5, 64, 112, 112}) - ->Args({5, 256, 14, 14}) - ->Args({5, 128, 28, 28}) - ->Args({5, 64, 56, 56}) - ->Args({5, 512, 7, 7}); -BENCHMARK_REGISTER_F(BatchNorm, NNC) - ->Args({1, 64, 112, 112}) - ->Args({1, 256, 14, 14}) - ->Args({1, 128, 28, 28}) - ->Args({1, 64, 56, 56}) - ->Args({1, 512, 7, 7}) - ->Args({5, 64, 112, 112}) - ->Args({5, 256, 14, 14}) - ->Args({5, 128, 28, 28}) - ->Args({5, 64, 56, 56}) - ->Args({5, 512, 7, 7}); -BENCHMARK_REGISTER_F(BatchNorm, ATenRelu) - ->Args({1, 64, 112, 112}) - ->Args({1, 256, 14, 14}) - ->Args({1, 128, 28, 28}) - ->Args({1, 64, 56, 56}) - ->Args({1, 512, 7, 7}) - ->Args({5, 64, 112, 112}) - ->Args({5, 256, 14, 14}) - ->Args({5, 128, 28, 28}) - ->Args({5, 64, 56, 56}) - ->Args({5, 512, 7, 7}); -BENCHMARK_REGISTER_F(BatchNorm, NNCRelu) - ->Args({1, 64, 112, 112}) - ->Args({1, 256, 14, 14}) - ->Args({1, 128, 28, 28}) - ->Args({1, 64, 56, 56}) - ->Args({1, 512, 7, 7}) - ->Args({5, 64, 112, 112}) - ->Args({5, 256, 14, 14}) - ->Args({5, 128, 28, 28}) - ->Args({5, 64, 56, 56}) - ->Args({5, 512, 7, 7}); diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp deleted file mode 100644 index be60f9cd599b..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_compile.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -#ifdef TORCH_ENABLE_LLVM -namespace te = torch::jit::tensorexpr; - -static void BM_CompileSwish(benchmark::State& state) { - for (auto _ : state) { - constexpr int N = 512; - te::VarHandle n("n", te::kInt); - te::BufHandle A("A", {N}, te::kFloat); - te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) { - return te::Max::make(A.load(i), 0.f, false); - }); - te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) { - return te::Min::make(relu.load(i), 6.f, false); - }); - te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) { - return min6.load(i) + 3.f; - }); - te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) { - return A.load(i) * plus3.load(i); - }); - te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) { - return times.load(i) * 1.f / 6.f; - }); - te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); - for (auto tensor : {relu, min6, plus3, times}) { - nest.computeInline(tensor.buf()); - } - nest.prepareForCodegen(); - te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); - te::LLVMCodeGen cg(s, {A, sixth, n}); - } -} - -static void BM_CompileSwishLLVMOnly(benchmark::State& state) { - constexpr int N = 512; - te::VarHandle n("n", te::kInt); - te::BufHandle A("A", {N}, te::kFloat); - te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) { - return te::Max::make(A.load(i), 0.f, false); - }); - te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) { - return te::Min::make(relu.load(i), 6.f, false); - }); - te::Tensor plus3 = te::Compute( - "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; }); - te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) { - return A.load(i) * plus3.load(i); - }); - te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) { - return times.load(i) * 1.f / 6.f; - }); - te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); - for (auto tensor : {relu, min6, plus3, times}) { - nest.computeInline(tensor.buf()); - } - nest.prepareForCodegen(); - te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); - for (auto _ : state) { - te::LLVMCodeGen cg(s, {A, sixth, n}); - } -} - -BENCHMARK(BM_CompileSwish); -BENCHMARK(BM_CompileSwishLLVMOnly); -#endif // TORCH_ENABLE_LLVM diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp deleted file mode 100644 index b7b97d02e3a8..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ /dev/null @@ -1,293 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -using namespace torch::jit::tensorexpr; - -namespace { - -class ConcatBench : public benchmark::Fixture { - public: - void init(const std::vector> input_sizes, int concat_dim) { - input_sizes_ = std::move(input_sizes); - concat_dim_ = concat_dim; - inputs_.resize(input_sizes_.size()); - for (const auto i : c10::irange(input_sizes_.size())) { - inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]}); - } - output_size_.resize(input_sizes_.front().size()); - for (const auto i : c10::irange(output_size_.size())) { - if (i == static_cast(concat_dim_)) { - output_size_[i] = 0; - for (const auto j : c10::irange(input_sizes_.size())) { - output_size_[i] += input_sizes_[j][i]; - } - } else { - output_size_[i] = input_sizes_.front()[i]; - } - } - ref_ = at::cat(inputs_, concat_dim_); - output_ = at::empty_like(ref_); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(ref_, output_)); - state.counters["GB/s"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * output_.nbytes(), - benchmark::Counter::kIsRate); - } - - void runATen(benchmark::State& state) { - for (auto _ : state) { - output_ = at::cat(inputs_, concat_dim_); - } - } - - void runNNC(benchmark::State& state) { - size_t num_inputs = inputs_.size(); - size_t num_dims = 2; - - std::vector inputs; - for (size_t i = 0; i < num_inputs; ++i) { - inputs.emplace_back(BufHandle( - "input" + std::to_string(i), - {input_sizes_[i][0], input_sizes_[i][1]}, - kFloat)); - } - - Tensor output = Compute( - "aten_cat", - {output_size_[0], output_size_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - int d = 0; - std::vector cumulative_concat_dim_sizes(num_inputs); - for (const auto i : c10::irange(num_inputs)) { - cumulative_concat_dim_sizes[i] = d; - d += input_sizes_[i][concat_dim_]; - } - auto load = - inputs.back().load(m, n - cumulative_concat_dim_sizes.back()); - for (size_t i = num_inputs - 1; i > 0; --i) { - load = ifThenElse( - CompareSelect::make( - n, IntImm::make(cumulative_concat_dim_sizes[i]), kLT), - inputs[i - 1].load(m, n - cumulative_concat_dim_sizes[i - 1]), - load); - } - return load; - }); - LoopNest nest({output}); - nest.prepareForCodegen(); - StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - std::vector buf_args(inputs.begin(), inputs.end()); - buf_args.push_back(output); - LLVMCodeGen cg(s, buf_args); - - std::vector call_args; - for (auto _ : state) { - output_ = at::empty_like(ref_); - call_args.clear(); - for (const auto& inp : inputs_) { - call_args.push_back(inp.data_ptr()); - } - call_args.push_back(output_.data_ptr()); - cg.call(call_args); - } - } - - void runNNCLoop(benchmark::State& state) { - size_t num_inputs = inputs_.size(); - size_t num_dims = 2; - - TORCH_INTERNAL_ASSERT(concat_dim_ == 1); - - auto output_buf = alloc( - alloc("aten_cat", kHandle), - std::vector( - {alloc(output_size_[0]), alloc(output_size_[1])}), - kFloat); - - std::vector inputs; - std::vector for_stmts(num_inputs); - int cumulative_input_sizes = 0; - for (size_t i = 0; i < num_inputs; ++i) { - inputs.emplace_back(BufHandle( - "input" + std::to_string(i), - {input_sizes_[i][0], input_sizes_[i][1]}, - kFloat)); - std::vector for_vars(num_inputs); - for (const auto d : c10::irange(num_dims)) { - for_vars[d] = - alloc("i" + std::to_string(i) + "_" + std::to_string(d), kInt); - } - auto store = alloc( - output_buf, - std::vector( - {for_vars[0], - alloc(for_vars[1], alloc(cumulative_input_sizes))}), - alloc( - inputs[i].node(), - std::vector({for_vars[0], for_vars[1]}))); - auto for_st = alloc( - for_vars[0], - alloc(0), - alloc(input_sizes_[i][0]), - alloc( - for_vars[1], - alloc(0), - alloc(input_sizes_[i][1]), - store)); - for_stmts[i] = for_st; - cumulative_input_sizes += input_sizes_[i][1]; - } - auto output = Tensor(output_buf, alloc(for_stmts)); - - LoopNest nest({output}); - nest.prepareForCodegen(); - nest.vectorizeInnerLoops(); - StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - std::vector buf_args(inputs.begin(), inputs.end()); - buf_args.push_back(output); - LLVMCodeGen cg(s, buf_args); - - std::vector call_args; - for (auto _ : state) { - output_ = at::empty_like(ref_); - call_args.clear(); - for (const auto& inp : inputs_) { - call_args.push_back(inp.data_ptr()); - } - call_args.push_back(output_.data_ptr()); - cg.call(call_args); - } - } - - std::vector> input_sizes_; - int concat_dim_; - std::vector inputs_; - std::vector output_size_; - at::Tensor output_; - at::Tensor ref_; -}; - -class Concat2D2Input : public ConcatBench { - public: - void SetUp(const benchmark::State& state) override { - init( - {{state.range(0), state.range(1)}, {state.range(2), state.range(3)}}, - state.range(4)); - } -}; - -} // namespace - -BENCHMARK_DEFINE_F(Concat2D2Input, ATen)(benchmark::State& state) { - runATen(state); -} - -BENCHMARK_DEFINE_F(Concat2D2Input, NNC)(benchmark::State& state) { - runNNC(state); -} - -BENCHMARK_DEFINE_F(Concat2D2Input, NNCLoop)(benchmark::State& state) { - runNNCLoop(state); -} - -BENCHMARK_REGISTER_F(Concat2D2Input, ATen) - ->Args({1, 160, 1, 14, 1}) - ->Args({1, 580, 1, 174, 1}) - ->Args({20, 160, 20, 14, 1}) - ->Args({20, 580, 20, 174, 1}) - ->Args({8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D2Input, NNC) - ->Args({1, 160, 1, 14, 1}) - ->Args({1, 580, 1, 174, 1}) - ->Args({20, 160, 20, 14, 1}) - ->Args({20, 580, 20, 174, 1}) - ->Args({8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D2Input, NNCLoop) - ->Args({1, 160, 1, 14, 1}) - ->Args({1, 580, 1, 174, 1}) - ->Args({20, 160, 20, 14, 1}) - ->Args({20, 580, 20, 174, 1}) - ->Args({8, 512, 8, 512, 1}); - -namespace { - -class Concat2D3Input : public ConcatBench { - public: - void SetUp(const benchmark::State& state) override { - init( - {{state.range(0), state.range(1)}, - {state.range(2), state.range(3)}, - {state.range(4), state.range(5)}}, - state.range(6)); - } -}; - -} // namespace - -BENCHMARK_DEFINE_F(Concat2D3Input, ATen)(benchmark::State& state) { - runATen(state); -} - -BENCHMARK_DEFINE_F(Concat2D3Input, NNC)(benchmark::State& state) { - runNNC(state); -} - -BENCHMARK_DEFINE_F(Concat2D3Input, NNCLoop)(benchmark::State& state) { - runNNCLoop(state); -} - -BENCHMARK_REGISTER_F(Concat2D3Input, ATen)->Args({8, 512, 8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D3Input, NNC)->Args({8, 512, 8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D3Input, NNCLoop) - ->Args({8, 512, 8, 512, 8, 512, 1}); - -namespace { - -class Concat2D7Input : public ConcatBench { - public: - void SetUp(const benchmark::State& state) override { - init( - {{state.range(0), state.range(1)}, - {state.range(2), state.range(3)}, - {state.range(4), state.range(5)}, - {state.range(6), state.range(7)}, - {state.range(8), state.range(9)}, - {state.range(10), state.range(11)}, - {state.range(12), state.range(13)}}, - state.range(14)); - } -}; - -} // namespace - -BENCHMARK_DEFINE_F(Concat2D7Input, ATen)(benchmark::State& state) { - runATen(state); -} - -BENCHMARK_DEFINE_F(Concat2D7Input, NNC)(benchmark::State& state) { - runNNC(state); -} - -BENCHMARK_DEFINE_F(Concat2D7Input, NNCLoop)(benchmark::State& state) { - runNNCLoop(state); -} - -BENCHMARK_REGISTER_F(Concat2D7Input, ATen) - ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D7Input, NNC) - ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1}); - -BENCHMARK_REGISTER_F(Concat2D7Input, NNCLoop) - ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1}); diff --git a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp deleted file mode 100644 index e0da3a38544f..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include - -using namespace torch::jit; - -static const std::string two_adds = R"JIT( -def two_adds(self, x: Tensor, y: Tensor, z: Tensor) -> Tensor: - return x + y + z -)JIT"; - -static void FusedOverhead(benchmark::State& state) { - c10::InferenceMode mode; - overrideCanFuseOnCPU(true); - - Module m("m"); - m.define(two_adds); - - auto x = torch::ones({1}); - auto y = torch::ones({1}); - auto z = torch::ones({1}); - - // Warmup. - for (const auto i : c10::irange(8)) { - (void)i; // Suppress unused variable warning - m.run_method("two_adds", x, y, z); - } - - for (auto _ : state) { - m.run_method("two_adds", x, y, z); - } -} - -static void UnfusedOverhead(benchmark::State& state) { - c10::InferenceMode guard; - overrideCanFuseOnCPU(false); - - Module m("m"); - m.define(two_adds); - - auto x = torch::ones({1}); - auto y = torch::ones({1}); - auto z = torch::ones({1}); - - // Warmup. - for (const auto i : c10::irange(8)) { - (void)i; // Suppress unused variable warning - m.run_method("two_adds", x, y, z); - } - - for (auto _ : state) { - m.run_method("two_adds", x, y, z); - } -} - -BENCHMARK(FusedOverhead); -BENCHMARK(UnfusedOverhead); diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp deleted file mode 100644 index 403746578dff..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp +++ /dev/null @@ -1,313 +0,0 @@ -#include -#include -#include -#include -#include - -namespace te = torch::jit::tensorexpr; - -namespace { -class Gemm : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - M = state.range(0); - N = state.range(1); - K = state.range(2); - A = torch::randn({M, K}); - B = torch::randn({K, N}); - C = torch::mm(A, B); - } - - void TearDown(benchmark::State& state) override { - state.counters["GFLOPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * M * N * K, - benchmark::Counter::kIsRate); - } - - int M; - int N; - int K; - at::Tensor A; - at::Tensor B; - at::Tensor C; -}; -} // namespace - -BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) { - for (auto _ : state) { - torch::mm_out(C, A, B); - } -} - -BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { - te::BufHandle AP("A", {M, K}, te::kFloat); - te::BufHandle BP("B", {K, N}, te::kFloat); - te::Tensor CT = te::Reduce( - "gemm", - {M, N}, - te::Sum(), - [&](const te::ExprHandle& m, - const te::ExprHandle& n, - const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {K}); - te::LoopNest loop({CT}); - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); - - for (auto _ : state) { - cg->call({A.data_ptr(), B.data_ptr(), C.data_ptr()}); - } -} - -BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { - te::BufHandle AP("A", {M, K}, te::kFloat); - te::BufHandle BP("B", {K, N}, te::kFloat); - te::Tensor CT = te::Reduce( - "gemm", - {M, N}, - te::Sum(), - [&](const te::ExprHandle& m, - const te::ExprHandle& n, - const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {K}); - te::LoopNest loop({CT}); - - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr m = loops[0]; - loop.splitWithMask(m, 32); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr n = loops[2]; - loop.splitWithMask(n, 32); - } - // mo, mi, no, ni, k -> - // mo, no, mi, ni, k - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[1]; - te::ForPtr no = loops[2]; - loop.reorderAxis(mi, no); - } - // mo, no, mi, ni, k -> - // mo, no, mi, k, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr ni = loops[3]; - te::ForPtr k = loops[4]; - loop.reorderAxis(ni, k); - } - // mo, no, mi, k, ni -> - // mo, no, k, mi, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[2]; - te::ForPtr k = loops[3]; - loop.reorderAxis(mi, k); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); - - for (auto _ : state) { - cg->call({A.data_ptr(), B.data_ptr(), C.data_ptr()}); - } -} - -BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { - te::BufHandle AP("A", {M, K}, te::kFloat); - te::BufHandle BP("B", {K, N}, te::kFloat); - te::Tensor CT = te::Reduce( - "gemm", - {M, N}, - te::Sum(), - [&](const te::ExprHandle& m, - const te::ExprHandle& n, - const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {K}); - te::LoopNest loop({CT}); - - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr m = loops[0]; - loop.splitWithMask(m, 4); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr n = loops[2]; - loop.splitWithMask(n, 16); - } - // mo, mi, no, ni, k -> - // mo, no, mi, ni, k - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[1]; - te::ForPtr no = loops[2]; - loop.reorderAxis(mi, no); - } - // mo, no, mi, ni, k -> - // mo, no, mi, k, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr ni = loops[3]; - te::ForPtr k = loops[4]; - loop.reorderAxis(ni, k); - } - // mo, no, mi, k, ni -> - // mo, no, k, mi, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[2]; - te::ForPtr k = loops[3]; - loop.reorderAxis(mi, k); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); - - for (auto _ : state) { - cg->call({A.data_ptr(), B.data_ptr(), C.data_ptr()}); - } -} - -BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { - te::BufHandle AP("A", {M, K}, te::kFloat); - te::BufHandle BP("B", {K, N}, te::kFloat); - te::Tensor CT = te::Reduce( - "gemm", - {M, N}, - te::Sum(), - [&](const te::ExprHandle& m, - const te::ExprHandle& n, - const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {K}); - te::LoopNest loop({CT}); - - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr m = loops[0]; - loop.splitWithMask(m, 4); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr n = loops[2]; - loop.splitWithMask(n, 16); - } - // mo, mi, no, ni, k -> - // mo, no, mi, ni, k - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[1]; - te::ForPtr no = loops[2]; - loop.reorderAxis(mi, no); - } - // mo, no, mi, ni, k -> - // mo, no, mi, k, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr ni = loops[3]; - te::ForPtr k = loops[4]; - loop.reorderAxis(ni, k); - } - // mo, no, mi, k, ni -> - // mo, no, k, mi, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[2]; - te::ForPtr k = loops[3]; - loop.reorderAxis(mi, k); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[3]; - te::ForPtr ni = loops[4]; - te::StmtPtr unrolled; - loop.vectorize(ni); - loop.fullUnroll(mi, &unrolled); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); - - for (auto _ : state) { - cg->call({A.data_ptr(), B.data_ptr(), C.data_ptr()}); - } -} - -BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { - te::BufHandle AP("A", {M, K}, te::kFloat); - te::BufHandle BP("B", {K, N}, te::kFloat); - te::Tensor CT = te::Reduce( - "gemm", - {M, N}, - te::Sum(), - [&](const te::ExprHandle& m, - const te::ExprHandle& n, - const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {K}); - te::LoopNest loop({CT}); - - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr m = loops[0]; - loop.splitWithMask(m, 4); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr n = loops[2]; - loop.splitWithMask(n, 16); - } - // mo, mi, no, ni, k -> - // mo, no, mi, ni, k - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[1]; - te::ForPtr no = loops[2]; - loop.reorderAxis(mi, no); - } - // mo, no, mi, ni, k -> - // mo, no, mi, k, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr ni = loops[3]; - te::ForPtr k = loops[4]; - loop.reorderAxis(ni, k); - } - // mo, no, mi, k, ni -> - // mo, no, k, mi, ni - { - auto const& loops = loop.getLoopStmtsFor(CT); - te::ForPtr mi = loops[2]; - te::ForPtr k = loops[3]; - loop.reorderAxis(mi, k); - } - { - auto const& loops = loop.getLoopStmtsFor(CT); - loop.cacheAccesses(CT.buf(), "C_regs", loops[2]); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); - - for (auto _ : state) { - cg->call({A.data_ptr(), B.data_ptr(), C.data_ptr()}); - } -} - -BENCHMARK_REGISTER_F(Gemm, Torch)->Args({128, 128, 128}); -BENCHMARK_REGISTER_F(Gemm, TensorExprNoopt)->Args({128, 128, 128}); -BENCHMARK_REGISTER_F(Gemm, TensorExprTile32x32)->Args({128, 128, 128}); -BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16)->Args({128, 128, 128}); -BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16VecUnroll)->Args({128, 128, 128}); -BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16Cache)->Args({128, 128, 128}); diff --git a/benchmarks/cpp/tensorexpr/bench_kernels.cpp b/benchmarks/cpp/tensorexpr/bench_kernels.cpp deleted file mode 100644 index 9a57547984b0..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_kernels.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include - -#include -#include -#include -#include - -using namespace torch::jit; -using namespace torch::jit::tensorexpr; - -static const std::string kernel_static_shapes_template = R"IR( - graph(%0 : Float(${dim}, strides=[1], device=cpu), - %1 : Float(${dim}, strides=[1], device=cpu)): - %2 : Float(${dim}, strides=[1]) = aten::mul(%0, %1) - %4 : Float(${dim}, strides=[1]) = aten::mul(%0, %2) - return (%4))IR"; - -static const std::string kernel_symbolic_shapes = R"IR( - graph(%0 : Float(SS(-2), strides=[1], device=cpu), - %1 : Float(SS(-2), strides=[1], device=cpu), - %SS_2 : int): - %2 : Float(SS(-2), strides=[1]) = aten::mul(%0, %1) - %4 : Float(SS(-2), strides=[1]) = aten::mul(%0, %2) - return (%4))IR"; - -class KernelBench : public benchmark::Fixture { - public: - void Eager(benchmark::State& state) { - auto dim = state.range(0); - auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - - for (auto _ : state) { - auto o = at::mul(a, at::mul(a, b)); - } - } - - void GraphWithStaticShapes(benchmark::State& state) { - auto dim = state.range(0); - auto graph = std::make_shared(); - at::jit::TemplateEnv env; - env.d("dim", dim); - const auto kernel_static_shapes = - format(kernel_static_shapes_template, env); - parseIR(kernel_static_shapes, &*graph); - TensorExprKernel k(graph); - - auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - std::vector inputs = {a, b}; - - for (auto _ : state) { - std::vector stack = at::fmap(inputs); - k.run(stack); - } - } - - void GraphWithSymbolicShapes(benchmark::State& state) { - auto dim = state.range(0); - auto graph = std::make_shared(); - parseIR(kernel_symbolic_shapes, &*graph); - - std::vector input_desc = { - torch::jit::StrideInput::TENSOR_CONT}; - std::unordered_map< - const torch::jit::Value*, - std::vector> - symbolic_strides; - symbolic_strides[graph->inputs().at(0)] = input_desc; - symbolic_strides[graph->inputs().at(1)] = input_desc; - symbolic_strides[graph->outputs().at(0)] = input_desc; - std::vector symbolic_shape_inputs = {-2}; - TensorExprKernel k( - graph, {}, symbolic_shape_inputs, false, symbolic_strides); - - auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - std::vector inputs = {a, b}; - - for (auto _ : state) { - std::vector stack = at::fmap(inputs); - stack.push_back(dim); - k.run(stack); - } - } -}; - -BENCHMARK_DEFINE_F(KernelBench, Eager)(benchmark::State& state) { - Eager(state); -} - -BENCHMARK_DEFINE_F(KernelBench, StaticShapes)(benchmark::State& state) { - GraphWithStaticShapes(state); -} -BENCHMARK_DEFINE_F(KernelBench, SymbolicShapes)(benchmark::State& state) { - GraphWithSymbolicShapes(state); -} - -BENCHMARK_REGISTER_F(KernelBench, Eager)->Range(32, 2048); -BENCHMARK_REGISTER_F(KernelBench, StaticShapes)->Range(32, 2048); -BENCHMARK_REGISTER_F(KernelBench, SymbolicShapes)->Range(32, 2048); diff --git a/benchmarks/cpp/tensorexpr/bench_ops.py b/benchmarks/cpp/tensorexpr/bench_ops.py deleted file mode 100644 index 3956d7a02a28..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_ops.py +++ /dev/null @@ -1,113 +0,0 @@ -import timeit - -import torch -import torch.nn.functional as F - -torch._C._jit_override_can_fuse_on_cpu(True) -torch._C._debug_set_fusion_group_inlining(False) -torch.set_num_threads(1) - - -def hardswish(x): - return x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0 - - -unary_ops = [ - hardswish, - torch._C._nn.hardswish, - torch.sigmoid, - torch.reciprocal, - torch.neg, - torch.relu, - torch.isnan, - torch.log, - torch.log10, - torch.log1p, - torch.log2, - torch.exp, - torch.expm1, - torch.erf, - torch.erfc, - torch.cos, - torch.sin, - torch.tan, - torch.acos, - torch.asin, - torch.cosh, - torch.sinh, - torch.atan, - torch.tanh, - torch.sqrt, - torch.rsqrt, - torch.abs, - torch.ceil, - torch.floor, - torch.round, - torch.trunc, - torch.lgamma, -] - -print(f"{'op':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}") - -for op in unary_ops: - x = torch.rand((1024, 1024)) - traced = torch.jit.trace(op, (x)) - - # Warmup. - warmup_iters = 8 - for _ in range(warmup_iters): - op(x) - traced(x) - - # Validate result. - torch.testing.assert_close(op(x), traced(x)) - - # Benchmark. - bench_iters = 100 - teager = timeit.timeit(stmt="op(x)", globals=globals(), number=bench_iters) - tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters) - print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}") - - -def test_batch_norm(): - op = F.batch_norm - print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}") - batch_norm_shapes = [ - [1, 64, 112, 112], - [1, 256, 14, 14], - [1, 128, 28, 28], - [1, 64, 56, 56], - [1, 512, 7, 7], - [5, 64, 112, 112], - [5, 256, 14, 14], - [5, 128, 28, 28], - [5, 64, 56, 56], - [5, 512, 7, 7], - ] - for n, c, h, w in batch_norm_shapes: - x = torch.rand((n, c, h, w)) - y = torch.rand(c) - z = torch.rand(c) - traced = torch.jit.trace(op, (x, y, z)) - - # Warmup. - warmup_iters = 8 - for _ in range(warmup_iters): - op(x, y, z) - traced(x, y, z) - - # Validate result. - torch.testing.assert_close(op(x, y, z), traced(x, y, z)) - - # Benchmark. - bench_iters = 100 - teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters) - tjit = timeit.timeit( - stmt="traced(x, y, z)", globals=locals(), number=bench_iters - ) - print( - f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}" - ) - - -test_batch_norm() diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp deleted file mode 100644 index 8d77a459c603..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace torch { -namespace jit { -namespace tensorexpr { - -class ParallelAdd : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - at::set_num_threads(4); - torch::manual_seed(0x12345678); - M = state.range(0); - A = torch::randn({M}); - B = torch::randn({M}); - C = torch::zeros({M}); - } - - void TearDown(benchmark::State& state) override { - state.counters["tasks"] = benchmark::Counter( - uint64_t(state.iterations()) * M, benchmark::Counter::kIsRate); - } - - int M; - at::Tensor A; - at::Tensor B; - at::Tensor C; -}; - -BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { - BufHandle a_buf("a", {M}, kFloat); - BufHandle b_buf("b", {M}, kFloat); - Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) { - return a_buf.load(m) + b_buf.load(m); - }); - LoopNest loop_nest({c_tensor}); - auto const& loops = loop_nest.getLoopStmtsFor(c_tensor); - ForPtr m = loops[0]; - m->set_parallel(); - loop_nest.prepareForCodegen(); - StmtPtr stmt = loop_nest.root_stmt(); - LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf}); - - float* a_ptr = A.data_ptr(); - float* b_ptr = B.data_ptr(); - float* c_ptr = C.data_ptr(); - std::vector args({c_ptr, a_ptr, b_ptr}); - cg.value(args); - for (const auto i : c10::irange(M)) { - float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]); - TORCH_CHECK(diff < 1e-5); - } - - for (auto _ : state) { - cg.value(args); - } -} - -BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16}); - -} // namespace tensorexpr -} // namespace jit -} // namespace torch diff --git a/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp b/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp deleted file mode 100644 index f1fe120c14ad..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp +++ /dev/null @@ -1,395 +0,0 @@ -#include -#include "ATen/Functions.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace torch::jit::tensorexpr; - -namespace { -#ifdef __AVX2__ - -#define _mm256_slli_si1(x) \ - _mm256_blend_epi32( \ - _mm256_permutevar8x32_ps(x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)), \ - _mm256_setzero_si256(), \ - 1) -#define _mm256_slli_si2(x) \ - _mm256_blend_epi32( \ - _mm256_permutevar8x32_ps(x, _mm256_set_epi32(5, 4, 3, 2, 1, 0, 7, 6)), \ - _mm256_setzero_si256(), \ - 3) -#define _mm256_slli_si4(x) \ - _mm256_blend_epi32( \ - _mm256_permutevar8x32_ps(x, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)), \ - _mm256_setzero_si256(), \ - 15) - -__m256 PrefixSum(__m256 x) { - x = _mm256_add_ps(x, _mm256_slli_si1(x)); - x = _mm256_add_ps(x, _mm256_slli_si2(x)); - x = _mm256_add_ps(x, _mm256_slli_si4(x)); - return x; // local prefix sums -} - -__m256i PrefixSumInt(__m256i x) { - x = _mm256_add_epi32(x, _mm256_slli_si1(x)); - x = _mm256_add_epi32(x, _mm256_slli_si2(x)); - x = _mm256_add_epi32(x, _mm256_slli_si4(x)); - return x; // local prefix sums -} - -// Util function to log the given value. Not used during benchmarking. -template -inline void Log(const __m256i& value) { - const size_t n = sizeof(__m256i) / sizeof(T); - T buffer[n]; - _mm256_storeu_si256((__m256i*)buffer, value); - for (int i = 0; i < n; i++) - std::cout << buffer[n - i - 1] << " "; - std::cout << std::endl; -} -#endif - -#ifdef __AVX512F__ - -#define _mm512_slli_si512(x, k) \ - _mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k) - -__m512 PrefixSum(__m512 x) { - x = _mm512_add_ps(x, _mm512_slli_si512(x, 1)); - x = _mm512_add_ps(x, _mm512_slli_si512(x, 2)); - x = _mm512_add_ps(x, _mm512_slli_si512(x, 4)); - x = _mm512_add_ps(x, _mm512_slli_si512(x, 8)); - return x; // local prefix sums -} - -__m512i PrefixSumInt(__m512i x) { - x = _mm512_add_epi32(x, _mm512_slli_si512(x, 1)); - x = _mm512_add_epi32(x, _mm512_slli_si512(x, 2)); - x = _mm512_add_epi32(x, _mm512_slli_si512(x, 4)); - x = _mm512_add_epi32(x, _mm512_slli_si512(x, 8)); - return x; // local prefix sums -} - -template -float _mm512_extract_f32(__m512 target) { - return _mm512_cvtss_f32(_mm512_alignr_epi32(target, target, index)); -} - -// extract the last i32 from target -int _mm512_extract_epi32(__m512i target) { - __m256i x = _mm512_extracti32x8_epi32(target, 1); - return _mm256_extract_epi32(x, 7); -} - -void PrefixSum(float* output_data, float* input_data, size_t input_size) { - float carry = 0.0f; - for (int i = 0; i < input_size / 16; i++) { - __m512 x = _mm512_loadu_ps(input_data + i * 16); - x = PrefixSum(x); - x = _mm512_add_ps(x, _mm512_set1_ps(carry)); - carry = _mm512_extract_f32<15>(x); - _mm512_storeu_ps((__m512*)(output_data + i * 16), x); - } -} - -void PrefixSum(int* output_data, int* input_data, size_t input_size) { - int carry = 0; - for (int i = 0; i < input_size / 16; i++) { - __m512i x = _mm512_loadu_epi32(input_data + i * 16); - x = PrefixSumInt(x); - x = _mm512_add_epi32(x, _mm512_set1_epi32(carry)); - carry = _mm512_extract_epi32(x); - _mm512_storeu_epi32((__m512i*)(output_data + i * 16), x); - } -} -#endif - -// PrefixSum: the same as inclusive scan -class PrefixSumBench : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - input_size_ = state.range(0); - input_ = torch::rand(input_size_); - ref_ = prefixSum(input_); - - // no type promotion. Default is int->long. - input_int_ = torch::randint(1000, {input_size_}, at::kInt); - ref_int_ = at::cumsum(input_int_, 0, at::kInt); - } - - void TearDown(benchmark::State& state) override { - if (output_.numel() > 0) { - if (output_.numel() == ref_.numel()) { - TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3)); - } - state.counters["GB/s"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * output_.nbytes(), - benchmark::Counter::kIsRate); - } else { - if (output_int_.numel() == ref_int_.numel()) { - TORCH_CHECK(ref_int_.equal(output_int_)); - } - state.counters["GB/s"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * output_int_.nbytes(), - benchmark::Counter::kIsRate); - } - } - - at::Tensor prefixSum(const at::Tensor& inp) { - return at::cumsum(inp, 0); - } - - void runATen(benchmark::State& state) { - output_ = prefixSum(input_); - for (auto _ : state) { - at::cumsum_out(output_, input_, 0); - } - } - - void runLocal(benchmark::State& state) { - output_ = at::empty_like(ref_); - for (auto _ : state) { - auto input_data = input_.data_ptr(); - auto output_data = output_.data_ptr(); - float sum = 0.0f; - for (int i = 0; i < input_size_; ++i) { - sum = sum + input_data[i]; - output_data[i] = sum; - } - } - } - - // no type promotion - void runLocalInt(benchmark::State& state) { - output_int_ = at::empty_like(input_int_); - for (auto _ : state) { - auto input_data = input_int_.data_ptr(); - auto output_data = output_int_.data_ptr(); - int sum = 0; - for (int i = 0; i < input_size_; ++i) { - sum = sum + input_data[i]; - output_data[i] = sum; - } - } - } - - void runNNC(benchmark::State& state) { - BufHandle input("input", {input_size_}, kFloat); - BufHandle output("output", {input_size_}, kFloat); - BufHandle s("s", {1}, kFloat); - VarHandle i("i", kInt); - auto allocS = Allocate::make(s); - auto initS = Store::make(s, {0}, 0.0f); - auto accumS = Store::make( - s, {0}, Add::make(Load::make(s, {0}), Load::make(input, {i}))); - auto store = Store::make(output, {i}, Load::make(s, {0})); - auto forI = For::make(i, 0, input_size_, Block::make({accumS, store})); - auto freeS = Free::make(s); - auto par = Block::make({allocS, initS, forI, freeS}); - LoopNest nest(par, {output.node()}); - - std::vector buf_args; - buf_args.emplace_back(input); - buf_args.emplace_back(output); - LLVMCodeGen cg(nest.root_stmt(), buf_args); - - std::vector call_args; - output_ = at::empty_like(ref_); - for (auto _ : state) { - call_args.clear(); - call_args.emplace_back(input_.data_ptr()); - call_args.emplace_back(output_.data_ptr()); - cg.call(call_args); - } - } - -#ifdef __AVX2__ - void runLocalAVX2(benchmark::State& state) { - output_ = at::empty_like(ref_); - for (auto _ : state) { - float* input_data = input_.data_ptr(); - float* output_data = output_.data_ptr(); - - float carry = 0.0f; - for (int i = 0; i < input_size_ / 8; i++) { - __m256 x = _mm256_loadu_ps(input_data + i * 8); - x = PrefixSum(x); - x = _mm256_add_ps(x, _mm256_set1_ps(carry)); - (reinterpret_cast<__m256*>(output_data))[i] = x; - carry = _mm256_cvtss_f32(_mm256_permutevar8x32_ps( - x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7))); - } - } - } - - void runLocalIntAVX2(benchmark::State& state) { - output_int_ = at::empty_like(input_int_); - for (auto _ : state) { - auto input_data = input_int_.data_ptr(); - auto output_data = output_int_.data_ptr(); - - int carry = 0; - for (size_t i = 0; i < input_size_ / 8; i++) { - __m256i x = _mm256_loadu_si256((__m256i*)(input_data + i * 8)); - x = PrefixSumInt(x); - x = _mm256_add_epi32(x, _mm256_set1_epi32(carry)); - _mm256_storeu_si256((__m256i*)(output_data + i * 8), x); - carry = _mm256_extract_epi32(x, 7); - } - } - } -#endif - -#ifdef __AVX512F__ - void runLocalAVX512(benchmark::State& state) { - output_ = at::empty_like(ref_); - for (auto _ : state) { - auto input_data = input_.data_ptr(); - auto output_data = output_.data_ptr(); - PrefixSum(output_data, input_data, input_size_); - } - } - - void runLocalIntAVX512(benchmark::State& state) { - output_int_ = at::empty_like(input_int_); - for (auto _ : state) { - auto input_data = input_int_.data_ptr(); - auto output_data = output_int_.data_ptr(); - PrefixSum(output_data, input_data, input_size_); - } - } - - void runExclusiveScanAVX512(benchmark::State& state) { - output_ = at::empty({input_size_ + 1}, at::kFloat); - for (auto _ : state) { - auto input_data = input_.data_ptr(); - auto output_data = output_.data_ptr(); - output_data[0] = 0.0f; - PrefixSum(output_data + 1, input_data, input_size_); - } - } - - void runExclusiveScanIntAVX512(benchmark::State& state) { - output_int_ = at::empty({input_size_ + 1}, at::kInt); - for (auto _ : state) { - auto input_data = input_int_.data_ptr(); - auto output_data = output_int_.data_ptr(); - output_data[0] = 0; - PrefixSum(output_data + 1, input_data, input_size_); - } - } - -#endif - - private: - int input_size_; - at::Tensor input_; - at::Tensor output_; - at::Tensor ref_; - at::Tensor input_int_; - at::Tensor output_int_; - at::Tensor ref_int_; // no type promotion -}; - -} // namespace - -BENCHMARK_DEFINE_F(PrefixSumBench, ATen)(benchmark::State& state) { - runATen(state); -} - -BENCHMARK_DEFINE_F(PrefixSumBench, Local)(benchmark::State& state) { - runLocal(state); -} - -BENCHMARK_DEFINE_F(PrefixSumBench, LocalInt)(benchmark::State& state) { - runLocalInt(state); -} - -BENCHMARK_DEFINE_F(PrefixSumBench, NNC)(benchmark::State& state) { - runNNC(state); -} - -#ifdef __AVX2__ -BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX2)(benchmark::State& state) { - runLocalAVX2(state); -} -BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX2)(benchmark::State& state) { - runLocalIntAVX2(state); -} -#endif - -#ifdef __AVX512F__ -BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX512)(benchmark::State& state) { - runLocalAVX512(state); -} -BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX512)(benchmark::State& state) { - runLocalIntAVX512(state); -} - -BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanAVX512) -(benchmark::State& state) { - runExclusiveScanAVX512(state); -} -BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanIntAVX512) -(benchmark::State& state) { - runExclusiveScanIntAVX512(state); -} -#endif - -//---------- float benchmarks ----------// -BENCHMARK_REGISTER_F(PrefixSumBench, ATen) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); - -BENCHMARK_REGISTER_F(PrefixSumBench, NNC) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); - -BENCHMARK_REGISTER_F(PrefixSumBench, Local) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); - -#ifdef __AVX2__ -BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX2) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -#endif - -#ifdef __AVX512F__ -BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX512) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanAVX512) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -#endif - -//---------- int benchmarks ----------// -BENCHMARK_REGISTER_F(PrefixSumBench, LocalInt) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); - -#ifdef __AVX2__ -BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX2) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -#endif - -#ifdef __AVX512F__ -BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX512) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanIntAVX512) - ->RangeMultiplier(4) - ->Ranges({{1 << 6, 1 << 20}}); -#endif diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp deleted file mode 100644 index bf0fe21ca0b1..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ /dev/null @@ -1,621 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace te = torch::jit::tensorexpr; - -namespace { -class Reduce1D : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - at::set_num_threads(1); - torch::manual_seed(0x12345678); - M = state.range(0); - A = torch::randn({M}); - B = torch::zeros({}); - ref = torch::sum(A, {0}); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-7)); - state.counters["BYTES"] = benchmark::Counter( - uint64_t(state.iterations()) * M * sizeof(float), - benchmark::Counter::kIsRate); - } - - int M; - at::Tensor A; - at::Tensor B; - at::Tensor ref; -}; - -} // namespace - -BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) { - for (auto _ : state) { - B = torch::sum(A, {0}); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24}); - -#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B)) - -template -void ValidateFunc( - Func func, - const std::string& func_name, - at::Tensor& A, - at::Tensor& B) { - func(A, B); - float* pB = B.data_ptr(); - at::Tensor B2 = torch::sum(A, {0}); - float* pB2 = B2.data_ptr(); - int size = A.numel(); - float size_sqrt = std::sqrt(size); - float natural_noise = size_sqrt * 1e-7; - if (!torch::allclose(B, B2, natural_noise)) { - std::ostringstream oss; - oss << func_name << " failed check: " << std::endl; - oss << "value: " << B << std::endl; - ; - oss << "reference: " << B2 << std::endl; - oss << "threshold: " << natural_noise << std::endl; - throw std::runtime_error(oss.str()); - } -} - -static void reduce1d_naive(at::Tensor& A, at::Tensor& B) { - float* pA = A.data_ptr(); - float* pB = B.data_ptr(); - int size = A.numel(); - TORCH_CHECK(B.numel() == 1); - *pB = 0.; - for (const auto i : c10::irange(size)) { - *pB += pA[i]; - } -} - -BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) { - VALIDATE(reduce1d_naive, A, B); - for (auto _ : state) { - reduce1d_naive(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24}); - -static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) { - float* pA = A.data_ptr(); - float* pB = B.data_ptr(); - int size = A.numel(); - constexpr int kChunkSize = 16; - TORCH_CHECK(B.numel() == 1); - TORCH_CHECK(size % kChunkSize == 0); - *pB = 0.; - float temp[kChunkSize]; - for (const auto j : c10::irange(kChunkSize)) { - temp[j] = 0; - } - - int chunk_count = size / kChunkSize; - for (const auto i : c10::irange(chunk_count)) { - for (const auto j : c10::irange(kChunkSize)) { - temp[j] += pA[i * kChunkSize + j]; - } - } - - for (const auto j : c10::irange(kChunkSize)) { - *pB += temp[j]; - } -} - -BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) { - VALIDATE(reduce1d_native_rfactor, A, B); - for (auto _ : state) { - reduce1d_native_rfactor(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24}); - -#ifdef USE_AVX2 - -// x = ( x7, x6, x5, x4, x3, x2, x1, x0 ) -inline float sum_f32x8(__m256 x) { - // hiQuad = ( x7, x6, x5, x4 ) - const __m128 hiQuad = _mm256_extractf128_ps(x, 1); - // loQuad = ( x3, x2, x1, x0 ) - const __m128 loQuad = _mm256_castps256_ps128(x); - // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 ) - const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad); - // loDual = ( -, -, x1 + x5, x0 + x4 ) - const __m128 loDual = sumQuad; - // hiDual = ( -, -, x3 + x7, x2 + x6 ) - const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad); - // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 ) - const __m128 sumDual = _mm_add_ps(loDual, hiDual); - // lo = ( -, -, -, x0 + x2 + x4 + x6 ) - const __m128 lo = sumDual; - // hi = ( -, -, -, x1 + x3 + x5 + x7 ) - const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1); - // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ) - const __m128 sum = _mm_add_ss(lo, hi); - return _mm_cvtss_f32(sum); -} - -static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) { - float* pA = A.data_ptr(); - float* pB = B.data_ptr(); - int size = A.numel(); - constexpr int kChunkSize = sizeof(__m256) / sizeof(float); - TORCH_CHECK(B.numel() == 1); - TORCH_CHECK(size % kChunkSize == 0); - *pB = 0.; - __m256 temp; - temp = _mm256_setzero_ps(); - - int tile_count = size / kChunkSize; - for (const auto i : c10::irange(tile_count)) { - __m256 data = _mm256_load_ps(pA + i * kChunkSize); - temp = _mm256_add_ps(temp, data); - } - - float result = sum_f32x8(temp); - *pB = result; -} - -BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) { - VALIDATE(reduce1d_native_vector, A, B); - for (auto _ : state) { - reduce1d_native_vector(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24}); - -static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) { - static constexpr int kTileSize = 4; - float* pA = A.data_ptr(); - float* pB = B.data_ptr(); - int size = A.numel(); - constexpr int kChunkSize = sizeof(__m256) / sizeof(float); - TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1"); - TORCH_CHECK( - size % kChunkSize == 0, - "Invalid size: ", - size, - " % ", - kChunkSize, - " ! = 0"); - __m256 t[kTileSize]; - for (const auto j : c10::irange(kTileSize)) { - t[j] = _mm256_setzero_ps(); - } - - int tile_count = size / kChunkSize / kTileSize; - for (int i = 0; i < tile_count; i++) { -#pragma unroll - for (int j = 0; j < kTileSize; j++) { - float* p = pA + (i * kTileSize + j) * kChunkSize; - __m256 data = _mm256_loadu_ps(p); - t[j] = _mm256_add_ps(t[j], data); - } - } - - float result = sum_f32x8(t[0]); - for (const auto j : c10::irange(1, kTileSize)) { - result += sum_f32x8(t[j]); - } - *pB = result; -} - -BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) { - VALIDATE(reduce1d_native_tiled, A, B); - for (auto _ : state) { - reduce1d_native_tiled(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24}); - -#endif // USE_AVX2 - -BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { - int M = A.numel(); - - te::BufHandle AP("A", {M}, te::kFloat); - te::Tensor BT = te::Reduce( - "reduce_full", - {1}, - te::Sum(), - [&](const te::ExprHandle& n, const te::ExprHandle& m) { - return AP.load(m); - }, - {M}); - - te::LoopNest loop({BT}); - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); - - auto func = [&](at::Tensor& A, at::Tensor& B) { - cg->call({A.data_ptr(), B.data_ptr()}); - }; - - ValidateFunc(func, "reduce1d_te_naive", A, B); - for (auto _ : state) { - func(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24}); - -BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { - int M = A.numel(); - - te::BufHandle AP("A", {M}, te::kFloat); - te::Tensor BT = te::Reduce( - "reduce_full", - {1}, - te::Sum(), - [&](const te::ExprHandle& n, const te::ExprHandle& m) { - return AP.load(m); - }, - {M}); - - te::LoopNest loop({BT}); - const int kChunkSize = 8; - - { - auto const& loops = loop.getLoopStmtsFor(BT); - te::ForPtr m = loops[1]; - loop.splitWithTail(m, kChunkSize); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); - - auto func = [&](at::Tensor& A, at::Tensor& B) { - cg->call({A.data_ptr(), B.data_ptr()}); - }; - - ValidateFunc(func, "reduce1d_te_naive", A, B); - for (auto _ : state) { - func(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24}); - -BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { - int M = A.numel(); - - te::BufHandle AP("A", {M}, te::kFloat); - te::Tensor BT = te::Reduce( - "reduce_full", - {1}, - te::Sum(), - [&](const te::ExprHandle& n, const te::ExprHandle& m) { - return AP.load(m); - }, - {M}); - - te::LoopNest loop({BT}); - const int kChunkSize = 8; - - { - auto const& loops = loop.getLoopStmtsFor(BT); - te::ForPtr m = loops[1]; - loop.splitWithMask(m, kChunkSize); - } - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); - - auto func = [&](at::Tensor& A, at::Tensor& B) { - cg->call({A.data_ptr(), B.data_ptr()}); - }; - - ValidateFunc(func, "reduce1d_te_naive", A, B); - for (auto _ : state) { - func(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24}); - -BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { - int M = A.numel(); - const int kChunkSize = 8; - TORCH_CHECK(M % kChunkSize == 0); - - te::BufHandle AP("A", {M}, te::kFloat); - te::Tensor BT = te::Reduce( - "reduce_full", - {}, - te::Sum(), - [&](const te::ExprHandle& m) { return AP.load(m); }, - {M}); - - te::LoopNest loop({BT}); - te::BufPtr rfac_buf; - - auto loops = loop.getLoopStmtsFor(BT); - TORCH_CHECK(loops.size() == 1); - te::ForPtr mi; - loop.splitWithMask(loops.at(0), kChunkSize, &mi); - te::ForPtr mo = loops.at(0); - - loop.reorderAxis(mo, mi); - loops = loop.getLoopStmtsFor(BT); - auto bt_body = loop.getAllWritesToBuf(BT.buf())[1]; - TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf)); - loop.reorderAxis(loops.at(0), loops.at(1)); - - loops = loop.getAllInnermostLoopsWritingToBuf(rfac_buf); - TORCH_CHECK(loops.size() == 2); - loop.vectorize(loops.at(1)); - - loop.prepareForCodegen(); - te::StmtPtr s = loop.root_stmt(); - s = te::IRSimplifier::simplify(s); - auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); - - auto func = [&](at::Tensor& A, at::Tensor& B) { - cg->call({A.data_ptr(), B.data_ptr()}); - }; - - ValidateFunc(func, "reduce1d_te_naive", A, B); - for (auto _ : state) { - func(A, B); - } -} - -BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24}); - -BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) { - const int M = A.numel(); - const int kChunkSize = 8; - - te::BufHandle a("A", {M}, te::kFloat); - te::Tensor b = te::computeSum( - {a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU); - te::LoopNest nest({b}); - - auto loops = nest.getLoopStmtsFor(b); - te::ForPtr mi, mo; - te::BufPtr rf; - nest.splitWithMask(loops[0], kChunkSize, &mi); - loops = nest.reorder({loops[0], mi}, {1, 0}); - nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf); - nest.reorderAxis(loops[0], loops[1]); - for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) { - nest.vectorize(loop); - } - - nest.prepareForCodegen(); - nest.simplify(); - te::LLVMCodeGen cg(nest.root_stmt(), {a, b}); - - for (auto _ : state) { - cg.call({A.data_ptr(), B.data_ptr()}); - } -} -BENCHMARK_REGISTER_F(Reduce1D, Op)->Args({1 << 24}); - -class Reduce2DCol : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - at::set_num_threads(1); - torch::manual_seed(0x12345678); - M = state.range(0); - N = state.range(1); - A = torch::randn({M, N}); - ref = torch::sum(A, {0}); - B = torch::zeros_like(ref); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-5)); - state.counters["BYTES"] = benchmark::Counter( - uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()), - benchmark::Counter::kIsRate); - } - - int M; - int N; - at::Tensor A; - at::Tensor B; - at::Tensor ref; -}; - -BENCHMARK_DEFINE_F(Reduce2DCol, Torch)(benchmark::State& state) { - for (auto _ : state) { - B = torch::sum(A, {0}); - } -} -BENCHMARK_REGISTER_F(Reduce2DCol, Torch) - ->Args({1 << 3, 1 << 21}) - ->Args({1 << 6, 1 << 18}) - ->Args({1 << 12, 1 << 12}); - -BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) { - constexpr int kCacheSize = 1 << 12; - te::BufHandle a("A", {M, N}, te::kFloat); - te::Tensor b = te::computeSum( - {a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU); - te::LoopNest nest({b}); - - auto sch = state.range(2); - if (sch == 0) { - } else if (sch == 1) { - auto loops = nest.getLoopStmtsFor(b); - nest.reorderAxis(loops[0], loops[1]); - } else if (sch == 2) { - auto loops = nest.getLoopStmtsFor(b); - nest.splitWithTail(loops[0], kCacheSize); - loops = nest.getLoopStmtsFor(b); - nest.reorderAxis(loops[1], loops[2]); - } else if (sch == 3) { - auto loops = nest.getLoopStmtsFor(b); - nest.splitWithTail(loops[1], 8); - loops = nest.getLoopStmtsFor(b); - nest.reorderAxis(loops[0], loops[1]); - } - - nest.prepareForCodegen(); - nest.simplify(); - te::LLVMCodeGen cg(nest.root_stmt(), {a, b}); - for (auto _ : state) { - cg.call({A.data_ptr(), B.data_ptr()}); - } -} -BENCHMARK_REGISTER_F(Reduce2DCol, OpSchedule) - ->Apply( // CustomArgs); - [](benchmark::internal::Benchmark* b) { - for (auto sch : {0, 1, 2, 3}) { - for (auto rows : {3, 6, 12}) { - auto cols = 24 - rows; - b->Args({1 << rows, 1 << cols, sch}); - } - } - }); - -class Reduce2DRow : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - at::set_num_threads(1); - torch::manual_seed(0x12345678); - M = state.range(0); - N = state.range(1); - A = torch::randn({M, N}); - ref = torch::sum(A, {1}); - B = torch::zeros_like(ref); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-4)); - state.counters["BYTES"] = benchmark::Counter( - uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()), - benchmark::Counter::kIsRate); - } - - int M; - int N; - at::Tensor A; - at::Tensor B; - at::Tensor ref; -}; - -BENCHMARK_DEFINE_F(Reduce2DRow, Torch)(benchmark::State& state) { - for (auto _ : state) { - B = torch::sum(A, {1}); - } -} -BENCHMARK_REGISTER_F(Reduce2DRow, Torch) - ->Args({1 << 3, 1 << 21}) - ->Args({1 << 6, 1 << 18}) - ->Args({1 << 12, 1 << 12}) - ->Args({1 << 18, 1 << 6}); - -BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) { - auto a = A.data_ptr(); - auto b = B.data_ptr(); - constexpr int Mb = 4; - constexpr int Nb = 4; - auto fn = [&] { - for (int m_outer = 0; m_outer < M; m_outer += Mb) { - float bregs[Mb][Nb] = {0.0f}; - for (int n_outer = 0; n_outer < N; n_outer += Nb) { - for (const auto m_inner : c10::irange(Mb)) { - for (const auto n_inner : c10::irange(Nb)) { - bregs[m_inner][n_inner] += - a[(m_outer + m_inner) * N + n_outer + n_inner]; - } - } - } - for (const auto m_inner : c10::irange(Mb)) { - b[m_outer + m_inner] = 0.f; - for (const auto n_inner : c10::irange(Nb)) { - b[m_outer + m_inner] += bregs[m_inner][n_inner]; - } - } - } - }; - for (auto _ : state) { - fn(); - } -} -BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6}); - -BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { - constexpr int kChunkSize = 8; - te::BufHandle a("A", {M, N}, te::kFloat); - te::Tensor b = te::computeSum( - {a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU); - te::LoopNest nest({b}); - - auto sch = state.range(2); - if (sch == 1) { - auto loops = nest.getLoopStmtsFor(b); - te::ForPtr mi, mo; - te::BufPtr rf; - nest.splitWithMask(loops[1], kChunkSize, &mi); - loops = nest.reorder({loops[1], mi}, {1, 0}); - TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf)); - nest.reorderAxis(loops[0], loops[1]); - for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) { - nest.vectorize(loop); - } - } else if (sch == 2) { - auto loops = nest.getLoopStmtsFor(b); - nest.splitWithMask(loops[1], 8); - nest.splitWithMask(loops[0], 4); - loops = nest.getLoopStmtsFor(b); - nest.reorderAxis(loops[1], loops[2]); - } else if (sch == 3) { - auto loops = nest.getLoopStmtsFor(b); - te::ForPtr mi, mo; - te::BufPtr rf; - nest.splitWithMask(loops[1], kChunkSize, &mi); - loops = nest.reorder({loops[1], mi}, {1, 0}); - TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf)); - nest.reorderAxis(loops[0], loops[1]); - te::LoopNest::compressBuffer(rf, nest.root_stmt()); - for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) { - nest.vectorize(loop); - } - } - - nest.prepareForCodegen(); - nest.simplify(); - te::LLVMCodeGen cg(nest.root_stmt(), {a, b}); - - for (auto _ : state) { - cg.call({A.data_ptr(), B.data_ptr()}); - } -} -BENCHMARK_REGISTER_F(Reduce2DRow, OpSchedule) - ->Apply( // CustomArgs); - [](benchmark::internal::Benchmark* b) { - for (auto sch : {0, 1, 2, 3}) { - for (auto rows : {3, 6, 12, 18}) { - auto cols = 24 - rows; - b->Args({1 << rows, 1 << cols, sch}); - } - } - }); diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp deleted file mode 100644 index 568905acd7c4..000000000000 --- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp +++ /dev/null @@ -1,166 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace torch::jit::tensorexpr; - -namespace { - -class SignedLog1pBench : public benchmark::Fixture { - public: - void SetUp(const benchmark::State& state) override { - input_size_ = {state.range(0), state.range(1)}; - input_size_int_ = {state.range(0), state.range(1)}; - input_ = torch::rand(input_size_); - ref_ = signedLog1p(input_); - } - - void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3)); - state.counters["GB/s"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * output_.nbytes(), - benchmark::Counter::kIsRate); - } - - at::Tensor signedLog1p(const at::Tensor& inp) { - auto sign = at::sign(inp); - auto log1p = at::log1p(at::abs(inp)); - return sign * log1p; - } - - void runATen(benchmark::State& state) { - for (auto _ : state) { - output_ = signedLog1p(input_); - } - } - - void runNNC(benchmark::State& state) { - BufHandle input_ph( - "input", {input_size_int_[0], input_size_int_[1]}, kFloat); - Tensor abs_result = Compute( - "aten_abs", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return abs(input_ph.load(m, n)); - }); - Tensor log1p_result = Compute( - "aten_log1p", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return log1p(abs_result.load(m, n)); - }); - Tensor sign_result = - computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]}); - Tensor output = Compute( - "aten_mul", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return sign_result.load(m, n) * log1p_result.load(m, n); - }); - LoopNest nest({output}, {abs_result, log1p_result, sign_result, output}); - GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt()); - nest.inlineIntermediateBufs(true); - nest.prepareForCodegen(); - nest.simplify(); - nest.vectorizeInnerLoops(); - nest.simplify(); - GRAPH_DEBUG("Final stmt: ", *nest.root_stmt()); - - // StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - std::vector buf_args; - buf_args.emplace_back(input_ph); - buf_args.emplace_back(output); - LLVMCodeGen cg(nest.root_stmt(), buf_args); - - std::vector call_args; - for (auto _ : state) { - output_ = at::empty_like(ref_); - call_args.clear(); - call_args.emplace_back(input_.data_ptr()); - call_args.emplace_back(output_.data_ptr()); - cg.call(call_args); - } - } - - void runNNCLogVml(benchmark::State& state) { - BufHandle input_ph( - "input", {input_size_int_[0], input_size_int_[1]}, kFloat); - Tensor abs_result = Compute( - "aten_abs", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return abs(input_ph.load(m, n)); - }); - Tensor log_vml_result = Compute( - "aten_log1p", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return log_vml(abs_result.load(m, n) + ExprHandle(1)); - }); - Tensor sign_result = - computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]}); - Tensor output = Compute( - "aten_mul", - {input_size_int_[0], input_size_int_[1]}, - [&](const VarHandle& m, const VarHandle& n) { - return sign_result.load(m, n) * log_vml_result.load(m, n); - }); - LoopNest nest({output}, {abs_result, log_vml_result, sign_result, output}); - GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt()); - nest.inlineIntermediateBufs(true); - nest.prepareForCodegen(); - nest.simplify(); - nest.vectorizeInnerLoops(); - nest.simplify(); - GRAPH_DEBUG("Final stmt: ", *nest.root_stmt()); - - // StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); - std::vector buf_args; - buf_args.emplace_back(input_ph); - buf_args.emplace_back(output); - LLVMCodeGen cg(nest.root_stmt(), buf_args); - - std::vector call_args; - for (auto _ : state) { - output_ = at::empty_like(ref_); - call_args.clear(); - call_args.emplace_back(input_.data_ptr()); - call_args.emplace_back(output_.data_ptr()); - cg.call(call_args); - } - } - - private: - std::vector input_size_; - std::vector input_size_int_; - at::Tensor input_; - at::Tensor output_; - at::Tensor ref_; -}; - -} // namespace - -BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) { - runATen(state); -} - -BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) { - runNNC(state); -} - -BENCHMARK_DEFINE_F(SignedLog1pBench, NNCLogVml)(benchmark::State& state) { - runNNCLogVml(state); -} - -BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467}); - -BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467}); - -BENCHMARK_REGISTER_F(SignedLog1pBench, NNCLogVml)->Args({10, 1467}); diff --git a/benchmarks/cpp/tensorexpr/main.cpp b/benchmarks/cpp/tensorexpr/main.cpp deleted file mode 100644 index 71fefa047228..000000000000 --- a/benchmarks/cpp/tensorexpr/main.cpp +++ /dev/null @@ -1,3 +0,0 @@ -#include - -BENCHMARK_MAIN(); diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 166cae5f6e25..f1394b62b825 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1660,14 +1660,6 @@ if(BUILD_STATIC_RUNTIME_BENCHMARK) target_link_libraries(static_runtime_test torch_library gtest_main) endif() -if(BUILD_TENSOREXPR_BENCHMARK) - add_subdirectory(${TORCH_ROOT}/benchmarks/cpp/tensorexpr ${CMAKE_BINARY_DIR}/tensorexpr_bench) -endif() - -if(BUILD_CPP_BENCHMARKS) - add_subdirectory(${TORCH_ROOT}/benchmarks/cpp ${PROJECT_BINARY_DIR}/bin) -endif() - if(BUILD_MOBILE_BENCHMARK) foreach(benchmark_src ${ATen_MOBILE_BENCHMARK_SRCS}) get_filename_component(benchmark_name ${benchmark_src} NAME_WE) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index f9dcb5e02f86..d8fe95805ab3 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -26,7 +26,6 @@ function(caffe2_print_configuration_summary) message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}") message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}") - message(STATUS " BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}") message(STATUS " BUILD_BINARY : ${BUILD_BINARY}") message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}") if(${CAFFE2_LINK_LOCAL_PROTOBUF})