Remove cpp/tensorexpr benchmarks (#116868)

Summary: These refer to a deprecated backend of torchscript which is no longer built in releases, and require llvm to be built. Test Plan: ``` python setup.py develop ``` Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/116868 Approved by: https://github.com/hl475, https://github.com/chenyang78, https://github.com/eellison, https://github.com/mikekgfb
2025-10-21 05:34:18 +08:00 · 2024-01-05 21:23:30 +00:00
parent 99ef47098d
commit 521dbbfaff
19 changed files with 0 additions and 3204 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -187,7 +187,6 @@ cmake_dependent_option(
 option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
 option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF)
 option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
 option(BUILD_TENSOREXPR_BENCHMARK "Build C++ binaries for tensorexpr benchmarks (need gbenchmark)" OFF)
 option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_JNI "Build JNI bindings" OFF)
--- a/benchmarks/cpp/CMakeLists.txt
+++ b/benchmarks/cpp/CMakeLists.txt
@ -1,2 +0,0 @@
 add_executable(convolution_bench convolution.cpp)
 target_link_libraries(convolution_bench PRIVATE torch_library benchmark)
--- a/benchmarks/cpp/convolution.cpp
+++ b/benchmarks/cpp/convolution.cpp
@ -1,313 +0,0 @@
 #include <ATen/ATen.h>
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <benchmark/benchmark.h>
 #include <c10/core/InferenceMode.h>
 #include <sstream>
 struct ConvParams {
  std::vector<int64_t> input;
  std::vector<int64_t> weight;
  std::vector<int64_t> bias;
  std::vector<int64_t> stride;
  std::vector<int64_t> padding;
  std::vector<int64_t> dilation;
  int64_t groups;
 };
 struct xs {
  explicit xs(const std::vector<int64_t>& v_) : v(v_) {}
  const std::vector<int64_t>& v;
 };
 std::ostream& operator<<(std::ostream& os, const xs& x) {
  bool first = true;
  for (auto const& xx : x.v) {
    if (!first) {
      os << "x";
    }
    first = false;
    os << xx;
  }
  return os;
 }
 std::ostream& operator<<(std::ostream& os, const ConvParams& params) {
  os << "I" << xs(params.input) << "_W" << xs(params.weight) << "_B"
     << xs(params.bias) << "_S" << xs(params.stride) << "_P"
     << xs(params.padding) << "_D" << xs(params.dilation) << "_G"
     << params.groups;
  return os;
 }
 std::vector<ConvParams> MobileNetV3Params = {
    {{1, 3, 224, 224}, {16, 3, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 16, 112, 112}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 16, 112, 112}, {16, 1, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 16},
    {{1, 16, 56, 56}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 16, 56, 56}, {72, 16, 1, 1}, {72}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 72, 56, 56}, {72, 1, 3, 3}, {72}, {2, 2}, {1, 1}, {1, 1}, 72},
    {{1, 72, 28, 28}, {24, 72, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 24, 28, 28}, {88, 24, 1, 1}, {88}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 88, 28, 28}, {88, 1, 3, 3}, {88}, {1, 1}, {1, 1}, {1, 1}, 88},
    {{1, 88, 28, 28}, {24, 88, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 24, 28, 28}, {96, 24, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 96, 28, 28}, {96, 1, 5, 5}, {96}, {2, 2}, {2, 2}, {1, 1}, 96},
    {{1, 96, 14, 14}, {40, 96, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
    {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
    {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 40, 14, 14}, {120, 40, 1, 1}, {120}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 120, 14, 14}, {120, 1, 5, 5}, {120}, {1, 1}, {2, 2}, {1, 1}, 120},
    {{1, 120, 14, 14}, {48, 120, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 48, 14, 14}, {144, 48, 1, 1}, {144}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 144, 14, 14}, {144, 1, 5, 5}, {144}, {1, 1}, {2, 2}, {1, 1}, 144},
    {{1, 144, 14, 14}, {48, 144, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 48, 14, 14}, {288, 48, 1, 1}, {288}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 288, 14, 14}, {288, 1, 5, 5}, {288}, {2, 2}, {2, 2}, {1, 1}, 288},
    {{1, 288, 7, 7}, {96, 288, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
    {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
    {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 576, 1, 1}, {1280, 576, 1, 1}, {1280}, {1, 1}, {0, 0}, {1, 1}, 1},
 };
 std::vector<ConvParams> ResNet18Params = {
    {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {128, 64, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {128, 64, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {256, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {256, 128, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {512, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
 };
 std::vector<ConvParams> ResNet50Params = {
    {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 56, 56}, {128, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 128, 56, 56}, {128, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 56, 56}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 28, 28}, {256, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 28, 28}, {256, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 28, 28}, {1024, 512, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {512, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 14, 14}, {512, 512, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 1024, 14, 14}, {2048, 1024, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
    {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
 };
 struct EnableMklDnn {
  explicit EnableMklDnn(bool enable)
      : prev_(at::globalContext().userEnabledMkldnn()) {
    at::globalContext().setUserEnabledMkldnn(enable);
  }
  ~EnableMklDnn() {
    at::globalContext().setUserEnabledMkldnn(prev_);
  }
  bool prev_;
 };
 template <bool WithMklDnn>
 static void BM_conv2d_native(
    benchmark::State& state,
    const ConvParams& params) {
  EnableMklDnn mkl(WithMklDnn);
  auto input = at::randn(params.input);
  auto weight = at::randn(params.weight);
  auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
  auto output = at::conv2d(
      input,
      weight,
      bias,
      params.stride,
      params.padding,
      params.dilation,
      params.groups);
  for (auto _ : state) {
    output = at::conv2d(
        input,
        weight,
        bias,
        params.stride,
        params.padding,
        params.dilation,
        params.groups);
  }
  state.counters["GFLOPS/s"] = benchmark::Counter(
      2.0f * output.numel() * weight.numel() / weight.size(0) *
          state.iterations(),
      benchmark::Counter::kIsRate);
  state.counters["GB/s"] = benchmark::Counter(
      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
      state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
      benchmark::Counter::kIsRate);
 }
 enum MklDnnReorder {
  None,
  WeightOnly,
  WeightAndInput,
 };
 template <MklDnnReorder Reorder>
 static void BM_conv2d_mkldnn(
    benchmark::State& state,
    const ConvParams& params) {
  auto input = at::randn(params.input);
  auto weight = at::randn(params.weight);
  auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
  if (Reorder == WeightAndInput) {
    auto it_input = at::native::itensor_from_mkldnn(input.to_mkldnn());
    auto r = ideep::tensor(
        params.input, ideep::data_type::f32, ideep::format_tag::aBcd16b);
    it_input.reorder_to(r);
    input = at::native::new_with_itensor_mkldnn(
        std::move(r), at::kFloat, at::Device(at::kCPU));
  }
  if (Reorder == WeightOnly || Reorder == WeightAndInput) {
    weight = at::mkldnn_reorder_conv2d_weight(
        weight.to_mkldnn(),
        params.padding,
        params.stride,
        params.dilation,
        params.groups);
    bias = params.bias.size() > 0 ? bias.to_mkldnn() : bias;
  }
  auto output = at::mkldnn_convolution(
      input,
      weight,
      bias,
      params.padding,
      params.stride,
      params.dilation,
      params.groups);
  for (auto _ : state) {
    output = at::mkldnn_convolution(
        input,
        weight,
        bias,
        params.padding,
        params.stride,
        params.dilation,
        params.groups);
  }
  state.counters["GFLOPS/s"] = benchmark::Counter(
      2.0f * output.numel() * weight.numel() / weight.size(0) *
          state.iterations(),
      benchmark::Counter::kIsRate);
  state.counters["GB/s"] = benchmark::Counter(
      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
      state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
      benchmark::Counter::kIsRate);
 }
 std::string name(
    const char* base,
    const char* suffix,
    const ConvParams& params) {
  std::ostringstream os;
  os << base << "_" << suffix << "_" << params;
  return os.str();
 }
 void registerOne(const char* base, const ConvParams& params) {
  benchmark::RegisterBenchmark(
      name(base, "native", params).data(), BM_conv2d_native<true>, params);
  benchmark::RegisterBenchmark(
      name(base, "native_nomkl", params).data(),
      BM_conv2d_native<false>,
      params);
  benchmark::RegisterBenchmark(
      name(base, "mkldnn_none", params).data(), BM_conv2d_mkldnn<None>, params);
  benchmark::RegisterBenchmark(
      name(base, "mkldnn_weight", params).data(),
      BM_conv2d_mkldnn<WeightOnly>,
      params);
  benchmark::RegisterBenchmark(
      name(base, "mkldnn_input", params).data(),
      BM_conv2d_mkldnn<WeightAndInput>,
      params);
 }
 int main(int argc, char** argv) {
  c10::InferenceMode guard;
 #define BENCH(x)                         \
  for (auto const& params : x##Params) { \
    registerOne(#x, params);             \
  }
  BENCH(MobileNetV3);
  BENCH(ResNet18);
  BENCH(ResNet50);
 #undef BENCH
  benchmark::Initialize(&argc, argv);
  benchmark::RunSpecifiedBenchmarks();
 }
--- a/benchmarks/cpp/tensorexpr/CMakeLists.txt
+++ b/benchmarks/cpp/tensorexpr/CMakeLists.txt
@ -1,24 +0,0 @@
 find_package(AVX)
 add_executable(
  tensorexpr_bench
  bench_approx.cpp
  bench_batchnorm.cpp
  bench_concat.cpp
  bench_compile.cpp
  bench_signed_log1p.cpp
  bench_fuser_overhead.cpp
  bench_gemm.cpp
  bench_kernels.cpp
  bench_parallel.cpp
  bench_prefix_sum.cpp
  bench_reduce.cpp
  main.cpp)
 if(C_AVX2_FOUND)
  message(STATUS "AVX2 compiler support found")
  target_compile_options(tensorexpr_bench PUBLIC -mavx2)
  target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
 endif()
 target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@ -1,433 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 #include "caffe2/operators/logit_op.h"
 #include "caffe2/operators/tanh_op.h"
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
 void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
  auto loops = ln->getLoopStmtsFor(target);
  ForPtr inner, tail;
  ln->splitWithTail(loops[0], width, &inner, &tail);
  ln->vectorize(inner);
 }
 void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
  std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
  ForPtr inner, tail;
  ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
  ForPtr outer = loops[0];
  ln->vectorize(inner);
  ln->splitWithTail(outer, 8, &inner, &tail);
  StmtPtr unrolled;
  LoopNest::fullUnroll(inner, &unrolled);
 }
 static void relu_nnc(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  auto clamp = 0;
  torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
    auto A_elem = [&]() {
      auto elem = A.load(i);
      auto min = FloatImm::make(clamp);
      return CompareSelect::make(elem, min, min, elem, kLT);
    }();
    return A_elem;
  });
  LoopNest ln({B});
  optimizePointwise(&ln, B);
  ln.prepareForCodegen();
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::randn({state.range(0)});
  at::Tensor B_t = torch::randn(state.range(0));
  auto B_ref = at::relu(A_t);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(B_t, B_ref));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["log/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void log_nnc_sleef(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  torch::jit::tensorexpr::Tensor B =
      Compute("B", {N}, [&](const VarHandle& i) { return log(A.load(i)); });
  LoopNest ln({B});
  ln.prepareForCodegen();
  vectorize(&ln, B, 8);
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::log(A_t);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(B_t, B_ref));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["log/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void log_nnc_fast(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  torch::jit::tensorexpr::Tensor B = Compute(
      "B", {N}, [&](const VarHandle& i) { return fast_log(A.load(i)); });
  LoopNest ln({B});
  optimizePointwise(&ln, B);
  ln.prepareForCodegen();
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::log(A_t);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(B_t, B_ref));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["log/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void log_nnc_vml(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  torch::jit::tensorexpr::Tensor B =
      Compute("B", {N}, [&](const VarHandle& i) { return log_vml(A.load(i)); });
  LoopNest ln({B});
  vectorize(&ln, B, 8);
  ln.prepareForCodegen();
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::log(A_t);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(B_t, B_ref));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["log/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void log_aten(benchmark::State& state) {
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  for (auto _ : state) {
    at::log_out(B_t, A_t);
  }
  state.counters["log/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void logit_nnc_sleef(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  auto clamp = 1e-6f;
  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
    auto A_elem = [&]() {
      auto elem = A.load(i);
      auto min = FloatImm::make(clamp);
      auto max = FloatImm::make(1.0f - clamp);
      elem = CompareSelect::make(elem, min, min, elem, kLT);
      return CompareSelect::make(elem, max, max, elem, kGT);
    }();
    return log(A_elem / (FloatImm::make(1.0f) - A_elem));
  });
  LoopNest ln({B});
  ln.prepareForCodegen();
  optimizePointwise(&ln, B);
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::logit(A_t, clamp);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["logit/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void logit_nnc_fast(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  auto clamp = 1e-6f;
  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
    auto A_elem = [&]() {
      auto elem = A.load(i);
      auto min = FloatImm::make(clamp);
      auto max = FloatImm::make(1.0f - clamp);
      elem = CompareSelect::make(elem, min, min, elem, kLT);
      return CompareSelect::make(elem, max, max, elem, kGT);
    }();
    return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem));
  });
  LoopNest ln({B});
  ln.prepareForCodegen();
  optimizePointwise(&ln, B);
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::logit(A_t, clamp);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["logit/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void logit_nnc_vml(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  auto clamp = 1e-6f;
  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
    auto A_elem = [&]() {
      auto elem = A.load(i);
      auto min = FloatImm::make(clamp);
      auto max = FloatImm::make(1.0f - clamp);
      elem = CompareSelect::make(elem, min, min, elem, kLT);
      return CompareSelect::make(elem, max, max, elem, kGT);
    }();
    return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
  });
  LoopNest ln({B});
  ln.prepareForCodegen();
  vectorize(&ln, B, 16);
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::logit(A_t, clamp);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["logit/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void logit_aten(benchmark::State& state) {
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto clamp = 1e-6f;
  for (auto _ : state) {
    at::native::logit_out(A_t, clamp, B_t);
  }
  state.counters["logit/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 template <typename T>
 void logit_caffe2_impl(int size, const T* X, T* Y, float eps_ = 1e-6f) {
  using namespace caffe2;
  ConstEigenVectorMap<T> X_vec(X, size);
  EigenVectorMap<T> Y_vec(Y, size);
  Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
  Y_vec = Y_vec.array().max(eps_);
  Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
 }
 static void logit_caffe2(benchmark::State& state) {
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  at::Tensor B_ref = torch::randn({state.range(0)});
  auto N = state.range(0);
  auto X = A_t.data_ptr<float>();
  auto Y = B_t.data_ptr<float>();
  auto clamp = 1e-6f;
  at::native::logit_out(A_t, clamp, B_ref);
  logit_caffe2_impl(N, X, Y, clamp);
  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
  for (auto _ : state) {
    logit_caffe2_impl(N, X, Y, clamp);
  }
  state.counters["logit/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void tanh_nnc_fast(benchmark::State& state) {
  auto N = VarHandle("N", kInt);
  BufHandle A("A", {N}, kFloat);
  torch::jit::tensorexpr::Tensor B = Compute(
      "B", {N}, [&](const VarHandle& i) { return fast_tanh(A.load(i)); });
  LoopNest ln({B});
  optimizePointwise(&ln, B);
  ln.prepareForCodegen();
  StmtPtr s = ln.root_stmt();
  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
  std::vector<CodeGen::BufferArg> args;
  args.emplace_back(B);
  args.emplace_back(A);
  args.emplace_back(N);
  LLVMCodeGen cg(s, args);
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  auto B_ref = at::tanh(A_t);
  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  TORCH_CHECK(at::allclose(B_t, B_ref, 1e-3f, 1e-6f));
  for (auto _ : state) {
    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
  }
  state.counters["tanh/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void tanh_aten(benchmark::State& state) {
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  for (auto _ : state) {
    at::tanh_out(A_t, B_t);
  }
  state.counters["tanh/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 }
 static void tanh_caffe2(benchmark::State& state) {
 #ifdef FBCODE_CAFFE2
  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
  at::Tensor B_t = torch::randn({state.range(0)});
  at::Tensor B_ref = torch::randn({state.range(0)});
  auto N = state.range(0);
  auto X = A_t.data_ptr<float>();
  auto Y = B_t.data_ptr<float>();
  caffe2::CPUContext c;
  auto tanh = caffe2::TanhFunctor<caffe2::CPUContext>();
  at::tanh_out(A_t, B_ref);
  tanh(N, X, Y, &c);
  TORCH_CHECK(at::native::allclose(B_t, B_ref, 1e-3f, 1e-6f));
  for (auto _ : state) {
    tanh(N, X, Y, &c);
  }
  state.counters["tanh/s"] = benchmark::Counter(
      uint64_t(state.range(0) * state.iterations()),
      benchmark::Counter::kIsRate);
 #endif
 }
 BENCHMARK(relu_nnc)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
    {2 << 14});
 BENCHMARK(log_nnc_sleef)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(log_nnc_fast)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(log_nnc_vml)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(log_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
    {2 << 14});
 BENCHMARK(logit_nnc_sleef)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(logit_nnc_fast)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(logit_nnc_vml)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(logit_aten)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(logit_caffe2)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(tanh_nnc_fast)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
 BENCHMARK(tanh_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
    {2 << 14});
 BENCHMARK(tanh_caffe2)
    ->Args({2 << 5})
    ->Args({2 << 8})
    ->Args({2 << 12})
    ->Args({2 << 14});
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@ -1,216 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 using namespace torch::jit::tensorexpr;
 namespace {
 class BatchNorm : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    N_ = state.range(0);
    C_ = state.range(1);
    H_ = state.range(2);
    W_ = state.range(3);
    input_ = torch::ones({N_, C_, H_, W_});
    weight_ = torch::ones({C_});
    bias_ = torch::ones({C_});
    mean_ = torch::ones({C_}) * 0.5f;
    var_ = torch::ones({C_}) * 0.1f;
    ref_ = at::batch_norm(
        input_,
        weight_,
        bias_,
        mean_,
        var_,
        training_,
        momentum_,
        eps_,
        cudnn_enabled_);
    output_ = at::empty_like(ref_);
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(ref_, output_));
    state.counters["GB/s"] = benchmark::Counter(
        uint64_t(state.iterations()) * (input_.nbytes() + ref_.nbytes()),
        benchmark::Counter::kIsRate);
  }
  int N_;
  int C_;
  int H_;
  int W_;
  at::Tensor input_;
  at::Tensor weight_;
  at::Tensor bias_;
  at::Tensor mean_;
  at::Tensor var_;
  at::Tensor output_;
  at::Tensor ref_;
  bool training_{false};
  float momentum_{0.1};
  float eps_{1.0e-5f};
  bool cudnn_enabled_{false};
 };
 } // namespace
 BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
  for (auto _ : state) {
    output_ = at::batch_norm(
        input_,
        weight_,
        bias_,
        mean_,
        var_,
        training_,
        momentum_,
        eps_,
        cudnn_enabled_);
  }
 }
 BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
  BufHandle weight("weight", {C_}, kFloat);
  BufHandle bias("bias", {C_}, kFloat);
  BufHandle mean("mean", {C_}, kFloat);
  BufHandle var("var", {C_}, kFloat);
  VarHandle eps("eps", kFloat);
  using axis = const VarHandle&;
  Tensor output =
      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
        // Compute affine terms.
        auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
        auto weight_v = weight.load(c);
        auto bias_v = bias.load(c);
        auto alpha = inv_var * weight_v;
        auto beta = bias_v - mean.load(c) * alpha;
        return input.load(n, c, h, w) * alpha + beta;
      });
  LoopNest nest({output});
  auto loops = nest.getLoopStmtsFor(output);
  LoopNest::flatten({loops[2], loops[3]});
  loops = nest.getLoopStmtsFor(output);
  LoopNest::flatten({loops[0], loops[1]});
  loops = nest.getLoopStmtsFor(output);
  loops[0]->set_parallel();
  nest.prepareForCodegen();
  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
  LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
  std::vector<CodeGen::CallArg> args;
  for (auto _ : state) {
    args.clear();
    output_ = at::empty_like(input_);
    for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
      args.push_back(t.data_ptr<float>());
    }
    args.push_back(eps_);
    cg.call(args);
  }
 }
 BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
  for (auto _ : state) {
    output_ = at::batch_norm(
        input_,
        weight_,
        bias_,
        mean_,
        var_,
        training_,
        momentum_,
        eps_,
        cudnn_enabled_);
    output_.relu_();
  }
 }
 BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
  BufHandle weight("weight", {C_}, kFloat);
  BufHandle bias("bias", {C_}, kFloat);
  BufHandle mean("mean", {C_}, kFloat);
  BufHandle var("var", {C_}, kFloat);
  VarHandle eps("eps", kFloat);
  using axis = const VarHandle&;
  Tensor output =
      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
        // Compute affine terms.
        auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
        auto weight_v = weight.load(c);
        auto bias_v = bias.load(c);
        auto alpha = inv_var * weight_v;
        auto beta = bias_v - mean.load(c) * alpha;
        auto bn = input.load(n, c, h, w) * alpha + beta;
        return CompareSelect::make(bn, 0.f, 0.f, bn, kLT);
      });
  LoopNest nest({output});
  nest.prepareForCodegen();
  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
  LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
  std::vector<CodeGen::CallArg> args;
  for (auto _ : state) {
    args.clear();
    output_ = at::empty_like(input_);
    for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
      args.push_back(t.data_ptr<float>());
    }
    args.push_back(eps_);
    cg.call(args);
  }
 }
 BENCHMARK_REGISTER_F(BatchNorm, ATen)
    ->Args({1, 64, 112, 112})
    ->Args({1, 256, 14, 14})
    ->Args({1, 128, 28, 28})
    ->Args({1, 64, 56, 56})
    ->Args({1, 512, 7, 7})
    ->Args({5, 64, 112, 112})
    ->Args({5, 256, 14, 14})
    ->Args({5, 128, 28, 28})
    ->Args({5, 64, 56, 56})
    ->Args({5, 512, 7, 7});
 BENCHMARK_REGISTER_F(BatchNorm, NNC)
    ->Args({1, 64, 112, 112})
    ->Args({1, 256, 14, 14})
    ->Args({1, 128, 28, 28})
    ->Args({1, 64, 56, 56})
    ->Args({1, 512, 7, 7})
    ->Args({5, 64, 112, 112})
    ->Args({5, 256, 14, 14})
    ->Args({5, 128, 28, 28})
    ->Args({5, 64, 56, 56})
    ->Args({5, 512, 7, 7});
 BENCHMARK_REGISTER_F(BatchNorm, ATenRelu)
    ->Args({1, 64, 112, 112})
    ->Args({1, 256, 14, 14})
    ->Args({1, 128, 28, 28})
    ->Args({1, 64, 56, 56})
    ->Args({1, 512, 7, 7})
    ->Args({5, 64, 112, 112})
    ->Args({5, 256, 14, 14})
    ->Args({5, 128, 28, 28})
    ->Args({5, 64, 56, 56})
    ->Args({5, 512, 7, 7});
 BENCHMARK_REGISTER_F(BatchNorm, NNCRelu)
    ->Args({1, 64, 112, 112})
    ->Args({1, 256, 14, 14})
    ->Args({1, 128, 28, 28})
    ->Args({1, 64, 56, 56})
    ->Args({1, 512, 7, 7})
    ->Args({5, 64, 112, 112})
    ->Args({5, 256, 14, 14})
    ->Args({5, 128, 28, 28})
    ->Args({5, 64, 56, 56})
    ->Args({5, 512, 7, 7});
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@ -1,71 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #ifdef TORCH_ENABLE_LLVM
 namespace te = torch::jit::tensorexpr;
 static void BM_CompileSwish(benchmark::State& state) {
  for (auto _ : state) {
    constexpr int N = 512;
    te::VarHandle n("n", te::kInt);
    te::BufHandle A("A", {N}, te::kFloat);
    te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
      return te::Max::make(A.load(i), 0.f, false);
    });
    te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
      return te::Min::make(relu.load(i), 6.f, false);
    });
    te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) {
      return min6.load(i) + 3.f;
    });
    te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
      return A.load(i) * plus3.load(i);
    });
    te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
      return times.load(i) * 1.f / 6.f;
    });
    te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
    for (auto tensor : {relu, min6, plus3, times}) {
      nest.computeInline(tensor.buf());
    }
    nest.prepareForCodegen();
    te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
    te::LLVMCodeGen cg(s, {A, sixth, n});
  }
 }
 static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
  constexpr int N = 512;
  te::VarHandle n("n", te::kInt);
  te::BufHandle A("A", {N}, te::kFloat);
  te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
    return te::Max::make(A.load(i), 0.f, false);
  });
  te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
    return te::Min::make(relu.load(i), 6.f, false);
  });
  te::Tensor plus3 = te::Compute(
      "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; });
  te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
    return A.load(i) * plus3.load(i);
  });
  te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
    return times.load(i) * 1.f / 6.f;
  });
  te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
  for (auto tensor : {relu, min6, plus3, times}) {
    nest.computeInline(tensor.buf());
  }
  nest.prepareForCodegen();
  te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
  for (auto _ : state) {
    te::LLVMCodeGen cg(s, {A, sixth, n});
  }
 }
 BENCHMARK(BM_CompileSwish);
 BENCHMARK(BM_CompileSwishLLVMOnly);
 #endif // TORCH_ENABLE_LLVM
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@ -1,293 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 using namespace torch::jit::tensorexpr;
 namespace {
 class ConcatBench : public benchmark::Fixture {
 public:
  void init(const std::vector<std::vector<int>> input_sizes, int concat_dim) {
    input_sizes_ = std::move(input_sizes);
    concat_dim_ = concat_dim;
    inputs_.resize(input_sizes_.size());
    for (const auto i : c10::irange(input_sizes_.size())) {
      inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
    }
    output_size_.resize(input_sizes_.front().size());
    for (const auto i : c10::irange(output_size_.size())) {
      if (i == static_cast<size_t>(concat_dim_)) {
        output_size_[i] = 0;
        for (const auto j : c10::irange(input_sizes_.size())) {
          output_size_[i] += input_sizes_[j][i];
        }
      } else {
        output_size_[i] = input_sizes_.front()[i];
      }
    }
    ref_ = at::cat(inputs_, concat_dim_);
    output_ = at::empty_like(ref_);
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(ref_, output_));
    state.counters["GB/s"] = benchmark::Counter(
        uint64_t(state.iterations()) * 2 * output_.nbytes(),
        benchmark::Counter::kIsRate);
  }
  void runATen(benchmark::State& state) {
    for (auto _ : state) {
      output_ = at::cat(inputs_, concat_dim_);
    }
  }
  void runNNC(benchmark::State& state) {
    size_t num_inputs = inputs_.size();
    size_t num_dims = 2;
    std::vector<BufHandle> inputs;
    for (size_t i = 0; i < num_inputs; ++i) {
      inputs.emplace_back(BufHandle(
          "input" + std::to_string(i),
          {input_sizes_[i][0], input_sizes_[i][1]},
          kFloat));
    }
    Tensor output = Compute(
        "aten_cat",
        {output_size_[0], output_size_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          int d = 0;
          std::vector<int> cumulative_concat_dim_sizes(num_inputs);
          for (const auto i : c10::irange(num_inputs)) {
            cumulative_concat_dim_sizes[i] = d;
            d += input_sizes_[i][concat_dim_];
          }
          auto load =
              inputs.back().load(m, n - cumulative_concat_dim_sizes.back());
          for (size_t i = num_inputs - 1; i > 0; --i) {
            load = ifThenElse(
                CompareSelect::make(
                    n, IntImm::make(cumulative_concat_dim_sizes[i]), kLT),
                inputs[i - 1].load(m, n - cumulative_concat_dim_sizes[i - 1]),
                load);
          }
          return load;
        });
    LoopNest nest({output});
    nest.prepareForCodegen();
    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
    std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
    buf_args.push_back(output);
    LLVMCodeGen cg(s, buf_args);
    std::vector<CodeGen::CallArg> call_args;
    for (auto _ : state) {
      output_ = at::empty_like(ref_);
      call_args.clear();
      for (const auto& inp : inputs_) {
        call_args.push_back(inp.data_ptr<float>());
      }
      call_args.push_back(output_.data_ptr<float>());
      cg.call(call_args);
    }
  }
  void runNNCLoop(benchmark::State& state) {
    size_t num_inputs = inputs_.size();
    size_t num_dims = 2;
    TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
    auto output_buf = alloc<Buf>(
        alloc<Var>("aten_cat", kHandle),
        std::vector<ExprPtr>(
            {alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
        kFloat);
    std::vector<BufHandle> inputs;
    std::vector<StmtPtr> for_stmts(num_inputs);
    int cumulative_input_sizes = 0;
    for (size_t i = 0; i < num_inputs; ++i) {
      inputs.emplace_back(BufHandle(
          "input" + std::to_string(i),
          {input_sizes_[i][0], input_sizes_[i][1]},
          kFloat));
      std::vector<VarPtr> for_vars(num_inputs);
      for (const auto d : c10::irange(num_dims)) {
        for_vars[d] =
            alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
      }
      auto store = alloc<Store>(
          output_buf,
          std::vector<ExprPtr>(
              {for_vars[0],
               alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
          alloc<Load>(
              inputs[i].node(),
              std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
      auto for_st = alloc<For>(
          for_vars[0],
          alloc<IntImm>(0),
          alloc<IntImm>(input_sizes_[i][0]),
          alloc<For>(
              for_vars[1],
              alloc<IntImm>(0),
              alloc<IntImm>(input_sizes_[i][1]),
              store));
      for_stmts[i] = for_st;
      cumulative_input_sizes += input_sizes_[i][1];
    }
    auto output = Tensor(output_buf, alloc<Block>(for_stmts));
    LoopNest nest({output});
    nest.prepareForCodegen();
    nest.vectorizeInnerLoops();
    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
    std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
    buf_args.push_back(output);
    LLVMCodeGen cg(s, buf_args);
    std::vector<CodeGen::CallArg> call_args;
    for (auto _ : state) {
      output_ = at::empty_like(ref_);
      call_args.clear();
      for (const auto& inp : inputs_) {
        call_args.push_back(inp.data_ptr<float>());
      }
      call_args.push_back(output_.data_ptr<float>());
      cg.call(call_args);
    }
  }
  std::vector<std::vector<int>> input_sizes_;
  int concat_dim_;
  std::vector<at::Tensor> inputs_;
  std::vector<int> output_size_;
  at::Tensor output_;
  at::Tensor ref_;
 };
 class Concat2D2Input : public ConcatBench {
 public:
  void SetUp(const benchmark::State& state) override {
    init(
        {{state.range(0), state.range(1)}, {state.range(2), state.range(3)}},
        state.range(4));
  }
 };
 } // namespace
 BENCHMARK_DEFINE_F(Concat2D2Input, ATen)(benchmark::State& state) {
  runATen(state);
 }
 BENCHMARK_DEFINE_F(Concat2D2Input, NNC)(benchmark::State& state) {
  runNNC(state);
 }
 BENCHMARK_DEFINE_F(Concat2D2Input, NNCLoop)(benchmark::State& state) {
  runNNCLoop(state);
 }
 BENCHMARK_REGISTER_F(Concat2D2Input, ATen)
    ->Args({1, 160, 1, 14, 1})
    ->Args({1, 580, 1, 174, 1})
    ->Args({20, 160, 20, 14, 1})
    ->Args({20, 580, 20, 174, 1})
    ->Args({8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D2Input, NNC)
    ->Args({1, 160, 1, 14, 1})
    ->Args({1, 580, 1, 174, 1})
    ->Args({20, 160, 20, 14, 1})
    ->Args({20, 580, 20, 174, 1})
    ->Args({8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D2Input, NNCLoop)
    ->Args({1, 160, 1, 14, 1})
    ->Args({1, 580, 1, 174, 1})
    ->Args({20, 160, 20, 14, 1})
    ->Args({20, 580, 20, 174, 1})
    ->Args({8, 512, 8, 512, 1});
 namespace {
 class Concat2D3Input : public ConcatBench {
 public:
  void SetUp(const benchmark::State& state) override {
    init(
        {{state.range(0), state.range(1)},
         {state.range(2), state.range(3)},
         {state.range(4), state.range(5)}},
        state.range(6));
  }
 };
 } // namespace
 BENCHMARK_DEFINE_F(Concat2D3Input, ATen)(benchmark::State& state) {
  runATen(state);
 }
 BENCHMARK_DEFINE_F(Concat2D3Input, NNC)(benchmark::State& state) {
  runNNC(state);
 }
 BENCHMARK_DEFINE_F(Concat2D3Input, NNCLoop)(benchmark::State& state) {
  runNNCLoop(state);
 }
 BENCHMARK_REGISTER_F(Concat2D3Input, ATen)->Args({8, 512, 8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D3Input, NNC)->Args({8, 512, 8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D3Input, NNCLoop)
    ->Args({8, 512, 8, 512, 8, 512, 1});
 namespace {
 class Concat2D7Input : public ConcatBench {
 public:
  void SetUp(const benchmark::State& state) override {
    init(
        {{state.range(0), state.range(1)},
         {state.range(2), state.range(3)},
         {state.range(4), state.range(5)},
         {state.range(6), state.range(7)},
         {state.range(8), state.range(9)},
         {state.range(10), state.range(11)},
         {state.range(12), state.range(13)}},
        state.range(14));
  }
 };
 } // namespace
 BENCHMARK_DEFINE_F(Concat2D7Input, ATen)(benchmark::State& state) {
  runATen(state);
 }
 BENCHMARK_DEFINE_F(Concat2D7Input, NNC)(benchmark::State& state) {
  runNNC(state);
 }
 BENCHMARK_DEFINE_F(Concat2D7Input, NNCLoop)(benchmark::State& state) {
  runNNCLoop(state);
 }
 BENCHMARK_REGISTER_F(Concat2D7Input, ATen)
    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D7Input, NNC)
    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
 BENCHMARK_REGISTER_F(Concat2D7Input, NNCLoop)
    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
--- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
@ -1,59 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <c10/core/InferenceMode.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/torch.h>
 using namespace torch::jit;
 static const std::string two_adds = R"JIT(
 def two_adds(self, x: Tensor, y: Tensor, z: Tensor) -> Tensor:
    return x + y + z
 )JIT";
 static void FusedOverhead(benchmark::State& state) {
  c10::InferenceMode mode;
  overrideCanFuseOnCPU(true);
  Module m("m");
  m.define(two_adds);
  auto x = torch::ones({1});
  auto y = torch::ones({1});
  auto z = torch::ones({1});
  // Warmup.
  for (const auto i : c10::irange(8)) {
    (void)i; // Suppress unused variable warning
    m.run_method("two_adds", x, y, z);
  }
  for (auto _ : state) {
    m.run_method("two_adds", x, y, z);
  }
 }
 static void UnfusedOverhead(benchmark::State& state) {
  c10::InferenceMode guard;
  overrideCanFuseOnCPU(false);
  Module m("m");
  m.define(two_adds);
  auto x = torch::ones({1});
  auto y = torch::ones({1});
  auto z = torch::ones({1});
  // Warmup.
  for (const auto i : c10::irange(8)) {
    (void)i; // Suppress unused variable warning
    m.run_method("two_adds", x, y, z);
  }
  for (auto _ : state) {
    m.run_method("two_adds", x, y, z);
  }
 }
 BENCHMARK(FusedOverhead);
 BENCHMARK(UnfusedOverhead);
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@ -1,313 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 namespace te = torch::jit::tensorexpr;
 namespace {
 class Gemm : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    M = state.range(0);
    N = state.range(1);
    K = state.range(2);
    A = torch::randn({M, K});
    B = torch::randn({K, N});
    C = torch::mm(A, B);
  }
  void TearDown(benchmark::State& state) override {
    state.counters["GFLOPS"] = benchmark::Counter(
        uint64_t(state.iterations()) * 2 * M * N * K,
        benchmark::Counter::kIsRate);
  }
  int M;
  int N;
  int K;
  at::Tensor A;
  at::Tensor B;
  at::Tensor C;
 };
 } // namespace
 BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) {
  for (auto _ : state) {
    torch::mm_out(C, A, B);
  }
 }
 BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
  te::BufHandle AP("A", {M, K}, te::kFloat);
  te::BufHandle BP("B", {K, N}, te::kFloat);
  te::Tensor CT = te::Reduce(
      "gemm",
      {M, N},
      te::Sum(),
      [&](const te::ExprHandle& m,
          const te::ExprHandle& n,
          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
      {K});
  te::LoopNest loop({CT});
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
  for (auto _ : state) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
  }
 }
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
  te::BufHandle AP("A", {M, K}, te::kFloat);
  te::BufHandle BP("B", {K, N}, te::kFloat);
  te::Tensor CT = te::Reduce(
      "gemm",
      {M, N},
      te::Sum(),
      [&](const te::ExprHandle& m,
          const te::ExprHandle& n,
          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
      {K});
  te::LoopNest loop({CT});
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr m = loops[0];
    loop.splitWithMask(m, 32);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr n = loops[2];
    loop.splitWithMask(n, 32);
  }
  // mo, mi, no, ni, k ->
  // mo, no, mi, ni, k
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[1];
    te::ForPtr no = loops[2];
    loop.reorderAxis(mi, no);
  }
  // mo, no, mi, ni, k ->
  // mo, no, mi, k, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr ni = loops[3];
    te::ForPtr k = loops[4];
    loop.reorderAxis(ni, k);
  }
  // mo, no, mi, k, ni ->
  // mo, no, k, mi, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[2];
    te::ForPtr k = loops[3];
    loop.reorderAxis(mi, k);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
  for (auto _ : state) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
  }
 }
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
  te::BufHandle AP("A", {M, K}, te::kFloat);
  te::BufHandle BP("B", {K, N}, te::kFloat);
  te::Tensor CT = te::Reduce(
      "gemm",
      {M, N},
      te::Sum(),
      [&](const te::ExprHandle& m,
          const te::ExprHandle& n,
          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
      {K});
  te::LoopNest loop({CT});
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr m = loops[0];
    loop.splitWithMask(m, 4);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr n = loops[2];
    loop.splitWithMask(n, 16);
  }
  // mo, mi, no, ni, k ->
  // mo, no, mi, ni, k
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[1];
    te::ForPtr no = loops[2];
    loop.reorderAxis(mi, no);
  }
  // mo, no, mi, ni, k ->
  // mo, no, mi, k, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr ni = loops[3];
    te::ForPtr k = loops[4];
    loop.reorderAxis(ni, k);
  }
  // mo, no, mi, k, ni ->
  // mo, no, k, mi, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[2];
    te::ForPtr k = loops[3];
    loop.reorderAxis(mi, k);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
  for (auto _ : state) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
  }
 }
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
  te::BufHandle AP("A", {M, K}, te::kFloat);
  te::BufHandle BP("B", {K, N}, te::kFloat);
  te::Tensor CT = te::Reduce(
      "gemm",
      {M, N},
      te::Sum(),
      [&](const te::ExprHandle& m,
          const te::ExprHandle& n,
          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
      {K});
  te::LoopNest loop({CT});
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr m = loops[0];
    loop.splitWithMask(m, 4);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr n = loops[2];
    loop.splitWithMask(n, 16);
  }
  // mo, mi, no, ni, k ->
  // mo, no, mi, ni, k
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[1];
    te::ForPtr no = loops[2];
    loop.reorderAxis(mi, no);
  }
  // mo, no, mi, ni, k ->
  // mo, no, mi, k, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr ni = loops[3];
    te::ForPtr k = loops[4];
    loop.reorderAxis(ni, k);
  }
  // mo, no, mi, k, ni ->
  // mo, no, k, mi, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[2];
    te::ForPtr k = loops[3];
    loop.reorderAxis(mi, k);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[3];
    te::ForPtr ni = loops[4];
    te::StmtPtr unrolled;
    loop.vectorize(ni);
    loop.fullUnroll(mi, &unrolled);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
  for (auto _ : state) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
  }
 }
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
  te::BufHandle AP("A", {M, K}, te::kFloat);
  te::BufHandle BP("B", {K, N}, te::kFloat);
  te::Tensor CT = te::Reduce(
      "gemm",
      {M, N},
      te::Sum(),
      [&](const te::ExprHandle& m,
          const te::ExprHandle& n,
          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
      {K});
  te::LoopNest loop({CT});
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr m = loops[0];
    loop.splitWithMask(m, 4);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr n = loops[2];
    loop.splitWithMask(n, 16);
  }
  // mo, mi, no, ni, k ->
  // mo, no, mi, ni, k
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[1];
    te::ForPtr no = loops[2];
    loop.reorderAxis(mi, no);
  }
  // mo, no, mi, ni, k ->
  // mo, no, mi, k, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr ni = loops[3];
    te::ForPtr k = loops[4];
    loop.reorderAxis(ni, k);
  }
  // mo, no, mi, k, ni ->
  // mo, no, k, mi, ni
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    te::ForPtr mi = loops[2];
    te::ForPtr k = loops[3];
    loop.reorderAxis(mi, k);
  }
  {
    auto const& loops = loop.getLoopStmtsFor(CT);
    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
  for (auto _ : state) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
  }
 }
 BENCHMARK_REGISTER_F(Gemm, Torch)->Args({128, 128, 128});
 BENCHMARK_REGISTER_F(Gemm, TensorExprNoopt)->Args({128, 128, 128});
 BENCHMARK_REGISTER_F(Gemm, TensorExprTile32x32)->Args({128, 128, 128});
 BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16)->Args({128, 128, 128});
 BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16VecUnroll)->Args({128, 128, 128});
 BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16Cache)->Args({128, 128, 128});
--- a/benchmarks/cpp/tensorexpr/bench_kernels.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_kernels.cpp
@ -1,101 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <ATen/code_template.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
 static const std::string kernel_static_shapes_template = R"IR(
    graph(%0 : Float(${dim}, strides=[1], device=cpu),
          %1 : Float(${dim}, strides=[1], device=cpu)):
        %2 : Float(${dim}, strides=[1]) = aten::mul(%0, %1)
        %4 : Float(${dim}, strides=[1]) = aten::mul(%0, %2)
        return (%4))IR";
 static const std::string kernel_symbolic_shapes = R"IR(
    graph(%0 : Float(SS(-2), strides=[1], device=cpu),
          %1 : Float(SS(-2), strides=[1], device=cpu),
          %SS_2 : int):
        %2 : Float(SS(-2), strides=[1]) = aten::mul(%0, %1)
        %4 : Float(SS(-2), strides=[1]) = aten::mul(%0, %2)
        return (%4))IR";
 class KernelBench : public benchmark::Fixture {
 public:
  void Eager(benchmark::State& state) {
    auto dim = state.range(0);
    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    for (auto _ : state) {
      auto o = at::mul(a, at::mul(a, b));
    }
  }
  void GraphWithStaticShapes(benchmark::State& state) {
    auto dim = state.range(0);
    auto graph = std::make_shared<Graph>();
    at::jit::TemplateEnv env;
    env.d("dim", dim);
    const auto kernel_static_shapes =
        format(kernel_static_shapes_template, env);
    parseIR(kernel_static_shapes, &*graph);
    TensorExprKernel k(graph);
    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    std::vector<at::Tensor> inputs = {a, b};
    for (auto _ : state) {
      std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
      k.run(stack);
    }
  }
  void GraphWithSymbolicShapes(benchmark::State& state) {
    auto dim = state.range(0);
    auto graph = std::make_shared<Graph>();
    parseIR(kernel_symbolic_shapes, &*graph);
    std::vector<torch::jit::StrideInput> input_desc = {
        torch::jit::StrideInput::TENSOR_CONT};
    std::unordered_map<
        const torch::jit::Value*,
        std::vector<torch::jit::StrideInput>>
        symbolic_strides;
    symbolic_strides[graph->inputs().at(0)] = input_desc;
    symbolic_strides[graph->inputs().at(1)] = input_desc;
    symbolic_strides[graph->outputs().at(0)] = input_desc;
    std::vector<int64_t> symbolic_shape_inputs = {-2};
    TensorExprKernel k(
        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    std::vector<at::Tensor> inputs = {a, b};
    for (auto _ : state) {
      std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
      stack.push_back(dim);
      k.run(stack);
    }
  }
 };
 BENCHMARK_DEFINE_F(KernelBench, Eager)(benchmark::State& state) {
  Eager(state);
 }
 BENCHMARK_DEFINE_F(KernelBench, StaticShapes)(benchmark::State& state) {
  GraphWithStaticShapes(state);
 }
 BENCHMARK_DEFINE_F(KernelBench, SymbolicShapes)(benchmark::State& state) {
  GraphWithSymbolicShapes(state);
 }
 BENCHMARK_REGISTER_F(KernelBench, Eager)->Range(32, 2048);
 BENCHMARK_REGISTER_F(KernelBench, StaticShapes)->Range(32, 2048);
 BENCHMARK_REGISTER_F(KernelBench, SymbolicShapes)->Range(32, 2048);
--- a/benchmarks/cpp/tensorexpr/bench_ops.py
+++ b/benchmarks/cpp/tensorexpr/bench_ops.py
@ -1,113 +0,0 @@
 import timeit
 import torch
 import torch.nn.functional as F
 torch._C._jit_override_can_fuse_on_cpu(True)
 torch._C._debug_set_fusion_group_inlining(False)
 torch.set_num_threads(1)
 def hardswish(x):
    return x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0
 unary_ops = [
    hardswish,
    torch._C._nn.hardswish,
    torch.sigmoid,
    torch.reciprocal,
    torch.neg,
    torch.relu,
    torch.isnan,
    torch.log,
    torch.log10,
    torch.log1p,
    torch.log2,
    torch.exp,
    torch.expm1,
    torch.erf,
    torch.erfc,
    torch.cos,
    torch.sin,
    torch.tan,
    torch.acos,
    torch.asin,
    torch.cosh,
    torch.sinh,
    torch.atan,
    torch.tanh,
    torch.sqrt,
    torch.rsqrt,
    torch.abs,
    torch.ceil,
    torch.floor,
    torch.round,
    torch.trunc,
    torch.lgamma,
 ]
 print(f"{'op':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
 for op in unary_ops:
    x = torch.rand((1024, 1024))
    traced = torch.jit.trace(op, (x))
    # Warmup.
    warmup_iters = 8
    for _ in range(warmup_iters):
        op(x)
        traced(x)
    # Validate result.
    torch.testing.assert_close(op(x), traced(x))
    # Benchmark.
    bench_iters = 100
    teager = timeit.timeit(stmt="op(x)", globals=globals(), number=bench_iters)
    tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
    print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
 def test_batch_norm():
    op = F.batch_norm
    print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
    batch_norm_shapes = [
        [1, 64, 112, 112],
        [1, 256, 14, 14],
        [1, 128, 28, 28],
        [1, 64, 56, 56],
        [1, 512, 7, 7],
        [5, 64, 112, 112],
        [5, 256, 14, 14],
        [5, 128, 28, 28],
        [5, 64, 56, 56],
        [5, 512, 7, 7],
    ]
    for n, c, h, w in batch_norm_shapes:
        x = torch.rand((n, c, h, w))
        y = torch.rand(c)
        z = torch.rand(c)
        traced = torch.jit.trace(op, (x, y, z))
        # Warmup.
        warmup_iters = 8
        for _ in range(warmup_iters):
            op(x, y, z)
            traced(x, y, z)
        # Validate result.
        torch.testing.assert_close(op(x, y, z), traced(x, y, z))
        # Benchmark.
        bench_iters = 100
        teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters)
        tjit = timeit.timeit(
            stmt="traced(x, y, z)", globals=locals(), number=bench_iters
        )
        print(
            f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}"
        )
 test_batch_norm()
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@ -1,71 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 #include <immintrin.h>
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 class ParallelAdd : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    at::set_num_threads(4);
    torch::manual_seed(0x12345678);
    M = state.range(0);
    A = torch::randn({M});
    B = torch::randn({M});
    C = torch::zeros({M});
  }
  void TearDown(benchmark::State& state) override {
    state.counters["tasks"] = benchmark::Counter(
        uint64_t(state.iterations()) * M, benchmark::Counter::kIsRate);
  }
  int M;
  at::Tensor A;
  at::Tensor B;
  at::Tensor C;
 };
 BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
  BufHandle a_buf("a", {M}, kFloat);
  BufHandle b_buf("b", {M}, kFloat);
  Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
    return a_buf.load(m) + b_buf.load(m);
  });
  LoopNest loop_nest({c_tensor});
  auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
  ForPtr m = loops[0];
  m->set_parallel();
  loop_nest.prepareForCodegen();
  StmtPtr stmt = loop_nest.root_stmt();
  LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
  float* a_ptr = A.data_ptr<float>();
  float* b_ptr = B.data_ptr<float>();
  float* c_ptr = C.data_ptr<float>();
  std::vector<void*> args({c_ptr, a_ptr, b_ptr});
  cg.value<int>(args);
  for (const auto i : c10::irange(M)) {
    float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
    TORCH_CHECK(diff < 1e-5);
  }
  for (auto _ : state) {
    cg.value<int>(args);
  }
 }
 BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
--- a/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp
@ -1,395 +0,0 @@
 #include <benchmark/benchmark.h>
 #include "ATen/Functions.h"
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 #include <immintrin.h>
 using namespace torch::jit::tensorexpr;
 namespace {
 #ifdef __AVX2__
 #define _mm256_slli_si1(x)                                                   \
  _mm256_blend_epi32(                                                        \
      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)), \
      _mm256_setzero_si256(),                                                \
      1)
 #define _mm256_slli_si2(x)                                                   \
  _mm256_blend_epi32(                                                        \
      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(5, 4, 3, 2, 1, 0, 7, 6)), \
      _mm256_setzero_si256(),                                                \
      3)
 #define _mm256_slli_si4(x)                                                   \
  _mm256_blend_epi32(                                                        \
      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)), \
      _mm256_setzero_si256(),                                                \
      15)
 __m256 PrefixSum(__m256 x) {
  x = _mm256_add_ps(x, _mm256_slli_si1(x));
  x = _mm256_add_ps(x, _mm256_slli_si2(x));
  x = _mm256_add_ps(x, _mm256_slli_si4(x));
  return x; // local prefix sums
 }
 __m256i PrefixSumInt(__m256i x) {
  x = _mm256_add_epi32(x, _mm256_slli_si1(x));
  x = _mm256_add_epi32(x, _mm256_slli_si2(x));
  x = _mm256_add_epi32(x, _mm256_slli_si4(x));
  return x; // local prefix sums
 }
 // Util function to log the given value. Not used during benchmarking.
 template <class T>
 inline void Log(const __m256i& value) {
  const size_t n = sizeof(__m256i) / sizeof(T);
  T buffer[n];
  _mm256_storeu_si256((__m256i*)buffer, value);
  for (int i = 0; i < n; i++)
    std::cout << buffer[n - i - 1] << " ";
  std::cout << std::endl;
 }
 #endif
 #ifdef __AVX512F__
 #define _mm512_slli_si512(x, k) \
  _mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k)
 __m512 PrefixSum(__m512 x) {
  x = _mm512_add_ps(x, _mm512_slli_si512(x, 1));
  x = _mm512_add_ps(x, _mm512_slli_si512(x, 2));
  x = _mm512_add_ps(x, _mm512_slli_si512(x, 4));
  x = _mm512_add_ps(x, _mm512_slli_si512(x, 8));
  return x; // local prefix sums
 }
 __m512i PrefixSumInt(__m512i x) {
  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 1));
  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 2));
  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 4));
  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 8));
  return x; // local prefix sums
 }
 template <int index>
 float _mm512_extract_f32(__m512 target) {
  return _mm512_cvtss_f32(_mm512_alignr_epi32(target, target, index));
 }
 // extract the last i32 from target
 int _mm512_extract_epi32(__m512i target) {
  __m256i x = _mm512_extracti32x8_epi32(target, 1);
  return _mm256_extract_epi32(x, 7);
 }
 void PrefixSum(float* output_data, float* input_data, size_t input_size) {
  float carry = 0.0f;
  for (int i = 0; i < input_size / 16; i++) {
    __m512 x = _mm512_loadu_ps(input_data + i * 16);
    x = PrefixSum(x);
    x = _mm512_add_ps(x, _mm512_set1_ps(carry));
    carry = _mm512_extract_f32<15>(x);
    _mm512_storeu_ps((__m512*)(output_data + i * 16), x);
  }
 }
 void PrefixSum(int* output_data, int* input_data, size_t input_size) {
  int carry = 0;
  for (int i = 0; i < input_size / 16; i++) {
    __m512i x = _mm512_loadu_epi32(input_data + i * 16);
    x = PrefixSumInt(x);
    x = _mm512_add_epi32(x, _mm512_set1_epi32(carry));
    carry = _mm512_extract_epi32(x);
    _mm512_storeu_epi32((__m512i*)(output_data + i * 16), x);
  }
 }
 #endif
 // PrefixSum: the same as inclusive scan
 class PrefixSumBench : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    input_size_ = state.range(0);
    input_ = torch::rand(input_size_);
    ref_ = prefixSum(input_);
    // no type promotion. Default is int->long.
    input_int_ = torch::randint(1000, {input_size_}, at::kInt);
    ref_int_ = at::cumsum(input_int_, 0, at::kInt);
  }
  void TearDown(benchmark::State& state) override {
    if (output_.numel() > 0) {
      if (output_.numel() == ref_.numel()) {
        TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
      }
      state.counters["GB/s"] = benchmark::Counter(
          uint64_t(state.iterations()) * 2 * output_.nbytes(),
          benchmark::Counter::kIsRate);
    } else {
      if (output_int_.numel() == ref_int_.numel()) {
        TORCH_CHECK(ref_int_.equal(output_int_));
      }
      state.counters["GB/s"] = benchmark::Counter(
          uint64_t(state.iterations()) * 2 * output_int_.nbytes(),
          benchmark::Counter::kIsRate);
    }
  }
  at::Tensor prefixSum(const at::Tensor& inp) {
    return at::cumsum(inp, 0);
  }
  void runATen(benchmark::State& state) {
    output_ = prefixSum(input_);
    for (auto _ : state) {
      at::cumsum_out(output_, input_, 0);
    }
  }
  void runLocal(benchmark::State& state) {
    output_ = at::empty_like(ref_);
    for (auto _ : state) {
      auto input_data = input_.data_ptr<float>();
      auto output_data = output_.data_ptr<float>();
      float sum = 0.0f;
      for (int i = 0; i < input_size_; ++i) {
        sum = sum + input_data[i];
        output_data[i] = sum;
      }
    }
  }
  // no type promotion
  void runLocalInt(benchmark::State& state) {
    output_int_ = at::empty_like(input_int_);
    for (auto _ : state) {
      auto input_data = input_int_.data_ptr<int>();
      auto output_data = output_int_.data_ptr<int>();
      int sum = 0;
      for (int i = 0; i < input_size_; ++i) {
        sum = sum + input_data[i];
        output_data[i] = sum;
      }
    }
  }
  void runNNC(benchmark::State& state) {
    BufHandle input("input", {input_size_}, kFloat);
    BufHandle output("output", {input_size_}, kFloat);
    BufHandle s("s", {1}, kFloat);
    VarHandle i("i", kInt);
    auto allocS = Allocate::make(s);
    auto initS = Store::make(s, {0}, 0.0f);
    auto accumS = Store::make(
        s, {0}, Add::make(Load::make(s, {0}), Load::make(input, {i})));
    auto store = Store::make(output, {i}, Load::make(s, {0}));
    auto forI = For::make(i, 0, input_size_, Block::make({accumS, store}));
    auto freeS = Free::make(s);
    auto par = Block::make({allocS, initS, forI, freeS});
    LoopNest nest(par, {output.node()});
    std::vector<CodeGen::BufferArg> buf_args;
    buf_args.emplace_back(input);
    buf_args.emplace_back(output);
    LLVMCodeGen cg(nest.root_stmt(), buf_args);
    std::vector<CodeGen::CallArg> call_args;
    output_ = at::empty_like(ref_);
    for (auto _ : state) {
      call_args.clear();
      call_args.emplace_back(input_.data_ptr<float>());
      call_args.emplace_back(output_.data_ptr<float>());
      cg.call(call_args);
    }
  }
 #ifdef __AVX2__
  void runLocalAVX2(benchmark::State& state) {
    output_ = at::empty_like(ref_);
    for (auto _ : state) {
      float* input_data = input_.data_ptr<float>();
      float* output_data = output_.data_ptr<float>();
      float carry = 0.0f;
      for (int i = 0; i < input_size_ / 8; i++) {
        __m256 x = _mm256_loadu_ps(input_data + i * 8);
        x = PrefixSum(x);
        x = _mm256_add_ps(x, _mm256_set1_ps(carry));
        (reinterpret_cast<__m256*>(output_data))[i] = x;
        carry = _mm256_cvtss_f32(_mm256_permutevar8x32_ps(
            x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)));
      }
    }
  }
  void runLocalIntAVX2(benchmark::State& state) {
    output_int_ = at::empty_like(input_int_);
    for (auto _ : state) {
      auto input_data = input_int_.data_ptr<int>();
      auto output_data = output_int_.data_ptr<int>();
      int carry = 0;
      for (size_t i = 0; i < input_size_ / 8; i++) {
        __m256i x = _mm256_loadu_si256((__m256i*)(input_data + i * 8));
        x = PrefixSumInt(x);
        x = _mm256_add_epi32(x, _mm256_set1_epi32(carry));
        _mm256_storeu_si256((__m256i*)(output_data + i * 8), x);
        carry = _mm256_extract_epi32(x, 7);
      }
    }
  }
 #endif
 #ifdef __AVX512F__
  void runLocalAVX512(benchmark::State& state) {
    output_ = at::empty_like(ref_);
    for (auto _ : state) {
      auto input_data = input_.data_ptr<float>();
      auto output_data = output_.data_ptr<float>();
      PrefixSum(output_data, input_data, input_size_);
    }
  }
  void runLocalIntAVX512(benchmark::State& state) {
    output_int_ = at::empty_like(input_int_);
    for (auto _ : state) {
      auto input_data = input_int_.data_ptr<int>();
      auto output_data = output_int_.data_ptr<int>();
      PrefixSum(output_data, input_data, input_size_);
    }
  }
  void runExclusiveScanAVX512(benchmark::State& state) {
    output_ = at::empty({input_size_ + 1}, at::kFloat);
    for (auto _ : state) {
      auto input_data = input_.data_ptr<float>();
      auto output_data = output_.data_ptr<float>();
      output_data[0] = 0.0f;
      PrefixSum(output_data + 1, input_data, input_size_);
    }
  }
  void runExclusiveScanIntAVX512(benchmark::State& state) {
    output_int_ = at::empty({input_size_ + 1}, at::kInt);
    for (auto _ : state) {
      auto input_data = input_int_.data_ptr<int>();
      auto output_data = output_int_.data_ptr<int>();
      output_data[0] = 0;
      PrefixSum(output_data + 1, input_data, input_size_);
    }
  }
 #endif
 private:
  int input_size_;
  at::Tensor input_;
  at::Tensor output_;
  at::Tensor ref_;
  at::Tensor input_int_;
  at::Tensor output_int_;
  at::Tensor ref_int_; // no type promotion
 };
 } // namespace
 BENCHMARK_DEFINE_F(PrefixSumBench, ATen)(benchmark::State& state) {
  runATen(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, Local)(benchmark::State& state) {
  runLocal(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, LocalInt)(benchmark::State& state) {
  runLocalInt(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, NNC)(benchmark::State& state) {
  runNNC(state);
 }
 #ifdef __AVX2__
 BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX2)(benchmark::State& state) {
  runLocalAVX2(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX2)(benchmark::State& state) {
  runLocalIntAVX2(state);
 }
 #endif
 #ifdef __AVX512F__
 BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX512)(benchmark::State& state) {
  runLocalAVX512(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX512)(benchmark::State& state) {
  runLocalIntAVX512(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanAVX512)
 (benchmark::State& state) {
  runExclusiveScanAVX512(state);
 }
 BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanIntAVX512)
 (benchmark::State& state) {
  runExclusiveScanIntAVX512(state);
 }
 #endif
 //---------- float benchmarks ----------//
 BENCHMARK_REGISTER_F(PrefixSumBench, ATen)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 BENCHMARK_REGISTER_F(PrefixSumBench, NNC)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 BENCHMARK_REGISTER_F(PrefixSumBench, Local)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #ifdef __AVX2__
 BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX2)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #endif
 #ifdef __AVX512F__
 BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX512)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanAVX512)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #endif
 //---------- int benchmarks ----------//
 BENCHMARK_REGISTER_F(PrefixSumBench, LocalInt)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #ifdef __AVX2__
 BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX2)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #endif
 #ifdef __AVX512F__
 BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX512)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanIntAVX512)
    ->RangeMultiplier(4)
    ->Ranges({{1 << 6, 1 << 20}});
 #endif
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@ -1,621 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 #include <immintrin.h>
 namespace te = torch::jit::tensorexpr;
 namespace {
 class Reduce1D : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    at::set_num_threads(1);
    torch::manual_seed(0x12345678);
    M = state.range(0);
    A = torch::randn({M});
    B = torch::zeros({});
    ref = torch::sum(A, {0});
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-7));
    state.counters["BYTES"] = benchmark::Counter(
        uint64_t(state.iterations()) * M * sizeof(float),
        benchmark::Counter::kIsRate);
  }
  int M;
  at::Tensor A;
  at::Tensor B;
  at::Tensor ref;
 };
 } // namespace
 BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) {
  for (auto _ : state) {
    B = torch::sum(A, {0});
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24});
 #define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B))
 template <typename Func>
 void ValidateFunc(
    Func func,
    const std::string& func_name,
    at::Tensor& A,
    at::Tensor& B) {
  func(A, B);
  float* pB = B.data_ptr<float>();
  at::Tensor B2 = torch::sum(A, {0});
  float* pB2 = B2.data_ptr<float>();
  int size = A.numel();
  float size_sqrt = std::sqrt(size);
  float natural_noise = size_sqrt * 1e-7;
  if (!torch::allclose(B, B2, natural_noise)) {
    std::ostringstream oss;
    oss << func_name << " failed check: " << std::endl;
    oss << "value: " << B << std::endl;
    ;
    oss << "reference: " << B2 << std::endl;
    oss << "threshold: " << natural_noise << std::endl;
    throw std::runtime_error(oss.str());
  }
 }
 static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
  float* pA = A.data_ptr<float>();
  float* pB = B.data_ptr<float>();
  int size = A.numel();
  TORCH_CHECK(B.numel() == 1);
  *pB = 0.;
  for (const auto i : c10::irange(size)) {
    *pB += pA[i];
  }
 }
 BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) {
  VALIDATE(reduce1d_naive, A, B);
  for (auto _ : state) {
    reduce1d_naive(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24});
 static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
  float* pA = A.data_ptr<float>();
  float* pB = B.data_ptr<float>();
  int size = A.numel();
  constexpr int kChunkSize = 16;
  TORCH_CHECK(B.numel() == 1);
  TORCH_CHECK(size % kChunkSize == 0);
  *pB = 0.;
  float temp[kChunkSize];
  for (const auto j : c10::irange(kChunkSize)) {
    temp[j] = 0;
  }
  int chunk_count = size / kChunkSize;
  for (const auto i : c10::irange(chunk_count)) {
    for (const auto j : c10::irange(kChunkSize)) {
      temp[j] += pA[i * kChunkSize + j];
    }
  }
  for (const auto j : c10::irange(kChunkSize)) {
    *pB += temp[j];
  }
 }
 BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) {
  VALIDATE(reduce1d_native_rfactor, A, B);
  for (auto _ : state) {
    reduce1d_native_rfactor(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24});
 #ifdef USE_AVX2
 // x = ( x7, x6, x5, x4, x3, x2, x1, x0 )
 inline float sum_f32x8(__m256 x) {
  // hiQuad = ( x7, x6, x5, x4 )
  const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
  // loQuad = ( x3, x2, x1, x0 )
  const __m128 loQuad = _mm256_castps256_ps128(x);
  // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
  const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
  // loDual = ( -, -, x1 + x5, x0 + x4 )
  const __m128 loDual = sumQuad;
  // hiDual = ( -, -, x3 + x7, x2 + x6 )
  const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
  // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
  const __m128 sumDual = _mm_add_ps(loDual, hiDual);
  // lo = ( -, -, -, x0 + x2 + x4 + x6 )
  const __m128 lo = sumDual;
  // hi = ( -, -, -, x1 + x3 + x5 + x7 )
  const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
  // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
 }
 static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
  float* pA = A.data_ptr<float>();
  float* pB = B.data_ptr<float>();
  int size = A.numel();
  constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
  TORCH_CHECK(B.numel() == 1);
  TORCH_CHECK(size % kChunkSize == 0);
  *pB = 0.;
  __m256 temp;
  temp = _mm256_setzero_ps();
  int tile_count = size / kChunkSize;
  for (const auto i : c10::irange(tile_count)) {
    __m256 data = _mm256_load_ps(pA + i * kChunkSize);
    temp = _mm256_add_ps(temp, data);
  }
  float result = sum_f32x8(temp);
  *pB = result;
 }
 BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) {
  VALIDATE(reduce1d_native_vector, A, B);
  for (auto _ : state) {
    reduce1d_native_vector(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24});
 static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
  static constexpr int kTileSize = 4;
  float* pA = A.data_ptr<float>();
  float* pB = B.data_ptr<float>();
  int size = A.numel();
  constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
  TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1");
  TORCH_CHECK(
      size % kChunkSize == 0,
      "Invalid size: ",
      size,
      " % ",
      kChunkSize,
      " ! = 0");
  __m256 t[kTileSize];
  for (const auto j : c10::irange(kTileSize)) {
    t[j] = _mm256_setzero_ps();
  }
  int tile_count = size / kChunkSize / kTileSize;
  for (int i = 0; i < tile_count; i++) {
 #pragma unroll
    for (int j = 0; j < kTileSize; j++) {
      float* p = pA + (i * kTileSize + j) * kChunkSize;
      __m256 data = _mm256_loadu_ps(p);
      t[j] = _mm256_add_ps(t[j], data);
    }
  }
  float result = sum_f32x8(t[0]);
  for (const auto j : c10::irange(1, kTileSize)) {
    result += sum_f32x8(t[j]);
  }
  *pB = result;
 }
 BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) {
  VALIDATE(reduce1d_native_tiled, A, B);
  for (auto _ : state) {
    reduce1d_native_tiled(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
 #endif // USE_AVX2
 BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
  int M = A.numel();
  te::BufHandle AP("A", {M}, te::kFloat);
  te::Tensor BT = te::Reduce(
      "reduce_full",
      {1},
      te::Sum(),
      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
        return AP.load(m);
      },
      {M});
  te::LoopNest loop({BT});
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
  auto func = [&](at::Tensor& A, at::Tensor& B) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
  };
  ValidateFunc(func, "reduce1d_te_naive", A, B);
  for (auto _ : state) {
    func(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
 BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
  int M = A.numel();
  te::BufHandle AP("A", {M}, te::kFloat);
  te::Tensor BT = te::Reduce(
      "reduce_full",
      {1},
      te::Sum(),
      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
        return AP.load(m);
      },
      {M});
  te::LoopNest loop({BT});
  const int kChunkSize = 8;
  {
    auto const& loops = loop.getLoopStmtsFor(BT);
    te::ForPtr m = loops[1];
    loop.splitWithTail(m, kChunkSize);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
  auto func = [&](at::Tensor& A, at::Tensor& B) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
  };
  ValidateFunc(func, "reduce1d_te_naive", A, B);
  for (auto _ : state) {
    func(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
 BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
  int M = A.numel();
  te::BufHandle AP("A", {M}, te::kFloat);
  te::Tensor BT = te::Reduce(
      "reduce_full",
      {1},
      te::Sum(),
      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
        return AP.load(m);
      },
      {M});
  te::LoopNest loop({BT});
  const int kChunkSize = 8;
  {
    auto const& loops = loop.getLoopStmtsFor(BT);
    te::ForPtr m = loops[1];
    loop.splitWithMask(m, kChunkSize);
  }
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
  auto func = [&](at::Tensor& A, at::Tensor& B) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
  };
  ValidateFunc(func, "reduce1d_te_naive", A, B);
  for (auto _ : state) {
    func(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
 BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
  int M = A.numel();
  const int kChunkSize = 8;
  TORCH_CHECK(M % kChunkSize == 0);
  te::BufHandle AP("A", {M}, te::kFloat);
  te::Tensor BT = te::Reduce(
      "reduce_full",
      {},
      te::Sum(),
      [&](const te::ExprHandle& m) { return AP.load(m); },
      {M});
  te::LoopNest loop({BT});
  te::BufPtr rfac_buf;
  auto loops = loop.getLoopStmtsFor(BT);
  TORCH_CHECK(loops.size() == 1);
  te::ForPtr mi;
  loop.splitWithMask(loops.at(0), kChunkSize, &mi);
  te::ForPtr mo = loops.at(0);
  loop.reorderAxis(mo, mi);
  loops = loop.getLoopStmtsFor(BT);
  auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
  TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
  loop.reorderAxis(loops.at(0), loops.at(1));
  loops = loop.getAllInnermostLoopsWritingToBuf(rfac_buf);
  TORCH_CHECK(loops.size() == 2);
  loop.vectorize(loops.at(1));
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
  auto func = [&](at::Tensor& A, at::Tensor& B) {
    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
  };
  ValidateFunc(func, "reduce1d_te_naive", A, B);
  for (auto _ : state) {
    func(A, B);
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
 BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
  const int M = A.numel();
  const int kChunkSize = 8;
  te::BufHandle a("A", {M}, te::kFloat);
  te::Tensor b = te::computeSum(
      {a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU);
  te::LoopNest nest({b});
  auto loops = nest.getLoopStmtsFor(b);
  te::ForPtr mi, mo;
  te::BufPtr rf;
  nest.splitWithMask(loops[0], kChunkSize, &mi);
  loops = nest.reorder({loops[0], mi}, {1, 0});
  nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf);
  nest.reorderAxis(loops[0], loops[1]);
  for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
    nest.vectorize(loop);
  }
  nest.prepareForCodegen();
  nest.simplify();
  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
  for (auto _ : state) {
    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
  }
 }
 BENCHMARK_REGISTER_F(Reduce1D, Op)->Args({1 << 24});
 class Reduce2DCol : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    at::set_num_threads(1);
    torch::manual_seed(0x12345678);
    M = state.range(0);
    N = state.range(1);
    A = torch::randn({M, N});
    ref = torch::sum(A, {0});
    B = torch::zeros_like(ref);
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-5));
    state.counters["BYTES"] = benchmark::Counter(
        uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
        benchmark::Counter::kIsRate);
  }
  int M;
  int N;
  at::Tensor A;
  at::Tensor B;
  at::Tensor ref;
 };
 BENCHMARK_DEFINE_F(Reduce2DCol, Torch)(benchmark::State& state) {
  for (auto _ : state) {
    B = torch::sum(A, {0});
  }
 }
 BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
    ->Args({1 << 3, 1 << 21})
    ->Args({1 << 6, 1 << 18})
    ->Args({1 << 12, 1 << 12});
 BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
  constexpr int kCacheSize = 1 << 12;
  te::BufHandle a("A", {M, N}, te::kFloat);
  te::Tensor b = te::computeSum(
      {a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU);
  te::LoopNest nest({b});
  auto sch = state.range(2);
  if (sch == 0) {
  } else if (sch == 1) {
    auto loops = nest.getLoopStmtsFor(b);
    nest.reorderAxis(loops[0], loops[1]);
  } else if (sch == 2) {
    auto loops = nest.getLoopStmtsFor(b);
    nest.splitWithTail(loops[0], kCacheSize);
    loops = nest.getLoopStmtsFor(b);
    nest.reorderAxis(loops[1], loops[2]);
  } else if (sch == 3) {
    auto loops = nest.getLoopStmtsFor(b);
    nest.splitWithTail(loops[1], 8);
    loops = nest.getLoopStmtsFor(b);
    nest.reorderAxis(loops[0], loops[1]);
  }
  nest.prepareForCodegen();
  nest.simplify();
  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
  for (auto _ : state) {
    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
  }
 }
 BENCHMARK_REGISTER_F(Reduce2DCol, OpSchedule)
    ->Apply( // CustomArgs);
        [](benchmark::internal::Benchmark* b) {
          for (auto sch : {0, 1, 2, 3}) {
            for (auto rows : {3, 6, 12}) {
              auto cols = 24 - rows;
              b->Args({1 << rows, 1 << cols, sch});
            }
          }
        });
 class Reduce2DRow : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    at::set_num_threads(1);
    torch::manual_seed(0x12345678);
    M = state.range(0);
    N = state.range(1);
    A = torch::randn({M, N});
    ref = torch::sum(A, {1});
    B = torch::zeros_like(ref);
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-4));
    state.counters["BYTES"] = benchmark::Counter(
        uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
        benchmark::Counter::kIsRate);
  }
  int M;
  int N;
  at::Tensor A;
  at::Tensor B;
  at::Tensor ref;
 };
 BENCHMARK_DEFINE_F(Reduce2DRow, Torch)(benchmark::State& state) {
  for (auto _ : state) {
    B = torch::sum(A, {1});
  }
 }
 BENCHMARK_REGISTER_F(Reduce2DRow, Torch)
    ->Args({1 << 3, 1 << 21})
    ->Args({1 << 6, 1 << 18})
    ->Args({1 << 12, 1 << 12})
    ->Args({1 << 18, 1 << 6});
 BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) {
  auto a = A.data_ptr<float>();
  auto b = B.data_ptr<float>();
  constexpr int Mb = 4;
  constexpr int Nb = 4;
  auto fn = [&] {
    for (int m_outer = 0; m_outer < M; m_outer += Mb) {
      float bregs[Mb][Nb] = {0.0f};
      for (int n_outer = 0; n_outer < N; n_outer += Nb) {
        for (const auto m_inner : c10::irange(Mb)) {
          for (const auto n_inner : c10::irange(Nb)) {
            bregs[m_inner][n_inner] +=
                a[(m_outer + m_inner) * N + n_outer + n_inner];
          }
        }
      }
      for (const auto m_inner : c10::irange(Mb)) {
        b[m_outer + m_inner] = 0.f;
        for (const auto n_inner : c10::irange(Nb)) {
          b[m_outer + m_inner] += bregs[m_inner][n_inner];
        }
      }
    }
  };
  for (auto _ : state) {
    fn();
  }
 }
 BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6});
 BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
  constexpr int kChunkSize = 8;
  te::BufHandle a("A", {M, N}, te::kFloat);
  te::Tensor b = te::computeSum(
      {a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU);
  te::LoopNest nest({b});
  auto sch = state.range(2);
  if (sch == 1) {
    auto loops = nest.getLoopStmtsFor(b);
    te::ForPtr mi, mo;
    te::BufPtr rf;
    nest.splitWithMask(loops[1], kChunkSize, &mi);
    loops = nest.reorder({loops[1], mi}, {1, 0});
    TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
    nest.reorderAxis(loops[0], loops[1]);
    for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
      nest.vectorize(loop);
    }
  } else if (sch == 2) {
    auto loops = nest.getLoopStmtsFor(b);
    nest.splitWithMask(loops[1], 8);
    nest.splitWithMask(loops[0], 4);
    loops = nest.getLoopStmtsFor(b);
    nest.reorderAxis(loops[1], loops[2]);
  } else if (sch == 3) {
    auto loops = nest.getLoopStmtsFor(b);
    te::ForPtr mi, mo;
    te::BufPtr rf;
    nest.splitWithMask(loops[1], kChunkSize, &mi);
    loops = nest.reorder({loops[1], mi}, {1, 0});
    TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
    nest.reorderAxis(loops[0], loops[1]);
    te::LoopNest::compressBuffer(rf, nest.root_stmt());
    for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
      nest.vectorize(loop);
    }
  }
  nest.prepareForCodegen();
  nest.simplify();
  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
  for (auto _ : state) {
    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
  }
 }
 BENCHMARK_REGISTER_F(Reduce2DRow, OpSchedule)
    ->Apply( // CustomArgs);
        [](benchmark::internal::Benchmark* b) {
          for (auto sch : {0, 1, 2, 3}) {
            for (auto rows : {3, 6, 12, 18}) {
              auto cols = 24 - rows;
              b->Args({1 << rows, 1 << cols, sch});
            }
          }
        });
--- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
@ -1,166 +0,0 @@
 #include <benchmark/benchmark.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 using namespace torch::jit::tensorexpr;
 namespace {
 class SignedLog1pBench : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    input_size_ = {state.range(0), state.range(1)};
    input_size_int_ = {state.range(0), state.range(1)};
    input_ = torch::rand(input_size_);
    ref_ = signedLog1p(input_);
  }
  void TearDown(benchmark::State& state) override {
    TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
    state.counters["GB/s"] = benchmark::Counter(
        uint64_t(state.iterations()) * 2 * output_.nbytes(),
        benchmark::Counter::kIsRate);
  }
  at::Tensor signedLog1p(const at::Tensor& inp) {
    auto sign = at::sign(inp);
    auto log1p = at::log1p(at::abs(inp));
    return sign * log1p;
  }
  void runATen(benchmark::State& state) {
    for (auto _ : state) {
      output_ = signedLog1p(input_);
    }
  }
  void runNNC(benchmark::State& state) {
    BufHandle input_ph(
        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
    Tensor abs_result = Compute(
        "aten_abs",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return abs(input_ph.load(m, n));
        });
    Tensor log1p_result = Compute(
        "aten_log1p",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return log1p(abs_result.load(m, n));
        });
    Tensor sign_result =
        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
    Tensor output = Compute(
        "aten_mul",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return sign_result.load(m, n) * log1p_result.load(m, n);
        });
    LoopNest nest({output}, {abs_result, log1p_result, sign_result, output});
    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
    nest.inlineIntermediateBufs(true);
    nest.prepareForCodegen();
    nest.simplify();
    nest.vectorizeInnerLoops();
    nest.simplify();
    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
    std::vector<CodeGen::BufferArg> buf_args;
    buf_args.emplace_back(input_ph);
    buf_args.emplace_back(output);
    LLVMCodeGen cg(nest.root_stmt(), buf_args);
    std::vector<CodeGen::CallArg> call_args;
    for (auto _ : state) {
      output_ = at::empty_like(ref_);
      call_args.clear();
      call_args.emplace_back(input_.data_ptr<float>());
      call_args.emplace_back(output_.data_ptr<float>());
      cg.call(call_args);
    }
  }
  void runNNCLogVml(benchmark::State& state) {
    BufHandle input_ph(
        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
    Tensor abs_result = Compute(
        "aten_abs",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return abs(input_ph.load(m, n));
        });
    Tensor log_vml_result = Compute(
        "aten_log1p",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return log_vml(abs_result.load(m, n) + ExprHandle(1));
        });
    Tensor sign_result =
        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
    Tensor output = Compute(
        "aten_mul",
        {input_size_int_[0], input_size_int_[1]},
        [&](const VarHandle& m, const VarHandle& n) {
          return sign_result.load(m, n) * log_vml_result.load(m, n);
        });
    LoopNest nest({output}, {abs_result, log_vml_result, sign_result, output});
    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
    nest.inlineIntermediateBufs(true);
    nest.prepareForCodegen();
    nest.simplify();
    nest.vectorizeInnerLoops();
    nest.simplify();
    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
    std::vector<CodeGen::BufferArg> buf_args;
    buf_args.emplace_back(input_ph);
    buf_args.emplace_back(output);
    LLVMCodeGen cg(nest.root_stmt(), buf_args);
    std::vector<CodeGen::CallArg> call_args;
    for (auto _ : state) {
      output_ = at::empty_like(ref_);
      call_args.clear();
      call_args.emplace_back(input_.data_ptr<float>());
      call_args.emplace_back(output_.data_ptr<float>());
      cg.call(call_args);
    }
  }
 private:
  std::vector<long> input_size_;
  std::vector<int> input_size_int_;
  at::Tensor input_;
  at::Tensor output_;
  at::Tensor ref_;
 };
 } // namespace
 BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
  runATen(state);
 }
 BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
  runNNC(state);
 }
 BENCHMARK_DEFINE_F(SignedLog1pBench, NNCLogVml)(benchmark::State& state) {
  runNNCLogVml(state);
 }
 BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
 BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});
 BENCHMARK_REGISTER_F(SignedLog1pBench, NNCLogVml)->Args({10, 1467});
--- a/benchmarks/cpp/tensorexpr/main.cpp
+++ b/benchmarks/cpp/tensorexpr/main.cpp
@ -1,3 +0,0 @@
 #include <benchmark/benchmark.h>
 BENCHMARK_MAIN();
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1660,14 +1660,6 @@ if(BUILD_STATIC_RUNTIME_BENCHMARK)
  target_link_libraries(static_runtime_test torch_library gtest_main)
 endif()
 if(BUILD_TENSOREXPR_BENCHMARK)
  add_subdirectory(${TORCH_ROOT}/benchmarks/cpp/tensorexpr ${CMAKE_BINARY_DIR}/tensorexpr_bench)
 endif()
 if(BUILD_CPP_BENCHMARKS)
  add_subdirectory(${TORCH_ROOT}/benchmarks/cpp ${PROJECT_BINARY_DIR}/bin)
 endif()
 if(BUILD_MOBILE_BENCHMARK)
  foreach(benchmark_src ${ATen_MOBILE_BENCHMARK_SRCS})
    get_filename_component(benchmark_name ${benchmark_src} NAME_WE)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -26,7 +26,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
  message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
  message(STATUS "  BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
  message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
  message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
  if(${CAFFE2_LINK_LOCAL_PROTOBUF})
		`@ -1,2 +0,0 @@`
			`add_executable(convolution_bench convolution.cpp)`
			`target_link_libraries(convolution_bench PRIVATE torch_library benchmark)`
		`@ -1,3 +0,0 @@`
			`#include <benchmark/benchmark.h>`

			`BENCHMARK_MAIN();`