From 521dbbfaffcc45a907b0a8d816f4af7d588ffcb5 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 5 Jan 2024 21:23:30 +0000
Subject: [PATCH] Remove cpp/tensorexpr benchmarks (#116868)

Summary: These refer to a deprecated backend of torchscript which is no longer built in releases, and require llvm to be built.

Test Plan:
```
python setup.py develop
```

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/116868
Approved by: https://github.com/hl475, https://github.com/chenyang78, https://github.com/eellison, https://github.com/mikekgfb
---
 CMakeLists.txt                                |   1 -
 benchmarks/cpp/CMakeLists.txt                 |   2 -
 benchmarks/cpp/convolution.cpp                | 313 ---------
 benchmarks/cpp/tensorexpr/CMakeLists.txt      |  24 -
 benchmarks/cpp/tensorexpr/bench_approx.cpp    | 433 ------------
 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp | 216 ------
 benchmarks/cpp/tensorexpr/bench_compile.cpp   |  71 --
 benchmarks/cpp/tensorexpr/bench_concat.cpp    | 293 ---------
 .../cpp/tensorexpr/bench_fuser_overhead.cpp   |  59 --
 benchmarks/cpp/tensorexpr/bench_gemm.cpp      | 313 ---------
 benchmarks/cpp/tensorexpr/bench_kernels.cpp   | 101 ---
 benchmarks/cpp/tensorexpr/bench_ops.py        | 113 ----
 benchmarks/cpp/tensorexpr/bench_parallel.cpp  |  71 --
 .../cpp/tensorexpr/bench_prefix_sum.cpp       | 395 -----------
 benchmarks/cpp/tensorexpr/bench_reduce.cpp    | 621 ------------------
 .../cpp/tensorexpr/bench_signed_log1p.cpp     | 166 -----
 benchmarks/cpp/tensorexpr/main.cpp            |   3 -
 caffe2/CMakeLists.txt                         |   8 -
 cmake/Summary.cmake                           |   1 -
 19 files changed, 3204 deletions(-)
 delete mode 100644 benchmarks/cpp/CMakeLists.txt
 delete mode 100644 benchmarks/cpp/convolution.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/CMakeLists.txt
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_approx.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_compile.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_concat.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_gemm.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_kernels.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_ops.py
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_parallel.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_reduce.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
 delete mode 100644 benchmarks/cpp/tensorexpr/main.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae21c51296dc..0be84cd241ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,7 +187,6 @@ cmake_dependent_option(
 option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
 option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF)
 option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
-option(BUILD_TENSOREXPR_BENCHMARK "Build C++ binaries for tensorexpr benchmarks (need gbenchmark)" OFF)
 option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_JNI "Build JNI bindings" OFF)
diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
deleted file mode 100644
index d4a6cdec54da..000000000000
--- a/benchmarks/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_executable(convolution_bench convolution.cpp)
-target_link_libraries(convolution_bench PRIVATE torch_library benchmark)
diff --git a/benchmarks/cpp/convolution.cpp b/benchmarks/cpp/convolution.cpp
deleted file mode 100644
index 998bad2e46b1..000000000000
--- a/benchmarks/cpp/convolution.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/native/mkldnn/MKLDNNCommon.h>
-#include <benchmark/benchmark.h>
-#include <c10/core/InferenceMode.h>
-#include <sstream>
-
-struct ConvParams {
-  std::vector<int64_t> input;
-  std::vector<int64_t> weight;
-  std::vector<int64_t> bias;
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  int64_t groups;
-};
-
-struct xs {
-  explicit xs(const std::vector<int64_t>& v_) : v(v_) {}
-  const std::vector<int64_t>& v;
-};
-
-std::ostream& operator<<(std::ostream& os, const xs& x) {
-  bool first = true;
-  for (auto const& xx : x.v) {
-    if (!first) {
-      os << "x";
-    }
-    first = false;
-    os << xx;
-  }
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const ConvParams& params) {
-  os << "I" << xs(params.input) << "_W" << xs(params.weight) << "_B"
-     << xs(params.bias) << "_S" << xs(params.stride) << "_P"
-     << xs(params.padding) << "_D" << xs(params.dilation) << "_G"
-     << params.groups;
-  return os;
-}
-
-std::vector<ConvParams> MobileNetV3Params = {
-    {{1, 3, 224, 224}, {16, 3, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 16, 112, 112}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 16, 112, 112}, {16, 1, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 16},
-    {{1, 16, 56, 56}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 16, 56, 56}, {72, 16, 1, 1}, {72}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 72, 56, 56}, {72, 1, 3, 3}, {72}, {2, 2}, {1, 1}, {1, 1}, 72},
-    {{1, 72, 28, 28}, {24, 72, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 24, 28, 28}, {88, 24, 1, 1}, {88}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 88, 28, 28}, {88, 1, 3, 3}, {88}, {1, 1}, {1, 1}, {1, 1}, 88},
-    {{1, 88, 28, 28}, {24, 88, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 24, 28, 28}, {96, 24, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 96, 28, 28}, {96, 1, 5, 5}, {96}, {2, 2}, {2, 2}, {1, 1}, 96},
-    {{1, 96, 14, 14}, {40, 96, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
-    {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
-    {{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 40, 14, 14}, {120, 40, 1, 1}, {120}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 120, 14, 14}, {120, 1, 5, 5}, {120}, {1, 1}, {2, 2}, {1, 1}, 120},
-    {{1, 120, 14, 14}, {48, 120, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 48, 14, 14}, {144, 48, 1, 1}, {144}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 144, 14, 14}, {144, 1, 5, 5}, {144}, {1, 1}, {2, 2}, {1, 1}, 144},
-    {{1, 144, 14, 14}, {48, 144, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 48, 14, 14}, {288, 48, 1, 1}, {288}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 288, 14, 14}, {288, 1, 5, 5}, {288}, {2, 2}, {2, 2}, {1, 1}, 288},
-    {{1, 288, 7, 7}, {96, 288, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
-    {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
-    {{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 576, 1, 1}, {1280, 576, 1, 1}, {1280}, {1, 1}, {0, 0}, {1, 1}, 1},
-};
-
-std::vector<ConvParams> ResNet18Params = {
-    {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {128, 64, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {128, 64, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {256, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {256, 128, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {512, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-};
-
-std::vector<ConvParams> ResNet50Params = {
-    {{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 56, 56}, {128, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 128, 56, 56}, {128, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 56, 56}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 28, 28}, {256, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 28, 28}, {256, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 28, 28}, {1024, 512, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {512, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 14, 14}, {512, 512, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 1024, 14, 14}, {2048, 1024, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
-    {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
-    {{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
-};
-
-struct EnableMklDnn {
-  explicit EnableMklDnn(bool enable)
-      : prev_(at::globalContext().userEnabledMkldnn()) {
-    at::globalContext().setUserEnabledMkldnn(enable);
-  }
-
-  ~EnableMklDnn() {
-    at::globalContext().setUserEnabledMkldnn(prev_);
-  }
-
-  bool prev_;
-};
-
-template <bool WithMklDnn>
-static void BM_conv2d_native(
-    benchmark::State& state,
-    const ConvParams& params) {
-  EnableMklDnn mkl(WithMklDnn);
-  auto input = at::randn(params.input);
-  auto weight = at::randn(params.weight);
-  auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
-  auto output = at::conv2d(
-      input,
-      weight,
-      bias,
-      params.stride,
-      params.padding,
-      params.dilation,
-      params.groups);
-  for (auto _ : state) {
-    output = at::conv2d(
-        input,
-        weight,
-        bias,
-        params.stride,
-        params.padding,
-        params.dilation,
-        params.groups);
-  }
-  state.counters["GFLOPS/s"] = benchmark::Counter(
-      2.0f * output.numel() * weight.numel() / weight.size(0) *
-          state.iterations(),
-      benchmark::Counter::kIsRate);
-  state.counters["GB/s"] = benchmark::Counter(
-      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-      state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
-      benchmark::Counter::kIsRate);
-}
-
-enum MklDnnReorder {
-  None,
-  WeightOnly,
-  WeightAndInput,
-};
-
-template <MklDnnReorder Reorder>
-static void BM_conv2d_mkldnn(
-    benchmark::State& state,
-    const ConvParams& params) {
-  auto input = at::randn(params.input);
-  auto weight = at::randn(params.weight);
-  auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
-
-  if (Reorder == WeightAndInput) {
-    auto it_input = at::native::itensor_from_mkldnn(input.to_mkldnn());
-    auto r = ideep::tensor(
-        params.input, ideep::data_type::f32, ideep::format_tag::aBcd16b);
-    it_input.reorder_to(r);
-    input = at::native::new_with_itensor_mkldnn(
-        std::move(r), at::kFloat, at::Device(at::kCPU));
-  }
-
-  if (Reorder == WeightOnly || Reorder == WeightAndInput) {
-    weight = at::mkldnn_reorder_conv2d_weight(
-        weight.to_mkldnn(),
-        params.padding,
-        params.stride,
-        params.dilation,
-        params.groups);
-
-    bias = params.bias.size() > 0 ? bias.to_mkldnn() : bias;
-  }
-
-  auto output = at::mkldnn_convolution(
-      input,
-      weight,
-      bias,
-      params.padding,
-      params.stride,
-      params.dilation,
-      params.groups);
-  for (auto _ : state) {
-    output = at::mkldnn_convolution(
-        input,
-        weight,
-        bias,
-        params.padding,
-        params.stride,
-        params.dilation,
-        params.groups);
-  }
-  state.counters["GFLOPS/s"] = benchmark::Counter(
-      2.0f * output.numel() * weight.numel() / weight.size(0) *
-          state.iterations(),
-      benchmark::Counter::kIsRate);
-  state.counters["GB/s"] = benchmark::Counter(
-      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-      state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
-      benchmark::Counter::kIsRate);
-}
-
-std::string name(
-    const char* base,
-    const char* suffix,
-    const ConvParams& params) {
-  std::ostringstream os;
-  os << base << "_" << suffix << "_" << params;
-  return os.str();
-}
-
-void registerOne(const char* base, const ConvParams& params) {
-  benchmark::RegisterBenchmark(
-      name(base, "native", params).data(), BM_conv2d_native<true>, params);
-  benchmark::RegisterBenchmark(
-      name(base, "native_nomkl", params).data(),
-      BM_conv2d_native<false>,
-      params);
-  benchmark::RegisterBenchmark(
-      name(base, "mkldnn_none", params).data(), BM_conv2d_mkldnn<None>, params);
-  benchmark::RegisterBenchmark(
-      name(base, "mkldnn_weight", params).data(),
-      BM_conv2d_mkldnn<WeightOnly>,
-      params);
-  benchmark::RegisterBenchmark(
-      name(base, "mkldnn_input", params).data(),
-      BM_conv2d_mkldnn<WeightAndInput>,
-      params);
-}
-
-int main(int argc, char** argv) {
-  c10::InferenceMode guard;
-
-#define BENCH(x)                         \
-  for (auto const& params : x##Params) { \
-    registerOne(#x, params);             \
-  }
-  BENCH(MobileNetV3);
-  BENCH(ResNet18);
-  BENCH(ResNet50);
-#undef BENCH
-
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
diff --git a/benchmarks/cpp/tensorexpr/CMakeLists.txt b/benchmarks/cpp/tensorexpr/CMakeLists.txt
deleted file mode 100644
index fbe657bdf2ee..000000000000
--- a/benchmarks/cpp/tensorexpr/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-find_package(AVX)
-
-add_executable(
-  tensorexpr_bench
-  bench_approx.cpp
-  bench_batchnorm.cpp
-  bench_concat.cpp
-  bench_compile.cpp
-  bench_signed_log1p.cpp
-  bench_fuser_overhead.cpp
-  bench_gemm.cpp
-  bench_kernels.cpp
-  bench_parallel.cpp
-  bench_prefix_sum.cpp
-  bench_reduce.cpp
-  main.cpp)
-
-if(C_AVX2_FOUND)
-  message(STATUS "AVX2 compiler support found")
-  target_compile_options(tensorexpr_bench PUBLIC -mavx2)
-  target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
-endif()
-
-target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)
diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp
deleted file mode 100644
index e3276abf99b8..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-#include "caffe2/operators/logit_op.h"
-#include "caffe2/operators/tanh_op.h"
-
-using namespace torch::jit;
-using namespace torch::jit::tensorexpr;
-
-void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
-  auto loops = ln->getLoopStmtsFor(target);
-  ForPtr inner, tail;
-  ln->splitWithTail(loops[0], width, &inner, &tail);
-  ln->vectorize(inner);
-}
-
-void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
-  std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
-  ForPtr inner, tail;
-  ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
-  ForPtr outer = loops[0];
-  ln->vectorize(inner);
-  ln->splitWithTail(outer, 8, &inner, &tail);
-  StmtPtr unrolled;
-  LoopNest::fullUnroll(inner, &unrolled);
-}
-
-static void relu_nnc(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  auto clamp = 0;
-  torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto A_elem = [&]() {
-      auto elem = A.load(i);
-      auto min = FloatImm::make(clamp);
-      return CompareSelect::make(elem, min, min, elem, kLT);
-    }();
-    return A_elem;
-  });
-  LoopNest ln({B});
-  optimizePointwise(&ln, B);
-  ln.prepareForCodegen();
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::randn({state.range(0)});
-  at::Tensor B_t = torch::randn(state.range(0));
-  auto B_ref = at::relu(A_t);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(B_t, B_ref));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["log/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void log_nnc_sleef(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  torch::jit::tensorexpr::Tensor B =
-      Compute("B", {N}, [&](const VarHandle& i) { return log(A.load(i)); });
-  LoopNest ln({B});
-  ln.prepareForCodegen();
-  vectorize(&ln, B, 8);
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::log(A_t);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(B_t, B_ref));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["log/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void log_nnc_fast(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  torch::jit::tensorexpr::Tensor B = Compute(
-      "B", {N}, [&](const VarHandle& i) { return fast_log(A.load(i)); });
-  LoopNest ln({B});
-  optimizePointwise(&ln, B);
-  ln.prepareForCodegen();
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::log(A_t);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(B_t, B_ref));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["log/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void log_nnc_vml(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  torch::jit::tensorexpr::Tensor B =
-      Compute("B", {N}, [&](const VarHandle& i) { return log_vml(A.load(i)); });
-  LoopNest ln({B});
-  vectorize(&ln, B, 8);
-  ln.prepareForCodegen();
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::log(A_t);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(B_t, B_ref));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["log/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void log_aten(benchmark::State& state) {
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  for (auto _ : state) {
-    at::log_out(B_t, A_t);
-  }
-  state.counters["log/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void logit_nnc_sleef(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  auto clamp = 1e-6f;
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto A_elem = [&]() {
-      auto elem = A.load(i);
-      auto min = FloatImm::make(clamp);
-      auto max = FloatImm::make(1.0f - clamp);
-      elem = CompareSelect::make(elem, min, min, elem, kLT);
-      return CompareSelect::make(elem, max, max, elem, kGT);
-    }();
-    return log(A_elem / (FloatImm::make(1.0f) - A_elem));
-  });
-  LoopNest ln({B});
-  ln.prepareForCodegen();
-  optimizePointwise(&ln, B);
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::logit(A_t, clamp);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["logit/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void logit_nnc_fast(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  auto clamp = 1e-6f;
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto A_elem = [&]() {
-      auto elem = A.load(i);
-      auto min = FloatImm::make(clamp);
-      auto max = FloatImm::make(1.0f - clamp);
-      elem = CompareSelect::make(elem, min, min, elem, kLT);
-      return CompareSelect::make(elem, max, max, elem, kGT);
-    }();
-    return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem));
-  });
-  LoopNest ln({B});
-  ln.prepareForCodegen();
-  optimizePointwise(&ln, B);
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::logit(A_t, clamp);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["logit/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void logit_nnc_vml(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  auto clamp = 1e-6f;
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto A_elem = [&]() {
-      auto elem = A.load(i);
-      auto min = FloatImm::make(clamp);
-      auto max = FloatImm::make(1.0f - clamp);
-      elem = CompareSelect::make(elem, min, min, elem, kLT);
-      return CompareSelect::make(elem, max, max, elem, kGT);
-    }();
-    return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
-  });
-  LoopNest ln({B});
-  ln.prepareForCodegen();
-  vectorize(&ln, B, 16);
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::logit(A_t, clamp);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["logit/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void logit_aten(benchmark::State& state) {
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto clamp = 1e-6f;
-  for (auto _ : state) {
-    at::native::logit_out(A_t, clamp, B_t);
-  }
-  state.counters["logit/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-template <typename T>
-void logit_caffe2_impl(int size, const T* X, T* Y, float eps_ = 1e-6f) {
-  using namespace caffe2;
-  ConstEigenVectorMap<T> X_vec(X, size);
-  EigenVectorMap<T> Y_vec(Y, size);
-  Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
-  Y_vec = Y_vec.array().max(eps_);
-  Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
-}
-
-static void logit_caffe2(benchmark::State& state) {
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  at::Tensor B_ref = torch::randn({state.range(0)});
-  auto N = state.range(0);
-  auto X = A_t.data_ptr<float>();
-  auto Y = B_t.data_ptr<float>();
-  auto clamp = 1e-6f;
-  at::native::logit_out(A_t, clamp, B_ref);
-  logit_caffe2_impl(N, X, Y, clamp);
-  TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
-
-  for (auto _ : state) {
-    logit_caffe2_impl(N, X, Y, clamp);
-  }
-
-  state.counters["logit/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void tanh_nnc_fast(benchmark::State& state) {
-  auto N = VarHandle("N", kInt);
-  BufHandle A("A", {N}, kFloat);
-  torch::jit::tensorexpr::Tensor B = Compute(
-      "B", {N}, [&](const VarHandle& i) { return fast_tanh(A.load(i)); });
-  LoopNest ln({B});
-  optimizePointwise(&ln, B);
-  ln.prepareForCodegen();
-  StmtPtr s = ln.root_stmt();
-  s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(B);
-  args.emplace_back(A);
-  args.emplace_back(N);
-  LLVMCodeGen cg(s, args);
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  auto B_ref = at::tanh(A_t);
-  cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  TORCH_CHECK(at::allclose(B_t, B_ref, 1e-3f, 1e-6f));
-  for (auto _ : state) {
-    cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
-  }
-  state.counters["tanh/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void tanh_aten(benchmark::State& state) {
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  for (auto _ : state) {
-    at::tanh_out(A_t, B_t);
-  }
-  state.counters["tanh/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-}
-
-static void tanh_caffe2(benchmark::State& state) {
-#ifdef FBCODE_CAFFE2
-  at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
-  at::Tensor B_t = torch::randn({state.range(0)});
-  at::Tensor B_ref = torch::randn({state.range(0)});
-
-  auto N = state.range(0);
-  auto X = A_t.data_ptr<float>();
-  auto Y = B_t.data_ptr<float>();
-  caffe2::CPUContext c;
-  auto tanh = caffe2::TanhFunctor<caffe2::CPUContext>();
-  at::tanh_out(A_t, B_ref);
-  tanh(N, X, Y, &c);
-  TORCH_CHECK(at::native::allclose(B_t, B_ref, 1e-3f, 1e-6f));
-
-  for (auto _ : state) {
-    tanh(N, X, Y, &c);
-  }
-  state.counters["tanh/s"] = benchmark::Counter(
-      uint64_t(state.range(0) * state.iterations()),
-      benchmark::Counter::kIsRate);
-#endif
-}
-
-BENCHMARK(relu_nnc)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
-    {2 << 14});
-BENCHMARK(log_nnc_sleef)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(log_nnc_fast)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(log_nnc_vml)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(log_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
-    {2 << 14});
-BENCHMARK(logit_nnc_sleef)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(logit_nnc_fast)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(logit_nnc_vml)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(logit_aten)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(logit_caffe2)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(tanh_nnc_fast)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
-BENCHMARK(tanh_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
-    {2 << 14});
-BENCHMARK(tanh_caffe2)
-    ->Args({2 << 5})
-    ->Args({2 << 8})
-    ->Args({2 << 12})
-    ->Args({2 << 14});
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
deleted file mode 100644
index 77e86020f28a..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-using namespace torch::jit::tensorexpr;
-
-namespace {
-class BatchNorm : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    N_ = state.range(0);
-    C_ = state.range(1);
-    H_ = state.range(2);
-    W_ = state.range(3);
-    input_ = torch::ones({N_, C_, H_, W_});
-    weight_ = torch::ones({C_});
-    bias_ = torch::ones({C_});
-    mean_ = torch::ones({C_}) * 0.5f;
-    var_ = torch::ones({C_}) * 0.1f;
-    ref_ = at::batch_norm(
-        input_,
-        weight_,
-        bias_,
-        mean_,
-        var_,
-        training_,
-        momentum_,
-        eps_,
-        cudnn_enabled_);
-    output_ = at::empty_like(ref_);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(ref_, output_));
-    state.counters["GB/s"] = benchmark::Counter(
-        uint64_t(state.iterations()) * (input_.nbytes() + ref_.nbytes()),
-        benchmark::Counter::kIsRate);
-  }
-
-  int N_;
-  int C_;
-  int H_;
-  int W_;
-  at::Tensor input_;
-  at::Tensor weight_;
-  at::Tensor bias_;
-  at::Tensor mean_;
-  at::Tensor var_;
-  at::Tensor output_;
-  at::Tensor ref_;
-  bool training_{false};
-  float momentum_{0.1};
-  float eps_{1.0e-5f};
-  bool cudnn_enabled_{false};
-};
-} // namespace
-
-BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
-  for (auto _ : state) {
-    output_ = at::batch_norm(
-        input_,
-        weight_,
-        bias_,
-        mean_,
-        var_,
-        training_,
-        momentum_,
-        eps_,
-        cudnn_enabled_);
-  }
-}
-
-BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
-  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
-  BufHandle weight("weight", {C_}, kFloat);
-  BufHandle bias("bias", {C_}, kFloat);
-  BufHandle mean("mean", {C_}, kFloat);
-  BufHandle var("var", {C_}, kFloat);
-  VarHandle eps("eps", kFloat);
-
-  using axis = const VarHandle&;
-  Tensor output =
-      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
-        // Compute affine terms.
-        auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
-        auto weight_v = weight.load(c);
-        auto bias_v = bias.load(c);
-        auto alpha = inv_var * weight_v;
-        auto beta = bias_v - mean.load(c) * alpha;
-
-        return input.load(n, c, h, w) * alpha + beta;
-      });
-  LoopNest nest({output});
-  auto loops = nest.getLoopStmtsFor(output);
-  LoopNest::flatten({loops[2], loops[3]});
-  loops = nest.getLoopStmtsFor(output);
-  LoopNest::flatten({loops[0], loops[1]});
-  loops = nest.getLoopStmtsFor(output);
-  loops[0]->set_parallel();
-  nest.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-  LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
-
-  std::vector<CodeGen::CallArg> args;
-  for (auto _ : state) {
-    args.clear();
-    output_ = at::empty_like(input_);
-    for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
-      args.push_back(t.data_ptr<float>());
-    }
-    args.push_back(eps_);
-    cg.call(args);
-  }
-}
-
-BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
-  for (auto _ : state) {
-    output_ = at::batch_norm(
-        input_,
-        weight_,
-        bias_,
-        mean_,
-        var_,
-        training_,
-        momentum_,
-        eps_,
-        cudnn_enabled_);
-    output_.relu_();
-  }
-}
-
-BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
-  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
-  BufHandle weight("weight", {C_}, kFloat);
-  BufHandle bias("bias", {C_}, kFloat);
-  BufHandle mean("mean", {C_}, kFloat);
-  BufHandle var("var", {C_}, kFloat);
-  VarHandle eps("eps", kFloat);
-
-  using axis = const VarHandle&;
-  Tensor output =
-      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
-        // Compute affine terms.
-        auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
-        auto weight_v = weight.load(c);
-        auto bias_v = bias.load(c);
-        auto alpha = inv_var * weight_v;
-        auto beta = bias_v - mean.load(c) * alpha;
-
-        auto bn = input.load(n, c, h, w) * alpha + beta;
-        return CompareSelect::make(bn, 0.f, 0.f, bn, kLT);
-      });
-  LoopNest nest({output});
-  nest.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-  LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
-
-  std::vector<CodeGen::CallArg> args;
-  for (auto _ : state) {
-    args.clear();
-    output_ = at::empty_like(input_);
-    for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
-      args.push_back(t.data_ptr<float>());
-    }
-    args.push_back(eps_);
-    cg.call(args);
-  }
-}
-
-BENCHMARK_REGISTER_F(BatchNorm, ATen)
-    ->Args({1, 64, 112, 112})
-    ->Args({1, 256, 14, 14})
-    ->Args({1, 128, 28, 28})
-    ->Args({1, 64, 56, 56})
-    ->Args({1, 512, 7, 7})
-    ->Args({5, 64, 112, 112})
-    ->Args({5, 256, 14, 14})
-    ->Args({5, 128, 28, 28})
-    ->Args({5, 64, 56, 56})
-    ->Args({5, 512, 7, 7});
-BENCHMARK_REGISTER_F(BatchNorm, NNC)
-    ->Args({1, 64, 112, 112})
-    ->Args({1, 256, 14, 14})
-    ->Args({1, 128, 28, 28})
-    ->Args({1, 64, 56, 56})
-    ->Args({1, 512, 7, 7})
-    ->Args({5, 64, 112, 112})
-    ->Args({5, 256, 14, 14})
-    ->Args({5, 128, 28, 28})
-    ->Args({5, 64, 56, 56})
-    ->Args({5, 512, 7, 7});
-BENCHMARK_REGISTER_F(BatchNorm, ATenRelu)
-    ->Args({1, 64, 112, 112})
-    ->Args({1, 256, 14, 14})
-    ->Args({1, 128, 28, 28})
-    ->Args({1, 64, 56, 56})
-    ->Args({1, 512, 7, 7})
-    ->Args({5, 64, 112, 112})
-    ->Args({5, 256, 14, 14})
-    ->Args({5, 128, 28, 28})
-    ->Args({5, 64, 56, 56})
-    ->Args({5, 512, 7, 7});
-BENCHMARK_REGISTER_F(BatchNorm, NNCRelu)
-    ->Args({1, 64, 112, 112})
-    ->Args({1, 256, 14, 14})
-    ->Args({1, 128, 28, 28})
-    ->Args({1, 64, 56, 56})
-    ->Args({1, 512, 7, 7})
-    ->Args({5, 64, 112, 112})
-    ->Args({5, 256, 14, 14})
-    ->Args({5, 128, 28, 28})
-    ->Args({5, 64, 56, 56})
-    ->Args({5, 512, 7, 7});
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
deleted file mode 100644
index be60f9cd599b..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-#ifdef TORCH_ENABLE_LLVM
-namespace te = torch::jit::tensorexpr;
-
-static void BM_CompileSwish(benchmark::State& state) {
-  for (auto _ : state) {
-    constexpr int N = 512;
-    te::VarHandle n("n", te::kInt);
-    te::BufHandle A("A", {N}, te::kFloat);
-    te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
-      return te::Max::make(A.load(i), 0.f, false);
-    });
-    te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
-      return te::Min::make(relu.load(i), 6.f, false);
-    });
-    te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) {
-      return min6.load(i) + 3.f;
-    });
-    te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
-      return A.load(i) * plus3.load(i);
-    });
-    te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
-      return times.load(i) * 1.f / 6.f;
-    });
-    te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
-    for (auto tensor : {relu, min6, plus3, times}) {
-      nest.computeInline(tensor.buf());
-    }
-    nest.prepareForCodegen();
-    te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
-    te::LLVMCodeGen cg(s, {A, sixth, n});
-  }
-}
-
-static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
-  constexpr int N = 512;
-  te::VarHandle n("n", te::kInt);
-  te::BufHandle A("A", {N}, te::kFloat);
-  te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
-    return te::Max::make(A.load(i), 0.f, false);
-  });
-  te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
-    return te::Min::make(relu.load(i), 6.f, false);
-  });
-  te::Tensor plus3 = te::Compute(
-      "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; });
-  te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
-    return A.load(i) * plus3.load(i);
-  });
-  te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
-    return times.load(i) * 1.f / 6.f;
-  });
-  te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
-  for (auto tensor : {relu, min6, plus3, times}) {
-    nest.computeInline(tensor.buf());
-  }
-  nest.prepareForCodegen();
-  te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
-  for (auto _ : state) {
-    te::LLVMCodeGen cg(s, {A, sixth, n});
-  }
-}
-
-BENCHMARK(BM_CompileSwish);
-BENCHMARK(BM_CompileSwishLLVMOnly);
-#endif // TORCH_ENABLE_LLVM
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
deleted file mode 100644
index b7b97d02e3a8..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-using namespace torch::jit::tensorexpr;
-
-namespace {
-
-class ConcatBench : public benchmark::Fixture {
- public:
-  void init(const std::vector<std::vector<int>> input_sizes, int concat_dim) {
-    input_sizes_ = std::move(input_sizes);
-    concat_dim_ = concat_dim;
-    inputs_.resize(input_sizes_.size());
-    for (const auto i : c10::irange(input_sizes_.size())) {
-      inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
-    }
-    output_size_.resize(input_sizes_.front().size());
-    for (const auto i : c10::irange(output_size_.size())) {
-      if (i == static_cast<size_t>(concat_dim_)) {
-        output_size_[i] = 0;
-        for (const auto j : c10::irange(input_sizes_.size())) {
-          output_size_[i] += input_sizes_[j][i];
-        }
-      } else {
-        output_size_[i] = input_sizes_.front()[i];
-      }
-    }
-    ref_ = at::cat(inputs_, concat_dim_);
-    output_ = at::empty_like(ref_);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(ref_, output_));
-    state.counters["GB/s"] = benchmark::Counter(
-        uint64_t(state.iterations()) * 2 * output_.nbytes(),
-        benchmark::Counter::kIsRate);
-  }
-
-  void runATen(benchmark::State& state) {
-    for (auto _ : state) {
-      output_ = at::cat(inputs_, concat_dim_);
-    }
-  }
-
-  void runNNC(benchmark::State& state) {
-    size_t num_inputs = inputs_.size();
-    size_t num_dims = 2;
-
-    std::vector<BufHandle> inputs;
-    for (size_t i = 0; i < num_inputs; ++i) {
-      inputs.emplace_back(BufHandle(
-          "input" + std::to_string(i),
-          {input_sizes_[i][0], input_sizes_[i][1]},
-          kFloat));
-    }
-
-    Tensor output = Compute(
-        "aten_cat",
-        {output_size_[0], output_size_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          int d = 0;
-          std::vector<int> cumulative_concat_dim_sizes(num_inputs);
-          for (const auto i : c10::irange(num_inputs)) {
-            cumulative_concat_dim_sizes[i] = d;
-            d += input_sizes_[i][concat_dim_];
-          }
-          auto load =
-              inputs.back().load(m, n - cumulative_concat_dim_sizes.back());
-          for (size_t i = num_inputs - 1; i > 0; --i) {
-            load = ifThenElse(
-                CompareSelect::make(
-                    n, IntImm::make(cumulative_concat_dim_sizes[i]), kLT),
-                inputs[i - 1].load(m, n - cumulative_concat_dim_sizes[i - 1]),
-                load);
-          }
-          return load;
-        });
-    LoopNest nest({output});
-    nest.prepareForCodegen();
-    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-    std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
-    buf_args.push_back(output);
-    LLVMCodeGen cg(s, buf_args);
-
-    std::vector<CodeGen::CallArg> call_args;
-    for (auto _ : state) {
-      output_ = at::empty_like(ref_);
-      call_args.clear();
-      for (const auto& inp : inputs_) {
-        call_args.push_back(inp.data_ptr<float>());
-      }
-      call_args.push_back(output_.data_ptr<float>());
-      cg.call(call_args);
-    }
-  }
-
-  void runNNCLoop(benchmark::State& state) {
-    size_t num_inputs = inputs_.size();
-    size_t num_dims = 2;
-
-    TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
-
-    auto output_buf = alloc<Buf>(
-        alloc<Var>("aten_cat", kHandle),
-        std::vector<ExprPtr>(
-            {alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
-        kFloat);
-
-    std::vector<BufHandle> inputs;
-    std::vector<StmtPtr> for_stmts(num_inputs);
-    int cumulative_input_sizes = 0;
-    for (size_t i = 0; i < num_inputs; ++i) {
-      inputs.emplace_back(BufHandle(
-          "input" + std::to_string(i),
-          {input_sizes_[i][0], input_sizes_[i][1]},
-          kFloat));
-      std::vector<VarPtr> for_vars(num_inputs);
-      for (const auto d : c10::irange(num_dims)) {
-        for_vars[d] =
-            alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
-      }
-      auto store = alloc<Store>(
-          output_buf,
-          std::vector<ExprPtr>(
-              {for_vars[0],
-               alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
-          alloc<Load>(
-              inputs[i].node(),
-              std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
-      auto for_st = alloc<For>(
-          for_vars[0],
-          alloc<IntImm>(0),
-          alloc<IntImm>(input_sizes_[i][0]),
-          alloc<For>(
-              for_vars[1],
-              alloc<IntImm>(0),
-              alloc<IntImm>(input_sizes_[i][1]),
-              store));
-      for_stmts[i] = for_st;
-      cumulative_input_sizes += input_sizes_[i][1];
-    }
-    auto output = Tensor(output_buf, alloc<Block>(for_stmts));
-
-    LoopNest nest({output});
-    nest.prepareForCodegen();
-    nest.vectorizeInnerLoops();
-    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-    std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
-    buf_args.push_back(output);
-    LLVMCodeGen cg(s, buf_args);
-
-    std::vector<CodeGen::CallArg> call_args;
-    for (auto _ : state) {
-      output_ = at::empty_like(ref_);
-      call_args.clear();
-      for (const auto& inp : inputs_) {
-        call_args.push_back(inp.data_ptr<float>());
-      }
-      call_args.push_back(output_.data_ptr<float>());
-      cg.call(call_args);
-    }
-  }
-
-  std::vector<std::vector<int>> input_sizes_;
-  int concat_dim_;
-  std::vector<at::Tensor> inputs_;
-  std::vector<int> output_size_;
-  at::Tensor output_;
-  at::Tensor ref_;
-};
-
-class Concat2D2Input : public ConcatBench {
- public:
-  void SetUp(const benchmark::State& state) override {
-    init(
-        {{state.range(0), state.range(1)}, {state.range(2), state.range(3)}},
-        state.range(4));
-  }
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(Concat2D2Input, ATen)(benchmark::State& state) {
-  runATen(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D2Input, NNC)(benchmark::State& state) {
-  runNNC(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D2Input, NNCLoop)(benchmark::State& state) {
-  runNNCLoop(state);
-}
-
-BENCHMARK_REGISTER_F(Concat2D2Input, ATen)
-    ->Args({1, 160, 1, 14, 1})
-    ->Args({1, 580, 1, 174, 1})
-    ->Args({20, 160, 20, 14, 1})
-    ->Args({20, 580, 20, 174, 1})
-    ->Args({8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D2Input, NNC)
-    ->Args({1, 160, 1, 14, 1})
-    ->Args({1, 580, 1, 174, 1})
-    ->Args({20, 160, 20, 14, 1})
-    ->Args({20, 580, 20, 174, 1})
-    ->Args({8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D2Input, NNCLoop)
-    ->Args({1, 160, 1, 14, 1})
-    ->Args({1, 580, 1, 174, 1})
-    ->Args({20, 160, 20, 14, 1})
-    ->Args({20, 580, 20, 174, 1})
-    ->Args({8, 512, 8, 512, 1});
-
-namespace {
-
-class Concat2D3Input : public ConcatBench {
- public:
-  void SetUp(const benchmark::State& state) override {
-    init(
-        {{state.range(0), state.range(1)},
-         {state.range(2), state.range(3)},
-         {state.range(4), state.range(5)}},
-        state.range(6));
-  }
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(Concat2D3Input, ATen)(benchmark::State& state) {
-  runATen(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D3Input, NNC)(benchmark::State& state) {
-  runNNC(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D3Input, NNCLoop)(benchmark::State& state) {
-  runNNCLoop(state);
-}
-
-BENCHMARK_REGISTER_F(Concat2D3Input, ATen)->Args({8, 512, 8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D3Input, NNC)->Args({8, 512, 8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D3Input, NNCLoop)
-    ->Args({8, 512, 8, 512, 8, 512, 1});
-
-namespace {
-
-class Concat2D7Input : public ConcatBench {
- public:
-  void SetUp(const benchmark::State& state) override {
-    init(
-        {{state.range(0), state.range(1)},
-         {state.range(2), state.range(3)},
-         {state.range(4), state.range(5)},
-         {state.range(6), state.range(7)},
-         {state.range(8), state.range(9)},
-         {state.range(10), state.range(11)},
-         {state.range(12), state.range(13)}},
-        state.range(14));
-  }
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(Concat2D7Input, ATen)(benchmark::State& state) {
-  runATen(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D7Input, NNC)(benchmark::State& state) {
-  runNNC(state);
-}
-
-BENCHMARK_DEFINE_F(Concat2D7Input, NNCLoop)(benchmark::State& state) {
-  runNNCLoop(state);
-}
-
-BENCHMARK_REGISTER_F(Concat2D7Input, ATen)
-    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D7Input, NNC)
-    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
-
-BENCHMARK_REGISTER_F(Concat2D7Input, NNCLoop)
-    ->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
diff --git a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
deleted file mode 100644
index e0da3a38544f..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <c10/core/InferenceMode.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/fuser/interface.h>
-#include <torch/torch.h>
-
-using namespace torch::jit;
-
-static const std::string two_adds = R"JIT(
-def two_adds(self, x: Tensor, y: Tensor, z: Tensor) -> Tensor:
-    return x + y + z
-)JIT";
-
-static void FusedOverhead(benchmark::State& state) {
-  c10::InferenceMode mode;
-  overrideCanFuseOnCPU(true);
-
-  Module m("m");
-  m.define(two_adds);
-
-  auto x = torch::ones({1});
-  auto y = torch::ones({1});
-  auto z = torch::ones({1});
-
-  // Warmup.
-  for (const auto i : c10::irange(8)) {
-    (void)i; // Suppress unused variable warning
-    m.run_method("two_adds", x, y, z);
-  }
-
-  for (auto _ : state) {
-    m.run_method("two_adds", x, y, z);
-  }
-}
-
-static void UnfusedOverhead(benchmark::State& state) {
-  c10::InferenceMode guard;
-  overrideCanFuseOnCPU(false);
-
-  Module m("m");
-  m.define(two_adds);
-
-  auto x = torch::ones({1});
-  auto y = torch::ones({1});
-  auto z = torch::ones({1});
-
-  // Warmup.
-  for (const auto i : c10::irange(8)) {
-    (void)i; // Suppress unused variable warning
-    m.run_method("two_adds", x, y, z);
-  }
-
-  for (auto _ : state) {
-    m.run_method("two_adds", x, y, z);
-  }
-}
-
-BENCHMARK(FusedOverhead);
-BENCHMARK(UnfusedOverhead);
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
deleted file mode 100644
index 403746578dff..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-namespace te = torch::jit::tensorexpr;
-
-namespace {
-class Gemm : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    M = state.range(0);
-    N = state.range(1);
-    K = state.range(2);
-    A = torch::randn({M, K});
-    B = torch::randn({K, N});
-    C = torch::mm(A, B);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    state.counters["GFLOPS"] = benchmark::Counter(
-        uint64_t(state.iterations()) * 2 * M * N * K,
-        benchmark::Counter::kIsRate);
-  }
-
-  int M;
-  int N;
-  int K;
-  at::Tensor A;
-  at::Tensor B;
-  at::Tensor C;
-};
-} // namespace
-
-BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) {
-  for (auto _ : state) {
-    torch::mm_out(C, A, B);
-  }
-}
-
-BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
-  te::BufHandle AP("A", {M, K}, te::kFloat);
-  te::BufHandle BP("B", {K, N}, te::kFloat);
-  te::Tensor CT = te::Reduce(
-      "gemm",
-      {M, N},
-      te::Sum(),
-      [&](const te::ExprHandle& m,
-          const te::ExprHandle& n,
-          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {K});
-  te::LoopNest loop({CT});
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
-
-  for (auto _ : state) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
-  }
-}
-
-BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
-  te::BufHandle AP("A", {M, K}, te::kFloat);
-  te::BufHandle BP("B", {K, N}, te::kFloat);
-  te::Tensor CT = te::Reduce(
-      "gemm",
-      {M, N},
-      te::Sum(),
-      [&](const te::ExprHandle& m,
-          const te::ExprHandle& n,
-          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {K});
-  te::LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr m = loops[0];
-    loop.splitWithMask(m, 32);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr n = loops[2];
-    loop.splitWithMask(n, 32);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[1];
-    te::ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr ni = loops[3];
-    te::ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[2];
-    te::ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
-
-  for (auto _ : state) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
-  }
-}
-
-BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
-  te::BufHandle AP("A", {M, K}, te::kFloat);
-  te::BufHandle BP("B", {K, N}, te::kFloat);
-  te::Tensor CT = te::Reduce(
-      "gemm",
-      {M, N},
-      te::Sum(),
-      [&](const te::ExprHandle& m,
-          const te::ExprHandle& n,
-          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {K});
-  te::LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr m = loops[0];
-    loop.splitWithMask(m, 4);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr n = loops[2];
-    loop.splitWithMask(n, 16);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[1];
-    te::ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr ni = loops[3];
-    te::ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[2];
-    te::ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
-
-  for (auto _ : state) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
-  }
-}
-
-BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
-  te::BufHandle AP("A", {M, K}, te::kFloat);
-  te::BufHandle BP("B", {K, N}, te::kFloat);
-  te::Tensor CT = te::Reduce(
-      "gemm",
-      {M, N},
-      te::Sum(),
-      [&](const te::ExprHandle& m,
-          const te::ExprHandle& n,
-          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {K});
-  te::LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr m = loops[0];
-    loop.splitWithMask(m, 4);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr n = loops[2];
-    loop.splitWithMask(n, 16);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[1];
-    te::ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr ni = loops[3];
-    te::ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[2];
-    te::ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[3];
-    te::ForPtr ni = loops[4];
-    te::StmtPtr unrolled;
-    loop.vectorize(ni);
-    loop.fullUnroll(mi, &unrolled);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
-
-  for (auto _ : state) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
-  }
-}
-
-BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
-  te::BufHandle AP("A", {M, K}, te::kFloat);
-  te::BufHandle BP("B", {K, N}, te::kFloat);
-  te::Tensor CT = te::Reduce(
-      "gemm",
-      {M, N},
-      te::Sum(),
-      [&](const te::ExprHandle& m,
-          const te::ExprHandle& n,
-          const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {K});
-  te::LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr m = loops[0];
-    loop.splitWithMask(m, 4);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr n = loops[2];
-    loop.splitWithMask(n, 16);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[1];
-    te::ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr ni = loops[3];
-    te::ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    te::ForPtr mi = loops[2];
-    te::ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
-
-  for (auto _ : state) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
-  }
-}
-
-BENCHMARK_REGISTER_F(Gemm, Torch)->Args({128, 128, 128});
-BENCHMARK_REGISTER_F(Gemm, TensorExprNoopt)->Args({128, 128, 128});
-BENCHMARK_REGISTER_F(Gemm, TensorExprTile32x32)->Args({128, 128, 128});
-BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16)->Args({128, 128, 128});
-BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16VecUnroll)->Args({128, 128, 128});
-BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16Cache)->Args({128, 128, 128});
diff --git a/benchmarks/cpp/tensorexpr/bench_kernels.cpp b/benchmarks/cpp/tensorexpr/bench_kernels.cpp
deleted file mode 100644
index 9a57547984b0..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_kernels.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include <benchmark/benchmark.h>
-
-#include <ATen/code_template.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-
-using namespace torch::jit;
-using namespace torch::jit::tensorexpr;
-
-static const std::string kernel_static_shapes_template = R"IR(
-    graph(%0 : Float(${dim}, strides=[1], device=cpu),
-          %1 : Float(${dim}, strides=[1], device=cpu)):
-        %2 : Float(${dim}, strides=[1]) = aten::mul(%0, %1)
-        %4 : Float(${dim}, strides=[1]) = aten::mul(%0, %2)
-        return (%4))IR";
-
-static const std::string kernel_symbolic_shapes = R"IR(
-    graph(%0 : Float(SS(-2), strides=[1], device=cpu),
-          %1 : Float(SS(-2), strides=[1], device=cpu),
-          %SS_2 : int):
-        %2 : Float(SS(-2), strides=[1]) = aten::mul(%0, %1)
-        %4 : Float(SS(-2), strides=[1]) = aten::mul(%0, %2)
-        return (%4))IR";
-
-class KernelBench : public benchmark::Fixture {
- public:
-  void Eager(benchmark::State& state) {
-    auto dim = state.range(0);
-    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-
-    for (auto _ : state) {
-      auto o = at::mul(a, at::mul(a, b));
-    }
-  }
-
-  void GraphWithStaticShapes(benchmark::State& state) {
-    auto dim = state.range(0);
-    auto graph = std::make_shared<Graph>();
-    at::jit::TemplateEnv env;
-    env.d("dim", dim);
-    const auto kernel_static_shapes =
-        format(kernel_static_shapes_template, env);
-    parseIR(kernel_static_shapes, &*graph);
-    TensorExprKernel k(graph);
-
-    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    std::vector<at::Tensor> inputs = {a, b};
-
-    for (auto _ : state) {
-      std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-      k.run(stack);
-    }
-  }
-
-  void GraphWithSymbolicShapes(benchmark::State& state) {
-    auto dim = state.range(0);
-    auto graph = std::make_shared<Graph>();
-    parseIR(kernel_symbolic_shapes, &*graph);
-
-    std::vector<torch::jit::StrideInput> input_desc = {
-        torch::jit::StrideInput::TENSOR_CONT};
-    std::unordered_map<
-        const torch::jit::Value*,
-        std::vector<torch::jit::StrideInput>>
-        symbolic_strides;
-    symbolic_strides[graph->inputs().at(0)] = input_desc;
-    symbolic_strides[graph->inputs().at(1)] = input_desc;
-    symbolic_strides[graph->outputs().at(0)] = input_desc;
-    std::vector<int64_t> symbolic_shape_inputs = {-2};
-    TensorExprKernel k(
-        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-    auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    std::vector<at::Tensor> inputs = {a, b};
-
-    for (auto _ : state) {
-      std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-      stack.push_back(dim);
-      k.run(stack);
-    }
-  }
-};
-
-BENCHMARK_DEFINE_F(KernelBench, Eager)(benchmark::State& state) {
-  Eager(state);
-}
-
-BENCHMARK_DEFINE_F(KernelBench, StaticShapes)(benchmark::State& state) {
-  GraphWithStaticShapes(state);
-}
-BENCHMARK_DEFINE_F(KernelBench, SymbolicShapes)(benchmark::State& state) {
-  GraphWithSymbolicShapes(state);
-}
-
-BENCHMARK_REGISTER_F(KernelBench, Eager)->Range(32, 2048);
-BENCHMARK_REGISTER_F(KernelBench, StaticShapes)->Range(32, 2048);
-BENCHMARK_REGISTER_F(KernelBench, SymbolicShapes)->Range(32, 2048);
diff --git a/benchmarks/cpp/tensorexpr/bench_ops.py b/benchmarks/cpp/tensorexpr/bench_ops.py
deleted file mode 100644
index 3956d7a02a28..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_ops.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import timeit
-
-import torch
-import torch.nn.functional as F
-
-torch._C._jit_override_can_fuse_on_cpu(True)
-torch._C._debug_set_fusion_group_inlining(False)
-torch.set_num_threads(1)
-
-
-def hardswish(x):
-    return x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0
-
-
-unary_ops = [
-    hardswish,
-    torch._C._nn.hardswish,
-    torch.sigmoid,
-    torch.reciprocal,
-    torch.neg,
-    torch.relu,
-    torch.isnan,
-    torch.log,
-    torch.log10,
-    torch.log1p,
-    torch.log2,
-    torch.exp,
-    torch.expm1,
-    torch.erf,
-    torch.erfc,
-    torch.cos,
-    torch.sin,
-    torch.tan,
-    torch.acos,
-    torch.asin,
-    torch.cosh,
-    torch.sinh,
-    torch.atan,
-    torch.tanh,
-    torch.sqrt,
-    torch.rsqrt,
-    torch.abs,
-    torch.ceil,
-    torch.floor,
-    torch.round,
-    torch.trunc,
-    torch.lgamma,
-]
-
-print(f"{'op':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
-
-for op in unary_ops:
-    x = torch.rand((1024, 1024))
-    traced = torch.jit.trace(op, (x))
-
-    # Warmup.
-    warmup_iters = 8
-    for _ in range(warmup_iters):
-        op(x)
-        traced(x)
-
-    # Validate result.
-    torch.testing.assert_close(op(x), traced(x))
-
-    # Benchmark.
-    bench_iters = 100
-    teager = timeit.timeit(stmt="op(x)", globals=globals(), number=bench_iters)
-    tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
-    print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
-
-
-def test_batch_norm():
-    op = F.batch_norm
-    print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
-    batch_norm_shapes = [
-        [1, 64, 112, 112],
-        [1, 256, 14, 14],
-        [1, 128, 28, 28],
-        [1, 64, 56, 56],
-        [1, 512, 7, 7],
-        [5, 64, 112, 112],
-        [5, 256, 14, 14],
-        [5, 128, 28, 28],
-        [5, 64, 56, 56],
-        [5, 512, 7, 7],
-    ]
-    for n, c, h, w in batch_norm_shapes:
-        x = torch.rand((n, c, h, w))
-        y = torch.rand(c)
-        z = torch.rand(c)
-        traced = torch.jit.trace(op, (x, y, z))
-
-        # Warmup.
-        warmup_iters = 8
-        for _ in range(warmup_iters):
-            op(x, y, z)
-            traced(x, y, z)
-
-        # Validate result.
-        torch.testing.assert_close(op(x, y, z), traced(x, y, z))
-
-        # Benchmark.
-        bench_iters = 100
-        teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters)
-        tjit = timeit.timeit(
-            stmt="traced(x, y, z)", globals=locals(), number=bench_iters
-        )
-        print(
-            f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}"
-        )
-
-
-test_batch_norm()
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
deleted file mode 100644
index 8d77a459c603..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/tensorexpr/analysis.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-#include <immintrin.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-class ParallelAdd : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    at::set_num_threads(4);
-    torch::manual_seed(0x12345678);
-    M = state.range(0);
-    A = torch::randn({M});
-    B = torch::randn({M});
-    C = torch::zeros({M});
-  }
-
-  void TearDown(benchmark::State& state) override {
-    state.counters["tasks"] = benchmark::Counter(
-        uint64_t(state.iterations()) * M, benchmark::Counter::kIsRate);
-  }
-
-  int M;
-  at::Tensor A;
-  at::Tensor B;
-  at::Tensor C;
-};
-
-BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
-  BufHandle a_buf("a", {M}, kFloat);
-  BufHandle b_buf("b", {M}, kFloat);
-  Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
-    return a_buf.load(m) + b_buf.load(m);
-  });
-  LoopNest loop_nest({c_tensor});
-  auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
-  ForPtr m = loops[0];
-  m->set_parallel();
-  loop_nest.prepareForCodegen();
-  StmtPtr stmt = loop_nest.root_stmt();
-  LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
-
-  float* a_ptr = A.data_ptr<float>();
-  float* b_ptr = B.data_ptr<float>();
-  float* c_ptr = C.data_ptr<float>();
-  std::vector<void*> args({c_ptr, a_ptr, b_ptr});
-  cg.value<int>(args);
-  for (const auto i : c10::irange(M)) {
-    float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
-    TORCH_CHECK(diff < 1e-5);
-  }
-
-  for (auto _ : state) {
-    cg.value<int>(args);
-  }
-}
-
-BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp b/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp
deleted file mode 100644
index f1fe120c14ad..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_prefix_sum.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-#include <benchmark/benchmark.h>
-#include "ATen/Functions.h"
-
-#include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/operators/operators.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-#include <immintrin.h>
-
-using namespace torch::jit::tensorexpr;
-
-namespace {
-#ifdef __AVX2__
-
-#define _mm256_slli_si1(x)                                                   \
-  _mm256_blend_epi32(                                                        \
-      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)), \
-      _mm256_setzero_si256(),                                                \
-      1)
-#define _mm256_slli_si2(x)                                                   \
-  _mm256_blend_epi32(                                                        \
-      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(5, 4, 3, 2, 1, 0, 7, 6)), \
-      _mm256_setzero_si256(),                                                \
-      3)
-#define _mm256_slli_si4(x)                                                   \
-  _mm256_blend_epi32(                                                        \
-      _mm256_permutevar8x32_ps(x, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)), \
-      _mm256_setzero_si256(),                                                \
-      15)
-
-__m256 PrefixSum(__m256 x) {
-  x = _mm256_add_ps(x, _mm256_slli_si1(x));
-  x = _mm256_add_ps(x, _mm256_slli_si2(x));
-  x = _mm256_add_ps(x, _mm256_slli_si4(x));
-  return x; // local prefix sums
-}
-
-__m256i PrefixSumInt(__m256i x) {
-  x = _mm256_add_epi32(x, _mm256_slli_si1(x));
-  x = _mm256_add_epi32(x, _mm256_slli_si2(x));
-  x = _mm256_add_epi32(x, _mm256_slli_si4(x));
-  return x; // local prefix sums
-}
-
-// Util function to log the given value. Not used during benchmarking.
-template <class T>
-inline void Log(const __m256i& value) {
-  const size_t n = sizeof(__m256i) / sizeof(T);
-  T buffer[n];
-  _mm256_storeu_si256((__m256i*)buffer, value);
-  for (int i = 0; i < n; i++)
-    std::cout << buffer[n - i - 1] << " ";
-  std::cout << std::endl;
-}
-#endif
-
-#ifdef __AVX512F__
-
-#define _mm512_slli_si512(x, k) \
-  _mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k)
-
-__m512 PrefixSum(__m512 x) {
-  x = _mm512_add_ps(x, _mm512_slli_si512(x, 1));
-  x = _mm512_add_ps(x, _mm512_slli_si512(x, 2));
-  x = _mm512_add_ps(x, _mm512_slli_si512(x, 4));
-  x = _mm512_add_ps(x, _mm512_slli_si512(x, 8));
-  return x; // local prefix sums
-}
-
-__m512i PrefixSumInt(__m512i x) {
-  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 1));
-  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 2));
-  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 4));
-  x = _mm512_add_epi32(x, _mm512_slli_si512(x, 8));
-  return x; // local prefix sums
-}
-
-template <int index>
-float _mm512_extract_f32(__m512 target) {
-  return _mm512_cvtss_f32(_mm512_alignr_epi32(target, target, index));
-}
-
-// extract the last i32 from target
-int _mm512_extract_epi32(__m512i target) {
-  __m256i x = _mm512_extracti32x8_epi32(target, 1);
-  return _mm256_extract_epi32(x, 7);
-}
-
-void PrefixSum(float* output_data, float* input_data, size_t input_size) {
-  float carry = 0.0f;
-  for (int i = 0; i < input_size / 16; i++) {
-    __m512 x = _mm512_loadu_ps(input_data + i * 16);
-    x = PrefixSum(x);
-    x = _mm512_add_ps(x, _mm512_set1_ps(carry));
-    carry = _mm512_extract_f32<15>(x);
-    _mm512_storeu_ps((__m512*)(output_data + i * 16), x);
-  }
-}
-
-void PrefixSum(int* output_data, int* input_data, size_t input_size) {
-  int carry = 0;
-  for (int i = 0; i < input_size / 16; i++) {
-    __m512i x = _mm512_loadu_epi32(input_data + i * 16);
-    x = PrefixSumInt(x);
-    x = _mm512_add_epi32(x, _mm512_set1_epi32(carry));
-    carry = _mm512_extract_epi32(x);
-    _mm512_storeu_epi32((__m512i*)(output_data + i * 16), x);
-  }
-}
-#endif
-
-// PrefixSum: the same as inclusive scan
-class PrefixSumBench : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    input_size_ = state.range(0);
-    input_ = torch::rand(input_size_);
-    ref_ = prefixSum(input_);
-
-    // no type promotion. Default is int->long.
-    input_int_ = torch::randint(1000, {input_size_}, at::kInt);
-    ref_int_ = at::cumsum(input_int_, 0, at::kInt);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    if (output_.numel() > 0) {
-      if (output_.numel() == ref_.numel()) {
-        TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
-      }
-      state.counters["GB/s"] = benchmark::Counter(
-          uint64_t(state.iterations()) * 2 * output_.nbytes(),
-          benchmark::Counter::kIsRate);
-    } else {
-      if (output_int_.numel() == ref_int_.numel()) {
-        TORCH_CHECK(ref_int_.equal(output_int_));
-      }
-      state.counters["GB/s"] = benchmark::Counter(
-          uint64_t(state.iterations()) * 2 * output_int_.nbytes(),
-          benchmark::Counter::kIsRate);
-    }
-  }
-
-  at::Tensor prefixSum(const at::Tensor& inp) {
-    return at::cumsum(inp, 0);
-  }
-
-  void runATen(benchmark::State& state) {
-    output_ = prefixSum(input_);
-    for (auto _ : state) {
-      at::cumsum_out(output_, input_, 0);
-    }
-  }
-
-  void runLocal(benchmark::State& state) {
-    output_ = at::empty_like(ref_);
-    for (auto _ : state) {
-      auto input_data = input_.data_ptr<float>();
-      auto output_data = output_.data_ptr<float>();
-      float sum = 0.0f;
-      for (int i = 0; i < input_size_; ++i) {
-        sum = sum + input_data[i];
-        output_data[i] = sum;
-      }
-    }
-  }
-
-  // no type promotion
-  void runLocalInt(benchmark::State& state) {
-    output_int_ = at::empty_like(input_int_);
-    for (auto _ : state) {
-      auto input_data = input_int_.data_ptr<int>();
-      auto output_data = output_int_.data_ptr<int>();
-      int sum = 0;
-      for (int i = 0; i < input_size_; ++i) {
-        sum = sum + input_data[i];
-        output_data[i] = sum;
-      }
-    }
-  }
-
-  void runNNC(benchmark::State& state) {
-    BufHandle input("input", {input_size_}, kFloat);
-    BufHandle output("output", {input_size_}, kFloat);
-    BufHandle s("s", {1}, kFloat);
-    VarHandle i("i", kInt);
-    auto allocS = Allocate::make(s);
-    auto initS = Store::make(s, {0}, 0.0f);
-    auto accumS = Store::make(
-        s, {0}, Add::make(Load::make(s, {0}), Load::make(input, {i})));
-    auto store = Store::make(output, {i}, Load::make(s, {0}));
-    auto forI = For::make(i, 0, input_size_, Block::make({accumS, store}));
-    auto freeS = Free::make(s);
-    auto par = Block::make({allocS, initS, forI, freeS});
-    LoopNest nest(par, {output.node()});
-
-    std::vector<CodeGen::BufferArg> buf_args;
-    buf_args.emplace_back(input);
-    buf_args.emplace_back(output);
-    LLVMCodeGen cg(nest.root_stmt(), buf_args);
-
-    std::vector<CodeGen::CallArg> call_args;
-    output_ = at::empty_like(ref_);
-    for (auto _ : state) {
-      call_args.clear();
-      call_args.emplace_back(input_.data_ptr<float>());
-      call_args.emplace_back(output_.data_ptr<float>());
-      cg.call(call_args);
-    }
-  }
-
-#ifdef __AVX2__
-  void runLocalAVX2(benchmark::State& state) {
-    output_ = at::empty_like(ref_);
-    for (auto _ : state) {
-      float* input_data = input_.data_ptr<float>();
-      float* output_data = output_.data_ptr<float>();
-
-      float carry = 0.0f;
-      for (int i = 0; i < input_size_ / 8; i++) {
-        __m256 x = _mm256_loadu_ps(input_data + i * 8);
-        x = PrefixSum(x);
-        x = _mm256_add_ps(x, _mm256_set1_ps(carry));
-        (reinterpret_cast<__m256*>(output_data))[i] = x;
-        carry = _mm256_cvtss_f32(_mm256_permutevar8x32_ps(
-            x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)));
-      }
-    }
-  }
-
-  void runLocalIntAVX2(benchmark::State& state) {
-    output_int_ = at::empty_like(input_int_);
-    for (auto _ : state) {
-      auto input_data = input_int_.data_ptr<int>();
-      auto output_data = output_int_.data_ptr<int>();
-
-      int carry = 0;
-      for (size_t i = 0; i < input_size_ / 8; i++) {
-        __m256i x = _mm256_loadu_si256((__m256i*)(input_data + i * 8));
-        x = PrefixSumInt(x);
-        x = _mm256_add_epi32(x, _mm256_set1_epi32(carry));
-        _mm256_storeu_si256((__m256i*)(output_data + i * 8), x);
-        carry = _mm256_extract_epi32(x, 7);
-      }
-    }
-  }
-#endif
-
-#ifdef __AVX512F__
-  void runLocalAVX512(benchmark::State& state) {
-    output_ = at::empty_like(ref_);
-    for (auto _ : state) {
-      auto input_data = input_.data_ptr<float>();
-      auto output_data = output_.data_ptr<float>();
-      PrefixSum(output_data, input_data, input_size_);
-    }
-  }
-
-  void runLocalIntAVX512(benchmark::State& state) {
-    output_int_ = at::empty_like(input_int_);
-    for (auto _ : state) {
-      auto input_data = input_int_.data_ptr<int>();
-      auto output_data = output_int_.data_ptr<int>();
-      PrefixSum(output_data, input_data, input_size_);
-    }
-  }
-
-  void runExclusiveScanAVX512(benchmark::State& state) {
-    output_ = at::empty({input_size_ + 1}, at::kFloat);
-    for (auto _ : state) {
-      auto input_data = input_.data_ptr<float>();
-      auto output_data = output_.data_ptr<float>();
-      output_data[0] = 0.0f;
-      PrefixSum(output_data + 1, input_data, input_size_);
-    }
-  }
-
-  void runExclusiveScanIntAVX512(benchmark::State& state) {
-    output_int_ = at::empty({input_size_ + 1}, at::kInt);
-    for (auto _ : state) {
-      auto input_data = input_int_.data_ptr<int>();
-      auto output_data = output_int_.data_ptr<int>();
-      output_data[0] = 0;
-      PrefixSum(output_data + 1, input_data, input_size_);
-    }
-  }
-
-#endif
-
- private:
-  int input_size_;
-  at::Tensor input_;
-  at::Tensor output_;
-  at::Tensor ref_;
-  at::Tensor input_int_;
-  at::Tensor output_int_;
-  at::Tensor ref_int_; // no type promotion
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(PrefixSumBench, ATen)(benchmark::State& state) {
-  runATen(state);
-}
-
-BENCHMARK_DEFINE_F(PrefixSumBench, Local)(benchmark::State& state) {
-  runLocal(state);
-}
-
-BENCHMARK_DEFINE_F(PrefixSumBench, LocalInt)(benchmark::State& state) {
-  runLocalInt(state);
-}
-
-BENCHMARK_DEFINE_F(PrefixSumBench, NNC)(benchmark::State& state) {
-  runNNC(state);
-}
-
-#ifdef __AVX2__
-BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX2)(benchmark::State& state) {
-  runLocalAVX2(state);
-}
-BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX2)(benchmark::State& state) {
-  runLocalIntAVX2(state);
-}
-#endif
-
-#ifdef __AVX512F__
-BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX512)(benchmark::State& state) {
-  runLocalAVX512(state);
-}
-BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX512)(benchmark::State& state) {
-  runLocalIntAVX512(state);
-}
-
-BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanAVX512)
-(benchmark::State& state) {
-  runExclusiveScanAVX512(state);
-}
-BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanIntAVX512)
-(benchmark::State& state) {
-  runExclusiveScanIntAVX512(state);
-}
-#endif
-
-//---------- float benchmarks ----------//
-BENCHMARK_REGISTER_F(PrefixSumBench, ATen)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-
-BENCHMARK_REGISTER_F(PrefixSumBench, NNC)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-
-BENCHMARK_REGISTER_F(PrefixSumBench, Local)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-
-#ifdef __AVX2__
-BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX2)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-#endif
-
-#ifdef __AVX512F__
-BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX512)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanAVX512)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-#endif
-
-//---------- int benchmarks ----------//
-BENCHMARK_REGISTER_F(PrefixSumBench, LocalInt)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-
-#ifdef __AVX2__
-BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX2)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-#endif
-
-#ifdef __AVX512F__
-BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX512)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanIntAVX512)
-    ->RangeMultiplier(4)
-    ->Ranges({{1 << 6, 1 << 20}});
-#endif
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
deleted file mode 100644
index bf0fe21ca0b1..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ /dev/null
@@ -1,621 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/tensorexpr/analysis.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/operators/operators.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-#include <immintrin.h>
-
-namespace te = torch::jit::tensorexpr;
-
-namespace {
-class Reduce1D : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    at::set_num_threads(1);
-    torch::manual_seed(0x12345678);
-    M = state.range(0);
-    A = torch::randn({M});
-    B = torch::zeros({});
-    ref = torch::sum(A, {0});
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-7));
-    state.counters["BYTES"] = benchmark::Counter(
-        uint64_t(state.iterations()) * M * sizeof(float),
-        benchmark::Counter::kIsRate);
-  }
-
-  int M;
-  at::Tensor A;
-  at::Tensor B;
-  at::Tensor ref;
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) {
-  for (auto _ : state) {
-    B = torch::sum(A, {0});
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24});
-
-#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B))
-
-template <typename Func>
-void ValidateFunc(
-    Func func,
-    const std::string& func_name,
-    at::Tensor& A,
-    at::Tensor& B) {
-  func(A, B);
-  float* pB = B.data_ptr<float>();
-  at::Tensor B2 = torch::sum(A, {0});
-  float* pB2 = B2.data_ptr<float>();
-  int size = A.numel();
-  float size_sqrt = std::sqrt(size);
-  float natural_noise = size_sqrt * 1e-7;
-  if (!torch::allclose(B, B2, natural_noise)) {
-    std::ostringstream oss;
-    oss << func_name << " failed check: " << std::endl;
-    oss << "value: " << B << std::endl;
-    ;
-    oss << "reference: " << B2 << std::endl;
-    oss << "threshold: " << natural_noise << std::endl;
-    throw std::runtime_error(oss.str());
-  }
-}
-
-static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
-  float* pA = A.data_ptr<float>();
-  float* pB = B.data_ptr<float>();
-  int size = A.numel();
-  TORCH_CHECK(B.numel() == 1);
-  *pB = 0.;
-  for (const auto i : c10::irange(size)) {
-    *pB += pA[i];
-  }
-}
-
-BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) {
-  VALIDATE(reduce1d_naive, A, B);
-  for (auto _ : state) {
-    reduce1d_naive(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24});
-
-static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
-  float* pA = A.data_ptr<float>();
-  float* pB = B.data_ptr<float>();
-  int size = A.numel();
-  constexpr int kChunkSize = 16;
-  TORCH_CHECK(B.numel() == 1);
-  TORCH_CHECK(size % kChunkSize == 0);
-  *pB = 0.;
-  float temp[kChunkSize];
-  for (const auto j : c10::irange(kChunkSize)) {
-    temp[j] = 0;
-  }
-
-  int chunk_count = size / kChunkSize;
-  for (const auto i : c10::irange(chunk_count)) {
-    for (const auto j : c10::irange(kChunkSize)) {
-      temp[j] += pA[i * kChunkSize + j];
-    }
-  }
-
-  for (const auto j : c10::irange(kChunkSize)) {
-    *pB += temp[j];
-  }
-}
-
-BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) {
-  VALIDATE(reduce1d_native_rfactor, A, B);
-  for (auto _ : state) {
-    reduce1d_native_rfactor(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24});
-
-#ifdef USE_AVX2
-
-// x = ( x7, x6, x5, x4, x3, x2, x1, x0 )
-inline float sum_f32x8(__m256 x) {
-  // hiQuad = ( x7, x6, x5, x4 )
-  const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
-  // loQuad = ( x3, x2, x1, x0 )
-  const __m128 loQuad = _mm256_castps256_ps128(x);
-  // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
-  const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
-  // loDual = ( -, -, x1 + x5, x0 + x4 )
-  const __m128 loDual = sumQuad;
-  // hiDual = ( -, -, x3 + x7, x2 + x6 )
-  const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
-  // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
-  const __m128 sumDual = _mm_add_ps(loDual, hiDual);
-  // lo = ( -, -, -, x0 + x2 + x4 + x6 )
-  const __m128 lo = sumDual;
-  // hi = ( -, -, -, x1 + x3 + x5 + x7 )
-  const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
-  // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
-  const __m128 sum = _mm_add_ss(lo, hi);
-  return _mm_cvtss_f32(sum);
-}
-
-static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
-  float* pA = A.data_ptr<float>();
-  float* pB = B.data_ptr<float>();
-  int size = A.numel();
-  constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
-  TORCH_CHECK(B.numel() == 1);
-  TORCH_CHECK(size % kChunkSize == 0);
-  *pB = 0.;
-  __m256 temp;
-  temp = _mm256_setzero_ps();
-
-  int tile_count = size / kChunkSize;
-  for (const auto i : c10::irange(tile_count)) {
-    __m256 data = _mm256_load_ps(pA + i * kChunkSize);
-    temp = _mm256_add_ps(temp, data);
-  }
-
-  float result = sum_f32x8(temp);
-  *pB = result;
-}
-
-BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) {
-  VALIDATE(reduce1d_native_vector, A, B);
-  for (auto _ : state) {
-    reduce1d_native_vector(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24});
-
-static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
-  static constexpr int kTileSize = 4;
-  float* pA = A.data_ptr<float>();
-  float* pB = B.data_ptr<float>();
-  int size = A.numel();
-  constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
-  TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1");
-  TORCH_CHECK(
-      size % kChunkSize == 0,
-      "Invalid size: ",
-      size,
-      " % ",
-      kChunkSize,
-      " ! = 0");
-  __m256 t[kTileSize];
-  for (const auto j : c10::irange(kTileSize)) {
-    t[j] = _mm256_setzero_ps();
-  }
-
-  int tile_count = size / kChunkSize / kTileSize;
-  for (int i = 0; i < tile_count; i++) {
-#pragma unroll
-    for (int j = 0; j < kTileSize; j++) {
-      float* p = pA + (i * kTileSize + j) * kChunkSize;
-      __m256 data = _mm256_loadu_ps(p);
-      t[j] = _mm256_add_ps(t[j], data);
-    }
-  }
-
-  float result = sum_f32x8(t[0]);
-  for (const auto j : c10::irange(1, kTileSize)) {
-    result += sum_f32x8(t[j]);
-  }
-  *pB = result;
-}
-
-BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) {
-  VALIDATE(reduce1d_native_tiled, A, B);
-  for (auto _ : state) {
-    reduce1d_native_tiled(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
-
-#endif // USE_AVX2
-
-BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
-  int M = A.numel();
-
-  te::BufHandle AP("A", {M}, te::kFloat);
-  te::Tensor BT = te::Reduce(
-      "reduce_full",
-      {1},
-      te::Sum(),
-      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
-        return AP.load(m);
-      },
-      {M});
-
-  te::LoopNest loop({BT});
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
-
-  auto func = [&](at::Tensor& A, at::Tensor& B) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
-  };
-
-  ValidateFunc(func, "reduce1d_te_naive", A, B);
-  for (auto _ : state) {
-    func(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
-
-BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
-  int M = A.numel();
-
-  te::BufHandle AP("A", {M}, te::kFloat);
-  te::Tensor BT = te::Reduce(
-      "reduce_full",
-      {1},
-      te::Sum(),
-      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
-        return AP.load(m);
-      },
-      {M});
-
-  te::LoopNest loop({BT});
-  const int kChunkSize = 8;
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(BT);
-    te::ForPtr m = loops[1];
-    loop.splitWithTail(m, kChunkSize);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
-
-  auto func = [&](at::Tensor& A, at::Tensor& B) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
-  };
-
-  ValidateFunc(func, "reduce1d_te_naive", A, B);
-  for (auto _ : state) {
-    func(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
-
-BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
-  int M = A.numel();
-
-  te::BufHandle AP("A", {M}, te::kFloat);
-  te::Tensor BT = te::Reduce(
-      "reduce_full",
-      {1},
-      te::Sum(),
-      [&](const te::ExprHandle& n, const te::ExprHandle& m) {
-        return AP.load(m);
-      },
-      {M});
-
-  te::LoopNest loop({BT});
-  const int kChunkSize = 8;
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(BT);
-    te::ForPtr m = loops[1];
-    loop.splitWithMask(m, kChunkSize);
-  }
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
-
-  auto func = [&](at::Tensor& A, at::Tensor& B) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
-  };
-
-  ValidateFunc(func, "reduce1d_te_naive", A, B);
-  for (auto _ : state) {
-    func(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
-
-BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
-  int M = A.numel();
-  const int kChunkSize = 8;
-  TORCH_CHECK(M % kChunkSize == 0);
-
-  te::BufHandle AP("A", {M}, te::kFloat);
-  te::Tensor BT = te::Reduce(
-      "reduce_full",
-      {},
-      te::Sum(),
-      [&](const te::ExprHandle& m) { return AP.load(m); },
-      {M});
-
-  te::LoopNest loop({BT});
-  te::BufPtr rfac_buf;
-
-  auto loops = loop.getLoopStmtsFor(BT);
-  TORCH_CHECK(loops.size() == 1);
-  te::ForPtr mi;
-  loop.splitWithMask(loops.at(0), kChunkSize, &mi);
-  te::ForPtr mo = loops.at(0);
-
-  loop.reorderAxis(mo, mi);
-  loops = loop.getLoopStmtsFor(BT);
-  auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
-  TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
-  loop.reorderAxis(loops.at(0), loops.at(1));
-
-  loops = loop.getAllInnermostLoopsWritingToBuf(rfac_buf);
-  TORCH_CHECK(loops.size() == 2);
-  loop.vectorize(loops.at(1));
-
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
-
-  auto func = [&](at::Tensor& A, at::Tensor& B) {
-    cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
-  };
-
-  ValidateFunc(func, "reduce1d_te_naive", A, B);
-  for (auto _ : state) {
-    func(A, B);
-  }
-}
-
-BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
-
-BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
-  const int M = A.numel();
-  const int kChunkSize = 8;
-
-  te::BufHandle a("A", {M}, te::kFloat);
-  te::Tensor b = te::computeSum(
-      {a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU);
-  te::LoopNest nest({b});
-
-  auto loops = nest.getLoopStmtsFor(b);
-  te::ForPtr mi, mo;
-  te::BufPtr rf;
-  nest.splitWithMask(loops[0], kChunkSize, &mi);
-  loops = nest.reorder({loops[0], mi}, {1, 0});
-  nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf);
-  nest.reorderAxis(loops[0], loops[1]);
-  for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
-    nest.vectorize(loop);
-  }
-
-  nest.prepareForCodegen();
-  nest.simplify();
-  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
-
-  for (auto _ : state) {
-    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
-  }
-}
-BENCHMARK_REGISTER_F(Reduce1D, Op)->Args({1 << 24});
-
-class Reduce2DCol : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    at::set_num_threads(1);
-    torch::manual_seed(0x12345678);
-    M = state.range(0);
-    N = state.range(1);
-    A = torch::randn({M, N});
-    ref = torch::sum(A, {0});
-    B = torch::zeros_like(ref);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-5));
-    state.counters["BYTES"] = benchmark::Counter(
-        uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
-        benchmark::Counter::kIsRate);
-  }
-
-  int M;
-  int N;
-  at::Tensor A;
-  at::Tensor B;
-  at::Tensor ref;
-};
-
-BENCHMARK_DEFINE_F(Reduce2DCol, Torch)(benchmark::State& state) {
-  for (auto _ : state) {
-    B = torch::sum(A, {0});
-  }
-}
-BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
-    ->Args({1 << 3, 1 << 21})
-    ->Args({1 << 6, 1 << 18})
-    ->Args({1 << 12, 1 << 12});
-
-BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
-  constexpr int kCacheSize = 1 << 12;
-  te::BufHandle a("A", {M, N}, te::kFloat);
-  te::Tensor b = te::computeSum(
-      {a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU);
-  te::LoopNest nest({b});
-
-  auto sch = state.range(2);
-  if (sch == 0) {
-  } else if (sch == 1) {
-    auto loops = nest.getLoopStmtsFor(b);
-    nest.reorderAxis(loops[0], loops[1]);
-  } else if (sch == 2) {
-    auto loops = nest.getLoopStmtsFor(b);
-    nest.splitWithTail(loops[0], kCacheSize);
-    loops = nest.getLoopStmtsFor(b);
-    nest.reorderAxis(loops[1], loops[2]);
-  } else if (sch == 3) {
-    auto loops = nest.getLoopStmtsFor(b);
-    nest.splitWithTail(loops[1], 8);
-    loops = nest.getLoopStmtsFor(b);
-    nest.reorderAxis(loops[0], loops[1]);
-  }
-
-  nest.prepareForCodegen();
-  nest.simplify();
-  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
-  for (auto _ : state) {
-    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
-  }
-}
-BENCHMARK_REGISTER_F(Reduce2DCol, OpSchedule)
-    ->Apply( // CustomArgs);
-        [](benchmark::internal::Benchmark* b) {
-          for (auto sch : {0, 1, 2, 3}) {
-            for (auto rows : {3, 6, 12}) {
-              auto cols = 24 - rows;
-              b->Args({1 << rows, 1 << cols, sch});
-            }
-          }
-        });
-
-class Reduce2DRow : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    at::set_num_threads(1);
-    torch::manual_seed(0x12345678);
-    M = state.range(0);
-    N = state.range(1);
-    A = torch::randn({M, N});
-    ref = torch::sum(A, {1});
-    B = torch::zeros_like(ref);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-4));
-    state.counters["BYTES"] = benchmark::Counter(
-        uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
-        benchmark::Counter::kIsRate);
-  }
-
-  int M;
-  int N;
-  at::Tensor A;
-  at::Tensor B;
-  at::Tensor ref;
-};
-
-BENCHMARK_DEFINE_F(Reduce2DRow, Torch)(benchmark::State& state) {
-  for (auto _ : state) {
-    B = torch::sum(A, {1});
-  }
-}
-BENCHMARK_REGISTER_F(Reduce2DRow, Torch)
-    ->Args({1 << 3, 1 << 21})
-    ->Args({1 << 6, 1 << 18})
-    ->Args({1 << 12, 1 << 12})
-    ->Args({1 << 18, 1 << 6});
-
-BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) {
-  auto a = A.data_ptr<float>();
-  auto b = B.data_ptr<float>();
-  constexpr int Mb = 4;
-  constexpr int Nb = 4;
-  auto fn = [&] {
-    for (int m_outer = 0; m_outer < M; m_outer += Mb) {
-      float bregs[Mb][Nb] = {0.0f};
-      for (int n_outer = 0; n_outer < N; n_outer += Nb) {
-        for (const auto m_inner : c10::irange(Mb)) {
-          for (const auto n_inner : c10::irange(Nb)) {
-            bregs[m_inner][n_inner] +=
-                a[(m_outer + m_inner) * N + n_outer + n_inner];
-          }
-        }
-      }
-      for (const auto m_inner : c10::irange(Mb)) {
-        b[m_outer + m_inner] = 0.f;
-        for (const auto n_inner : c10::irange(Nb)) {
-          b[m_outer + m_inner] += bregs[m_inner][n_inner];
-        }
-      }
-    }
-  };
-  for (auto _ : state) {
-    fn();
-  }
-}
-BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6});
-
-BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
-  constexpr int kChunkSize = 8;
-  te::BufHandle a("A", {M, N}, te::kFloat);
-  te::Tensor b = te::computeSum(
-      {a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU);
-  te::LoopNest nest({b});
-
-  auto sch = state.range(2);
-  if (sch == 1) {
-    auto loops = nest.getLoopStmtsFor(b);
-    te::ForPtr mi, mo;
-    te::BufPtr rf;
-    nest.splitWithMask(loops[1], kChunkSize, &mi);
-    loops = nest.reorder({loops[1], mi}, {1, 0});
-    TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
-    nest.reorderAxis(loops[0], loops[1]);
-    for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
-      nest.vectorize(loop);
-    }
-  } else if (sch == 2) {
-    auto loops = nest.getLoopStmtsFor(b);
-    nest.splitWithMask(loops[1], 8);
-    nest.splitWithMask(loops[0], 4);
-    loops = nest.getLoopStmtsFor(b);
-    nest.reorderAxis(loops[1], loops[2]);
-  } else if (sch == 3) {
-    auto loops = nest.getLoopStmtsFor(b);
-    te::ForPtr mi, mo;
-    te::BufPtr rf;
-    nest.splitWithMask(loops[1], kChunkSize, &mi);
-    loops = nest.reorder({loops[1], mi}, {1, 0});
-    TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
-    nest.reorderAxis(loops[0], loops[1]);
-    te::LoopNest::compressBuffer(rf, nest.root_stmt());
-    for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
-      nest.vectorize(loop);
-    }
-  }
-
-  nest.prepareForCodegen();
-  nest.simplify();
-  te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
-
-  for (auto _ : state) {
-    cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
-  }
-}
-BENCHMARK_REGISTER_F(Reduce2DRow, OpSchedule)
-    ->Apply( // CustomArgs);
-        [](benchmark::internal::Benchmark* b) {
-          for (auto sch : {0, 1, 2, 3}) {
-            for (auto rows : {3, 6, 12, 18}) {
-              auto cols = 24 - rows;
-              b->Args({1 << rows, 1 << cols, sch});
-            }
-          }
-        });
diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
deleted file mode 100644
index 568905acd7c4..000000000000
--- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <benchmark/benchmark.h>
-
-#include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/operators/operators.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-using namespace torch::jit::tensorexpr;
-
-namespace {
-
-class SignedLog1pBench : public benchmark::Fixture {
- public:
-  void SetUp(const benchmark::State& state) override {
-    input_size_ = {state.range(0), state.range(1)};
-    input_size_int_ = {state.range(0), state.range(1)};
-    input_ = torch::rand(input_size_);
-    ref_ = signedLog1p(input_);
-  }
-
-  void TearDown(benchmark::State& state) override {
-    TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
-    state.counters["GB/s"] = benchmark::Counter(
-        uint64_t(state.iterations()) * 2 * output_.nbytes(),
-        benchmark::Counter::kIsRate);
-  }
-
-  at::Tensor signedLog1p(const at::Tensor& inp) {
-    auto sign = at::sign(inp);
-    auto log1p = at::log1p(at::abs(inp));
-    return sign * log1p;
-  }
-
-  void runATen(benchmark::State& state) {
-    for (auto _ : state) {
-      output_ = signedLog1p(input_);
-    }
-  }
-
-  void runNNC(benchmark::State& state) {
-    BufHandle input_ph(
-        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
-    Tensor abs_result = Compute(
-        "aten_abs",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return abs(input_ph.load(m, n));
-        });
-    Tensor log1p_result = Compute(
-        "aten_log1p",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return log1p(abs_result.load(m, n));
-        });
-    Tensor sign_result =
-        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
-    Tensor output = Compute(
-        "aten_mul",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return sign_result.load(m, n) * log1p_result.load(m, n);
-        });
-    LoopNest nest({output}, {abs_result, log1p_result, sign_result, output});
-    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
-    nest.inlineIntermediateBufs(true);
-    nest.prepareForCodegen();
-    nest.simplify();
-    nest.vectorizeInnerLoops();
-    nest.simplify();
-    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
-
-    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-    std::vector<CodeGen::BufferArg> buf_args;
-    buf_args.emplace_back(input_ph);
-    buf_args.emplace_back(output);
-    LLVMCodeGen cg(nest.root_stmt(), buf_args);
-
-    std::vector<CodeGen::CallArg> call_args;
-    for (auto _ : state) {
-      output_ = at::empty_like(ref_);
-      call_args.clear();
-      call_args.emplace_back(input_.data_ptr<float>());
-      call_args.emplace_back(output_.data_ptr<float>());
-      cg.call(call_args);
-    }
-  }
-
-  void runNNCLogVml(benchmark::State& state) {
-    BufHandle input_ph(
-        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
-    Tensor abs_result = Compute(
-        "aten_abs",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return abs(input_ph.load(m, n));
-        });
-    Tensor log_vml_result = Compute(
-        "aten_log1p",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return log_vml(abs_result.load(m, n) + ExprHandle(1));
-        });
-    Tensor sign_result =
-        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
-    Tensor output = Compute(
-        "aten_mul",
-        {input_size_int_[0], input_size_int_[1]},
-        [&](const VarHandle& m, const VarHandle& n) {
-          return sign_result.load(m, n) * log_vml_result.load(m, n);
-        });
-    LoopNest nest({output}, {abs_result, log_vml_result, sign_result, output});
-    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
-    nest.inlineIntermediateBufs(true);
-    nest.prepareForCodegen();
-    nest.simplify();
-    nest.vectorizeInnerLoops();
-    nest.simplify();
-    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
-
-    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
-    std::vector<CodeGen::BufferArg> buf_args;
-    buf_args.emplace_back(input_ph);
-    buf_args.emplace_back(output);
-    LLVMCodeGen cg(nest.root_stmt(), buf_args);
-
-    std::vector<CodeGen::CallArg> call_args;
-    for (auto _ : state) {
-      output_ = at::empty_like(ref_);
-      call_args.clear();
-      call_args.emplace_back(input_.data_ptr<float>());
-      call_args.emplace_back(output_.data_ptr<float>());
-      cg.call(call_args);
-    }
-  }
-
- private:
-  std::vector<long> input_size_;
-  std::vector<int> input_size_int_;
-  at::Tensor input_;
-  at::Tensor output_;
-  at::Tensor ref_;
-};
-
-} // namespace
-
-BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
-  runATen(state);
-}
-
-BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
-  runNNC(state);
-}
-
-BENCHMARK_DEFINE_F(SignedLog1pBench, NNCLogVml)(benchmark::State& state) {
-  runNNCLogVml(state);
-}
-
-BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
-
-BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});
-
-BENCHMARK_REGISTER_F(SignedLog1pBench, NNCLogVml)->Args({10, 1467});
diff --git a/benchmarks/cpp/tensorexpr/main.cpp b/benchmarks/cpp/tensorexpr/main.cpp
deleted file mode 100644
index 71fefa047228..000000000000
--- a/benchmarks/cpp/tensorexpr/main.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <benchmark/benchmark.h>
-
-BENCHMARK_MAIN();
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 166cae5f6e25..f1394b62b825 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1660,14 +1660,6 @@ if(BUILD_STATIC_RUNTIME_BENCHMARK)
   target_link_libraries(static_runtime_test torch_library gtest_main)
 endif()
 
-if(BUILD_TENSOREXPR_BENCHMARK)
-  add_subdirectory(${TORCH_ROOT}/benchmarks/cpp/tensorexpr ${CMAKE_BINARY_DIR}/tensorexpr_bench)
-endif()
-
-if(BUILD_CPP_BENCHMARKS)
-  add_subdirectory(${TORCH_ROOT}/benchmarks/cpp ${PROJECT_BINARY_DIR}/bin)
-endif()
-
 if(BUILD_MOBILE_BENCHMARK)
   foreach(benchmark_src ${ATen_MOBILE_BENCHMARK_SRCS})
     get_filename_component(benchmark_name ${benchmark_src} NAME_WE)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index f9dcb5e02f86..d8fe95805ab3 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -26,7 +26,6 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
   message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
   message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
-  message(STATUS "  BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
   message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
   message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
   if(${CAFFE2_LINK_LOCAL_PROTOBUF})