Remove tensorexpr tests (#158928)

The tests are not maintained. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158928 Approved by: https://github.com/albanD, https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2025-08-09 02:21:22 +00:00
parent 11a3565f18
commit 10e3514c96
39 changed files with 10 additions and 36802 deletions
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi
 # Enable LLVM dependency for TensorExpr testing
 export USE_LLVM=/opt/llvm
 export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
@ -192,7 +189,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
  unset USE_LLVM
 fi
 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1051,20 +1051,10 @@ test_libtorch_api() {
    mkdir -p $TEST_REPORTS_DIR
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
    # On s390x, pytorch is built without llvm.
    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
    # test fails with errors like:
    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
    # unknown file: Failure
    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
    fi
  fi
  # quantization is not fully supported on s390x yet
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@ -1,7 +1,8 @@
 #include <gtest/gtest.h>
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <thread>
@ -9,7 +10,7 @@
 // numbers of threads set and also whether the scheduler
 // will throw an exception when multiple threads call
 // their first parallel construct.
-void test(int given_num_threads) {
+static void test(int given_num_threads) {
  auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat));
  ASSERT_TRUE(given_num_threads >= 0);
  ASSERT_EQ(at::get_num_threads(), given_num_threads);
@ -19,7 +20,7 @@ void test(int given_num_threads) {
  }
 }
-int main() {
+TEST(ThreadInitTest, ThreadInit) {
  at::init_num_threads();
  at::set_num_threads(4);
@ -32,13 +33,11 @@ int main() {
  #if !AT_PARALLEL_NATIVE
  at::set_num_threads(5);
-  ASSERT_TRUE(at::get_num_threads() == 5);
+  ASSERT_EQ(at::get_num_threads(), 5);
  #endif
  // test inter-op settings
  at::set_num_interop_threads(5);
  ASSERT_EQ(at::get_num_interop_threads(), 5);
  ASSERT_ANY_THROW(at::set_num_interop_threads(6));
  return 0;
 }
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1345,10 +1345,6 @@ if(BUILD_TEST)
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
    add_subdirectory(
      ${TORCH_ROOT}/test/cpp/tensorexpr
      ${CMAKE_BINARY_DIR}/test_tensorexpr
    )
    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
      if(NOT WIN32)
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@ -1,83 +0,0 @@
 set(TENSOREXPR_TEST_ROOT ${TORCH_ROOT}/test/cpp/tensorexpr)
 set(TENSOREXPR_TEST_SRCS
  ${TENSOREXPR_TEST_ROOT}/test_approx.cpp
  ${TENSOREXPR_TEST_ROOT}/test_aten.cpp
  ${TENSOREXPR_TEST_ROOT}/test_boundsinference.cpp
  ${TENSOREXPR_TEST_ROOT}/test_conv.cpp
  ${TENSOREXPR_TEST_ROOT}/test_cpp_codegen.cpp
  ${TENSOREXPR_TEST_ROOT}/test_dynamic_shapes.cpp
  ${TENSOREXPR_TEST_ROOT}/test_expr.cpp
  ${TENSOREXPR_TEST_ROOT}/test_external_calls.cpp
  ${TENSOREXPR_TEST_ROOT}/test_graph_opt.cpp
  ${TENSOREXPR_TEST_ROOT}/test_ir_printer.cpp
  ${TENSOREXPR_TEST_ROOT}/test_ir_verifier.cpp
  ${TENSOREXPR_TEST_ROOT}/test_kernel.cpp
  ${TENSOREXPR_TEST_ROOT}/test_loopnest.cpp
  ${TENSOREXPR_TEST_ROOT}/test_memdependency.cpp
  ${TENSOREXPR_TEST_ROOT}/test_ops.cpp
  ${TENSOREXPR_TEST_ROOT}/test_quantization.cpp
  ${TENSOREXPR_TEST_ROOT}/test_memplanning.cpp
  ${TENSOREXPR_TEST_ROOT}/test_reductions.cpp
  ${TENSOREXPR_TEST_ROOT}/test_registerizer.cpp
  ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp
  ${TENSOREXPR_TEST_ROOT}/test_te_fuser_pass.cpp
  ${TENSOREXPR_TEST_ROOT}/test_type.cpp
  ${TENSOREXPR_TEST_ROOT}/test_type_specializations.cpp
 )
 if(USE_CUDA)
  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_cuda.cpp)
 endif()
 if(USE_LLVM AND LLVM_FOUND)
  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_llvm.cpp)
 endif()
 add_executable(test_tensorexpr
  ${TORCH_ROOT}/test/cpp/common/main.cpp
  ${TENSOREXPR_TEST_ROOT}/padded_buffer.cpp
  ${TENSOREXPR_TEST_SRCS})
 target_link_libraries(test_tensorexpr PRIVATE torch gtest_main)
 target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 target_compile_definitions(test_tensorexpr PRIVATE USE_GTEST)
 add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
 target_link_libraries(tutorial_tensorexpr PRIVATE torch)
 target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 # The test case depends on the xnnpack header which in turn depends on the
 # pthreadpool header. For some build environment we need add the dependency
 # explicitly.
 if(USE_PTHREADPOOL)
  target_link_libraries(test_tensorexpr PRIVATE pthreadpool_interface)
 endif()
 if(USE_CUDA)
  target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA)
  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
 elseif(USE_ROCM)
  target_link_libraries(test_tensorexpr PRIVATE
    hiprtc::hiprtc
    hip::amdhip64
    ${TORCH_CUDA_LIBRARIES})
  target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
  target_link_libraries(tutorial_tensorexpr PRIVATE
    hiprtc::hiprtc
    hip::amdhip64
    ${TORCH_CUDA_LIBRARIES})
  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
 endif()
 if(INSTALL_TEST)
  set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
  install(TARGETS test_tensorexpr DESTINATION bin)
  set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
  install(TARGETS tutorial_tensorexpr DESTINATION bin)
  # Install PDB files for MSVC builds
  if(MSVC AND BUILD_SHARED_LIBS)
    install(FILES $<TARGET_PDB_FILE:test_tensorexpr> DESTINATION bin OPTIONAL)
    install(FILES $<TARGET_PDB_FILE:tutorial_tensorexpr> DESTINATION bin OPTIONAL)
  endif()
 endif()
--- a/test/cpp/tensorexpr/README.md
+++ b/test/cpp/tensorexpr/README.md
@ -1,55 +0,0 @@
 # TensorExpr C++ Tests
 ## How to add a new test
 First, create a new test file. Test files should have be placed in this
 directory, with a name that starts with `test_`, like `test_foo.cpp`.
 Here is an example test file you can copy-paste.
 ```cpp
 #include <test/cpp/tensorexpr/test_base.h>
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
 // 1. Test cases are void() functions.
 // 2. They start with the prefix `test`
 void testCaseOne() {
    // ...
 }
 void testCaseTwo() {
    // ...
 }
 }
 }
 ```
 Then, register your test in `tests.h`:
 ```cpp
 // Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests
 #define TH_FORALL_TESTS(_)             \
  _(ADFormulas)                        \
  _(Attributes)                        \
  ...
  _(CaseOne)  // note that the `test` prefix is omitted.
  _(CaseTwo)
 ```
 We glob all the test files together in `CMakeLists.txt` so that you don't
 have to edit it every time you add a test. Unfortunately, this means that in
 order to get the build to pick up your new test file, you need to re-run
 cmake:
 ```bash
 CMAKE_FRESH=1 python setup.py build
 ```
 ## How do I run the tests?
 The following commands assume you are in PyTorch root.
 ```bash
 # (re)build the test binary
 ninja build/bin/test_tensorexpr
 # run
 build/bin/test_tensorexpr --gtest_filter='glob_style_filter*'
 ```
--- a/test/cpp/tensorexpr/gtest_assert_float_eq.h
+++ b/test/cpp/tensorexpr/gtest_assert_float_eq.h
@ -1,119 +0,0 @@
 #pragma once
 #include <cmath>
 // Copyright 2005, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 using Bits = uint32_t;
 // this avoids the "dereferencing type-punned pointer
 // will break strict-aliasing rules" error
 union Float {
  float float_;
  Bits bits_;
 };
 // # of bits in a number.
 static const size_t kBitCount = 8 * sizeof(Bits);
 // The mask for the sign bit.
 static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 // GOOGLETEST_CM0001 DO NOT DELETE
 // Converts an integer from the sign-and-magnitude representation to
 // the biased representation.  More precisely, let N be 2 to the
 // power of (kBitCount - 1), an integer x is represented by the
 // unsigned number x + N.
 //
 // For instance,
 //
 //   -N + 1 (the most negative number representable using
 //          sign-and-magnitude) is represented by 1;
 //   0      is represented by N; and
 //   N - 1  (the biggest number representable using
 //          sign-and-magnitude) is represented by 2N - 1.
 //
 // Read http://en.wikipedia.org/wiki/Signed_number_representations
 // for more details on signed number representations.
 static Bits SignAndMagnitudeToBiased(const Bits& sam) {
  if (kSignBitMask & sam) {
    // sam represents a negative number.
    return ~sam + 1;
  } else {
    // sam represents a positive number.
    return kSignBitMask | sam;
  }
 }
 // Given two numbers in the sign-and-magnitude representation,
 // returns the distance between them as an unsigned number.
 static Bits DistanceBetweenSignAndMagnitudeNumbers(
    const Bits& sam1,
    const Bits& sam2) {
  const Bits biased1 = SignAndMagnitudeToBiased(sam1);
  const Bits biased2 = SignAndMagnitudeToBiased(sam2);
  return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
 }
 // How many ULP's (Units in the Last Place) we want to tolerate when
 // comparing two numbers.  The larger the value, the more error we
 // allow.  A 0 value means that two numbers must be exactly the same
 // to be considered equal.
 //
 // The maximum error of a single floating-point operation is 0.5
 // units in the last place.  On Intel CPU's, all floating-point
 // calculations are done with 80-bit precision, while double has 64
 // bits.  Therefore, 4 should be enough for ordinary use.
 //
 // See the following article for more details on ULP:
 // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
 static const size_t kMaxUlps = 4;
 // Returns true if and only if this number is at most kMaxUlps ULP's away
 // from rhs.  In particular, this function:
 //
 //   - returns false if either number is (or both are) NAN.
 //   - treats really large numbers as almost equal to infinity.
 //   - thinks +0.0 and -0.0 are 0 DLP's apart.
 inline bool AlmostEquals(float lhs, float rhs) {
  // The IEEE standard says that any comparison operation involving
  // a NAN must return false.
  if (std::isnan(lhs) || std::isnan(rhs))
    return false;
  Float l = {lhs};
  Float r = {rhs};
  return DistanceBetweenSignAndMagnitudeNumbers(l.bits_, r.bits_) <= kMaxUlps;
 }
--- a/test/cpp/tensorexpr/padded_buffer.cpp
+++ b/test/cpp/tensorexpr/padded_buffer.cpp
@ -1,37 +0,0 @@
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include <c10/util/Logging.h>
 #include <c10/util/irange.h>
 #include <sstream>
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 int PaddedBufferBase::Index(const std::vector<int>& indices) const {
  TORCH_DCHECK_EQ(dims_.size(), indices.size());
  int total_index = 0;
  for (const auto i : c10::irange(dims_.size())) {
    total_index += indices[i] * strides_[i];
  }
  return total_index;
 }
 PaddedBufferBase::PaddedBufferBase(
    const std::vector<int>& dims,
    // NOLINTNEXTLINE(modernize-pass-by-value)
    const std::string& name)
    : dims_(dims), name_(name), strides_(dims.size()) {
  for (int i = (int)dims.size() - 1; i >= 0; --i) {
    if (i == (int)dims.size() - 1) {
      strides_[i] = 1;
    } else {
      strides_[i] = strides_[i + 1] * dims[i + 1];
    }
  }
  total_size_ = strides_[0] * dims[0];
 }
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/padded_buffer.h
+++ b/test/cpp/tensorexpr/padded_buffer.h
@ -1,242 +0,0 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <c10/util/irange.h>
 #include "torch/csrc/jit/tensorexpr/eval.h"
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 template <typename T>
 struct DefaultPaddedValue;
 template <>
 struct DefaultPaddedValue<int> {
  static const int kValue = static_cast<int>(0xDEADBEEF);
 };
 template <>
 struct DefaultPaddedValue<int8_t> {
  static const int8_t kValue = static_cast<int8_t>(0xBE);
 };
 template <>
 struct DefaultPaddedValue<uint8_t> {
  static const uint8_t kValue = static_cast<uint8_t>(0xBE);
 };
 template <>
 struct DefaultPaddedValue<int16_t> {
  static const int16_t kValue = static_cast<int16_t>(0xBEEF);
 };
 template <>
 struct DefaultPaddedValue<int64_t> {
  static const int64_t kValue = static_cast<int64_t>(0xDEADBEEF);
 };
 template <>
 struct DefaultPaddedValue<float> {
  static constexpr float kValue = 0.1357;
 };
 template <>
 struct DefaultPaddedValue<at::Half> {
  // at::Half ctor isn't constexpr, so just fill it with bits.
  static constexpr uint16_t kValue = 1357;
 };
 template <>
 struct DefaultPaddedValue<double> {
  static constexpr double kValue = 0.1357;
 };
 // A concrete base to be used in PaddedBase.
 class PaddedBufferBase {
 public:
  const std::string& name() const {
    return name_;
  }
  int size() const {
    return total_size_;
  }
  int raw_size() const {
    return total_size_ + 2 * kPaddingSize;
  }
  virtual ~PaddedBufferBase() {}
 protected:
  explicit PaddedBufferBase(
      const std::vector<int>& dims,
      const std::string& name);
  int Index(const std::vector<int>& indices) const;
  std::vector<int> dims_;
  std::string name_;
  std::vector<int> strides_;
  int total_size_; // total number of useful element, does not include the
                   // paddings
  static constexpr int kPaddingSize = 64;
 };
 // A padded buffer with wartermarks for testing.
 // The buffer carries padded watermarks on both sides to catch potential
 // out-of-bounds writes. For read-only data that are not supposed to change, it
 // can also make a backup and be compared later.
 template <typename T>
 class PaddedBuffer : public PaddedBufferBase {
 public:
  PaddedBuffer(int d0, const std::string& name = "")
      : PaddedBuffer(std::vector<int>({d0}), name) {}
  PaddedBuffer(int d0, int d1, const std::string& name = "")
      : PaddedBuffer(std::vector<int>({d0, d1}), name) {}
  PaddedBuffer(int d0, int d1, int d2, const std::string& name = "")
      : PaddedBuffer(std::vector<int>({d0, d1, d2}), name) {}
  PaddedBuffer(int d0, int d1, int d2, int d3, const std::string& name = "")
      : PaddedBuffer(std::vector<int>({d0, d1, d2, d3}), name) {}
  PaddedBuffer(const std::vector<int>& dims, const std::string& name = "")
      : PaddedBufferBase(dims, name) {
    data_.resize(total_size_ + 2 * kPaddingSize, kPaddingValue);
  }
  PaddedBuffer(const PaddedBuffer& other, const std::string& name)
      : PaddedBuffer(other) {
    this->name_ = name;
  }
  T* data() {
    return data_.data() + kPaddingSize;
  }
  const T* data() const {
    return const_cast<PaddedBuffer*>(this)->data();
  }
  T* raw_data() {
    return data_.data();
  }
  const T* raw_data() const {
    return const_cast<PaddedBuffer*>(this)->raw_data();
  }
  T& operator()(int i0) {
    // There is a bit performance impact with forming a vector here. But this
    // data structure is for testing only, and not performance critical.
    return this->operator()(std::vector<int>({i0}));
  }
  const T& operator()(int i0) const {
    return const_cast<PaddedBuffer*>(this)->operator()(i0);
  }
  T& operator()(int i0, int i1) {
    return this->operator()(std::vector<int>({i0, i1}));
  }
  const T& operator()(int i0, int i1) const {
    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1);
  }
  T& operator()(int i0, int i1, int i2) {
    return this->operator()(std::vector<int>({i0, i1, i2}));
  }
  const T& operator()(int i0, int i1, int i2) const {
    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2);
  }
  T& operator()(int i0, int i1, int i2, int i3) {
    return this->operator()(std::vector<int>({i0, i1, i2, i3}));
  }
  const T& operator()(int i0, int i1, int i2, int i3) const {
    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2, i3);
  }
  T& operator()(const std::vector<int>& indices) {
    return data_[kPaddingSize + Index(indices)];
  }
  const T& operator()(const std::vector<int>& indices) const {
    return const_cast<PaddedBuffer*>(this)->operator()(indices);
  }
  template <typename U>
  friend void ExpectAllNear(
      const PaddedBuffer<U>& v1,
      const PaddedBuffer<U>& v2,
      float abs_error);
  template <typename U>
  friend void ExpectAllEqual(
      const PaddedBuffer<U>& v1,
      const PaddedBuffer<U>& v2);
  void Backup() {
    backup_data_ = data_;
  }
  // Verify the watermarks in the paddings are intact.
  void ValidateWatermark() const {
    for (const auto i : c10::irange(kPaddingSize)) {
      ASSERT_EQ(data_[i], kPaddingValue);
      ASSERT_EQ(data_[i + total_size_ + kPaddingSize], kPaddingValue);
    }
  }
  void CheckBackup() const {
    ValidateWatermark();
    DCHECK(backup_data_.size() == data_.size())
        << "Please make sure you have call Backup() before calling CheckBackup()";
    for (const auto i : c10::irange(total_size_)) {
      ASSERT_EQ(data_[i + kPaddingSize], backup_data_[i + kPaddingSize]);
    }
  }
 private:
  std::vector<T> data_;
  std::vector<T> backup_data_;
  T kPaddingValue = DefaultPaddedValue<T>::kValue;
 };
 template <typename T>
 inline CodeGen::CallArg::CallArg(const PaddedBuffer<T>& buffer)
    : data_(const_cast<T*>(buffer.data())) {}
 template <typename T>
 std::string CompareErrorMsg(
    const PaddedBuffer<T>& v1,
    const PaddedBuffer<T>& v2,
    int index) {
  std::ostringstream oss;
  oss << "index: " << index << ", v1: (" << v1.name() << ", " << v1(index)
      << ")"
      << ", v2: (" << v2.name() << ", " << v2(index) << ")";
  return oss.str();
 }
 template <typename T>
 void ExpectAllEqual(const PaddedBuffer<T>& f1, const PaddedBuffer<T>& f2) {
  const std::vector<T>& v1 = f1.data_;
  const std::vector<T>& v2 = f2.data_;
  const int kPaddingSize = f1.kPaddingSize;
  const int total_size = f1.total_size_;
  ASSERT_EQ(v1.size(), v2.size());
  f1.ValidateWatermark();
  f2.ValidateWatermark();
  for (const auto i : c10::irange(total_size)) {
    ASSERT_EQ(v1[kPaddingSize + i], v2[kPaddingSize + i]);
  }
 }
 template <typename T>
 void ExpectAllNear(
    const PaddedBuffer<T>& f1,
    const PaddedBuffer<T>& f2,
    float abs_error) {
  const std::vector<T>& v1 = f1.data_;
  const std::vector<T>& v2 = f2.data_;
  const int kPaddingSize = f1.kPaddingSize;
  const int total_size = f1.total_size_;
  ASSERT_EQ(v1.size(), v2.size());
  f1.ValidateWatermark();
  f2.ValidateWatermark();
  for (const auto i : c10::irange(total_size)) {
    ASSERT_NEAR(v1[kPaddingSize + i], v2[kPaddingSize + i], abs_error);
  }
 }
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@ -1,96 +0,0 @@
 #ifdef TORCH_ENABLE_LLVM
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 #include <cstring>
 using namespace torch::indexing;
 namespace te = torch::jit::tensorexpr;
 static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
  auto loops = ln->getLoopStmtsFor(target);
  te::ForPtr inner, tail;
  ln->splitWithTail(loops[0], width, &inner, &tail);
  ASSERT_TRUE(te::LoopNest::vectorize(inner));
 }
 std::string diffs(const at::Tensor& a, const at::Tensor& b) {
  auto diff = torch::abs(a.flatten() - b.flatten());
  auto count_diffs = torch::sum(diff > 0.f);
  auto greatest_diff_index = torch::argmax(diff);
  std::stringstream ss;
  ss << "Found " << count_diffs << " unequal element(s). "
     << "The greatest difference was " << diff.index({greatest_diff_index})
     << " at index " << greatest_diff_index;
  return ss.str();
 }
 TEST(Approx, log_vml) {
  te::VarHandle N("N", te::kInt);
  te::BufHandle A("A", {N}, te::kFloat);
  te::Tensor B = te::Compute(
      "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
  te::LoopNest ln({B});
  ln.prepareForCodegen();
  vectorize(&ln, B, 8);
  te::StmtPtr s = ln.root_stmt();
  s = te::IRSimplifier::simplify(s);
  te::LLVMCodeGen cg(s, {A, B, N});
  auto eps = std::numeric_limits<float>::epsilon();
  auto test = [&](const at::Tensor& A_t) {
    at::Tensor B_ref = at::log(A_t);
    at::Tensor B_t = at::empty_like(A_t);
    auto ap = A_t.data_ptr<float>();
    auto bp = B_t.data_ptr<float>();
    cg.call({ap, bp, A_t.numel()});
    // Results should be bit-identical.
    ASSERT_TRUE(torch::allclose(
        B_t, B_ref, /*rtol=*/eps, /*atol=*/0.0f, /*equal_nan=*/true))
        << "Input[:8]\n"
        << A_t.index({Slice(0, 8)}) << "\n"
        << "Test[:8]\n"
        << B_t.index({Slice(0, 8)}) << "\n"
        << "Ref[:8]\n"
        << B_ref.index({Slice(0, 8)}) << diffs(B_t, B_ref);
  };
  // Generate every single-precision FP value in [1.0, 2.0).
  at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
  ASSERT_EQ(A_t.numel(), 1 << 23);
  test(A_t);
  test(A_t * 2.0f);
  test(A_t * 0.5f);
  test(A_t * 4.0f);
  test(A_t * 0.25f);
  test(A_t * powf(2.0f, 16));
  test(A_t * powf(2.0f, -16));
  test(A_t * powf(2.0f, 126));
  test(A_t * powf(2.0f, -126));
  test(torch::full({32}, INFINITY));
  test(torch::full({32}, NAN));
  auto min = std::numeric_limits<float>::min();
  auto denorm_min = std::numeric_limits<float>::denorm_min();
  // Denormals aren't bit precise, because sleef isn't bit-precise either.
  A_t = torch::arange(0.0f, min, denorm_min);
  ASSERT_EQ(A_t.numel(), 1 << 23);
  auto B_ref = at::log(A_t);
  auto B_t = at::empty_like(B_ref);
  cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
  ASSERT_TRUE(torch::allclose(B_t, B_ref));
 }
 #endif // TORCH_ENABLE_LLVM
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ b/test/cpp/tensorexpr/test_aten.cpp
--- a/test/cpp/tensorexpr/test_base.h
+++ b/test/cpp/tensorexpr/test_base.h
@ -1,89 +0,0 @@
 #pragma once
 #if defined(USE_GTEST)
 #include <gtest/gtest.h>
 #include <test/cpp/common/support.h>
 #else
 #include <cmath>
 #include "c10/util/Exception.h"
 #include "test/cpp/tensorexpr/gtest_assert_float_eq.h"
 #define ASSERT_EQ(x, y, ...) TORCH_INTERNAL_ASSERT((x) == (y), __VA_ARGS__)
 #define ASSERT_FLOAT_EQ(x, y, ...) \
  TORCH_INTERNAL_ASSERT(AlmostEquals((x), (y)), __VA_ARGS__)
 #define ASSERT_NE(x, y, ...) TORCH_INTERNAL_ASSERT((x) != (y), __VA_ARGS__)
 #define ASSERT_GT(x, y, ...) TORCH_INTERNAL_ASSERT((x) > (y), __VA_ARGS__)
 #define ASSERT_GE(x, y, ...) TORCH_INTERNAL_ASSERT((x) >= (y), __VA_ARGS__)
 #define ASSERT_LT(x, y, ...) TORCH_INTERNAL_ASSERT((x) < (y), __VA_ARGS__)
 #define ASSERT_LE(x, y, ...) TORCH_INTERNAL_ASSERT((x) <= (y), __VA_ARGS__)
 #define ASSERT_NEAR(x, y, a, ...) \
  TORCH_INTERNAL_ASSERT(std::fabs((x) - (y)) < (a), __VA_ARGS__)
 #define ASSERT_TRUE TORCH_INTERNAL_ASSERT
 #define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
 #define ASSERT_THROWS_WITH(statement, substring)                         \
  try {                                                                  \
    (void)statement;                                                     \
    ASSERT_TRUE(false);                                                  \
  } catch (const std::exception& e) {                                    \
    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
  }
 #define ASSERT_ANY_THROW(statement)     \
  {                                     \
    bool threw = false;                 \
    try {                               \
      (void)statement;                  \
    } catch (const std::exception& e) { \
      threw = true;                     \
    }                                   \
    ASSERT_TRUE(threw);                 \
  }
 #endif // defined(USE_GTEST)
 #include <string>
 #include <vector>
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 template <typename U, typename V>
 void ExpectAllNear(
    const std::vector<U>& v1,
    const std::vector<U>& v2,
    V threshold,
    const std::string& name = "") {
  ASSERT_EQ(v1.size(), v2.size());
  for (size_t i = 0; i < v1.size(); i++) {
    ASSERT_NEAR(v1[i], v2[i], threshold);
  }
 }
 template <typename U, typename V>
 void ExpectAllNear(
    const std::vector<U>& vec,
    const U& val,
    V threshold,
    const std::string& name = "") {
  for (size_t i = 0; i < vec.size(); i++) {
    ASSERT_NEAR(vec[i], val, threshold);
  }
 }
 template <typename T>
 static void assertAllEqual(const std::vector<T>& vec, const T& val) {
  for (auto const& elt : vec) {
    ASSERT_EQ(elt, val);
  }
 }
 template <typename T>
 static void assertAllEqual(const std::vector<T>& v1, const std::vector<T>& v2) {
  ASSERT_EQ(v1.size(), v2.size());
  for (size_t i = 0; i < v1.size(); ++i) {
    ASSERT_EQ(v1[i], v2[i]);
  }
 }
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@ -1,234 +0,0 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/conv2d.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 namespace torch {
 namespace jit {
 namespace te = torch::jit::tensorexpr;
 namespace F = torch::nn::functional;
 #ifdef TORCH_ENABLE_LLVM
 // Generate test data with few bits of precision, to minimize error
 // accumulation from floating-point reordering.
 static at::Tensor genTestData(c10::IntArrayRef args) {
  return at::trunc(at::randn(args) * 256.0f) / 256.0f;
 }
 TEST(Conv, DepthwiseConv2D) {
  constexpr int N = 1, C = 72, H = 56, W = 56;
  constexpr int K = 72, R = 3, S = 3;
  constexpr int kPad = 1, kStride = 2, kGroups = C;
  constexpr int CperG = C / kGroups;
  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
  te::BufHandle bias("bias", {K}, te::kFloat);
  te::Tensor output =
      te::conv2d_depthwise(input, weight, bias, kStride, kPad, kGroups);
  te::LoopNest loop({output});
  loop.simplify();
  loop.prepareForCodegen();
  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, bias, output});
  auto it = genTestData({N, C, H, W});
  auto wt = genTestData({K, CperG, R, S});
  auto bt = genTestData({K});
  auto ref = at::conv2d(it, wt, bt, kStride, kPad, /*dilation=*/1, kGroups);
  auto ot = at::zeros_like(ref);
  cg.call(
      {it.data_ptr<float>(),
       wt.data_ptr<float>(),
       bt.data_ptr<float>(),
       ot.data_ptr<float>()});
  ASSERT_TRUE(at::allclose(ref, ot));
 }
 TEST(Conv, DepthwiseConv2DNoBias) {
  constexpr int N = 1, C = 72, H = 56, W = 56;
  constexpr int K = 72, R = 3, S = 3;
  constexpr int kPad = 1, kStride = 2, kGroups = C;
  constexpr int CperG = C / kGroups;
  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
  te::Tensor output =
      te::conv2d_depthwise(input, weight, kStride, kPad, kGroups);
  te::LoopNest loop({output});
  loop.simplify();
  loop.prepareForCodegen();
  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, output});
  auto it = genTestData({N, C, H, W});
  auto wt = genTestData({K, CperG, R, S});
  auto ref =
      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
  auto ot = at::zeros_like(ref);
  cg.call({it.data_ptr<float>(), wt.data_ptr<float>(), ot.data_ptr<float>()});
  ASSERT_TRUE(at::allclose(ref, ot));
 }
 TEST(Conv, DepthwiseConv2DDynamicShapes) {
  te::VarHandle N_var("N", te::kInt);
  te::VarHandle C_var("C", te::kInt);
  te::VarHandle H_var("H", te::kInt);
  te::VarHandle W_var("W", te::kInt);
  te::VarHandle K_var("K", te::kInt);
  te::VarHandle CperG_var("CperG", te::kInt);
  te::VarHandle R_var("R", te::kInt);
  te::VarHandle S_var("S", te::kInt);
  te::VarHandle kPad_var("kPad", te::kInt);
  te::VarHandle kStride_var("kStride", te::kInt);
  te::VarHandle kGroups_var("kGroups", te::kInt);
  te::BufHandle input("input", {N_var, C_var, H_var, W_var}, te::kFloat);
  te::BufHandle weight("weight", {K_var, CperG_var, R_var, S_var}, te::kFloat);
  te::Tensor output = te::conv2d_depthwise(
      input,
      weight,
      N_var,
      C_var,
      H_var,
      W_var,
      K_var,
      CperG_var,
      R_var,
      S_var,
      kStride_var,
      kPad_var,
      kGroups_var);
  te::LoopNest loop({output});
  loop.simplify();
  loop.prepareForCodegen();
  std::vector<te::CodeGen::BufferArg> buffer_args = {
      input,
      weight,
      N_var,
      C_var,
      H_var,
      W_var,
      K_var,
      CperG_var,
      R_var,
      S_var,
      kPad_var,
      kStride_var,
      kGroups_var,
      output};
  te::LLVMCodeGen cg(loop.root_stmt(), buffer_args);
  constexpr int N = 1, C = 72, H = 56, W = 56;
  constexpr int K = 72, R = 3, S = 3;
  constexpr int kPad = 1, kStride = 2, kGroups = C;
  constexpr int CperG = C / kGroups;
  auto it = genTestData({N, C, H, W});
  auto wt = genTestData({K, CperG, R, S});
  auto ref =
      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
  auto ot = at::zeros_like(ref);
  std::vector<te::CodeGen::CallArg> call_args = {
      it.data_ptr<float>(),
      wt.data_ptr<float>(),
      N,
      C,
      H,
      W,
      K,
      CperG,
      R,
      S,
      kPad,
      kStride,
      kGroups,
      ot.data_ptr<float>()};
  cg.call(call_args);
  ASSERT_TRUE(at::allclose(ref, ot));
 }
 #endif
 TEST(Conv, Conv2D) {
  // Input dimensions.
  constexpr int N = 1;
  constexpr int C = 3;
  constexpr int H = 11;
  constexpr int W = 11;
  // Filter dimensions.
  constexpr int K = 8;
  constexpr int R = 3;
  constexpr int S = 3;
  // Output dims.
  constexpr int OH = H - R + 1;
  constexpr int OW = W - S + 1;
  // Compute reference result.
  at::Tensor input = torch::randn({N, C, H, W});
  at::Tensor filter = torch::randn({K, C, R, S});
  at::Tensor ref = F::conv2d(input, filter);
  // Double check the output size is as expected.
  ASSERT_EQ(ref.size(0), N);
  ASSERT_EQ(ref.size(1), K);
  ASSERT_EQ(ref.size(2), OH);
  ASSERT_EQ(ref.size(3), OW);
  te::BufHandle inputB("input", {N, C, H, W}, te::kFloat);
  te::BufHandle filterB("filter", {K, C, R, S}, te::kFloat);
  te::Tensor conv = te::Reduce(
      "conv",
      {N, K, OH, OW},
      te::Sum(),
      // FIXME: We have to use a `std::vector` parameter here and then unpack
      // it, because we don't have an overload allowing for an arbitrary number
      // of ExprHandle/VarHandle parameters.
      [&](const std::vector<te::VarHandle>& v) {
        auto const& n = v[0];
        auto const& k = v[1];
        auto const& oh = v[2];
        auto const& ow = v[3];
        auto const& c = v[4];
        auto const& r = v[5];
        auto const& s = v[6];
        // FIXME: We have to use `call` and construct a `std::vector` here
        // because the `operator()` overload is only specialized for a small
        // number of arguments.
        return inputB.load(n, c, oh + r, ow + s) * filterB.load(k, c, r, s);
      },
      // FIXME: If you forget one of the reduction dims, you get a segfault.
      // Could that be caught by a verifier?
      {C, R, S});
  // FIXME: It'd be nice to have a single header that pulls in things like
  // LoopNest, IRSimplifier, etc.
  te::LoopNest loop({conv});
  loop.prepareForCodegen();
  te::StmtPtr s = loop.root_stmt();
  s = te::IRSimplifier::simplify(s);
  at::Tensor result = at::empty_like(ref);
  te::SimpleIREvaluator cg(s, {inputB, filterB, conv});
  cg.call(
      {input.data_ptr<float>(),
       filter.data_ptr<float>(),
       result.data_ptr<float>()});
  ASSERT_TRUE(at::allclose(ref, result, 1e-3, 1e-3));
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_cpp_codegen.cpp
+++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp
@ -1,259 +0,0 @@
 #include <gtest/gtest.h>
 #include "test/cpp/tensorexpr/test_base.h"
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
 #include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/testing/file_check.h>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 #define STR_CHECK(node, expected) \
  std::stringstream ss;           \
  CppPrinter printer(&ss);        \
  printer.visit(node);            \
  ASSERT_EQ(ss.str(), expected)
 #define FILE_CHECK(node, pattern) \
  std::stringstream ss;           \
  CppPrinter printer(&ss);        \
  printer.visit(node);            \
  torch::jit::testing::FileCheck().run(pattern, ss.str())
 TEST(CppPrinter, IntImm) {
  auto i = alloc<IntImm>(10);
  STR_CHECK(i, "10");
 }
 TEST(CppPrinter, FloatImm) {
  auto f = alloc<FloatImm>(10);
  STR_CHECK(f, "10.f");
 }
 TEST(CppPrinter, FloatImm1) {
  auto f = alloc<FloatImm>(10);
  STR_CHECK(f, "10.f");
 }
 TEST(CppPrinter, DoubleImm) {
  auto d = alloc<DoubleImm>(10);
  STR_CHECK(d, "10.0");
 }
 TEST(CppPrinter, DoubleImm1) {
  auto d = alloc<DoubleImm>(10.1);
  STR_CHECK(d, "10.1");
 }
 TEST(CppPrinter, HalfImm) {
  auto h = alloc<HalfImm>(10);
  STR_CHECK(h, "10");
 }
 TEST(CppPrinter, Add) {
  auto add = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
  STR_CHECK(add, "1 + 2");
 }
 TEST(CppPrinter, AddExpr1) {
  auto add = alloc<Add>(
      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
  STR_CHECK(add, "(0 + 1) + (2 - 3)");
 }
 TEST(CppPrinter, AddExpr2) {
  auto add = alloc<Add>(
      alloc<Mul>(alloc<IntImm>(0), alloc<IntImm>(1)),
      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
  STR_CHECK(add, "0 * 1 + (2 - 3)");
 }
 TEST(CppPrinter, AddExpr3) {
  auto add = alloc<Add>(
      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
      alloc<Div>(alloc<IntImm>(2), alloc<IntImm>(3)));
  STR_CHECK(add, "(0 + 1) + 2 / 3");
 }
 TEST(CppPrinter, Mod) {
  auto mod = alloc<Mod>(alloc<IntImm>(1), alloc<IntImm>(2));
  STR_CHECK(mod, "1 % 2");
 }
 TEST(CppPrinter, ModFloat) {
  auto mod = alloc<Mod>(alloc<FloatImm>(1), alloc<FloatImm>(2));
  STR_CHECK(mod, "std::fmod(1.f, 2.f)");
 }
 TEST(CppPrinter, Max) {
  auto max = alloc<Max>(alloc<IntImm>(1), alloc<IntImm>(2), false);
  STR_CHECK(max, "std::max(1, 2)");
 }
 TEST(CppPrinter, MaxFloat) {
  auto max = alloc<Max>(alloc<FloatImm>(1), alloc<FloatImm>(2), false);
  STR_CHECK(max, "std::max(1.f, 2.f)");
 }
 TEST(CppPrinter, MaxHalf) {
  auto max = alloc<Max>(alloc<HalfImm>(1), alloc<HalfImm>(2), false);
  STR_CHECK(max, "(1 < 2) ? 2 : 1");
 }
 TEST(CppPrinter, And) {
  auto v = alloc<And>(alloc<IntImm>(1), alloc<IntImm>(2));
  STR_CHECK(v, "1 & 2");
 }
 TEST(CppPrinter, CompareSelect) {
  auto cs = alloc<CompareSelect>(
      alloc<IntImm>(1),
      alloc<IntImm>(2),
      alloc<FloatImm>(1),
      alloc<FloatImm>(2),
      CompareSelectOperation::kLE);
  STR_CHECK(cs, "((1 <= 2) ? 1.f : 2.f)");
 }
 TEST(CppPrinter, IfThenElse) {
  auto cond = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
  auto true_value = alloc<Sub>(alloc<IntImm>(0), alloc<IntImm>(1));
  auto false_value = alloc<Mul>(alloc<IntImm>(2), alloc<IntImm>(3));
  auto v = alloc<IfThenElse>(cond, true_value, false_value);
  STR_CHECK(v, "((1 + 2) ? 0 - 1 : 2 * 3)");
 }
 TEST(CppPrinter, AllocateFree) {
  BufHandle buf("x", {2, 3}, kInt);
  AllocatePtr alloc = Allocate::make(buf);
  FreePtr free = Free::make(buf);
  BlockPtr block = Block::make({alloc, free});
  const std::string pattern = R"(
   # CHECK: {
   # CHECK:   int* x = static_cast<int*>(malloc(24));
   # CHECK:   free(x);
   # CHECK: }
  )";
  FILE_CHECK(block, pattern);
 }
 TEST(CppPrinter, LoadStore) {
  BufHandle a("A", {2, 3}, kInt);
  BufHandle b("B", {3, 4}, kInt);
  auto store = b.store({2, 2}, a.load(1, 1));
  STR_CHECK(
      store, "B[(0 + 2 * (1 * 4)) + 2 * 1] = A[(0 + 1 * (1 * 3)) + 1 * 1];\n");
 }
 TEST(CppPrinter, Var) {
  auto var = alloc<Var>("x", kInt);
  STR_CHECK(var, "x");
 }
 TEST(CppPrinter, Cast) {
  auto cast = alloc<Cast>(kFloat, alloc<IntImm>(1));
  STR_CHECK(cast, "static_cast<float>(1)");
 }
 TEST(CppPrinter, BitCast) {
  auto cast = alloc<BitCast>(kInt, alloc<FloatImm>(20));
  STR_CHECK(cast, "std::bitcast<float, int>(20.f)");
 }
 TEST(CppPrinter, Let) {
  auto var = alloc<Var>("x", kFloat);
  auto val = alloc<FloatImm>(2);
  auto let = alloc<Let>(var, val);
  STR_CHECK(let, "float x = 2.f;\n");
 }
 TEST(CppPrinter, For) {
  constexpr int N = 1024;
  BufHandle a("A", {N}, kInt);
  BufHandle b("B", {N}, kInt);
  BufHandle c("C", {N}, kInt);
  VarHandle i("i", kInt);
  auto f = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
  const std::string pattern = R"(
   # CHECK: for (int i = 0; i < 1024; i++) {
   # CHECK:   C[i] = (A[i]) + (B[i]);
   # CHECK: }
  )";
  FILE_CHECK(f, pattern);
 }
 TEST(CppPrinter, Cond) {
  BufHandle x("X", {1}, kInt);
  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
  auto cond =
      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
  const std::string pattern = R"(
    # CHECK: if (((X[0] < 10) ? 1 : 0)) {
    # CHECK:   X[0] = (X[0]) + 1;
    # CHECK: } else {
    # CHECK:   X[0] = (X[0]) - 1;
    # CHECK: }
  )";
  FILE_CHECK(cond, pattern);
 }
 TEST(CppPrinter, Intrinsics) {
  const std::unordered_set<IntrinsicsOp, std::hash<int>> unsupported_ops{
      kRand, kSigmoid};
  for (const auto i : c10::irange(static_cast<uint32_t>(kMaxIntrinsicsOp))) {
    IntrinsicsOp op = static_cast<IntrinsicsOp>(i);
    if (unsupported_ops.count(op)) {
      continue;
    }
    if (Intrinsics::OpArgCount(op) == 1) {
      auto v = alloc<Intrinsics>(op, alloc<FloatImm>(2.0f));
      STR_CHECK(v, "std::" + v->func_name() + "(2.f)");
    } else {
      auto v =
          alloc<Intrinsics>(op, alloc<FloatImm>(1.0f), alloc<FloatImm>(2.0f));
      STR_CHECK(v, "std::" + v->func_name() + "(1.f, 2.f)");
    }
  }
 }
 TEST(CppPrinter, ExternalCall) {
  std::vector<ExprPtr> dims{alloc<IntImm>(2), alloc<IntImm>(2)};
  auto output = alloc<Buf>("out", dims, kFloat);
  auto buf_arg1 = alloc<Buf>("a", dims, kFloat);
  auto buf_arg2 = alloc<Buf>("b", dims, kFloat);
  auto scalar_arg = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
  std::vector<BufPtr> buf_args{buf_arg1, buf_arg2};
  std::vector<ExprPtr> scalar_args{scalar_arg};
  auto call =
      alloc<ExternalCall>(output, "nnc_aten_matmul", buf_args, scalar_args);
  const std::string pattern = R"(
   # CHECK: {
   # CHECK:   void* buf_ptrs[]{out, a, b};
   # CHECK:   int64_t buf_ranks[]{2, 2, 2};
   # CHECK:   int64_t buf_dims[]{2, 2, 2, 2, 2, 2};
   # CHECK:   int8_t buf_dtypes[]{6, 6, 6};
   # CHECK:   int64_t extra_args[]{1 + 2};
   # CHECK:   nnc_aten_matmul(
   # CHECK:       3,
   # CHECK:       buf_ptrs,
   # CHECK:       buf_ranks,
   # CHECK:       buf_dims,
   # CHECK:       buf_dtypes,
   # CHECK:       1,
   # CHECK:       extra_args);
   # CHECK: }
  )";
  FILE_CHECK(call, pattern);
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
--- a/test/cpp/tensorexpr/test_dynamic_shapes.cpp
+++ b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
@ -1,701 +0,0 @@
 #include <gtest/gtest.h>
 #include <ATen/code_template.h>
 #include <c10/core/DeviceType.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/torch.h>
 #include <cmath>
 #include <sstream>
 #include <stdexcept>
 #include <thread>
 namespace torch {
 namespace jit {
 using namespace torch::indexing;
 using namespace torch::jit::tensorexpr;
 TEST(DynamicShapes, SimpleGraph) {
 #ifdef TORCH_ENABLE_LLVM
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
      graph(%x : Tensor,
            %SS_2 : int,
            %SS_3 : int):
        %3 : Tensor = aten::tanh(%x)
        %4 : Tensor = aten::erf(%3)
        return (%4))IR";
  torch::jit::parseIR(graph_string, graph.get());
  auto x_inp = graph->inputs()[0];
  auto x_type = TensorType::create(at::rand({10, 5}));
  std::vector<ShapeSymbol> x_sym_dims(
      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
  graph->inputs().at(0)->setType(x_sym_type);
  for (const auto n : graph->nodes()) {
    n->output()->setType(x_sym_type);
  }
  // Graph with symbolic shapes:
  //
  // graph(%x : Float(SS(-2), SS(-3)),
  //       %SS_2 : int,
  //       %SS_3 : int):
  //   %3 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
  //   %4 : Float(SS(-2), SS(-3)) = aten::erf(%3)
  //   return (%4)
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[x_inp] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
      x_sym_dims,
      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
  TensorExprKernel kernel(
      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  // Run with the same static dims as the one we initialized the graph with.
  {
    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::erf(at::tanh(a));
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
    stack.push_back(10);
    stack.push_back(5);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  // Run with inputs having different dims.
  {
    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::erf(at::tanh(a));
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
    stack.push_back(50);
    stack.push_back(100);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
 #endif
 }
 TEST(DynamicShapes, GraphWith2InputsSameDims) {
 #ifdef TORCH_ENABLE_LLVM
  // The two inputs in this graph must have the same dims.
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
      graph(%x : Tensor,
            %y : Tensor,
            %SS_2 : int,
            %SS_3 : int):
        %3 : Tensor = aten::tanh(%x)
        %4 : Tensor = aten::erf(%3)
        %5 : Tensor = aten::mul(%4, %y)
        return (%5))IR";
  torch::jit::parseIR(graph_string, graph.get());
  auto x_inp = graph->inputs()[0];
  auto y_inp = graph->inputs()[1];
  auto x_type = TensorType::create(at::rand({10, 5}));
  std::vector<ShapeSymbol> x_sym_dims(
      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
  graph->inputs().at(0)->setType(x_sym_type);
  graph->inputs().at(1)->setType(x_sym_type);
  for (const auto n : graph->nodes()) {
    n->output()->setType(x_sym_type);
  }
  // Graph with symbolic shapes:
  //
  // graph(%x : Float(SS(-4), SS(-5)),
  //       %y : Float(SS(-4), SS(-5)),
  //       %SS_2 : int,
  //       %SS_3 : int):
  //   %4 : Float(SS(-4), SS(-5)) = aten::tanh(%x)
  //   %5 : Float(SS(-4), SS(-5)) = aten::erf(%4)
  //   %6 : Float(SS(-4), SS(-5)) = aten::mul(%5, %y)
  //   return (%6)
  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
      x_sym_dims,
      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[x_inp] = input_desc;
  symbolic_strides[y_inp] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  TensorExprKernel kernel(
      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  // Run with the same static dims as the one we initialized the graph with.
  {
    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::erf(at::tanh(a)), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(10);
    stack.push_back(5);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  // Run with inputs having different dims.
  {
    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::erf(at::tanh(a)), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(50);
    stack.push_back(100);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
 #endif
 }
 TEST(DynamicShapes, GraphWith2InputsAndBroadcast) {
 #ifdef TORCH_ENABLE_LLVM
  // The second input to the graph has a dim of size 1 which should be
  // broadcasted in the at::mul op.
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
            %y : Float(1, 5, requires_grad=0, device=cpu),
            %SS_2 : int,
            %SS_3 : int):
        %3 : Tensor = aten::tanh(%x)
        %4 : Tensor = aten::erf(%3)
        %5 : Tensor = aten::mul(%4, %y)
        return (%5))IR";
  torch::jit::parseIR(graph_string, graph.get());
  auto x_inp = graph->inputs()[0];
  auto y_inp = graph->inputs()[1];
  auto x_type = TensorType::create(at::rand({10, 5}));
  auto y_type = TensorType::create(at::rand({1, 5}));
  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
  auto x_sym_type = x_type->withSymbolicShapes(
      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
  auto y_sym_type = y_type->withSymbolicShapes(std::vector<ShapeSymbol>(
      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
  graph->inputs().at(0)->setType(x_sym_type);
  graph->inputs().at(1)->setType(y_sym_type);
  for (const auto n : graph->nodes()) {
    n->output()->setType(x_sym_type);
  }
  // Graph with symbolic shapes:
  //
  // graph(%x : Float(SS(-6), SS(-7)),
  //       %y : Float(1, SS(-7)),
  //       %SS_2 : int,
  //       %SS_3 : int):
  //   %4 : Float(SS(-6), SS(-7)) = aten::tanh(%x)
  //   %5 : Float(SS(-6), SS(-7)) = aten::erf(%4)
  //   %6 : Float(SS(-6), SS(-7)) = aten::mul(%5, %y)
  //   return (%6)
  std::vector<int64_t> symbolic_shape_inputs(
      {x_dim0_sym.value(), x_dim1_sym.value()});
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[x_inp] = input_desc;
  symbolic_strides[y_inp] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  TensorExprKernel kernel(
      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  // Run with the same static dims as the one we initialized the graph with.
  {
    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::erf(at::tanh(a)), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(10);
    stack.push_back(5);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  // Run with inputs having different dims.
  {
    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::erf(at::tanh(a)), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(50);
    stack.push_back(100);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
 #endif
 }
 TEST(DynamicShapes, GraphWithPartiallySymbolicOutput) {
 #ifdef TORCH_ENABLE_LLVM
  // The second input to the graph has a dim of size 1 which should be
  // broadcasted in the at::mul op.
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
      graph(%x : Float(1, 5, requires_grad=0, device=cpu),
            %y : Float(1, 5, requires_grad=0, device=cpu),
            %SS_2 : int):
        %4 : Tensor = aten::tanh(%x)
        %5 : Tensor = aten::mul(%4, %y)
        return (%5))IR";
  torch::jit::parseIR(graph_string, graph.get());
  auto x_inp = graph->inputs()[0];
  auto y_inp = graph->inputs()[1];
  auto x_type = TensorType::create(at::rand({1, 5}));
  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
  auto x_sym_type = x_type->withSymbolicShapes(std::vector<ShapeSymbol>(
      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
  graph->inputs().at(0)->setType(x_sym_type);
  graph->inputs().at(1)->setType(x_sym_type);
  for (const auto n : graph->nodes()) {
    n->output()->setType(x_sym_type);
  }
  // Graph with symbolic shapes:
  //
  // graph(%x : Float(1, SS(-2)),
  //       %y : Float(1, SS(-2)),
  //       %SS_2 : int):
  //   %3 : Float(1, SS(-2)) = aten::tanh(%x)
  //   %4 : Float(1, SS(-2)) = aten::mul(%3, %y)
  //   return (%4)
  std::vector<int64_t> symbolic_shape_inputs({x_dim1_sym.value()});
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[x_inp] = input_desc;
  symbolic_strides[y_inp] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  TensorExprKernel kernel(
      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  // Run with the same static dims as the one we initialized the graph with.
  {
    auto a = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::tanh(a), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(5);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  // Run with inputs having different dims.
  {
    auto a = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::tanh(a), b);
    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
    stack.push_back(100);
    kernel.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
 #endif
 }
 TEST(DynamicShapes, GraphWithSymbolicStrides) {
 #ifdef TORCH_ENABLE_LLVM
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
          %1 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
          %SS_3 : int,
          %SS_2 : int):
      %15 : int = prim::Constant[value=1]()
      %21 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::add(%0, %1, %15)
      %22 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::mul(%21, %0)
      return (%22))IR";
  parseIR(graph_string, &*graph);
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::S_AS_ARG, torch::jit::StrideInput::S_ONE};
  std::vector<torch::jit::StrideInput> output_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[graph->inputs().at(0)] = input_desc;
  symbolic_strides[graph->inputs().at(1)] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = output_desc;
  std::vector<int64_t> symbolic_shape_inputs = {-3, -2};
  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  {
    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::add(x0, x1, 1), x0);
    std::vector<at::Tensor> inputs = {x0, x1};
    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
    stack.push_back(32);
    stack.push_back(10);
    k.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  {
    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto out =
        at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    auto ref = at::mul(at::add(x0, x1, 1), x0);
    std::vector<at::Tensor> inputs = {out, x0, x1};
    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
    stack.push_back(32);
    stack.push_back(10);
    k.runWithAllocatedOutputs(stack);
    ASSERT_TRUE(at::allclose(out, ref));
  }
 #endif
 }
 TEST(DynamicShapes, GraphWithCatAndBroadcast) {
 #ifdef TORCH_ENABLE_LLVM
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
            %y : Float(4, 5, requires_grad=0, device=cpu),
            %z : Float(1, 1, requires_grad=0, device=cpu),
            %SS_2 : int,
            %SS_3 : int,
            %SS_4 : int,
            %SS_5 : int):
        %11 : int = prim::Constant[value=0]()
        %3 : Tensor = aten::tanh(%x)
        %out1 : Tensor = aten::erf(%3)
        %out2 : Tensor = aten::relu(%y)
        %10 : Tensor[] = prim::ListConstruct(%out1, %out2)
        %25 : Tensor = aten::cat(%10, %11)
        %28 : Tensor = aten::hardswish(%25)
        %29 : Tensor = aten::mul(%28, %z)
        return (%29))IR";
  torch::jit::parseIR(graph_string, graph.get());
  auto x_inp = graph->inputs()[0];
  auto y_inp = graph->inputs()[1];
  auto z_inp = graph->inputs()[2];
  auto x_type = TensorType::create(at::rand({10, 5}));
  auto y_type = TensorType::create(at::rand({4, 5}));
  auto z_type = TensorType::create(at::rand({1, 1}));
  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
  auto x_sym_type = x_type->withSymbolicShapes(
      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
  auto y_dim0_sym = c10::ShapeSymbol::newSymbol();
  auto y_sym_type = y_type->withSymbolicShapes(
      std::vector<ShapeSymbol>({y_dim0_sym, x_dim1_sym}));
  graph->inputs().at(0)->setType(x_sym_type);
  graph->inputs().at(1)->setType(y_sym_type);
  auto cat_dim0_sym = c10::ShapeSymbol::newSymbol();
  auto cat_out_type = x_type->withSymbolicShapes(
      std::vector<ShapeSymbol>({cat_dim0_sym, x_dim1_sym}));
  auto nodeIt = graph->nodes().begin();
  ++nodeIt;
  nodeIt->output()->setType(x_sym_type); // aten::tanh
  ++nodeIt;
  nodeIt->output()->setType(x_sym_type); // aten::erf
  ++nodeIt;
  nodeIt->output()->setType(y_sym_type); // aten::relu
  ++nodeIt;
  ++nodeIt;
  nodeIt->output()->setType(cat_out_type); // aten::cat
  ++nodeIt;
  nodeIt->output()->setType(cat_out_type); // aten::hardswish
  ++nodeIt;
  nodeIt->output()->setType(cat_out_type); // aten::mul
  // Graph with symbolic shapes:
  //
  // graph(%x : Float(SS(-2), SS(-3)),
  //       %y : Float(SS(-4), SS(-3)),
  //       %z : Float(1, 1),
  //       %SS_2 : int,
  //       %SS_3 : int,
  //       %SS_4 : int,
  //       %SS_5 : int):
  //   %7 : int = prim::Constant[value=0]()
  //   %8 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
  //   %9 : Float(SS(-2), SS(-3)) = aten::erf(%8)
  //   %10 : Float(SS(-4), SS(-3)) = aten::relu(%y)
  //   %11 : Tensor[] = prim::ListConstruct(%9, %10)
  //   %12 : Float(SS(-5), SS(-3)) = aten::cat(%11, %7)
  //   %13 : Float(SS(-5), SS(-3)) = aten::hardswish(%12)
  //   %14 : Float(SS(-5), SS(-3)) = aten::mul(%13, %z)
  //   return (%14)
  std::vector<int64_t> symbolic_shape_inputs(
      {x_dim0_sym.value(),
       x_dim1_sym.value(),
       y_dim0_sym.value(),
       cat_dim0_sym.value()});
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[x_inp] = input_desc;
  symbolic_strides[y_inp] = input_desc;
  symbolic_strides[z_inp] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  TensorExprKernel kernel(
      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto b = at::rand({4, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto c = at::rand({1, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto ref = at::mul(
      at::hardswish(at::cat({at::erf(at::tanh(a)), at::relu(b)}, 0)), c);
  std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
  stack.push_back(10);
  stack.push_back(5);
  stack.push_back(4);
  stack.push_back(14);
  kernel.run(stack);
  auto o = stack[0].toTensor();
  ASSERT_TRUE(at::allclose(o, ref));
 #endif
 }
 TEST(DynamicShapes, GraphFromModel) {
 #ifdef TORCH_ENABLE_LLVM
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  const auto graph_string = R"IR(
    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
          %1 : Float(SS(-2), SS(-4), requires_grad=0, device=cpu),
          %2 : Float(SS(-2), SS(-5), requires_grad=0, device=cpu),
          %input.4 : Long(SS(-2), SS(-6), requires_grad=0, device=cpu),
          %4 : Float(SS(-7), requires_grad=0, device=cpu),
          %5 : Float(SS(-7), requires_grad=0, device=cpu),
          %SS_10 : int,
          %SS_9 : int,
          %SS_8 : int,
          %SS_7 : int,
          %SS_6 : int,
          %SS_5 : int,
          %SS_4 : int,
          %SS_3 : int,
          %SS_2 : int):
      %15 : int = prim::Constant[value=1]()
      %16 : bool = prim::Constant[value=0]()
      %17 : int = prim::Constant[value=6]()
      %18 : Float(SS(-2), SS(-6), strides=[139, 1], requires_grad=0, device=cpu) = aten::to(%input.4, %17, %16, %16)
      %19 : Tensor[] = prim::ListConstruct(%0, %1, %18, %2)
      %20 : Float(SS(-2), SS(-8), strides=[261, 1], requires_grad=0, device=cpu) = aten::cat(%19, %15)
      %21 : Float(SS(-2), SS(-9), strides=[261, 1], requires_grad=0, device=cpu) = aten::add(%20, %5, %15)
      %22 : Float(SS(-2), SS(-10), requires_grad=0, device=cpu) = aten::mul(%21, %4)
      return (%22))IR";
  parseIR(graph_string, &*graph);
  std::vector<torch::jit::StrideInput> input_desc = {
      torch::jit::StrideInput::TENSOR_CONT};
  std::unordered_map<
      const torch::jit::Value*,
      std::vector<torch::jit::StrideInput>>
      symbolic_strides;
  symbolic_strides[graph->inputs().at(0)] = input_desc;
  symbolic_strides[graph->inputs().at(1)] = input_desc;
  symbolic_strides[graph->inputs().at(2)] = input_desc;
  symbolic_strides[graph->inputs().at(3)] = input_desc;
  symbolic_strides[graph->inputs().at(4)] = input_desc;
  symbolic_strides[graph->inputs().at(5)] = input_desc;
  symbolic_strides[graph->outputs().at(0)] = input_desc;
  std::vector<int64_t> symbolic_shape_inputs = {
      -10, -9, -8, -7, -6, -5, -4, -3, -2};
  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
  int64_t i2 = 10;
  int64_t i3 = 32;
  int64_t i4 = 19;
  int64_t i5 = 71;
  int64_t i6 = 139;
  int64_t i7 = 261;
  int64_t i8 = 261;
  int64_t i9 = 261;
  int64_t i10 = 261;
  auto x0 = at::rand({i2, i3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto x1 = at::rand({i2, i4}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto x2 = at::rand({i2, i5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto x3 = at::ones({i2, i6}, at::TensorOptions(at::kCPU).dtype(at::kLong));
  auto x4 = at::rand({i7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto x5 = at::rand({i8}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  auto ref = at::mul(at::add(at::cat({x0, x1, x3, x2}, 1), x5), x4);
  {
    std::vector<at::Tensor> inputs = {x0, x1, x2, x3, x4, x5};
    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
    stack.emplace_back(i10);
    stack.emplace_back(i9);
    stack.emplace_back(i8);
    stack.emplace_back(i7);
    stack.emplace_back(i6);
    stack.emplace_back(i5);
    stack.emplace_back(i4);
    stack.emplace_back(i3);
    stack.emplace_back(i2);
    k.run(stack);
    auto o = stack[0].toTensor();
    ASSERT_TRUE(at::allclose(o, ref));
  }
  {
    auto out =
        at::rand({i2, i10}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
    std::vector<at::Tensor> inputs = {out, x0, x1, x2, x3, x4, x5};
    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
    stack.emplace_back(i10);
    stack.emplace_back(i9);
    stack.emplace_back(i8);
    stack.emplace_back(i7);
    stack.emplace_back(i6);
    stack.emplace_back(i5);
    stack.emplace_back(i4);
    stack.emplace_back(i3);
    stack.emplace_back(i2);
    k.runWithAllocatedOutputs(stack);
    ASSERT_TRUE(at::allclose(out, ref));
  }
 #endif
 }
 TEST(DynamicShapes, MultiThreadedExecution) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_template = R"IR(
      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
            %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
            %SS_2 : int,
            %SS_3 : int):
        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x)
        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3)
        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y)
        return (%5))IR";
  for (bool use_cuda : {false, true}) {
    if (!torch::cuda::is_available() && use_cuda) {
      continue;
    }
    auto device = use_cuda ? at::kCUDA : at::kCPU;
    at::jit::TemplateEnv env;
    env.s("device", use_cuda ? "cuda:0" : "cpu");
    const auto graph_string = format(graph_template, env);
    std::shared_ptr<Graph> graph = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, graph.get());
    std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
    std::vector<torch::jit::StrideInput> input_desc = {
        torch::jit::StrideInput::TENSOR_CONT};
    std::unordered_map<
        const torch::jit::Value*,
        std::vector<torch::jit::StrideInput>>
        symbolic_strides;
    symbolic_strides[graph->inputs().at(0)] = input_desc;
    symbolic_strides[graph->inputs().at(1)] = input_desc;
    symbolic_strides[graph->outputs().at(0)] = input_desc;
    TensorExprKernel kernel(
        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
    auto run_kernel = [&](int dim1, int dim2) {
      auto a =
          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
      auto b =
          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
      auto ref = at::mul(at::erf(at::tanh(a)), b);
      std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
      stack.emplace_back(dim1);
      stack.emplace_back(dim2);
      kernel.run(stack);
      auto o = stack[0].toTensor();
      ASSERT_TRUE(at::allclose(o, ref));
    };
    // Run the kernel in parallel to ensure that the run() method calls in
    // TensorExprKernel are not changing any state.
    constexpr size_t kNumThreads = 4;
    std::vector<std::thread> threads;
    for (size_t id = 0; id < kNumThreads; ++id) {
      threads.emplace_back(run_kernel, id + 5, id + 20);
    }
    for (auto& t : threads) {
      t.join();
    }
  }
 #endif
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@ -1,836 +0,0 @@
 #include <gtest/gtest.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <c10/util/irange.h>
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <test/cpp/tensorexpr/test_utils.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/ir_verifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <cmath>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
 TEST(Expr, BasicValueTest) {
  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
  ExprHandle c = Add::make(a, b);
  SimpleIRExprEval eval(c);
  ASSERT_EQ(eval.value<int>(), 5);
 }
 TEST(Expr, BasicValueTest02) {
  ExprHandle a(2.0f);
  ExprHandle b(3.0f);
  ExprHandle c(4.0f);
  ExprHandle d(5.0f);
  ExprHandle f = (a + b) - (c + d);
  SimpleIRExprEval eval(f);
  ASSERT_EQ(eval.value<float>(), -4.0f);
 }
 TEST(Expr, IsChannelsLastContiguous) {
  std::vector<VarHandle> vars = {
      VarHandle("var1", kLong),
      VarHandle("var2", kLong),
      VarHandle("var3", kLong),
      VarHandle("var4", kLong),
      VarHandle("var5", kLong)};
  // {
  //   key: ndims,
  //   value: [
  //     ...
  //     [dim_2, dim_1, ..., dim_n]
  //   ]
  // }
  using shapGenInfo = std::unordered_map<int, std::vector<std::vector<int>>>;
  // {
  //   size: [ExprHandle_1, ExprHandle_2, ..., ExprHandle_n],
  //   strides: [
  //     ...
  //     [ExprHandle_x, ExprHandle_y, ..., ExprHandle_z]
  //   ]
  // }
  using shapeInfo =
      std::pair<std::vector<ExprHandle>, std::vector<std::vector<ExprHandle>>>;
  std::vector<int> dims = {3, 4, 5};
  std::unordered_map<int, std::vector<ExprHandle>> dims_expr_vec_conf = {
      {3, std::vector<ExprHandle>(vars.begin(), vars.begin() + 2)},
      {4, std::vector<ExprHandle>(vars.begin(), vars.begin() + 3)},
      {5, std::vector<ExprHandle>(vars.begin(), vars.begin() + 4)},
  };
  shapGenInfo channels_last_cont_shape_conf = {
      {3, {{1, 2, 0}}}, {4, {{1, 3, 2, 0}}}, {5, {{1, 4, 3, 2, 0}}}};
  shapGenInfo channels_last_non_cont_shape_conf = {
      {3, {{2, 1, 0}, {1, 0, 2}}},
      {4, {{3, 1, 2, 0}, {1, 2, 3, 0}, {1, 0, 2, 3}}},
      {5, {{4, 3, 2, 1, 0}, {1, 3, 2, 4, 0}, {1, 4, 3, 2, 0}}}};
  shapGenInfo cont_shape_conf = {
      {3, {{0, 1, 2}}}, {4, {{0, 1, 2, 3}}}, {5, {{0, 1, 2, 3, 4}}}};
  auto shape_gen_fn = [dims_expr_vec_conf](
                          int ndims, shapGenInfo shape_gen_info) -> shapeInfo {
    auto dims_expr_vec = dims_expr_vec_conf.at(ndims);
    std::vector<std::vector<ExprHandle>> strides_expr_vec;
    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
      strides_expr_vec[i].resize(ndims);
    }
    auto stride_gen_fn = [](int indicator, ExprHandle a, ExprHandle b) {
      if (indicator % 2 == 0) {
        return a * b;
      } else {
        return b * a;
      }
    };
    auto stride_order_vec = shape_gen_info.at(ndims);
    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
      auto stride_order = stride_order_vec[i];
      strides_expr_vec[i][stride_order[0]] = 1;
      for (size_t j = 1; j < stride_order.size(); j++) {
        auto cur_dim_idx = stride_order[j];
        auto adjacent_dim_idx = stride_order[j - 1];
        strides_expr_vec[i][cur_dim_idx] = stride_gen_fn(
            i,
            dims_expr_vec[adjacent_dim_idx],
            strides_expr_vec[i][adjacent_dim_idx]);
      }
    }
    return {dims_expr_vec, strides_expr_vec};
  };
  auto check_channels_last_fn = [](int ndims, BufHandle buf_handle) -> bool {
    if (ndims == 3) {
      return buf_handle.is_channels_last_1d_contiguous();
    } else if (ndims == 4) {
      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast);
    } else {
      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast3d);
    }
  };
  // channels-last contiguous
  for (size_t i = 0; i < dims.size(); i++) {
    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
    for (size_t j = 0; j < shape_info.second.size(); j++) {
      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), true);
    }
  }
  // channels-last non-contiguous
  for (size_t i = 0; i < dims.size(); i++) {
    auto shape_info = shape_gen_fn(dims[i], channels_last_non_cont_shape_conf);
    for (size_t j = 0; j < shape_info.second.size(); j++) {
      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), false);
    }
  }
  // contiguous
  for (size_t i = 0; i < dims.size(); i++) {
    auto shape_info = shape_gen_fn(dims[i], cont_shape_conf);
    for (size_t j = 0; j < shape_info.second.size(); j++) {
      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
      ASSERT_EQ(buf_handle.is_contiguous(), true);
    }
  }
  // non-contiguous
  for (size_t i = 0; i < dims.size(); i++) {
    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
    for (size_t j = 0; j < shape_info.second.size(); j++) {
      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
      ASSERT_EQ(buf_handle.is_contiguous(), false);
    }
  }
 }
 TEST(Expr, LetTest01) {
  VarHandle x("x", kFloat);
  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle(3.f));
  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, LetTest02) {
  VarHandle x("x", kFloat);
  VarHandle y("y", kFloat);
  ExprHandle body =
      ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f) * y);
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle(3.f));
  eval.bindVar(y, ExprHandle(6.f));
  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4 * 6));
 }
 TEST(Expr, LetStmtTest01) {
  BufHandle a_buf("a", {1}, kFloat);
  BufHandle b_buf("b", {1}, kFloat);
  ExprHandle load_a = a_buf.load(0);
  VarHandle var = VarHandle("v", kFloat);
  StmtPtr let_store = Let::make(var, load_a);
  StmtPtr store_b = b_buf.store({0}, var);
  BlockPtr block = Block::make({let_store, store_b});
  SimpleIREvaluator eval(block, {a_buf, b_buf});
  PaddedBuffer<float> a_v(1);
  PaddedBuffer<float> b_v(1);
  PaddedBuffer<float> b_ref(1);
  a_v(0) = 23;
  b_ref(0) = a_v(0);
  eval(a_v, b_v);
  ExpectAllNear(b_v, b_ref, 1e-5);
 }
 TEST(Expr, IntTest) {
  VarHandle x("x", kInt);
  ExprHandle body = ExprHandle(2) + (x * ExprHandle(3) + ExprHandle(4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle(3));
  ASSERT_EQ(eval.value<int>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, FloatTest) {
  VarHandle x("x", kFloat);
  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle(3.f));
  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, ByteTest) {
  VarHandle x("x", kByte);
  ExprHandle body = ExprHandle((uint8_t)2) +
      (x * ExprHandle((uint8_t)3) + ExprHandle((uint8_t)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((uint8_t)3));
  ASSERT_EQ(eval.value<uint8_t>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, CharTest) {
  VarHandle x("x", kChar);
  ExprHandle body = ExprHandle((int8_t)2) +
      (x * ExprHandle((int8_t)3) + ExprHandle((int8_t)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((int8_t)3));
  ASSERT_EQ(eval.value<int8_t>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, ShortTest) {
  VarHandle x("x", kShort);
  ExprHandle body = ExprHandle((int16_t)2) +
      (x * ExprHandle((int16_t)3) + ExprHandle((int16_t)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((int16_t)3));
  ASSERT_EQ(eval.value<int16_t>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, LongTest) {
  VarHandle x("x", kLong);
  ExprHandle body = ExprHandle((int64_t)2) +
      (x * ExprHandle((int64_t)3) + ExprHandle((int64_t)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((int64_t)3));
  ASSERT_EQ(eval.value<int64_t>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, HalfTest) {
  VarHandle x("x", kHalf);
  ExprHandle body = ExprHandle((at::Half)2) +
      (x * ExprHandle((at::Half)3) + ExprHandle((at::Half)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((at::Half)3));
  ASSERT_EQ(eval.value<at::Half>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, DoubleTest) {
  VarHandle x("x", kDouble);
  ExprHandle body = ExprHandle((double)2) +
      (x * ExprHandle((double)3) + ExprHandle((double)4));
  SimpleIRExprEval eval(body);
  eval.bindVar(x, ExprHandle((double)3));
  ASSERT_EQ(eval.value<double>(), 2 + (3 * 3 + 4));
 }
 TEST(Expr, VectorAdd01) {
  const int kVectorSize = 8;
  const int kVectorCount = 128;
  const int kTotalSize = kVectorSize * kVectorCount;
  BufHandle a_buf("A", {kTotalSize}, kFloat);
  BufHandle b_buf("B", {kTotalSize}, kFloat);
  BufHandle c_buf("C", {kTotalSize}, kFloat);
  /*
  Build the following:
    for (const auto index : c10::irange(kVectorCount)) {
      store(c_buf, ramp(index * 8, 1, 8),
            load(a_buf, ramp(index * 8, 1, 8) +
            load(b_buf, ramp(index * 8, 1, 8))))
    }
  */
  VarHandle index = VarHandle("index", kInt);
  ExprHandle load_a =
      a_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
  ExprHandle load_b =
      b_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
  ExprHandle value = load_a + load_b;
  StmtPtr store_c =
      c_buf.store({Ramp::make(index * kVectorSize, 1, kVectorSize)}, value);
  StmtPtr stmt = For::make(index, 0, kVectorCount, store_c);
  ASSERT_EQ(load_a.dtype(), Dtype(kFloat, kVectorSize));
  ASSERT_EQ(load_b.dtype(), Dtype(kFloat, kVectorSize));
  ASSERT_EQ(value.dtype(), Dtype(kFloat, kVectorSize));
  PaddedBuffer<float> a_v(kTotalSize);
  PaddedBuffer<float> b_v(kTotalSize);
  PaddedBuffer<float> c_v(kTotalSize);
  PaddedBuffer<float> c_ref(kTotalSize);
  for (const auto i : c10::irange(kTotalSize)) {
    a_v(i) = i * i;
    b_v(i) = i * i * 4;
    c_ref(i) = a_v(i) + b_v(i);
  }
  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
  ir_eval(a_v, b_v, c_v);
  ExpectAllNear(c_v, c_ref, 1e-5);
 }
 TEST(Expr, CompareSelectEQ) {
  constexpr int N = 1024;
  BufHandle a("A", {N}, kInt);
  BufHandle b("B", {N}, kInt);
  BufHandle c("C", {N}, kInt);
  std::vector<int> a_buffer(N, 1);
  std::vector<int> b_buffer(N, 1);
  std::vector<int> c_buffer(N, 0);
  std::vector<int> c_ref(N, 0);
  VarHandle i("i", kInt);
  auto memcpy_expr = For::make(
      i,
      0,
      N,
      c.store(
          {i},
          CompareSelect::make(
              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
  ir_eval(a_buffer, b_buffer, c_buffer);
  ASSERT_EQ(a_buffer.size(), N);
  ASSERT_EQ(b_buffer.size(), N);
  ASSERT_EQ(c_buffer.size(), N);
  assertAllEqual(a_buffer, 1);
  assertAllEqual(b_buffer, 1);
  assertAllEqual(c_buffer, 1);
 }
 TEST(Expr, CompareSelectDtypes) {
  // LHS and RHS expressions should have the same dtype, but this dtype could
  // differ from the dtype of the return values (but dtypes of true and false
  // return values should be the same).
  // This test constructs a CompareSelect expression where the input dtype is
  // different from the output dtype and verifies that it works correctly:
  //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
  constexpr int N = 1024;
  BufHandle a("A", {N}, kInt);
  BufHandle b("B", {N}, kInt);
  BufHandle c("C", {N}, kFloat);
  std::vector<int> a_buffer(N, 1);
  std::vector<int> b_buffer(N, 1);
  std::vector<float> c_buffer(N, 0.0f);
  std::vector<float> c_ref(N, 3.14f);
  VarHandle i("i", kInt);
  // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f
  // A and B are int, C is float.
  auto select_expr = For::make(
      i,
      0,
      N,
      c.store(
          {i},
          CompareSelect::make(
              a.load(i),
              b.load(i),
              FloatImm::make(3.14f),
              FloatImm::make(2.78f),
              CompareSelectOperation::kEQ)));
  SimpleIREvaluator ir_eval(select_expr, {a, b, c});
  ir_eval(a_buffer, b_buffer, c_buffer);
  ASSERT_EQ(a_buffer.size(), N);
  ASSERT_EQ(b_buffer.size(), N);
  ASSERT_EQ(c_buffer.size(), N);
  assertAllEqual(a_buffer, 1);
  assertAllEqual(b_buffer, 1);
  ExpectAllNear(c_buffer, c_ref, 1e-7);
 }
 TEST(Expr, IntrinsicsDtypes) {
  constexpr int N = 256;
  BufHandle a("A", {N}, kDouble);
  BufHandle b("B", {N}, kDouble);
  std::vector<double> a_buffer(N, -10.0);
  std::vector<double> b_buffer(N, 0.0);
  std::vector<double> b_ref(N, 10.0);
  VarHandle i("i", kInt);
  auto abs_expr = For::make(i, 0, N, b.store({i}, tensorexpr::abs(a.load(i))));
  SimpleIREvaluator ir_eval(abs_expr, {a, b});
  ir_eval(a_buffer, b_buffer);
  ASSERT_EQ(a_buffer.size(), N);
  ASSERT_EQ(b_buffer.size(), N);
  assertAllEqual(a_buffer, -10.0);
  ExpectAllNear(b_buffer, b_ref, 1e-7);
 }
 TEST(Expr, Substitute01) {
  VarPtr x = alloc<Var>("x", kFloat);
  VarPtr y = alloc<Var>("y", kFloat);
  ExprPtr e =
      alloc<Mul>(alloc<Sub>(x, alloc<FloatImm>(1.0f)), alloc<Add>(x, y));
  VarPtr z = alloc<Var>("z", kFloat);
  ExprPtr e2 = Substitute(e, {{x, alloc<Add>(z, alloc<FloatImm>(5.0f))}});
  ExprPtr e2_ref = alloc<Mul>(
      alloc<Sub>(alloc<Add>(z, alloc<FloatImm>(5.0f)), alloc<FloatImm>(1.0f)),
      alloc<Add>(alloc<Add>(z, alloc<FloatImm>(5.0f)), y));
  std::ostringstream oss;
  oss << *e2;
  std::string e2_str = oss.str();
  oss.str("");
  oss << *e2_ref;
  std::string e2_ref_str = oss.str();
  ASSERT_EQ(e2_str, e2_ref_str);
 }
 TEST(Expr, Math01) {
  ExprHandle v = sin(ExprHandle(1.0f));
  std::ostringstream oss;
  oss << v;
  ASSERT_EQ(oss.str(), "sin(1.f)");
  SimpleIRExprEval eval(v);
  float v_ref = std::sin(1.0f);
  float res = eval.value<float>();
  ASSERT_NEAR(res, v_ref, 1e-6);
 }
 TEST(Expr, UnaryMath01) {
  struct TestConfig {
    std::function<ExprHandle(const ExprHandle&)> func;
    std::function<float(float)> ref_func;
  };
  std::vector<TestConfig> test_configs = {
      {[](const ExprHandle& v) { return sin(v); },
       [](float v) { return std::sin(v); }},
      {[](const ExprHandle& v) { return sin(v); },
       [](float v) { return std::sin(v); }},
      {[](const ExprHandle& v) { return tan(v); },
       [](float v) { return std::tan(v); }},
      {[](const ExprHandle& v) { return asin(v); },
       [](float v) { return std::asin(v); }},
      {[](const ExprHandle& v) { return acos(v); },
       [](float v) { return std::acos(v); }},
      {[](const ExprHandle& v) { return atan(v); },
       [](float v) { return std::atan(v); }},
      {[](const ExprHandle& v) { return sinh(v); },
       [](float v) { return std::sinh(v); }},
      {[](const ExprHandle& v) { return cosh(v); },
       [](float v) { return std::cosh(v); }},
      {[](const ExprHandle& v) { return tanh(v); },
       [](float v) { return std::tanh(v); }},
      {[](const ExprHandle& v) { return exp(v); },
       [](float v) { return std::exp(v); }},
      {[](const ExprHandle& v) { return tensorexpr::abs(v); },
       [](float v) { return std::fabs(v); }},
      {[](const ExprHandle& v) { return log(v); },
       [](float v) { return std::log(v); }},
      {[](const ExprHandle& v) { return log2(v); },
       [](float v) { return std::log2(v); }},
      {[](const ExprHandle& v) { return log10(v); },
       [](float v) { return std::log10(v); }},
      {[](const ExprHandle& v) { return erf(v); },
       [](float v) { return std::erf(v); }},
      {[](const ExprHandle& v) { return sqrt(v); },
       [](float v) { return std::sqrt(v); }},
      {[](const ExprHandle& v) { return rsqrt(v); },
       [](float v) { return 1.0f / std::sqrt(v); }},
      {[](const ExprHandle& v) { return ceil(v); },
       [](float v) { return std::ceil(v); }},
      {[](const ExprHandle& v) { return floor(v); },
       [](float v) { return std::floor(v); }},
      {[](const ExprHandle& v) { return round(v); },
       [](float v) { return std::round(v); }},
      {[](const ExprHandle& v) { return trunc(v); },
       [](float v) { return std::trunc(v); }},
  };
  for (const TestConfig& test_config : test_configs) {
    const float input_v = 0.8765f;
    ExprHandle v = test_config.func(ExprHandle(input_v));
    float v_ref = test_config.ref_func(input_v);
    SimpleIRExprEval eval(v);
    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
  }
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  for (float input_v : {std::nan("1"), 0., .5}) {
    ExprHandle v = FloatImm::make(input_v);
    SimpleIRExprEval eval(Intrinsics::make(kIsNan, v));
    ASSERT_NEAR(eval.value<int>(), std::isnan(input_v), 0);
  }
 }
 TEST(Expr, BinaryMath01) {
  struct TestConfig {
    std::function<ExprHandle(const ExprHandle&, const ExprHandle&)> func;
    std::function<float(float, float)> ref_func;
  };
  std::vector<TestConfig> test_configs = {
      {[](const ExprHandle& v1, const ExprHandle& v2) { return pow(v1, v2); },
       [](float v1, float v2) { return std::pow(v1, v2); }},
      {[](const ExprHandle& v1, const ExprHandle& v2) { return fmod(v1, v2); },
       [](float v1, float v2) { return std::fmod(v1, v2); }},
  };
  for (const TestConfig& test_config : test_configs) {
    const float v1 = 0.8765f;
    float v2 = 1.2345f;
    ExprHandle v_expr = test_config.func(ExprHandle(v1), ExprHandle(v2));
    float v_ref = test_config.ref_func(v1, v2);
    SimpleIRExprEval eval(v_expr);
    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
  }
 }
 TEST(Expr, LogicalOps01) {
  ExprHandle a(23);
  ExprHandle b(11);
  ExprHandle c(0.72f);
  ExprHandle d(0.69f);
  ExprHandle f1 = (a > b) && (c > d);
  ExprHandle f2 = (a > b) && (c < d);
  ExprHandle f3 = (a < b) && (c > d);
  ExprHandle f4 = (a < b) && (c < d);
  ExprHandle f5 = (a < b) || (c > d);
  ExprHandle f6 = (a < b) || (c < d);
  ExprHandle f7 = (a > b) || (c < d);
  ExprHandle f8 = (a > b) || (c > d);
  SimpleIRExprEval eval1(f1);
  SimpleIRExprEval eval2(f2);
  SimpleIRExprEval eval3(f3);
  SimpleIRExprEval eval4(f4);
  SimpleIRExprEval eval5(f5);
  SimpleIRExprEval eval6(f6);
  SimpleIRExprEval eval7(f7);
  SimpleIRExprEval eval8(f8);
  ASSERT_EQ(eval1.value<int>(), 1);
  ASSERT_EQ(eval2.value<int>(), 0);
  ASSERT_EQ(eval3.value<int>(), 0);
  ASSERT_EQ(eval4.value<int>(), 0);
  ASSERT_EQ(eval5.value<int>(), 1);
  ASSERT_EQ(eval6.value<int>(), 0);
  ASSERT_EQ(eval7.value<int>(), 1);
  ASSERT_EQ(eval8.value<int>(), 1);
 }
 TEST(Expr, LogicalOps02) {
  ExprHandle a(23);
  ExprHandle b(11);
  ExprHandle c(0.72f);
  ExprHandle d(0.72f);
  ExprHandle f1 = (a > b) || (c > d);
  ExprHandle f2 = (a > b) && (c <= d);
  ExprHandle f3 = (a > b) && (c > d);
  ExprHandle ff1 = f1 && f2;
  ExprHandle ff2 = f2 || f3;
  SimpleIRExprEval eval1(ff1);
  SimpleIRExprEval eval2(ff2);
  ASSERT_EQ(eval1.value<int>(), 1);
  ASSERT_EQ(eval2.value<int>(), 1);
 }
 TEST(Expr, LogicalOps03) {
  ExprHandle a(23);
  ExprHandle b(11);
  ExprHandle c(0.72f);
  ExprHandle d(0.69f);
  // Bool types
  ExprHandle bool_f1 = (a > b) && BoolImm::make(true);
  ExprHandle bool_f2 = (c <= d) || BoolImm::make(true);
  // Int types
  ExprHandle int_f1 = (a > b) && IntImm::make(1);
  ExprHandle int_f2 = (c <= d) || IntImm::make(1);
  // Short types
  ExprHandle short_f1 = (a > b) && ShortImm::make(1);
  ExprHandle short_f2 = (c <= d) || ShortImm::make(1);
  // Long types
  ExprHandle long_f1 = (a > b) && LongImm::make(1);
  ExprHandle long_f2 = (c <= d) || LongImm::make(1);
  // Char types
  ExprHandle char_f1 = (a > b) && CharImm::make(1);
  ExprHandle char_f2 = (c <= d) || CharImm::make(1);
  // Byte types
  ExprHandle byte_f1 = (a > b) && ByteImm::make(1);
  ExprHandle byte_f2 = (c <= d) || ByteImm::make(1);
  SimpleIRExprEval eval1(bool_f1);
  SimpleIRExprEval eval2(bool_f2);
  SimpleIRExprEval eval3(int_f1);
  SimpleIRExprEval eval4(int_f2);
  SimpleIRExprEval eval5(short_f1);
  SimpleIRExprEval eval6(short_f2);
  SimpleIRExprEval eval7(long_f1);
  SimpleIRExprEval eval8(long_f2);
  SimpleIRExprEval eval9(char_f1);
  SimpleIRExprEval eval10(char_f2);
  SimpleIRExprEval eval11(byte_f1);
  SimpleIRExprEval eval12(byte_f2);
  ASSERT_EQ(eval1.value<bool>(), true);
  ASSERT_EQ(eval2.value<bool>(), true);
  ASSERT_EQ(eval3.value<int>(), 1);
  ASSERT_EQ(eval4.value<int>(), 1);
  ASSERT_EQ(eval5.value<int16_t>(), 1);
  ASSERT_EQ(eval6.value<int16_t>(), 1);
  ASSERT_EQ(eval7.value<int64_t>(), 1);
  ASSERT_EQ(eval8.value<int64_t>(), 1);
  ASSERT_EQ(eval9.value<int8_t>(), 1);
  ASSERT_EQ(eval10.value<int8_t>(), 1);
  ASSERT_EQ(eval11.value<uint8_t>(), 1);
  ASSERT_EQ(eval12.value<uint8_t>(), 1);
 }
 TEST(Expr, BitwiseOps) {
  ExprHandle a(59);
  ExprHandle b(11);
  ExprHandle c(101);
  ExprHandle d(2);
  ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
  SimpleIRExprEval eval(f);
  ASSERT_EQ(eval.value<int>(), 11);
 }
 TEST(Expr, DynamicShapeAdd) {
  auto testWithSize = [](int32_t size) {
    VarHandle n("n", kInt);
    BufHandle a("a", {n}, kFloat);
    BufHandle b("b", {n}, kFloat);
    BufHandle c("c", {n}, kFloat);
    VarHandle i("i", kInt);
    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
    std::vector<float> aData(size, 1.0f);
    std::vector<float> bData(size, 2.0f);
    std::vector<float> cData(size, 0.0f);
    SimpleIREvaluator(s, {a, b, c, n})(aData, bData, cData, size);
    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
  };
  testWithSize(1);
  testWithSize(16);
  testWithSize(37);
 }
 TEST(Expr, OutOfBounds) {
  ExprHandle N(10);
  ExprHandle start(0);
  ExprHandle stop(15);
  VarHandle i("i", kInt);
  BufHandle X("X", {N}, kInt);
  auto body = Store::make(X, {i}, i);
  auto stmt = For::make(i, start, stop, body);
  PaddedBuffer<int> data(20);
  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
 }
 TEST(Expr, OutOfBounds2d) {
  std::vector<std::pair<int, int>> size_options = {{10, 15}, {15, 10}};
  for (auto sizes : size_options) {
    ExprHandle N(sizes.first);
    ExprHandle M(sizes.second);
    ExprHandle start(0);
    ExprHandle stopInner(15);
    ExprHandle stopOuter(15);
    VarHandle i("i", kInt);
    VarHandle j("j", kInt);
    BufHandle X("X", {N, M}, kInt);
    auto body = Store::make(X, {i, j}, i);
    auto inner = For::make(j, start, stopInner, body);
    auto stmt = For::make(i, start, stopOuter, inner);
    PaddedBuffer<int> data(400);
    EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
  }
 }
 TEST(Expr, OutOfBounds2dFlattenedIndex) {
  ExprHandle buf_size(149);
  ExprHandle start(0);
  ExprHandle stopInner(15);
  ExprHandle stopOuter(10);
  VarHandle i("i", kInt);
  VarHandle j("j", kInt);
  BufHandle X("X", {buf_size}, kInt);
  auto idx = Add::make(Mul::make(i, stopInner), j);
  auto body = Store::make(X, {idx}, i);
  auto inner = For::make(j, start, stopInner, body);
  auto stmt = For::make(i, start, stopOuter, inner);
  PaddedBuffer<int> data(400);
  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
 }
 void testCond01() {
  const int N = 16;
  PaddedBuffer<float> a_v(N);
  BufHandle a_buf("a", {N}, kFloat);
  VarHandle index = VarHandle("index", kInt);
  StmtPtr assign_x2 = a_buf.store({index}, cast<float>(index) * 2);
  StmtPtr assign_x3 = a_buf.store({index}, cast<float>(index) * 3);
  ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ);
  StmtPtr assign = Cond::make(even_cond, assign_x2, assign_x3);
  StmtPtr for_stmt = For::make(index, 0, N, assign);
  SimpleIREvaluator(for_stmt, {a_buf})(a_v);
  PaddedBuffer<float> a_ref(N);
  for (const auto i : c10::irange(N)) {
    if (i % 2 == 0) {
      a_ref(i) = i * 2;
    } else {
      a_ref(i) = i * 3;
    }
  }
  ExpectAllNear(a_v, a_ref, 1e-5);
 }
 void testIfThenElse01() {
  ExprHandle v = ifThenElse(ExprHandle(1), ExprHandle(1.0f), ExprHandle(2.0f));
  std::ostringstream oss;
  oss << v;
  ASSERT_EQ(oss.str(), "IfThenElse(1, 1.f, 2.f)");
  SimpleIRExprEval eval(v);
  ASSERT_EQ(eval.value<float>(), 1.0f);
 }
 void testIfThenElse02() {
  ExprHandle v = ifThenElse(ExprHandle(0), ExprHandle(1.0f), ExprHandle(2.0f));
  std::ostringstream oss;
  oss << v;
  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
  SimpleIRExprEval eval(v);
  ASSERT_EQ(eval.value<float>(), 2.0f);
 }
 void testIfThenElse03() {
  ExprHandle v =
      ifThenElse(BoolImm::make(false), ExprHandle(1.0f), ExprHandle(2.0f));
  std::ostringstream oss;
  oss << v;
  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
  SimpleIRExprEval eval(v);
  ASSERT_EQ(eval.value<float>(), 2.0f);
 }
 void testStmtClone() {
  const int N = 16;
  BufHandle a_buf("a", {N}, kInt);
  VarHandle index = VarHandle("index", kInt);
  StmtPtr body = a_buf.store({index}, 5);
  StmtPtr loop = For::make(index, 0, N, body);
  StmtPtr cloned_loop = Stmt::clone(loop);
  std::vector<int> orig_loop_results(N);
  std::vector<int> cloned_loop_results(N);
  SimpleIREvaluator(loop, {a_buf})(orig_loop_results);
  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results);
  assertAllEqual(orig_loop_results, 5);
  assertAllEqual(cloned_loop_results, 5);
  // Let's add another assign to the body in the cloned loop and verify that the
  // original statement hasn't changed while the cloned one has.
  StmtPtr body_addition = a_buf.store({index}, 33);
  BlockPtr cloned_body = static_to<Block>(static_to<For>(cloned_loop)->body());
  cloned_body->append_stmt(body_addition);
  std::vector<int> orig_loop_results_after_mutation(N);
  std::vector<int> cloned_loop_results_after_mutation(N);
  SimpleIREvaluator(loop, {a_buf})(orig_loop_results_after_mutation);
  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results_after_mutation);
  assertAllEqual(orig_loop_results_after_mutation, 5);
  assertAllEqual(cloned_loop_results_after_mutation, 33);
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
--- a/test/cpp/tensorexpr/test_graph_opt.cpp
+++ b/test/cpp/tensorexpr/test_graph_opt.cpp
@ -1,319 +0,0 @@
 #include <gtest/gtest.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/lower_tuples.h>
 #include <torch/csrc/jit/tensorexpr/graph_opt.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/torch.h>
 #include <limits>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 class GraphOpt : public ::testing::Test {
 public:
  void SetUp() override {
    old_cat_wo_conditionals_ = getCatWoConditionals();
    getCatWoConditionals() = true;
  }
  void TearDown() override {
    getCatWoConditionals() = old_cat_wo_conditionals_;
  }
 private:
  bool old_cat_wo_conditionals_;
 };
 TEST_F(GraphOpt, OptimizeCat) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
      return (%5))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // The `aten::log` op must be moved to the inputs of `aten::cat`.
  testing::FileCheck()
      .check("aten::log")
      ->check("aten::log")
      ->check("aten::log")
      ->check("aten::cat")
      ->check_not("aten::log")
      ->run(*kernel.graph());
  auto x = at::rand({10}, at::kFloat);
  auto y = at::rand({20}, at::kFloat);
  auto z = at::rand({30}, at::kFloat);
  auto ref = at::log(at::cat({x, y, z}, 0));
  std::vector<at::Tensor> inputs = {x, y, z};
  std::vector<IValue> stack = fmap<IValue>(inputs);
  kernel.run(stack);
  auto out = stack[0].toTensor();
  ASSERT_EQ(out.sizes(), ref.sizes());
  ASSERT_EQ(out.dtype(), ref.dtype());
  ASSERT_TRUE(at::allclose(out, ref));
 #endif
 }
 TEST_F(GraphOpt, OptimizeCat2) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
      %6 : Float(60, strides=[1], device=cpu) = aten::tanh(%5)
      return (%6))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // The `aten::log` and `aten::tanh` ops must be moved to the inputs of
  // `aten::cat`.
  testing::FileCheck()
      .check("aten::log")
      ->check("aten::log")
      ->check("aten::log")
      ->check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::cat")
      ->check_not("aten::log")
      ->check_not("aten::tanh")
      ->run(*kernel.graph());
  auto x = at::rand({10}, at::kFloat);
  auto y = at::rand({20}, at::kFloat);
  auto z = at::rand({30}, at::kFloat);
  auto ref = at::tanh(at::log(at::cat({x, y, z}, 0)));
  std::vector<at::Tensor> inputs = {x, y, z};
  std::vector<IValue> stack = fmap<IValue>(inputs);
  kernel.run(stack);
  auto out = stack[0].toTensor();
  ASSERT_EQ(out.sizes(), ref.sizes());
  ASSERT_EQ(out.dtype(), ref.dtype());
  ASSERT_TRUE(at::allclose(out, ref));
 #endif
 }
 TEST_F(GraphOpt, OptimizeCat3) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%a : Float(60, strides=[1], device=cpu),
          %x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
      %6 : Float(60, strides=[1], device=cpu) = aten::mul(%a, %5)
      return (%6))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
  // But the `aten::mul` op must not be moved since it is not a single-tensor
  // op (it has 2 tensor inputs).
  testing::FileCheck()
      .check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::cat")
      ->check("aten::mul")
      ->check_not("aten::tanh")
      ->run(*kernel.graph());
  auto a = at::rand({60}, at::kFloat);
  auto x = at::rand({10}, at::kFloat);
  auto y = at::rand({20}, at::kFloat);
  auto z = at::rand({30}, at::kFloat);
  auto ref = at::tanh(at::cat({x, y, z}, 0)) * a;
  std::vector<at::Tensor> inputs = {a, x, y, z};
  std::vector<IValue> stack = fmap<IValue>(inputs);
  kernel.run(stack);
  auto out = stack[0].toTensor();
  ASSERT_EQ(out.sizes(), ref.sizes());
  ASSERT_EQ(out.dtype(), ref.dtype());
  ASSERT_TRUE(at::allclose(out, ref));
 #endif
 }
 TEST_F(GraphOpt, OptimizeCatWithTypePromotionInUser) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%x : Int(10, strides=[1], device=cpu),
          %y : Int(20, strides=[1], device=cpu),
          %z : Int(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Int(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
      return (%5))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
  // The scalar type of the inputs to `cat` should now be `Float` since they
  // are the result of `tanh` which does the type promotion.
  testing::FileCheck()
      .check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::tanh")
      ->check("aten::cat")
      ->check_not("aten::tanh")
      ->run(*kernel.graph());
  auto x = at::randint(std::numeric_limits<int>::max(), {10}, at::kInt);
  auto y = at::randint(std::numeric_limits<int>::max(), {20}, at::kInt);
  auto z = at::randint(std::numeric_limits<int>::max(), {30}, at::kInt);
  auto ref = at::tanh(at::cat({x, y, z}, 0));
  std::vector<at::Tensor> inputs = {x, y, z};
  std::vector<IValue> stack = fmap<IValue>(inputs);
  kernel.run(stack);
  auto out = stack[0].toTensor();
  ASSERT_EQ(out.sizes(), ref.sizes());
  ASSERT_EQ(out.dtype(), ref.dtype());
  ASSERT_TRUE(at::allclose(out, ref));
 #endif
 }
 TEST_F(GraphOpt, OptimizeCatWithTypePromotionInCat) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Double(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Double(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Double(60, strides=[1], device=cpu) = aten::log(%cat)
      return (%5))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // No transformation should have happened because the `aten::cat` op performs
  // type promotion. This case is currently not handled.
  testing::FileCheck()
      .check("aten::cat")
      ->check("aten::log")
      ->check_not("aten::cat")
      ->check_not("aten::log")
      ->run(*kernel.graph());
 #endif
 }
 TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%0 : Float(60, strides=[1], device=cpu),
          %x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
      return (%5))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // No transformation is expected since the consumers of cat are not
  // single-tensor element-wise ops.
  testing::FileCheck()
      .check("aten::cat")
      ->check("aten::mul")
      ->check_not("aten::cat")
      ->check_not("aten::mul")
      ->run(*kernel.graph());
 #endif
 }
 TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp2) {
 #ifdef TORCH_ENABLE_LLVM
  const auto graph_string = R"IR(
    graph(%0 : Float(60, strides=[1], device=cpu),
          %1 : Float(60, strides=[1], device=cpu),
          %x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %one : int = prim::Constant[value=1]()
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
      %6 : Float(60, strides=[1], device=cpu) = aten::add(%5, %1, %one)
      return (%6))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  TensorExprKernel kernel(g);
  // No transformation is expected since the consumers of cat are not
  // single-tensor element-wise ops.
  testing::FileCheck()
      .check("aten::cat")
      ->check("aten::mul")
      ->check("aten::add")
      ->check_not("aten::cat")
      ->check_not("aten::mul")
      ->check_not("aten::add")
      ->run(*kernel.graph());
 #endif
 }
 TEST_F(GraphOpt, AOTGraphPrepPasses) {
  const auto graph_string = R"IR(
    graph(%x, %y, %z, %i : int):
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      return (%xyz_list, %i))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  removeGraphOutput(g, 1);
  replaceListOutputWithTuple(g);
  LowerAllTuples(g);
  testing::FileCheck().check("return (%x, %y, %z)")->run(*g);
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@ -1,98 +0,0 @@
 #include <gtest/gtest.h>
 #include <stdexcept>
 #include "test/cpp/tensorexpr/test_base.h"
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 TEST(IRPrinter, BasicValueTest) {
  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
  ExprHandle c = Add::make(a, b);
  std::stringstream ss;
  ss << c;
  ASSERT_EQ(ss.str(), "2 + 3");
 }
 TEST(IRPrinter, BasicValueTest02) {
  ExprHandle a(2.0f);
  ExprHandle b(3.0f);
  ExprHandle c(4.0f);
  ExprHandle d(5.0f);
  ExprHandle f = (a + b) - (c + d);
  std::stringstream ss;
  ss << f;
  ASSERT_EQ(ss.str(), "(2.f + 3.f) - (4.f + 5.f)");
 }
 TEST(IRPrinter, BasicValueTest03) {
  ExprHandle a(3.402823466385289e+38f);
  ExprHandle b(-3.402823466385289e+38f);
  std::stringstream ss;
  ss << a << ", " << b;
  ASSERT_EQ(ss.str(), "3.402823466385289e+38f, -3.402823466385289e+38f");
 }
 TEST(IRPrinter, CastTest) {
  VarHandle x("x", kHalf);
  VarHandle y("y", kFloat);
  ExprHandle body = ExprHandle(2.f) +
      (Cast::make(kFloat, x) * ExprHandle(3.f) + ExprHandle(4.f) * y);
  std::stringstream ss;
  ss << body;
  ASSERT_EQ(ss.str(), "2.f + (float(x) * 3.f + 4.f * y)");
 }
 TEST(IRPrinter, FunctionName) {
  int M = 4;
  int N = 20;
  Tensor producer = Compute(
      "producer", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return m * n;
      });
  Tensor chunk_0 = Compute(
      "chunk_0", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
        return producer.load(m, n);
      });
  Tensor chunk_1 = Compute(
      "chunk_1", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
        return producer.load(m, n + ExprHandle(N / 2));
      });
  Tensor consumer = Compute(
      "consumer", {M, N / 2}, [&](const ExprHandle& i, const ExprHandle& j) {
        return i * chunk_1.load(i, j);
      });
  LoopNest l({chunk_0, chunk_1, consumer});
  auto body = LoopNest::sanitizeNames(l.root_stmt());
  std::stringstream ss;
  ss << *body;
  const std::string& verification_pattern =
      R"IR(
 # CHECK:   for (int i_2
 # CHECK:    for (int j_2
 # CHECK:     consumer[i_2, j_2] = i_2 * (chunk_1[i_2, j_2])IR";
  torch::jit::testing::FileCheck().run(verification_pattern, ss.str());
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_ir_verifier.cpp
+++ b/test/cpp/tensorexpr/test_ir_verifier.cpp
@ -1,191 +0,0 @@
 #include <gtest/gtest.h>
 #include <stdexcept>
 #include "test/cpp/tensorexpr/test_base.h"
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_verifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 TEST(IRVerifier, BitwiseOps) {
  VarPtr X = alloc<Var>("x", kInt);
  VarPtr Y = alloc<Var>("y", kFloat);
  {
    auto a = alloc<And>(X, Y);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    auto a = alloc<Or>(X, Y);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    auto a = alloc<Xor>(X, Y);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    auto a = alloc<Lshift>(X, Y);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    auto a = alloc<Rshift>(X, Y);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, CompareSelect) {
  ExprPtr X = alloc<IntImm>(1);
  ExprPtr Y = alloc<FloatImm>(3.14f);
  {
    auto a = alloc<CompareSelect>(X, X, X, Y, kEQ);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    auto a = alloc<CompareSelect>(X, Y, X, X, kEQ);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, Ramp) {
  VarPtr I = alloc<Var>("i", kInt);
  VarPtr J = alloc<Var>("j", kFloat);
  {
    auto a = alloc<Ramp>(I, J, 4);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, Load) {
  VarPtr I = alloc<Var>("i", kInt);
  VarPtr J = alloc<Var>("j", kLong);
  VarPtr K = alloc<Var>("k", kFloat);
  BufPtr B = alloc<Buf>(
      "b",
      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
      kFloat);
  {
    // Indices with different int dtypes (kInt, kLong) are ok
    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, J}));
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_NO_THROW(verify(a));
  }
  {
    // Float index
    auto a = alloc<Load>(B, std::vector<ExprPtr>({K, K}));
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    // Multilanes are only allowed in flattened indices
    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, multilane_index}));
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, IfThenElse) {
  VarPtr I = alloc<Var>("i", kInt);
  VarPtr J = alloc<Var>("j", kLong);
  VarPtr K = alloc<Var>("k", kFloat);
  {
    // Condition must be integral
    auto a = alloc<IfThenElse>(K, I, I);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    // Dtypes of true and false exprs must match
    auto a = alloc<IfThenElse>(I, I, J);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    // Can't have multiple lanes in condition expr
    auto a = alloc<IfThenElse>(alloc<Broadcast>(I, 4), I, I);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, For) {
  VarPtr I = alloc<Var>("i", kInt);
  VarPtr J = alloc<Var>("j", kInt);
  StmtPtr body = alloc<Block>(std::vector<StmtPtr>({}));
  {
    // Can't have nullptr as a Var
    auto a = alloc<For>(nullptr, I, J, body);
    // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 TEST(IRVerifier, Block) {
  VarPtr I = alloc<Var>("i", kInt);
  BufPtr B = alloc<Buf>("B", std::vector<ExprPtr>({alloc<IntImm>(10)}), kInt);
  {
    StmtPtr store = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
    StmtPtr block1 = alloc<Block>(std::vector<StmtPtr>({store}));
    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
    StmtPtr block2 = alloc<Block>(std::vector<StmtPtr>({store}));
    // Stmt can't have multiple parents, thus inserting it into several blocks
    // is illegal
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(block2));
  }
 }
 TEST(IRVerifier, Store) {
  VarPtr I = alloc<Var>("i", kInt);
  VarPtr J = alloc<Var>("j", kLong);
  VarPtr K = alloc<Var>("k", kFloat);
  BufPtr B = alloc<Buf>(
      "b",
      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
      kFloat);
  {
    // Indices with different int dtypes (kInt, kLong) are ok
    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, J}), K);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_NO_THROW(verify(a));
  }
  {
    // Float index
    auto a = alloc<Store>(B, std::vector<ExprPtr>({K, K}), K);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    // Multilanes are only allowed in flattened indices
    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, multilane_index}), K);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
  {
    // Value and buf dtypes mismatch
    auto a = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
    EXPECT_ANY_THROW(verify(a));
  }
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
--- a/test/cpp/tensorexpr/test_memplanning.cpp
+++ b/test/cpp/tensorexpr/test_memplanning.cpp
@ -1,708 +0,0 @@
 #include <gtest/gtest.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <c10/util/irange.h>
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 extern void checkIR(StmtPtr s, const std::string& pattern);
 TEST(BufLiveRange, SingleRangeLine) {
  VarHandle i("i", kInt), j("j", kInt);
  BufHandle a("a", {32}, kFloat);
  BufHandle b("b", {32, 32}, kFloat);
  // Construct Stmt:
  // {
  //   for (int i = 0; i < 32; i++) {
  //     a[i] = 0;
  //     for (int j = 0; j < 32; j++) {
  //       a[i] = (a[i]) + (b[i, j]);
  //     }
  //   }
  // }
  StorePtr aInit = Store::make(a, {i}, 0);
  ExprHandle reduce = a.load({i}) + b.load({i, j});
  StorePtr aReduce = Store::make(a, {i}, reduce);
  StmtPtr loop =
      For::make(i, 0, 32, Block::make({aInit, For::make(j, 0, 32, aReduce)}));
  StmtPtr stmt = Block::make({loop});
  auto range = BufLiveRange::liveRange(stmt, a.node());
  ASSERT_TRUE(std::get<0>(range) == 0);
  ASSERT_TRUE(std::get<1>(range) == 0);
 }
 TEST(BufLiveRange, MulRangeLine) {
  VarHandle i("i", kInt);
  BufHandle a("a", {32}, kFloat);
  BufHandle b("b", {32}, kFloat);
  // Construct Stmt:
  // {
  //   for (int i = 0; i < 32; i++) {
  //     if (i<10 ? 1 : 0) {
  //       a[i] = i + i;
  //       b[i] = i * i;
  //     }
  //   }
  //   for (int i = 0; i < 32; i++) {
  //     if (i>10 ? 1 : 0) {
  //       a[i] = i * i;
  //       b[i] = i + i;
  //     }
  //   }
  // }
  StorePtr aStore_1 = Store::make(a, {i}, i + i);
  StorePtr bStore_1 = Store::make(b, {i}, i * i);
  StmtPtr loop_1 = For::make(
      i, 0, 32, Cond::make(i < 10, Block::make({aStore_1, bStore_1}), NULL));
  StorePtr aStore_2 = Store::make(a, {i}, i * i);
  StorePtr bStore_2 = Store::make(b, {i}, i + i);
  StmtPtr loop_2 = For::make(
      i, 0, 32, Cond::make(i > 10, Block::make({aStore_2, bStore_2}), NULL));
  StmtPtr stmt = Block::make({loop_1, loop_2});
  auto range_a = BufLiveRange::liveRange(stmt, a.node());
  ASSERT_TRUE(std::get<0>(range_a) == 0);
  ASSERT_TRUE(std::get<1>(range_a) == 1);
  auto range_b = BufLiveRange::liveRange(stmt, b.node());
  ASSERT_TRUE(std::get<0>(range_b) == 0);
  ASSERT_TRUE(std::get<1>(range_b) == 1);
 }
 TEST(MemPlanning, MemReuseWithTypeCast) {
  int M = 4;
  int N = 4;
  int K = 4;
  BufHandle AP("A", {M, K}, kFloat);
  BufHandle BP("B", {K, N}, kFloat);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return CompareSelect::make(
            CT.load(m, n), 0.0f, 0.0f, CT.load(m, n), kLT);
      });
  Tensor ET =
      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return Cast::make(kQUInt8, DT.load(m, n) + DT.load(m, n));
      });
  Tensor FT =
      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return ET.load(m, n);
      });
  StmtPtr stmt =
      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
  // different: 'E' type quint8 < 'gemm' type float. We'll reuse 'gemm' for 'E'
  // with typecasting.
  //{
  //  for (int i = 0; i < 4; i++) {
  //    for (int i_1 = 0; i_1 < 4; i_1++) {
  //      gemm[i, i_1] = float(0);
  //      for (int i_2 = 0; i_2 < 4; i_2++) {
  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
  //        i_1]), reduce_args={i_2});
  //      }
  //    }
  //  }
  //  for (int i_3 = 0; i_3 < 4; i_3++) {
  //    for (int i_4 = 0; i_4 < 4; i_4++) {
  //      relu[i_3, i_4] = (gemm[i_3, i_4])<0.f ? 0.f : (gemm[i_3, i_4]);
  //    }
  //  }
  //  for (int i_5 = 0; i_5 < 4; i_5++) {
  //    for (int i_6 = 0; i_6 < 4; i_6++) {
  //      E[i_5, i_6] = quint8((relu[i_5, i_6]) + (relu[i_5, i_6]));
  //    }
  //  }
  //  for (int i_7 = 0; i_7 < 4; i_7++) {
  //    for (int i_8 = 0; i_8 < 4; i_8++) {
  //      F[i_7, i_8] = E[i_7, i_8];
  //    }
  //  }
  //}
  LoopNest l(stmt, {FT.buf()});
  l.prepareForCodegen();
  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
 # CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
 # CHECK: Alias(E,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
  PaddedBuffer<float> a_v(M, K, "a");
  PaddedBuffer<float> b_v(K, N, "b");
  PaddedBuffer<uint8_t> o1(M, N, "e_before");
  PaddedBuffer<uint8_t> o2(M, N, "e_after");
  for (const auto m : c10::irange(M)) {
    for (const auto k : c10::irange(K)) {
      a_v(m, k) = at::randn({1}).item().to<float>();
    }
  }
  for (const auto k : c10::irange(K)) {
    for (const auto n : c10::irange(N)) {
      b_v(k, n) = at::randn({1}).item().to<float>();
    }
  }
  cg.call({a_v, b_v, o1});
 #ifdef TORCH_ENABLE_LLVM
  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
 # CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
 # CHECK: Alias(E,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
  cg_llvm.call({a_v, b_v, o2});
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
  ExpectAllNear(o1, o2, 1e-5);
 #endif
 }
 TEST(MemPlanning, NoMemReuseForLargerType) {
  int M = 4;
  int N = 4;
  int K = 4;
  BufHandle AP("A", {M, K}, kShort);
  BufHandle BP("B", {K, N}, kShort);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  auto zero = Cast::make(CT.buf()->dtype(), 0);
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return CompareSelect::make(
            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
      });
  Tensor ET =
      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return Cast::make(kFloat, DT.load(m, n) + DT.load(m, n));
      });
  Tensor FT =
      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return ET.load(m, n);
      });
  StmtPtr stmt =
      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
  // different: 'E' type float > 'gemm' type int16. We won't reuse 'gemm' for
  // 'E'.
  //{
  //  for (int i = 0; i < 4; i++) {
  //    for (int i_1 = 0; i_1 < 4; i_1++) {
  //      gemm[i, i_1] = int16_t(0);
  //      for (int i_2 = 0; i_2 < 4; i_2++) {
  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
  //        i_1]), reduce_args={i_2});
  //      }
  //    }
  //  }
  //  for (int i_3 = 0; i_3 < 4; i_3++) {
  //    for (int i_4 = 0; i_4 < 4; i_4++) {
  //      relu[i_3, i_4] = (gemm[i_3, i_4])<int16_t(0) ? int16_t(0) : (gemm[i_3,
  //      i_4]);
  //    }
  //  }
  //  for (int i_5 = 0; i_5 < 4; i_5++) {
  //    for (int i_6 = 0; i_6 < 4; i_6++) {
  //      E[i_5, i_6] = float((relu[i_5, i_6]) + (relu[i_5, i_6]));
  //    }
  //  }
  //  for (int i_7 = 0; i_7 < 4; i_7++) {
  //    for (int i_8 = 0; i_8 < 4; i_8++) {
  //      F[i_7, i_8] = E[i_7, i_8];
  //    }
  //  }
  //}
  LoopNest l(stmt, {FT.buf()});
  l.prepareForCodegen();
  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT.buf()});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
 # CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
 # CHECK: Allocate(E); // dtype=float, dims=[4, 4]
 # CHECK: Free(E);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
  PaddedBuffer<short> a_v(M, K, "a");
  PaddedBuffer<short> b_v(K, N, "b");
  PaddedBuffer<float> o1(M, N, "e_before");
  PaddedBuffer<float> o2(M, N, "e_after");
  for (const auto m : c10::irange(M)) {
    for (const auto k : c10::irange(K)) {
      a_v(m, k) = at::randn({1}).item().to<float>();
    }
  }
  for (const auto k : c10::irange(K)) {
    for (const auto n : c10::irange(N)) {
      b_v(k, n) = at::randn({1}).item().to<float>();
    }
  }
  cg.call({a_v, b_v, o1});
 #ifdef TORCH_ENABLE_LLVM
  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
 # CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
 # CHECK: Allocate(E); // dtype=float, dims=[4, 4]
 # CHECK: Free(E);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
  cg_llvm.call({a_v, b_v, o2});
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
  ExpectAllNear(o1, o2, 1e-5);
 #endif
 }
 TEST(MemPlanning, SameBufSizeMemReuse) {
  int M = 1024;
  int N = 1024;
  int K = 2048;
  BufHandle AP("A", {M, K}, kFloat);
  BufHandle BP("B", {K, N}, kFloat);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        auto zero = Cast::make(CT.buf()->dtype(), 0);
        return CompareSelect::make(
            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
      });
  Tensor ET =
      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return DT.load(m, n) + DT.load(m, n);
      });
  Tensor FT =
      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return ET.load(m, n) * ET.load(m, n);
      });
  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // add [2, 3] Buffer 'gemm' and 'add' are the same size; we'll reuse 'gemm'
  // for 'add'.
  //{
  //  for (int M = 0; M < 1024; M++) {
  //    for (int N = 0; N < 1024; N++) {
  //      gemm[M, N] = float(0);
  //      for (int K = 0; K < 2048; K++) {
  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
  //        reduce_args={K});
  //      }
  //    }
  //  }
  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
  //      N_1]);
  //    }
  //  }
  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
  //    }
  //  }
  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
  //    }
  //  }
  //}
  SimpleIREvaluator cg(stmt, {AP, BP, FT});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #ifdef TORCH_ENABLE_LLVM
  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
  loop.prepareForCodegen();
  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #endif
 }
 TEST(MemPlanning, SameBufSizeMultiMemReuses) {
  int M = 1024;
  int N = 1024;
  int K = 2048;
  BufHandle AP("A", {M, K}, kFloat);
  BufHandle BP("B", {K, N}, kFloat);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        auto zero = Cast::make(CT.buf()->dtype(), 0);
        return CompareSelect::make(
            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
      });
  Tensor ET =
      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return DT.load(m, n) + DT.load(m, n);
      });
  Tensor FT =
      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return ET.load(m, n) * ET.load(m, n);
      });
  Tensor GT =
      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return FT.load(m, n) - ET.load(m, n);
      });
  auto stmt =
      Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // add [2, 3], mul [3, 4] Buffer 'gemm', 'relu, ''add' and 'mul' are the same
  // size; we'll reuse 'gemm' for 'add', and reuse 'relu' for 'mul'
  //{
  //  for (int M = 0; M < 1024; M++) {
  //    for (int N = 0; N < 1024; N++) {
  //      gemm[M, N] = float(0);
  //      for (int K = 0; K < 2048; K++) {
  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
  //        reduce_args={K});
  //      }
  //    }
  //  }
  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
  //      N_1]);
  //    }
  //  }
  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
  //    }
  //  }
  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
  //    }
  //  }
  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
  //      sub[M_4, N_4] = (mul[M_4, N_4]) - (add[M_4, N_4]);
  //    }
  //  }
  //}
  SimpleIREvaluator cg(stmt, {AP, BP, GT});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Alias(mul,relu);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #ifdef TORCH_ENABLE_LLVM
  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
  loop.prepareForCodegen();
  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Alias(mul,relu);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #endif
 }
 TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) {
  int M = 1024;
  int N = 1024;
  int K = 2048;
  BufHandle AP("A", {M, K}, kFloat);
  BufHandle BP("B", {K, N}, kFloat);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        auto zero = Cast::make(CT.buf()->dtype(), 0);
        return CompareSelect::make(
            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
      });
  Tensor ET =
      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return DT.load(m, n) + DT.load(m, n);
      });
  Tensor FT =
      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return ET.load(m, n) * ET.load(m, n);
      });
  Tensor GT =
      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return FT.load(m, n) - 1;
      });
  Tensor HT =
      Compute("div", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        return GT.load(m, n) / 2;
      });
  auto stmt = Block::make(
      {CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt(), HT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // add [2, 3], mul [3, 4], sub [4, 5] Buffer 'gemm', 'relu, ''add', 'mul' and
  // 'sub' are the same size; we'll reuse 'gemm' for 'add', reuse 'relu' for
  // 'mul', and reuse 'gemm' for 'sub'.
  //{
  //  for (int M = 0; M < 1024; M++) {
  //    for (int N = 0; N < 1024; N++) {
  //      gemm[M, N] = float(0);
  //      for (int K = 0; K < 2048; K++) {
  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
  //        reduce_args={K});
  //      }
  //    }
  //  }
  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
  //      N_1]);
  //    }
  //  }
  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
  //    }
  //  }
  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
  //    }
  //  }
  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
  //      sub[M_4, N_4] = (mul[M_4, N_4]) - float(1);
  //    }
  //  }
  //  for (int M_5 = 0; M_5 < 1024; M_5++) {
  //    for (int N_5 = 0; N_5 < 1024; N_5++) {
  //      div[M_5, N_5] = (sub[M_5, N_5]) / float(2);
  //    }
  //  }
  //}
  SimpleIREvaluator cg(stmt, {AP, BP, HT});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Alias(mul,relu);
 # CHECK: Alias(sub,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #ifdef TORCH_ENABLE_LLVM
  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
  loop.prepareForCodegen();
  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK: Alias(add,gemm);
 # CHECK: Alias(mul,relu);
 # CHECK: Alias(sub,gemm);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #endif
 }
 TEST(MemPlanning, SmallerBufSizeNonMemReuse) {
  int M = 1024;
  int N = 1024;
  int K = 2048;
  BufHandle AP("A", {M, K}, kFloat);
  BufHandle BP("B", {K, N}, kFloat);
  Tensor CT = Reduce(
      "gemm",
      {M, N},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return AP.load(m, k) * BP.load(k, n);
      },
      {K});
  Tensor DT =
      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
        auto zero = Cast::make(CT.buf()->dtype(), 0);
        return CompareSelect::make(
            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
      });
  Tensor ET = Compute(
      "add", {M * 2, N * 2}, [&](const ExprHandle& em, const ExprHandle& en) {
        return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2);
      });
  Tensor FT = Compute(
      "mul", {M * 2, N * 2}, [&](const ExprHandle& fm, const ExprHandle& fn) {
        return ET.load(fm, fn) * ET.load(fm, fn);
      });
  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
  // Constructed stmt:
  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
  // add [2, 3] We do not reuse buffer 'gemm' for 'add' because the size of
  // buffer 'gemm' is smaller.
  //{
  //  for (int M = 0; M < 1024; M++) {
  //    for (int N = 0; N < 1024; N++) {
  //      gemm[M, N] = float(0);
  //      for (int K = 0; K < 2048; K++) {
  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
  //        reduce_args={K});
  //      }
  //    }
  //  }
  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
  //      N_1]);
  //    }
  //  }
  //  for (int EM = 0; EM < 2048; EM++) {
  //    for (int EN = 0; EN < 2048; EN++) {
  //      add[EM, EN] = (relu[EM / 2, EN / 2]) + (relu[EM / 2, EN / 2]);
  //    }
  //  }
  //  for (int FM = 0; FM < 2048; FM++) {
  //    for (int FN = 0; FN < 2048; FN++) {
  //      mul[FM, FN] = (add[FM, FN]) * (add[FM, FN]);
  //    }
  //  }
  //}
  //
  SimpleIREvaluator cg(stmt, {AP, BP, FT});
  checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK-NOT: Alias(add,gemm);
 # CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
 # CHECK: Free(add);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #ifdef TORCH_ENABLE_LLVM
  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
  loop.prepareForCodegen();
  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
  checkIR(cg_llvm.stmt(), R"IR(
 # CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
 # CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
 # CHECK-NOT: Alias(add,gemm);
 # CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
 # CHECK: Free(add);
 # CHECK: Free(relu);
 # CHECK: Free(gemm))IR");
 #endif
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@ -1,78 +0,0 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 #include <torch/torch.h>
 using namespace torch::jit::tensorexpr;
 using Tensors = std::vector<Tensor>;
 using Args = std::vector<CodeGen::BufferArg>;
 std::unique_ptr<SimpleIREvaluator> compile(
    const Args& inputs,
    const Tensors& outputs) {
  LoopNest nest({outputs});
  nest.prepareForCodegen();
  nest.simplify();
  auto join = inputs;
  join.insert(join.end(), outputs.begin(), outputs.end());
  return std::make_unique<SimpleIREvaluator>(nest.root_stmt(), join);
 }
 TEST(Ops, Sum) {
  constexpr int M = 8;
  constexpr int N = 16;
  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
  std::vector<std::vector<ExprHandle>> outputShapes = {{N}, {M}, {}};
  for (unsigned idx = 0; idx < testDims.size(); idx++) {
    const auto& dims = testDims[idx];
    const auto& outShape = outputShapes[idx];
    BufHandle a("a", {M, N}, kFloat);
    std::vector<ExprHandle> outStrides =
        c10::fmap<ExprHandle>(make_contiguous_strides(outShape));
    Tensor b = computeSum(
        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
    auto cg = compile({a}, {b});
    auto at = at::arange(M * N, at::kFloat).view({M, N});
    auto ref = at::sum(at, dims);
    auto bt = at::empty_like(ref);
    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
    ASSERT_TRUE(at::allclose(bt, ref));
  }
 }
 TEST(Ops, ChannelsLastSum) {
  constexpr int A = 2;
  constexpr int B = 3;
  constexpr int C = 4;
  constexpr int D = 5;
  constexpr int E = 6;
  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
  std::vector<std::vector<ExprHandle>> outputShapes = {
      {B, C, D, E}, {A, C, D, E}, {C, D, E}};
  for (unsigned idx = 0; idx < testDims.size(); idx++) {
    const auto& dims = testDims[idx];
    const auto& outShape = outputShapes[idx];
    BufHandle a("a", {A, B, C, D, E}, kFloat);
    std::vector<ExprHandle> outStrides =
        c10::fmap<ExprHandle>(make_channels_last_strides(outShape));
    Tensor b = computeSum(
        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
    auto cg = compile({a}, {b});
    auto at = at::arange(A * B * C * D * E, at::kFloat).view({A, B, C, D, E});
    auto ref = at::sum(at, dims);
    auto bt = at::empty_like(ref);
    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
    ASSERT_TRUE(at::allclose(bt, ref));
  }
 }
--- a/test/cpp/tensorexpr/test_quantization.cpp
+++ b/test/cpp/tensorexpr/test_quantization.cpp
@ -1,452 +0,0 @@
 #include <gtest/gtest.h>
 #include <ATen/native/quantized/PackedParams.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/torch.h>
 #include <cmath>
 #include <sstream>
 #include "torch/csrc/jit/tensorexpr/eval.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
 using namespace torch::indexing;
 using namespace torch::jit::tensorexpr;
 class Quantization : public ::testing::Test {
 public:
  void SetUp() override {
    getTEMustUseLLVMOnCPU() = false;
  }
 };
 TEST_F(Quantization, QuantDequantInt8) {
  const auto graph_string = R"IR(
      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=12]()
        %3 : int = prim::Constant[value=13]()
        %4 : float = prim::Constant[value=0.1]()
        %q.1 : QInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
        %6 : Float(2, 2) = aten::dequantize(%q.1)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQInt8);
  auto y_expected = at::dequantize(q);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, QuantDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %3 : int = prim::Constant[value=122]()
        %4 : float = prim::Constant[value=0.1]()
        %q.1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
        %6 : Float(2, 2) = aten::dequantize(%q.1)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = 2 * at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
  auto y_expected = at::dequantize(q);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, QuantDequantUInt8_NLC) {
  const auto graph_string = R"IR(
      graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %3 : int = prim::Constant[value=122]()
        %4 : float = prim::Constant[value=0.1]()
        %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
        %6 : Float(1, 2, 2) = aten::dequantize(%q.1)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  x.unsafeGetTensorImpl()->set_sizes_and_strides(
      std::initializer_list<int64_t>{1, 2, 2}, {4, 1, 2});
  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
  auto y_expected = at::dequantize(q);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x:\n" << x << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 at::Tensor quantized_add(
    at::Tensor x1,
    at::Tensor x2,
    double scale,
    int64_t zero) {
  const auto qadd_op =
      c10::Dispatcher::singleton()
          .findSchemaOrThrow("quantized::add", "")
          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
  return qadd_op.call(x1, x2, scale, zero);
 }
 TEST_F(Quantization, QuantAddDequantInt8) {
  const auto graph_string = R"IR(
      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=12]()
        %qz1 : int = prim::Constant[value=13]()
        %qs1 : float = prim::Constant[value=0.1]()
        %qz2 : int = prim::Constant[value=13]()
        %qs2 : float = prim::Constant[value=0.1]()
        %qza : int = prim::Constant[value=13]()
        %qsa : float = prim::Constant[value=0.1]()
        %q1 : QInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
        %q2 : QInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
        %qa : QInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
        %6 : Float(2, 2) = aten::dequantize(%qa)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQInt8);
  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQInt8);
  auto qa = quantized_add(q1, q2, 0.1f, 13);
  auto y_expected = at::dequantize(qa);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x1, x2};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x1:\n" << x1 << std::endl;
    std::cout << "q1:\n" << q1 << std::endl;
    std::cout << "x2:\n" << x2 << std::endl;
    std::cout << "q2:\n" << q2 << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, QuantAddDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %qz1 : int = prim::Constant[value=13]()
        %qs1 : float = prim::Constant[value=0.1]()
        %qz2 : int = prim::Constant[value=13]()
        %qs2 : float = prim::Constant[value=0.1]()
        %qza : int = prim::Constant[value=13]()
        %qsa : float = prim::Constant[value=0.1]()
        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
        %qa : QUInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
        %6 : Float(2, 2) = aten::dequantize(%qa)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
  auto qa = quantized_add(q1, q2, 0.1f, 13);
  auto y_expected = at::dequantize(qa);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x1, x2};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x1:\n" << x1 << std::endl;
    std::cout << "q1:\n" << q1 << std::endl;
    std::cout << "x2:\n" << x2 << std::endl;
    std::cout << "q2:\n" << q2 << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, QuantSigmoidDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %qz1 : int = prim::Constant[value=13]()
        %qs1 : float = prim::Constant[value=0.1]()
        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
        %qa : QUInt8(2, 2) = aten::sigmoid(%q1)
        %6 : Float(2, 2) = aten::dequantize(%qa)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
  auto qs = at::sigmoid(q1);
  auto y_expected = at::dequantize(qs);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x1};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x1:\n" << x1 << std::endl;
    std::cout << "q1:\n" << q1 << std::endl;
    std::cout << "qs:\n" << qs << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 at::Tensor quantized_mul(
    at::Tensor x1,
    at::Tensor x2,
    double scale,
    int64_t zero) {
  const auto op =
      c10::Dispatcher::singleton()
          .findSchemaOrThrow("quantized::mul", "")
          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
  return op.call(x1, x2, scale, zero);
 }
 TEST_F(Quantization, QuantMulDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %qz1 : int = prim::Constant[value=13]()
        %qs1 : float = prim::Constant[value=0.1]()
        %qz2 : int = prim::Constant[value=13]()
        %qs2 : float = prim::Constant[value=0.1]()
        %qza : int = prim::Constant[value=13]()
        %qsa : float = prim::Constant[value=0.1]()
        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
        %qa : QUInt8(2, 2) = quantized::mul(%q1, %q2, %qsa, %qza)
        %6 : Float(2, 2) = aten::dequantize(%qa)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
  auto qa = quantized_mul(q1, q2, 0.1f, 13);
  auto y_expected = at::dequantize(qa);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x1, x2};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x1:\n" << x1 << std::endl;
    std::cout << "q1:\n" << q1 << std::endl;
    std::cout << "x2:\n" << x2 << std::endl;
    std::cout << "q2:\n" << q2 << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, QuantUpsampleNearst2dDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x : Float(1, 1, 4, 4, strides=[16, 16, 4, 1], device=cpu)):
        %2 : int = prim::Constant[value=13]()
        %4 : NoneType = prim::Constant()
        %3 : int[] = prim::Constant[value=[6, 6]]()
        %qz : int = prim::Constant[value=13]()
        %qs : float = prim::Constant[value=0.1]()
        %q : QUInt8(1, 1, 4, 4) = aten::quantize_per_tensor(%x, %qs, %qz, %2)
        %qu : QUInt8(1, 1, 6, 6) = aten::upsample_nearest2d(%q, %3, %4)
        %6 : Float(1, 1, 6, 6) = aten::dequantize(%qu)
        return (%6))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = at::rand({1, 1, 4, 4}, TensorOptions(kCPU).dtype(at::kFloat));
  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
  auto qu = at::upsample_nearest2d(q, {6, 6});
  auto y_expected = at::dequantize(qu);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x:\n" << x << std::endl;
    std::cout << "q:\n" << q << std::endl;
    std::cout << "qu:\n" << qu << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 TEST_F(Quantization, UpsampleNearst2d) {
  const auto graph_string = R"IR(
      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
        %4 : NoneType = prim::Constant()
        %3 : int[] = prim::Constant[value=[4, 4]]()
        %u : Float(1, 1, 4, 4) = aten::upsample_nearest2d(%x, %3, %4)
        return (%u))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto y_expected = at::upsample_nearest2d(x, {4, 4});
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto y = stack[0].toTensor();
  bool check = at::allclose(y_expected, y);
  if (!check) {
    std::cout << "x:\n" << x << std::endl;
    std::cout << "y_expected:\n" << y_expected << std::endl;
    std::cout << "y:\n" << y << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 at::Tensor quantized_cat(
    c10::List<at::Tensor> const& xs,
    int64_t dim,
    double scale,
    int64_t zero) {
  const auto op = c10::Dispatcher::singleton()
                      .findSchemaOrThrow("quantized::cat", "")
                      .typed<at::Tensor(
                          c10::List<at::Tensor> const&,
                          int64_t,
                          std::optional<double>,
                          std::optional<int64_t>)>();
  return op.redispatch(
      DispatchKeySet({DispatchKey::QuantizedCPU}), xs, dim, scale, zero);
 }
 TEST_F(Quantization, QuantCatDequantUInt8) {
  const auto graph_string = R"IR(
      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %y : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %z : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
        %qdt : int = prim::Constant[value=13]()
        %qxz : int = prim::Constant[value=13]()
        %qxs : float = prim::Constant[value=0.1]()
        %qyz : int = prim::Constant[value=16]()
        %qys : float = prim::Constant[value=0.15]()
        %qzz : int = prim::Constant[value=19]()
        %qzs : float = prim::Constant[value=0.2]()
        %qx : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%x, %qxs, %qxz, %qdt)
        %qy : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%y, %qys, %qyz, %qdt)
        %qz : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%z, %qzs, %qzz, %qdt)
        %catx : Tensor[] = prim::ListConstruct(%qx, %qy, %qz)
        %catd : int = prim::Constant[value=0]()
        %qcat : QUInt8(3, 1, 2, 2) = quantized::cat(%catx, %catd, %qxs, %qxz)
        %cat : Float(3, 1, 2, 2) = aten::dequantize(%qcat)
        return (%cat))IR";
  auto graph = std::make_shared<Graph>();
  parseIR(graph_string, &*graph);
  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto y = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto z = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
  auto qx = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
  auto qy = at::quantize_per_tensor(y, 0.15f, 16, at::kQUInt8);
  auto qz = at::quantize_per_tensor(z, 0.2f, 19, at::kQUInt8);
  auto qcat = quantized_cat({qx, qy, qz}, 0, 0.1f, 13);
  auto expected = at::dequantize(qcat);
  TensorExprKernel k(graph);
  std::vector<at::Tensor> inputs = {x, y, z};
  StmtPtr s = k.getCodeGenStmt();
  std::vector<IValue> stack = fmap<IValue>(inputs);
  k.run(stack);
  auto result = stack[0].toTensor();
  bool check = at::allclose(expected, result);
  if (!check) {
    std::cout << "x:\n" << x << std::endl;
    std::cout << "y:\n" << y << std::endl;
    std::cout << "z:\n" << z << std::endl;
    std::cout << "qx:\n" << qx << std::endl;
    std::cout << "qy:\n" << qy << std::endl;
    std::cout << "qz:\n" << qz << std::endl;
    std::cout << "qcat:\n" << qcat << std::endl;
    std::cout << "expected:\n" << expected << std::endl;
    std::cout << "result:\n" << result << std::endl;
  }
  TORCH_CHECK_EQ(check, 1);
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@ -1,402 +0,0 @@
 #include <gtest/gtest.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 struct WithCPUFuser {
  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
    overrideCanFuseOnCPU(val);
  }
  ~WithCPUFuser() {
    overrideCanFuseOnCPU(cpuFuserEnabled);
  }
  bool cpuFuserEnabled;
 };
 TEST(TEFuserPass, FuserPass_1) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%0 : Float(128, strides=[1], device=cpu),
          %1 : Float(128, strides=[1], device=cpu)):
      %12 : int = prim::Constant[value=1]()
      %2.1 : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
      %2 : Float(128, strides=[1], device=cpu) = aten::mul(%2.1, %1)
      %3 : Float(128, strides=[1], device=cpu) = aten::add_(%2, %1, %12)
      %4 : Float(128, strides=[1], device=cpu) = aten::mul(%2, %1)
      %5 : Float(128, strides=[1], device=cpu) = aten::add(%2, %4, %12)
      return (%5))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g);
  // We should not be able to fuse across the in-place operation here.
  testing::FileCheck()
      .check("prim::TensorExprGroup_")
      ->check("aten::add_")
      ->check("prim::TensorExprGroup_")
      ->run(*g);
 }
 TEST(TEFuserPass, FuserPass_2) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%0 : Float(128, strides=[1], device=cpu),
          %1 : Float(128, strides=[1], device=cpu)):
      %12 : int = prim::Constant[value=1]()
      %a : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
      %b : Float(128, strides=[1], device=cpu) = aten::add(%0, %1, %12)
      %c : Float(128, strides=[1], device=cpu) = aten::add_(%b, %1, %12)
      %d : Float(128, strides=[1], device=cpu) = aten::mul(%c, %a)
      return (%d))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g);
  // We should not be able to fuse across the in-place operation here.
  testing::FileCheck()
      .check("aten::add_")
      ->check("prim::TensorExprGroup_0")
      ->run(*g);
 }
 TEST(TEFuserPass, FuserPass_3) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Float(128, strides=[1], device=cpu),
          %y : Float(128, strides=[1], device=cpu)):
      %r : Float(128, strides=[1], device=cpu) = aten::mul(%x, %y)
      return (%r))IR";
  {
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 2);
    // We should not create a fusion group since its size would be too small
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
  {
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 1);
    // We should create a fusion group since its size is above the threshold
    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
  }
 }
 TEST(TEFuserPass, FuserPass_0DimInput) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Float(device=cpu),
          %y : Float(device=cpu)):
      %one : int = prim::Constant[value=1]()
      %a : Float(device=cpu) = aten::mul(%x, %y)
      %b : Float(device=cpu) = aten::add(%x, %a, %one)
      return (%b))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g);
  // We should fuse 0-dim tensors too
  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, FuserPass_UnfusibleDevice) {
  WithCPUFuser cf(false);
  const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(10, strides=[1], device=cpu)):
      %a : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
      return (%a))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g, /* min_group_size= */ 1);
  // Test that we're not starting fusion groups from nodes with unfusible device
  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, FuserPass_UnknownShapes) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Tensor,
          %y : Tensor):
      %a : Tensor = aten::mul(%x, %y)
      %b : Tensor = aten::mul(%x, %a)
      return (%b))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g);
  // Test that we're not generating fusion groups when shapes are not known
  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, FuserPass_Multidevice) {
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      return (%cat))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 1);
    // We should be able to fuse this
    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
  }
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cuda:0),
          %z : Float(30, strides=[1], device=cpu)):
      %dim : int = prim::Constant[value=0]()
      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
      return (%cat))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 1);
    // We should not fuse this aten::cat since its inputs are from different
    // devices
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(10, strides=[1], device=cuda:0)):
      %dim : int = prim::Constant[value=0]()
      %xy_list : Tensor[] = prim::ListConstruct(%x, %y)
      %xy_cat : Float(30, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
      %r : Float(30, strides=[1], device=cpu) = aten::mul(%xy_cat, %z)
      return (%r))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 2);
    // Test that we check device before merging one node (cat) into another
    // (mul)
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cpu),
          %z : Float(10, strides=[1], device=cuda:0)):
      %z2 : Tensor = aten::mul(%z, %z)
      %dim : int = prim::Constant[value=0]()
      %xy_list : Tensor[] = prim::ListConstruct(%x, %y, %z2)
      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
      return (%cat))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 2);
    // Test that we check device before merging one node (mul) into another
    // (cat)
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cpu),
          %y : Float(20, strides=[1], device=cuda:0)):
      %r : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
      return (%r))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 1);
    // We should not fuse this graph since its inputs are from different devices
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
  {
    WithCPUFuser cf;
    const auto graph_string = R"IR(
    graph(%x : Float(10, strides=[1], device=cuda:0),
          %y : Float(20, strides=[1], device=cuda:1),
          %z : Float(20, strides=[1], device=cpu)):
      %x2 : Float(10, strides=[1], device=cpu) = aten::mul(%x, %x)
      %y2 : Float(10, strides=[1], device=cpu) = aten::mul(%y, %y)
      %z2 : Float(10, strides=[1], device=cpu) = aten::mul(%z, %z)
      return (%x2, %y2, %z2))IR";
    auto g = std::make_shared<Graph>();
    torch::jit::parseIR(graph_string, g.get());
    g->lint();
    FuseTensorExprs(g, /* min_group_size= */ 2);
    // We should not fuse these two computations since they use different
    // devices
    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
  }
 }
 TEST(TEFuserPass, FuserPass_MergeGroups) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%a : Float(128, strides=[1], device=cpu),
          %b : Float(128, strides=[1], device=cpu)):
      %x : Float(128, strides=[1], device=cpu) = aten::mul(%a, %a)
      %y : Float(128, strides=[1], device=cpu) = aten::mul(%b, %b)
      return (%x, %y))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g, /* min_group_size= */ 1);
  // The %x and %y computations are completely independent and yet we should put
  // them into a single fusion group rather than having two separate ones.
  testing::FileCheck()
      .check("= prim::TensorExprGroup_")
      ->check_not("= prim::TensorExprGroup_")
      ->run(*g);
 }
 TEST(TEFuserPass, FuserPass_IgnoreUnknownShapeAtStart) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Bool(8, strides=[1], device=cpu),
          %y : Bool(8, strides=[1], device=cpu)):
      %a : Bool(8, strides=[1], device=cpu) = aten::__and__(%x, %y)
      %b : Tensor = aten::__or__(%a, %y)
      return (%b)
    )IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g, /* min_group_size= */ 2);
  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, FuserPass_Where) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Float(8, strides=[1], device=cpu),
          %y : Float(8, strides=[1], device=cpu),
          %z : Float(8, strides=[1], device=cpu)):
      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
      %b : Float(8, strides=[1], device=cpu) = aten::where(%cond, %y, %z)
      return (%b)
    )IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g, /* min_group_size= */ 2);
  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, FuserPass_WhereList) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%x : Float(8, strides=[1], device=cpu),
          %y : Float(8, strides=[1], device=cpu),
          %z : Float(8, strides=[1], device=cpu)):
      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
      %b : Tensor[] = aten::where(%cond)
      return (%b)
    )IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(g, /* min_group_size= */ 2);
  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 TEST(TEFuserPass, DynamicShapeFusion) {
  WithCPUFuser cf;
  const auto graph_string = R"IR(
    graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
          %1 : Float(10, 5, strides=[5, 1], device=cpu)):
      %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
      %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
      return (%3))IR";
  auto g = std::make_shared<Graph>();
  torch::jit::parseIR(graph_string, g.get());
  g->lint();
  FuseTensorExprs(
      g,
      /* min_group_size = */ 2,
      /* add_composed_op = */ true,
      /* fuse_to_dynamic_shapes = */ true);
  Code code(g, "");
  testing::FileCheck()
      .check("prim::TensorExprDynamicGroup_")
      ->check("prim::TensorExprDynamicGuard")
      ->check("prim::TensorExprGroup_")
      ->run(*g);
  auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
    auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
    InterpreterState interp(code);
    Stack stack(inputs.begin(), inputs.end());
    interp.run(stack);
    at::Tensor out = pop(stack).toTensor();
    ASSERT_TRUE(at::allclose(out, ref));
  };
  std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
  run_and_compare(inputs);
  std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
  run_and_compare(inputs2);
  std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
  run_and_compare(inputs3);
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_type.cpp
+++ b/test/cpp/tensorexpr/test_type.cpp
@ -1,202 +0,0 @@
 #include <gtest/gtest.h>
 #include "torch/csrc/jit/tensorexpr/eval.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/tensor.h"
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 TEST(Type, Test01) {
  {
    Dtype dt1 = kInt;
    ASSERT_EQ(dt1, kInt);
  }
  {
    Dtype dt2_a(kInt, 8);
    Dtype dt2_b(kInt, 4);
    Dtype dt2_c(ScalarType::Int, 8);
    ASSERT_EQ(dt2_a, dt2_c);
    ASSERT_NE(dt2_a, dt2_b);
  }
  {
    ASSERT_EQ(kInt, ToDtype<int>());
    ASSERT_EQ(kFloat, ToDtype<float>());
    ASSERT_EQ(kByte, ToDtype<uint8_t>());
    ASSERT_EQ(kChar, ToDtype<int8_t>());
    ASSERT_EQ(kShort, ToDtype<int16_t>());
    ASSERT_EQ(kLong, ToDtype<int64_t>());
    ASSERT_EQ(kHalf, ToDtype<at::Half>());
    ASSERT_EQ(kDouble, ToDtype<double>());
    ASSERT_EQ(kBool, ToDtype<bool>());
  }
  {
    Dtype int32x8(kInt, 8);
    Dtype float32x8(kFloat, 8);
    ASSERT_NE(int32x8, float32x8);
    ASSERT_EQ(float32x8, BinaryOpDtype(int32x8, float32x8));
    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, int32x8));
    ASSERT_EQ(int32x8, BinaryOpDtype(int32x8, int32x8));
    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, float32x8));
  }
 }
 TEST(Type, BitCasting) {
  {
    VarHandle x("x", kFloat);
    ExprHandle y = bitcast<int32_t>(x);
    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
    ASSERT_EQ(y.dtype(), kInt);
  }
  {
    VarHandle x("x", kInt);
    ExprHandle y = bitcast<float>(x);
    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
    ASSERT_EQ(y.dtype(), kFloat);
  }
  {
    VarHandle x("x", kShort);
    ExprHandle y = bitcast<at::Half>(x);
    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
    ASSERT_EQ(y.dtype(), kHalf);
  }
  {
    VarHandle x("x", kHalf);
    ExprHandle y = bitcast<int16_t>(x);
    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
    ASSERT_EQ(y.dtype(), kShort);
  }
  constexpr int32_t ref32 = 1337;
  constexpr int64_t ref64 = 1337;
  constexpr float reff32 = 1337.0f;
  constexpr double reff64 = 1337.0f;
  using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
  // this is broken
  /*{
    constexpr int16_t ref16 = 1337;
    at::Half k_;
    at::Half* k = &k_;
    *reinterpret_cast<int16_t*>(k) = ref16;
    auto a = HalfImm::make(*k);
    auto b = BitCast::make(kShort, a);
    SimpleIRExprEval cg(b);
    ASSERT_EQ(cg.value<int16_t>(), ref16);
  }*/
  {
    float k = raw_bitcast<float>(ref32);
    auto a = FloatImm::make(k);
    auto b = BitCast::make(kInt, a);
    SimpleIRExprEval cg(b);
    ASSERT_EQ(cg.value<int32_t>(), ref32);
  }
  {
    double k = raw_bitcast<double>(ref64);
    auto a = DoubleImm::make(k);
    auto b = BitCast::make(kLong, a);
    SimpleIRExprEval cg(b);
    ASSERT_EQ(cg.value<int64_t>(), ref64);
  }
  {
    int64_t k = raw_bitcast<int64_t>(reff64);
    auto a = LongImm::make(k);
    auto b = BitCast::make(kDouble, a);
    SimpleIRExprEval cg(b);
    ASSERT_EQ(cg.value<double>(), reff64);
  }
  {
    int32_t k = raw_bitcast<int32_t>(reff32);
    auto a = IntImm::make(k);
    auto b = BitCast::make(kFloat, a);
    SimpleIRExprEval cg(b);
    ASSERT_EQ(cg.value<float>(), reff32);
  }
  // This segfaults :(
  /*{
    VarHandle x("x", kDouble);
    ASSERT_ANY_THROW(ExprHandle y = bitcast<int32_t>(x));
  }
  {
    VarHandle x("x", kFloat);
    ASSERT_ANY_THROW(ExprHandle y = bitcast<int64_t>(x));
  }
  {
    VarHandle x("x", kLong);
    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
  }
  {
    VarHandle x("x", kShort);
    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
  }
  {
    VarHandle x("x", kInt);
    ASSERT_ANY_THROW(ExprHandle y = bitcast<at::Half>(x));
  }*/
 }
 TEST(Type, Propagation) {
  // Same types:
  {
    VarHandle x("x", kFloat);
    VarHandle y("y", kFloat);
    ExprHandle body = FloatImm::make(2.f) +
        (x * FloatImm::make(3.f) + FloatImm::make(4.f) * y);
    ASSERT_EQ(body.dtype(), kFloat);
  }
  // Int to bigger int:
  {
    VarHandle x("x", kShort);
    VarHandle y("y", kLong);
    ExprHandle body =
        ShortImm::make(2.f) + (x * ShortImm::make(3) + ShortImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kLong);
  }
  // Float to bigger float:
  {
    VarHandle x("x", kHalf);
    VarHandle y("y", kDouble);
    ExprHandle body =
        HalfImm::make(2.f) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kDouble);
  }
  // Int to Float:
  {
    VarHandle x("x", kFloat);
    VarHandle y("y", kInt);
    ExprHandle body =
        IntImm::make(2) + (x * IntImm::make(3) + IntImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kFloat);
  }
  // Smaller float, bigger Int:
  {
    VarHandle x("x", kHalf);
    VarHandle y("y", kLong);
    ExprHandle body =
        HalfImm::make(2) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kHalf);
  }
  // Bigger float, smaller Int:
  {
    VarHandle x("x", kChar);
    VarHandle y("y", kDouble);
    ExprHandle body =
        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kDouble);
  }
  // Sign change char/byte upgrades to short:
  {
    VarHandle x("x", kChar);
    VarHandle y("y", kByte);
    ExprHandle body =
        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
    ASSERT_EQ(body.dtype(), kShort);
  }
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_type_specializations.cpp
+++ b/test/cpp/tensorexpr/test_type_specializations.cpp
@ -1,75 +0,0 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 // Test that tensor type specializations are available in
 // the custom passes
 namespace torch {
 namespace jit {
 namespace {
 bool hasTensorTypeSpecializations(torch::jit::Block* block) {
  for (Value* v : block->inputs()) {
    if (hasTensorTypeSpecialization(v))
      return true;
  }
  for (Node* n : block->nodes()) {
    for (torch::jit::Block* b : n->blocks()) {
      if (hasTensorTypeSpecializations(b))
        return true;
    }
    for (Value* v : n->outputs()) {
      if (hasTensorTypeSpecialization(v))
        return true;
    }
  }
  return false;
 }
 static bool hasSpecializations = false;
 void detectTTSpecializationPass(std::shared_ptr<Graph>& graph) {
  GRAPH_DUMP("In detectTTSpecialization Custom Post Pass: ", graph);
  hasSpecializations = hasTensorTypeSpecializations(graph->block());
 }
 } // namespace
 TEST(SpecializationsInCustomPasses, Basic) {
  RegisterPass p(detectTTSpecializationPass);
  hasSpecializations = false;
  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
  parseIR(
      R"IR(
 graph(%a.1 : Tensor,
      %b.1 : Tensor):
  %c.1 : Tensor = aten::mul(%a.1, %b.1) # misc/test_specializations.py:5:8
  %d.1 : Tensor = aten::mul(%c.1, %b.1) # misc/test_specializations.py:6:8
  return (%d.1)
  )IR",
      &*graph);
  IValue ival = IValue(torch::randn({22}, at::kCPU));
  std::vector<IValue> stack = {ival, ival};
  auto run = [&](std::shared_ptr<Graph>& graph, std::vector<IValue> stack) {
    GraphExecutor executor(graph, "");
    executor.run(stack);
    return stack;
  };
  run(graph, stack);
  // Profiling mode will not be run with simple executor
  if (!getExecutorMode()) {
    EXPECT_TRUE(hasSpecializations);
  }
 }
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/test_utils.h
+++ b/test/cpp/tensorexpr/test_utils.h
@ -1,78 +0,0 @@
 #pragma once
 #include <memory>
 #include <vector>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/testing/file_check.h>
 namespace torch {
 namespace jit {
 using namespace torch::jit::tensorexpr;
 #define IS_NODE(T, node)       \
  {                            \
    auto node_ = to<T>(node);  \
    ASSERT_NE(nullptr, node_); \
  }
 #define IS_NODE_WITH_NAME(T, node, name) \
  auto name = to<T>(node);               \
  ASSERT_NE(nullptr, name);
 #define IS_NODE_WITH_NAME_AND_CAST(T, node, name, Type)        \
  NodePtr<T> name = nullptr;                                   \
  {                                                            \
    auto node_ = to<Cast>(node);                               \
    ASSERT_NE(nullptr, node_);                                 \
    ASSERT_EQ(node_->dtype().scalar_type(), ScalarType::Type); \
    name = to<T>(node_->src_value());                          \
  }                                                            \
  ASSERT_NE(nullptr, name);
 #define IS_IMM_WITH_VAL(T, node, val) \
  {                                   \
    auto node_ = to<T##Imm>(node);    \
    ASSERT_NE(nullptr, node_);        \
    ASSERT_EQ(node_->value(), val);   \
  }
 #define IS_VAR_WITH_NAME(node, name)     \
  {                                      \
    auto node_ = to<Var>(node);          \
    ASSERT_NE(nullptr, node_);           \
    ASSERT_EQ(node_->name_hint(), name); \
  }
 #define IS_BINOP_W_VARS(T, node, name, v1, v2) \
  NodePtr<T> name = nullptr;                   \
  {                                            \
    name = to<T>(node);                        \
    ASSERT_NE(nullptr, name);                  \
    IS_VAR_WITH_NAME(name->lhs(), v1);         \
    IS_VAR_WITH_NAME(name->rhs(), v2);         \
  }
 #define IS_BINOP_W_CONST(T, node, name, v, c) \
  NodePtr<T> name = nullptr;                  \
  {                                           \
    name = to<T>(node);                       \
    ASSERT_NE(nullptr, name);                 \
    IS_VAR_WITH_NAME(name->lhs(), v);         \
    IS_IMM_WITH_VAL(Int, name->rhs(), c);     \
  }
 #define IS_RAND(node)                   \
  {                                     \
    auto node_ = to<Intrinsics>(node);  \
    ASSERT_NE(nullptr, node_);          \
    ASSERT_EQ(node_->op_type(), kRand); \
  }
 void checkIR(StmtPtr s, const std::string& pattern);
 void checkExprIR(ExprPtr e, const std::string& pattern);
 void checkExprIR(const ExprHandle& e, const std::string& pattern);
 } // namespace jit
 } // namespace torch
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@ -1,542 +0,0 @@
 // *** Tensor Expressions ***
 //
 // This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
 // work with them, and outlines how they are used in the overall TorchScript
 // compilation pipeline. This doc is permanently a "work in progress" since NNC
 // is under active development and things change fast.
 //
 // This Tutorial's code is compiled in the standard pytorch build, and the
 // executable can be found in `build/bin/tutorial_tensorexpr`.
 //
 // *** What is NNC ***
 //
 // NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
 // and it performs on-the-fly code generation for kernels, which are often a
 // combination of multiple aten (torch) operators.
 //
 // When the JIT interpreter executes a torchscript model, it automatically
 // extracts subgraphs from the torchscript IR graph for which specialized code
 // can be JIT generated. This usually improves performance as the 'combined'
 // kernel created from the subgraph could avoid unnecessary memory traffic that
 // is unavoidable when the subgraph is interpreted as-is, operator by operator.
 // This optimization is often referred to as 'fusion'. Relatedly, the process of
 // finding and extracting subgraphs suitable for NNC code generation is done by
 // a JIT pass called 'fuser'.
 //
 // *** What is TE ***
 //
 // TE stands for Tensor Expressions. TE is a commonly used approach for
 // compiling kernels performing tensor (~matrix) computation. The idea behind it
 // is that operators are represented as a mathematical formula describing what
 // computation they do (as TEs) and then the TE engine can perform mathematical
 // simplification and other optimizations using those formulas and eventually
 // generate executable code that would produce the same results as the original
 // sequence of operators, but more efficiently.
 //
 // NNC's design and implementation of TE was heavily inspired by Halide and TVM
 // projects.
 #include <iostream>
 #include <string>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/torch.h>
 using namespace torch::jit::tensorexpr;
 #ifdef TORCH_ENABLE_LLVM
 // Helper function to print a snippet from a big multi-line string
 static void printLinesToFrom(const std::string& input_str, int from, int to);
 #endif
 int main(int argc, char* argv[]) {
  std::cout << "*** Structure of tensor expressions and statements ***"
            << std::endl;
  {
    // A tensor expression is a tree of expressions. Each expression has a type,
    // and that type defines what sub-expressions the current expression has.
    // For instance, an expression of type 'Mul' would have a type 'kMul' and
    // two subexpressions: LHS and RHS. Each of these two sub-expressions could
    // also be a 'Mul' or some other expression.
    //
    // Let's construct a simple TE:
    ExprPtr lhs = alloc<IntImm>(5);
    ExprPtr rhs = alloc<Var>("x", kInt);
    ExprPtr mul = alloc<Mul>(lhs, rhs);
    std::cout << "Tensor expression: " << *mul << std::endl;
    // Prints: Tensor expression: 5 * x
    // Here we created an expression representing a 5*x computation, where x is
    // an int variable.
    // Another, probably a more convenient, way to construct tensor expressions
    // is to use so called expression handles (as opposed to raw expressions
    // like we did in the previous example). Expression handles overload common
    // operations and allow us to express the same semantics in a more natural
    // way:
    ExprHandle l = 5;
    ExprHandle r = Var::make("x", kInt);
    ExprHandle m = l * r;
    std::cout << "Tensor expression: " << *m.node() << std::endl;
    // Prints: Tensor expression: 5 * x
    // Converting from handles to raw expressions and back is easy:
    ExprHandle handle = Var::make("x", kInt);
    ExprPtr raw_expr_from_handle = handle.node();
    ExprPtr raw_expr = alloc<Var>("x", kInt);
    ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);
    // We could construct arbitrarily complex expressions using mathematical
    // and logical operations, casts between various data types, and a bunch of
    // intrinsics.
    ExprHandle a = Var::make("a", kInt);
    ExprHandle b = Var::make("b", kFloat);
    ExprHandle c = Var::make("c", kFloat);
    ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
    std::cout << "Tensor expression: " << *x.node() << std::endl;
    // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
    // An ultimate purpose of tensor expressions is to optimize tensor
    // computations, and in order to represent accesses to tensors data, there
    // is a special kind of expression - a load.
    // To construct a load we need two pieces: the base and the indices. The
    // base of a load is a Buf expression, which could be thought of as a
    // placeholder similar to Var, but with dimensions info.
    //
    // Let's construct a simple load:
    BufHandle A("A", {64, 32}, kInt);
    VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
    ExprHandle i(i_var), j(j_var);
    ExprHandle load = Load::make(A.dtype(), A, {i, j});
    std::cout << "Tensor expression: " << *load.node() << std::endl;
    // Prints: Tensor expression: A[i, j]
    // Tensor Expressions constitute Tensor Statements, which are used to
    // represent computation of a given operator or a group of operators from a
    // fusion group.
    //
    // There are three main kinds of tensor statements:
    //  - block
    //  - store
    //  - loop
    //
    // A Store represents a store to a single element of a tensor (or to a
    // group of elements if it's a vectorized store). Store statements,
    // similarly to Load expressions, have a base and indices, but on top of
    // that they also include a value - an expression representing what needs
    // to be stored at the given memory location. Let's create a Store stmt:
    StmtPtr store_a = Store::make(A, {i, j}, i + j);
    std::cout << "Store statement: " << *store_a << std::endl;
    // Prints: Store statement: A[i, j] = i + j;
    // An operator fills the entire tensor, not just a single element, and to
    // represent this we need to use For stmt: let's wrap our store stmt with
    // two nested loops to represent that variables i and j need to iterate
    // over some ranges.
    ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
    ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);
    std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
    // Prints:
    // Nested for loops:
    // for (const auto i : c10::irange(64)) {
    //   for (const auto j : c10::irange(32)) {
    //     A[i, j] = i + j;
    //   }
    // }
    // A Block statement is used when we need a sequence of other statements.
    // E.g. if a fusion group contains several operators, we initially define
    // separate loopnest for each of them and put them all into a common block:
    BufHandle B("B", {64, 32}, kInt);
    StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
    ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
    ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);
    BlockPtr block = Block::make({loop_i_a, loop_i_b});
    std::cout << "Compound Block statement: " << std::endl
              << *block << std::endl;
    // Prints:
    // Compound Block statement:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       A[i, j] = i + j;
    //     }
    //   }
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       B[i, j] = A[i, j];
    //     }
    //   }
    // }
    // Manually constructing nested loops and blocks to represent a computation
    // might be laborious, and instead we can use a 'Compute' API. This API
    // requires us to specify dimensions and a lambda to compute a single
    // element of the resulting tensor and returns a `Tensor` structure. This
    // structure is simply a pair of a buffer that was created to represent the
    // result of the computation (BufPtr) and a statement representing the
    // computation itself (StmtPtr).
    Tensor C =
        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
    std::cout << "Stmt produced by 'Compute' API: " << std::endl
              << *C.stmt() << std::endl;
    // Prints:
    // Stmt produced by 'Compute' API:
    // for (const auto i : c10::irange(64)) {
    //   for (const auto j : c10::irange(32)) {
    //     C[i, j] = i * j;
    //   }
    // }
    // To construct statements to represent computations with reductions, we
    // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
    // of extra arguments defining how to perform the reduction. Let's define a
    // simple 2D sum of C using that:
    Tensor D = Reduce(
        "D",
        {},
        Sum(),
        [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
        {64, 32});
    std::cout << "Stmt produced by 'Reduce' API: " << std::endl
              << *D.stmt() << std::endl;
  }
  std::cout << "*** Loopnests transformations ***" << std::endl;
  {
    // When a statement for the computation is generated, we might want to
    // apply some optimizations to it. These transformations allow us to end up
    // with a statement producing the same results, but more efficiently.
    //
    // Let's look at a couple of transformations that are used in NNC. We will
    // begin with constructing a Block statement like we did before.
    Tensor C =
        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
          return i * (j + 1);
        });
    BufHandle c_buf(C.buf());
    Tensor D =
        Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
          return c_buf.load(i, j) - i;
        });
    StmtPtr block = Block::make({C.stmt(), D.stmt()});
    std::cout << "Stmt produced by 'Compute' API: " << std::endl
              << *block << std::endl;
    // Prints:
    // Stmt produced by 'Compute' API:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       C[i, j] = i * (j + 1);
    //     }
    //   }
    //   for (const auto i_1 : c10::irange(64)) {
    //     for (const auto j_1 : c10::irange(32)) {
    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
    //     }
    //   }
    // }
    // One transformation we can apply to this computation is inlining: i.e.
    // taking the expression that defines values of C and substituting a load
    // from C with it.
    // To do that, we first need to create a special object called LoopNest -
    // all transformations are methods of this class. To create a loopnest we
    // need to provide a list of output buffers and the root statement:
    LoopNest nest(block, {D.buf()});
    // We can always retrieve the Stmt back from LoopNest:
    std::cout << "LoopNest root stmt: " << std::endl
              << *nest.root_stmt() << std::endl;
    // Prints:
    // LoopNest root stmt:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       C[i, j] = i * (j + 1);
    //     }
    //   }
    //   for (const auto i_1 : c10::irange(64)) {
    //     for (const auto j_1 : c10::irange(32)) {
    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
    //     }
    //   }
    // }
    // Now we can apply the inlining transformation:
    nest.computeInline(C.buf());
    std::cout << "Stmt after inlining:" << std::endl
              << *nest.root_stmt() << std::endl;
    // Prints:
    // Stmt after inlining:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       D[i, j] = i * (j + 1) - i;
    //     }
    //   }
    // }
    // We can also apply algebraic simplification to a statement:
    StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
    std::cout << "Stmt after simplification:" << std::endl
              << *simplified << std::endl;
    // Prints:
    // Stmt after simplification:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       D[i, j] = i * j;
    //     }
    //   }
    // }
    // Many loopnest transformations are stateless and can be applied without
    // creating a LoopNest object. In fact, we plan to make all transformations
    // stateless.
    // splitWithTail is one such transformation: it splits an iteration space
    // of a given loop into two with a given factor.
    ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
    LoopNest::splitWithTail(outer_loop, 13);
    // Call simplifier once more to fold some arithmetic.
    simplified = IRSimplifier::simplify(simplified);
    std::cout << "Stmt after splitWithTail:" << std::endl
              << *simplified << std::endl;
    // Prints:
    // Stmt after splitWithTail:
    // {
    //   for (const auto i_outer : c10::irange(4)) {
    //     for (const auto i_inner : c10::irange(13)) {
    //       for (const auto j : c10::irange(32)) {
    //         D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
    //       }
    //     }
    //   }
    //   for (const auto i_tail : c10::irange(12)) {
    //     for (const auto j : c10::irange(32)) {
    //       D[i_tail + 52, j] = i_tail * j + 52 * j;
    //     }
    //   }
    // }
    // NNC supports a wide range of loop nest transformations, which we are not
    // listing here. Please refer to documentation in
    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
    // for more details.
  }
  std::cout << "*** Codegen ***" << std::endl;
  {
    // An ultimate goal of tensor expressions is to be provide a mechanism to
    // execute a given computation in the fastest possible way. So far we've
    // looked at how we could describe what computation we're interested in, but
    // we haven't looked at how to actually execute it.
    //
    // All we've been dealing with was just symbols with no actual data
    // associated, in this section we would look at how we can bridge that gap.
    // Let's start by constructing a simple computation for us to work with:
    BufHandle A("A", {64, 32}, kInt);
    BufHandle B("B", {64, 32}, kInt);
    Tensor X =
        Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
          return A.load(i, j) + B.load(i, j);
        });
    // And let's lower it to a loop nest, as we did in the previous section. We
    // can pass Tensor object directly:
    LoopNest loopnest({X});
    std::cout << *loopnest.root_stmt() << std::endl;
    // Prints:
    // {
    //   for (const auto i : c10::irange(64)) {
    //     for (const auto j : c10::irange(32)) {
    //       X[i, j] = (A[i, j]) + (B[i, j]);
    //     }
    //   }
    // Now imagine that we have two actual tensors 64x32 that we want sum
    // together, how do we pass those tensors to the computation and how do we
    // carry it out?
    //
    // Codegen object is aimed at providing exactly that functionality. Codegen
    // is an abstract class and concrete codegens are derived from it.
    // Currently, we have three codegens:
    //  1) Simple Evaluator,
    //  2) LLVM Codegen for CPU,
    //  3) CUDA Codegen.
    // In this example we will be using Simple Evaluator, since it's available
    // everywhere.
    // To create a codegen, we need to provide the statement - it specifies the
    // computation we want to perform - and a list of placeholders and tensors
    // used in the computation. The latter part is crucial since that's the only
    // way the codegen could use to correlate symbols in the statement to actual
    // data arrays that we will be passing when we will actually be performing
    // the computation.
    //
    // Let's create a Simple IR Evaluator codegen for our computation:
    SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
    // We are using the simplest codegen and in it almost no work is done at the
    // construction step. Real codegens such as CUDA and LLVM perform
    // compilation during that stage so that when we're about to run the
    // computation everything is ready.
    // Let's now create some inputs and run our computation with them:
    std::vector<int> data_A(64 * 32, 3); // This will be the input A
    std::vector<int> data_B(64 * 32, 5); // This will be the input B
    std::vector<int> data_X(64 * 32, 0); // This will be used for the result
    // Now let's invoke our codegen to perform the computation on our data. We
    // need to provide as many arguments as how many placeholders and tensors we
    // passed at the codegen construction time. A position in these lists would
    // define how real data arrays from the latter call (these arguments are
    // referred to as 'CallArg's in our codebase) correspond to symbols
    // (placeholders and tensors) used in the tensor expressions we constructed
    // (these are referred to as 'BufferArg').
    // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
    // contains data for the placeholder A, data_B - for the placeholder B, and
    // data_X would be used for contents of tensor X.
    ir_eval(data_A, data_B, data_X);
    // Let's print one of the elements from each array to verify that the
    // computation did happen:
    std::cout << "A[10] = " << data_A[10] << std::endl
              << "B[10] = " << data_B[10] << std::endl
              << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
    // Prints:
    // A[10] = 3
    // B[10] = 5
    // X[10] = A[10] + B[10] = 8
  }
  std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
  {
    // This section requires a LLVM-enabled PyTorch build, so we have to use a
    // guard:
 #ifdef TORCH_ENABLE_LLVM
    // Often we would like to convert a TorchScript IR to TE rather than
    // construct TE IR from scratch.  NNC provides an API to perform such
    // lowering: it takes a TorchScript graph and returns an object that can be
    // used to invoke the generated kernel.
    // This API is currently used by the TorchScript JIT fuser and can also be
    // used ahead of time to pre-compile parts of a model.
    //
    // To get familiar with this API let's first start with defining a simple
    // TorchScript graph:
    const auto graph_string = R"IR(
        graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
              %B : Float(5, 3, strides=[3, 1], device=cpu)):
          %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
          %one : int = prim::Constant[value=1]()
          %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
          %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
          return (%AAB_plus_B))IR";
    auto graph = std::make_shared<torch::jit::Graph>();
    parseIR(graph_string, &*graph);
    // This graph defines a simple computation of A*A*B + B where A and B are
    // input 5x3 tensors.
    // To lower this TorchScript graph to TE, we just need to create a
    // TensorExprKernel object. In its constructor it constructs the
    // corresponding TE IR and compiles it for the given backend (in this
    // example for CPU using LLVM compiler).
    TensorExprKernel kernel(graph);
    // We can retrieve the generated TE stmt from the kernel object:
    StmtPtr kernel_stmt = kernel.getCodeGenStmt();
    std::cout << "TE Stmt constructed from TorchScript: " << std::endl
              << *kernel_stmt << std::endl;
    // Prints:
    // TE Stmt constructed from TorchScript:
    // {
    //   for (const auto v : c10::irange(5)) {
    //     for (const auto _tail_tail : c10::irange(3)) {
    //       aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
    //       ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
    //       (tB[_tail_tail + 3 * v]);
    //     }
    //   }
    // }
    // We can also examine generated LLVM IR and assembly code:
    std::cout << "Generated LLVM IR: " << std::endl;
    auto ir_str = kernel.getCodeText("ir");
    printLinesToFrom(ir_str, 15, 20);
    // Prints:
    // Generated LLVM IR:
    //   %9 = bitcast float* %2 to <8 x float>*
    //   %10 = load <8 x float>, <8 x float>* %9 ...
    //   %11 = bitcast float* %5 to <8 x float>*
    //   %12 = load <8 x float>, <8 x float>* %11 ...
    //   %13 = fmul <8 x float> %10, %12
    //   %14 = fmul <8 x float> %10, %13
    std::cout << "Generated assembly: " << std::endl;
    auto asm_str = kernel.getCodeText("asm");
    printLinesToFrom(asm_str, 10, 15);
    // Prints:
    // Generated assembly:
    //         vmulps  %ymm1, %ymm0, %ymm2
    //         vfmadd213ps     %ymm1, %ymm0, %ymm2
    //         vmovups %ymm2, (%rax)
    //         vmovss  32(%rcx), %xmm0
    //         vmovss  32(%rdx), %xmm1
    //         vmulss  %xmm1, %xmm0, %xmm2
    // We can also execute the generated kernel:
    auto A =
        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
        2.0;
    auto B =
        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
        3.0;
    std::vector<at::Tensor> inputs = {A, B};
    std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
    kernel.run(stack);
    auto R = stack[0].toTensor();
    // Let's print one of the elements from the result tensor to verify that the
    // computation did happen and was correct:
    std::cout << "R[2][2] = " << R[2][2] << std::endl;
    // Prints:
    // R[2][2] = 15
    // [ CPUFloatType{} ]
 #endif
  }
  return 0;
 }
 void printLinesToFrom(const std::string& input_str, int from, int to) {
  std::istringstream f(input_str);
  std::string s;
  int idx = 0;
  while (getline(f, s)) {
    if (idx > from) {
      std::cout << s << "\n";
    }
    if (idx++ > to) {
      break;
    }
  }
 }
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@ -2939,7 +2939,10 @@ def f({", ".join(param_names)}):
    @slowTest
    @onlyCPU
-    @ops(op_db, dtypes=OpDTypes.supported)
+    @ops(
        [op for op in op_db if get_name(op) not in known_failures],
        dtypes=OpDTypes.supported,
    )
    def test_nnc_correctness(self, device, dtype, op):
        if not op.supports_tracing:
            self.skipTest("Requires tracing support")
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@ -1910,7 +1910,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator {
    }
    auto& out_t = p_node->Output(0).toTensor();
-    if (in0_t.sizes() == in1_t.sizes() &&
+    if (te && te->checkInput<float>(in0_t) && in0_t.sizes() == in1_t.sizes() &&
        in0_t.scalar_type() == in1_t.scalar_type() &&
        in0_t.strides() == in1_t.strides() && in0_t.is_contiguous() &&
        in0_t.scalar_type() == at::kFloat) {