Nvfuser code base nuke (#111447)

removing nvfuser code base. Pull Request resolved: https://github.com/pytorch/pytorch/pull/111447 Approved by: https://github.com/albanD
2025-10-20 21:14:14 +08:00 · 2023-11-01 20:53:14 +00:00
parent 5a6f8014c4
commit 9d23440c81
313 changed files with 1 additions and 170447 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -8,7 +8,7 @@ load("@pytorch//tools/rules:cu.bzl", "cu_library")
 load("@pytorch//tools/config:defs.bzl", "if_cuda")
 load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
 load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
-load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
+load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -208,9 +208,6 @@ cmake_dependent_option(
 cmake_dependent_option(
    USE_CUSPARSELT "Use cuSPARSELt" ON
    "USE_CUDA" OFF)
-cmake_dependent_option(
-    BUILD_NVFUSER_BENCHMARK "Build C++ binaries for nvfuser benchmarks" OFF
-    "USE_CUDA" OFF)
 cmake_dependent_option(
    USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON
    "USE_CUDNN" OFF)
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -16,42 +16,6 @@ GENERATED_LAZY_TS_CPP = [
    "lazy/generated/RegisterLazy.cpp",
 ]

-# NVFuser runtime library
-libtorch_nvfuser_runtime_sources = [
-    "third_party/nvfuser/runtime/array.cu",
-    "third_party/nvfuser/runtime/array_rocm.cu",
-    "third_party/nvfuser/runtime/bf16_support.cu",
-    "third_party/nvfuser/runtime/bf16_support_rocm.cu",
-    "third_party/nvfuser/runtime/block_reduction.cu",
-    "third_party/nvfuser/runtime/block_sync_atomic.cu",
-    "third_party/nvfuser/runtime/block_sync_default.cu",
-    "third_party/nvfuser/runtime/block_sync_default_rocm.cu",
-    "third_party/nvfuser/runtime/broadcast.cu",
-    "third_party/nvfuser/runtime/fp16_support.cu",
-    "third_party/nvfuser/runtime/fused_reduction.cu",
-    "third_party/nvfuser/runtime/fused_welford_helper.cu",
-    "third_party/nvfuser/runtime/fused_welford_impl.cu",
-    "third_party/nvfuser/runtime/grid_broadcast.cu",
-    "third_party/nvfuser/runtime/grid_reduction.cu",
-    "third_party/nvfuser/runtime/grid_sync.cu",
-    "third_party/nvfuser/runtime/helpers.cu",
-    "third_party/nvfuser/runtime/index_utils.cu",
-    "third_party/nvfuser/runtime/memory.cu",
-    "third_party/nvfuser/runtime/random_numbers.cu",
-    "third_party/nvfuser/runtime/swizzle.cu",
-    "third_party/nvfuser/runtime/tensor.cu",
-    "third_party/nvfuser/runtime/tensorcore.cu",
-    "third_party/nvfuser/runtime/tuple.cu",
-    "third_party/nvfuser/runtime/type_traits.cu",
-    "third_party/nvfuser/runtime/warp.cu",
-    "third_party/nvfuser/runtime/warp_rocm.cu",
-    "third_party/nvfuser/runtime/welford.cu",
-    "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh",
-    "aten/src/ATen/cuda/detail/UnpackRaw.cuh",
-]
-
-libtorch_nvfuser_generated_headers = ["{}.h".format(name.split("/")[-1].split(".")[0]) for name in libtorch_nvfuser_runtime_sources]
-
 def libtorch_generated_sources(gencode_pattern):
    return [gencode_pattern.format(name) for name in [
        "torch/csrc/autograd/generated/Functions.cpp",
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -27,7 +27,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
  message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
  message(STATUS "  BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
-  message(STATUS "  BUILD_NVFUSER_BENCHMARK: ${BUILD_NVFUSER_BENCHMARK}")
  message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
  message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
  if(${CAFFE2_LINK_LOCAL_PROTOBUF})
--- a/third_party/nvfuser/CMakeLists.txt
+++ b/third_party/nvfuser/CMakeLists.txt
@ -1,371 +0,0 @@
-if(NOT BUILD_NVFUSER)
-  return()
-endif()
-
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(nvfuser)
-
-if(NOT USE_ROCM)
-  set(TORCHLIB_FLAVOR torch_cuda)
-else()
-  set(TORCHLIB_FLAVOR torch_hip)
-endif()
-
-# --- project
-
-file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/nvfuser")
-
-set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR})
-set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
-set(TORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
-set(TORCH_INSTALL_LIB_DIR ${TORCH_ROOT}/torch/lib)
-
-# --- build nvfuser_codegen library
-
-set(NVFUSER_SRCS)
-set(NVFUSER_CODEGEN ${PROJECT_NAME}_codegen)
-list(APPEND NVFUSER_SRCS
-    ${NVFUSER_SRCS_DIR}/arith.cpp
-    ${NVFUSER_SRCS_DIR}/compute_at.cpp
-    ${NVFUSER_SRCS_DIR}/inlining.cpp
-    ${NVFUSER_SRCS_DIR}/compute_at_map.cpp
-    ${NVFUSER_SRCS_DIR}/codegen.cpp
-    ${NVFUSER_SRCS_DIR}/contiguity.cpp
-    ${NVFUSER_SRCS_DIR}/dispatch.cpp
-    ${NVFUSER_SRCS_DIR}/expr_evaluator.cpp
-    ${NVFUSER_SRCS_DIR}/kernel_expr_evaluator.cpp
-    ${NVFUSER_SRCS_DIR}/executor.cpp
-    ${NVFUSER_SRCS_DIR}/executor_kernel_arg.cpp
-    ${NVFUSER_SRCS_DIR}/executor_launch_params.cpp
-    ${NVFUSER_SRCS_DIR}/evaluator_common.cpp
-    ${NVFUSER_SRCS_DIR}/executor_utils.cpp
-    ${NVFUSER_SRCS_DIR}/fusion.cpp
-    ${NVFUSER_SRCS_DIR}/graph_fuser.cpp
-    ${NVFUSER_SRCS_DIR}/grouped_reduction.cpp
-    ${NVFUSER_SRCS_DIR}/index_compute.cpp
-    ${NVFUSER_SRCS_DIR}/lower_index_compute.cpp
-    ${NVFUSER_SRCS_DIR}/instrumentation.cpp
-    ${NVFUSER_SRCS_DIR}/ir_base_nodes.cpp
-    ${NVFUSER_SRCS_DIR}/ir_builder.cpp
-    ${NVFUSER_SRCS_DIR}/ir_cloner.cpp
-    ${NVFUSER_SRCS_DIR}/ir_container.cpp
-    ${NVFUSER_SRCS_DIR}/ir_graphviz.cpp
-    ${NVFUSER_SRCS_DIR}/ir_nodes.cpp
-    ${NVFUSER_SRCS_DIR}/ir_iostream.cpp
-    ${NVFUSER_SRCS_DIR}/ir_utils.cpp
-    ${NVFUSER_SRCS_DIR}/iter_visitor.cpp
-    ${NVFUSER_SRCS_DIR}/kernel.cpp
-    ${NVFUSER_SRCS_DIR}/kernel_cache.cpp
-    ${NVFUSER_SRCS_DIR}/kernel_ir.cpp
-    ${NVFUSER_SRCS_DIR}/kernel_ir_dispatch.cpp
-    ${NVFUSER_SRCS_DIR}/lower_alias_memory.cpp
-    ${NVFUSER_SRCS_DIR}/lower_allocation.cpp
-    ${NVFUSER_SRCS_DIR}/lower_double_buffer.cpp
-    ${NVFUSER_SRCS_DIR}/lower_divisible_split.cpp
-    ${NVFUSER_SRCS_DIR}/lower_expr_sort.cpp
-    ${NVFUSER_SRCS_DIR}/lower_fused_reduction.cpp
-    ${NVFUSER_SRCS_DIR}/lower_fusion_simplifier.cpp
-    ${NVFUSER_SRCS_DIR}/lower_index.cpp
-    ${NVFUSER_SRCS_DIR}/lower_index_hoist.cpp
-    ${NVFUSER_SRCS_DIR}/lower_insert_syncs.cpp
-    ${NVFUSER_SRCS_DIR}/lower_instrument.cpp
-    ${NVFUSER_SRCS_DIR}/lower_loops.cpp
-    ${NVFUSER_SRCS_DIR}/lower_magic_zero.cpp
-    ${NVFUSER_SRCS_DIR}/lower_misaligned_vectorization.cpp
-    ${NVFUSER_SRCS_DIR}/lower_predicate.cpp
-    ${NVFUSER_SRCS_DIR}/lower_predicate_elimination.cpp
-    ${NVFUSER_SRCS_DIR}/lower_replace_size.cpp
-    ${NVFUSER_SRCS_DIR}/lower_shift.cpp
-    ${NVFUSER_SRCS_DIR}/lower_sync_information.cpp
-    ${NVFUSER_SRCS_DIR}/lower_thread_predicate.cpp
-    ${NVFUSER_SRCS_DIR}/lower_trivial_broadcast.cpp
-    ${NVFUSER_SRCS_DIR}/lower_trivial_reductions.cpp
-    ${NVFUSER_SRCS_DIR}/lower_unroll.cpp
-    ${NVFUSER_SRCS_DIR}/lower_utils.cpp
-    ${NVFUSER_SRCS_DIR}/lower_validation.cpp
-    ${NVFUSER_SRCS_DIR}/lower_warp_reduce.cpp
-    ${NVFUSER_SRCS_DIR}/lower2device.cpp
-    ${NVFUSER_SRCS_DIR}/lower_bank_conflict.cpp
-    ${NVFUSER_SRCS_DIR}/manager.cpp
-    ${NVFUSER_SRCS_DIR}/maxinfo_propagator.cpp
-    ${NVFUSER_SRCS_DIR}/mutator.cpp
-    ${NVFUSER_SRCS_DIR}/non_divisible_split.cpp
-    ${NVFUSER_SRCS_DIR}/ops/alias.cpp
-    ${NVFUSER_SRCS_DIR}/ops/composite.cpp
-    ${NVFUSER_SRCS_DIR}/ops/normalization.cpp
-    ${NVFUSER_SRCS_DIR}/parallel_dimension_map.cpp
-    ${NVFUSER_SRCS_DIR}/parallel_type_bitmap.cpp
-    ${NVFUSER_SRCS_DIR}/parser.cpp
-    ${NVFUSER_SRCS_DIR}/partial_split_map.cpp
-    ${NVFUSER_SRCS_DIR}/partition.cpp
-    ${NVFUSER_SRCS_DIR}/predicate_compute.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_interface.cpp
-    ${NVFUSER_SRCS_DIR}/register_interface.cpp
-    ${NVFUSER_SRCS_DIR}/root_domain_map.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/pointwise_utils.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/normalization.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/reduction.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/reduction_utils.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/registry.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
-    ${NVFUSER_SRCS_DIR}/type_inference.cpp
-    ${NVFUSER_SRCS_DIR}/type_promotion.cpp
-    ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
-    ${NVFUSER_SRCS_DIR}/tensor_view.cpp
-    ${NVFUSER_SRCS_DIR}/transform_iter.cpp
-    ${NVFUSER_SRCS_DIR}/transform_replay.cpp
-    ${NVFUSER_SRCS_DIR}/transform_rfactor.cpp
-    ${NVFUSER_SRCS_DIR}/transform_view.cpp
-    ${NVFUSER_SRCS_DIR}/type.cpp
-    ${NVFUSER_SRCS_DIR}/utils.cpp
-    ${NVFUSER_SRCS_DIR}/mma_type.cpp
-    ${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
-)
-
-add_library(${NVFUSER_CODEGEN} SHARED ${NVFUSER_SRCS})
-
-if(NOT USE_ROCM)
-  target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
-  # NB: This must be target_compile_definitions, not target_compile_options,
-  # as the latter is not respected by nvcc
-  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
-else()
-  target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
-  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
-  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE
-    USE_ROCM
-    __HIP_PLATFORM_HCC__
-    )
-endif()
-
-target_link_libraries(${NVFUSER_CODEGEN} PRIVATE torch ${TORCHLIB_FLAVOR})
-if(NOT USE_ROCM)
-  target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${CUDA_NVRTC_LIB} torch::nvtoolsext)
-  target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${CUDA_INCLUDE_DIRS})
-else()
-  target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${ROCM_HIPRTC_LIB})
-  target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
-endif()
-if(NOT MSVC)
-  target_compile_options(${NVFUSER_CODEGEN} PRIVATE -Wno-unused-variable)
-endif()
-target_include_directories(${NVFUSER_CODEGEN}
-                           PUBLIC $<BUILD_INTERFACE:${NVFUSER_SRCS_DIR}>)
-set_property(TARGET ${NVFUSER_CODEGEN} PROPERTY CXX_STANDARD 17)
-install(TARGETS ${NVFUSER_CODEGEN} EXPORT NvfuserTargets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-file(WRITE "${TORCH_ROOT}/test/_nvfuser/.gitignore" "*")
-# --- build nvfuser_python library
-
-if(BUILD_PYTHON)
-  set(NVFUSER "${PROJECT_NAME}")
-  #find_package(pybind11 REQUIRED)
-
-  set(NVFUSER_PYTHON_SRCS)
-  list(APPEND NVFUSER_PYTHON_SRCS
-      ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
-      ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
-  )
-
-  add_library(${NVFUSER} MODULE ${NVFUSER_PYTHON_SRCS})
-  if(NOT USE_ROCM)
-    target_compile_options(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
-    # NB: This must be target_compile_definitions, not target_compile_options,
-    # as the latter is not respected by nvcc
-    target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
-    target_link_libraries(${NVFUSER} PRIVATE torch::nvtoolsext)
-  else()
-    target_compile_options(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
-    target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
-    target_compile_definitions(${NVFUSER} PRIVATE
-      USE_ROCM
-      __HIP_PLATFORM_HCC__
-      )
-    target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
-  endif()
-
-  target_link_libraries(${NVFUSER} PRIVATE ${NVFUSER_CODEGEN})
-  target_link_libraries(${NVFUSER} PRIVATE torch torch_python ${TORCHLIB_FLAVOR})
-  target_link_libraries(${NVFUSER} PRIVATE pybind::pybind11)
-  target_include_directories(${NVFUSER} PRIVATE ${TORCH_ROOT})
-  target_compile_definitions(${NVFUSER} PRIVATE EXTENSION_NAME=_C)
-  target_compile_options(${NVFUSER} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
-
-  # avoid using Python3_add_library, copied from functorch
-  set_target_properties(${NVFUSER} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
-  if(NOT MSVC)
-    target_compile_options(${NVFUSER} PRIVATE -Wno-unused-variable)
-    set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".so")
-  else()
-    set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".pyd")
-  endif()
-
-  set_target_properties(${NVFUSER} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-        ${CMAKE_BINARY_DIR}/nvfuser)
-  set_target_properties(${NVFUSER} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
-
-  if(TORCH_PYTHON_LINK_FLAGS AND NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
-    message(STATUS "somehow this is happening")
-    set_target_properties(${NVFUSER} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
-  endif()
-  install(TARGETS ${NVFUSER} EXPORT NvfuserTargets DESTINATION ${TORCH_ROOT}/nvfuser/)
-
-  # install nvfuser python files
-  install(DIRECTORY "${NVFUSER_ROOT}/python/"
-          DESTINATION "${TORCH_ROOT}/nvfuser"
-          FILES_MATCHING PATTERN "*.py" )
-  
-  file(WRITE "${TORCH_ROOT}/nvfuser/.gitignore" "*")
-endif()
-
-# --- generate runtime files
-
-# The list of NVFUSER runtime files
-list(APPEND NVFUSER_RUNTIME_FILES
-  ${NVFUSER_ROOT}/runtime/array.cu
-  ${NVFUSER_ROOT}/runtime/block_reduction.cu
-  ${NVFUSER_ROOT}/runtime/block_sync_atomic.cu
-  ${NVFUSER_ROOT}/runtime/block_sync_default.cu
-  ${NVFUSER_ROOT}/runtime/broadcast.cu
-  ${NVFUSER_ROOT}/runtime/fp16_support.cu
-  ${NVFUSER_ROOT}/runtime/fused_reduction.cu
-  ${NVFUSER_ROOT}/runtime/fused_welford_helper.cu
-  ${NVFUSER_ROOT}/runtime/fused_welford_impl.cu
-  ${NVFUSER_ROOT}/runtime/bf16_support.cu
-  ${NVFUSER_ROOT}/runtime/grid_broadcast.cu
-  ${NVFUSER_ROOT}/runtime/grid_reduction.cu
-  ${NVFUSER_ROOT}/runtime/grid_sync.cu
-  ${NVFUSER_ROOT}/runtime/helpers.cu
-  ${NVFUSER_ROOT}/runtime/index_utils.cu
-  ${NVFUSER_ROOT}/runtime/random_numbers.cu
-  ${NVFUSER_ROOT}/runtime/swizzle.cu
-  ${NVFUSER_ROOT}/runtime/tensor.cu
-  ${NVFUSER_ROOT}/runtime/tuple.cu
-  ${NVFUSER_ROOT}/runtime/type_traits.cu
-  ${NVFUSER_ROOT}/runtime/welford.cu
-  ${NVFUSER_ROOT}/runtime/warp.cu
-  ${NVFUSER_ROOT}/runtime/tensorcore.cu
-  ${NVFUSER_ROOT}/runtime/memory.cu
-  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
-  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh
-)
-
-if(USE_ROCM)
-list(APPEND NVFUSER_RUNTIME_FILES
-  ${NVFUSER_ROOT}/runtime/array_rocm.cu
-  ${NVFUSER_ROOT}/runtime/bf16_support_rocm.cu
-  ${NVFUSER_ROOT}/runtime/block_sync_default_rocm.cu
-  ${NVFUSER_ROOT}/runtime/warp_rocm.cu
-)
-endif()
-
-file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
-
-# "stringify" NVFUSER runtime sources
-# (generate C++ header files embedding the original input as a string literal)
-set(NVFUSER_STRINGIFY_TOOL "${NVFUSER_ROOT}/tools/stringify_file.py")
-foreach(src ${NVFUSER_RUNTIME_FILES})
-  get_filename_component(filename ${src} NAME_WE)
-  set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
-  add_custom_command(
-    COMMENT "Stringify NVFUSER runtime source file"
-    OUTPUT ${dst}
-    DEPENDS ${src} "${NVFUSER_STRINGIFY_TOOL}"
-    COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
-  )
-  add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
-  add_dependencies(${NVFUSER_CODEGEN} nvfuser_rt_${filename})
-
-  # also generate the resource headers during the configuration step
-  # (so tools like clang-tidy can run w/o requiring a real build)
-  execute_process(COMMAND
-    ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
-endforeach()
-
-target_include_directories(${NVFUSER_CODEGEN} PRIVATE "${CMAKE_BINARY_DIR}/include")
-
-# -- build tests
-
-# note: ideally we don't need USE_CUDA here, but our cpp tests are not ROCM compatible.
-if(BUILD_TEST AND USE_CUDA)
-  set(NVFUSER_TESTS "${PROJECT_NAME}_tests")
-  set(JIT_TEST_SRCS)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_definition.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_cache.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_record.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu1.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu2.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu3.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensor_factories.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_shift.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensorcore.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_view.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_transpose.cpp)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
-  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_utils.cpp)
-
-  add_executable(${NVFUSER_TESTS}
-             ${TORCH_ROOT}/test/cpp/common/main.cpp
-             ${TORCH_ROOT}/test/cpp/jit/test_utils.cpp
-             ${JIT_TEST_SRCS})
-
-  target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_GTEST)
-  if(NOT USE_ROCM)
-    target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_CUDA)
-  else()
-    target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_ROCM)
-  endif()
-  target_include_directories(${NVFUSER_TESTS} PRIVATE "${NVFUSER_ROOT}" "${TORCH_ROOT}/torch/csrc/api/include/")
-  target_link_libraries(${NVFUSER_TESTS} PRIVATE ${NVFUSER_CODEGEN} torch ${TORCHLIB_FLAVOR} gtest_main gmock_main)
-  if(NOT MSVC)
-    target_compile_options(${NVFUSER_TESTS} PRIVATE -Wno-unused-variable)
-  endif()
-
-  install(TARGETS ${NVFUSER_TESTS} DESTINATION bin)
-endif()
-
-if(BUILD_NVFUSER_BENCHMARK)
-  set(NVFUSER_BENCHMARK "${PROJECT_NAME}_bench")
-  set(BENCHMARK_SRCS)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_first.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_first_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_last.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_last_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/bert.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/broadcast.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/gelu_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/heuristic_lookup.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/shape_inference.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/instance_norm.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/layer_norm.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/layer_norm_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/rms_norm.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/rms_norm_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/lstm_cell.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/reduction.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/softmax.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/softmax_backward.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/scale_bias_relu.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/transpose.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/matmul.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/timm.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/utils.cpp)
-  list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/main.cpp)
-
-  add_executable(${NVFUSER_BENCHMARK} ${BENCHMARK_SRCS})
-  target_link_libraries(${NVFUSER_BENCHMARK} PRIVATE torch_library benchmark ${NVFUSER_CODEGEN})
-  target_include_directories(${NVFUSER_BENCHMARK} PRIVATE ${NVFUSER_ROOT})
-  if(NOT MSVC)
-    target_compile_options_if_supported(nvfuser_bench -Werror)
-    target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
-    target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
-  endif()
-
-endif()
--- a/third_party/nvfuser/benchmark/CMakeLists.txt
+++ b/third_party/nvfuser/benchmark/CMakeLists.txt
@ -1,35 +0,0 @@
-if(USE_CUDA)
-  add_executable(nvfuser_bench
-    batch_norm_channels_first.cpp
-    batch_norm_channels_first_backward.cpp
-    batch_norm_channels_last.cpp
-    batch_norm_channels_last_backward.cpp
-    bert.cpp
-    broadcast.cpp
-    gelu_backward.cpp
-    heuristic_lookup.cpp
-    shape_inference.cpp
-    instance_norm.cpp
-    layer_norm.cpp
-    layer_norm_backward.cpp
-    rms_norm.cpp
-    rms_norm_backward.cpp
-    lstm_cell.cpp
-    reduction.cpp
-    softmax.cpp
-    softmax_backward.cpp
-    scale_bias_relu.cpp
-    transpose.cpp
-    matmul.cpp
-    timm.cpp
-    utils.cpp
-    main.cpp)
-
-  target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
-  if(NOT MSVC)
-    target_compile_options_if_supported(nvfuser_bench -Werror)
-    target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
-    target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
-  endif()
-
-endif()
--- a/third_party/nvfuser/benchmark/batch_norm_channels_first.cpp
+++ b/third_party/nvfuser/benchmark/batch_norm_channels_first.cpp
@ -1,335 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  auto output = result.output;
-
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_BatchNorm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(at_weight);
-  auto ato_bias = c10::optional<at::Tensor>(at_bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
-
-  auto output = at::batch_norm(
-      at_x,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    auto output = at::batch_norm(
-        at_x,
-        ato_weight,
-        ato_bias,
-        ato_run_mean,
-        ato_run_var,
-        true,
-        kMomentum,
-        kEps,
-        true);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Half);
-}
-
-// Simple aliases just for names in the printed output
-static void Baseline_ResNet_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Half);
-}
-
-static void Baseline_ResNext_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_fp32,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_fp16,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    // cuDNN didn't make it to 1024
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-// RESNET and REXNEXT benchmarks
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNet_BatchNorm_fp16,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNext_BatchNorm_fp16,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_ResNet_BatchNorm_cuDNN_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_ResNext_BatchNorm_cuDNN_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/batch_norm_channels_first_backward.cpp
+++ b/third_party/nvfuser/benchmark/batch_norm_channels_first_backward.cpp
@ -1,358 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <ATen/Operators.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto grad_output = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, DataType::Float);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-  auto save_mean = makeContigTensor(1, DataType::Float);
-  auto save_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(grad_output);
-  fusion->addInput(weight);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-  fusion->addInput(save_mean);
-  fusion->addInput(save_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    grad_output = castOp(DataType::Float, grad_output);
-  }
-
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm_backward(
-      input,
-      grad_output,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_var,
-      kTraining,
-      eps_ptr,
-      std::vector<bool>(3, true));
-
-  auto grad_input = result.grad_input;
-  auto grad_weight = result.grad_weight;
-  auto grad_bias = result.grad_bias;
-
-  if (dtype == DataType::Half) {
-    grad_input = castOp(DataType::Half, grad_input);
-    grad_weight = castOp(DataType::Half, grad_weight);
-    grad_bias = castOp(DataType::Half, grad_bias);
-  }
-
-  fusion->addOutput(grad_input);
-  fusion->addOutput(grad_weight);
-  fusion->addOutput(grad_bias);
-}
-
-static void NvFuserScheduler_BatchNorm_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs(
-      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_BWD(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(weight);
-  auto ato_bias = c10::optional<at::Tensor>(bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(run_var);
-  auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
-  auto ato_save_var = c10::optional<at::Tensor>(save_var);
-
-  auto fwd_result = at::_ops::_batch_norm_impl_index::call(
-      input,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-  cudaDeviceSynchronize();
-
-  // Sync everything up before we start
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    at::_ops::cudnn_batch_norm_backward::call(
-        input,
-        grad_out,
-        weight,
-        ato_run_mean,
-        ato_run_var,
-        save_mean,
-        save_var,
-        kEps,
-        std::get<3>(fwd_result));
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_BWD_cuDNN_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_BWD_cuDNN_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
-}
-
-// Simple aliases just for names in the printed output
-static void Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
-}
-
-static void Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
-}
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_BWD_fp32,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_BWD_fp16,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    // cuDNN didn't make it to 1024
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-// RESNET and REXNEXT benchmarks
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNet_BatchNorm_BWD_fp16,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_BWD_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNext_BatchNorm_BWD_fp16,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_BWD_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/batch_norm_channels_last.cpp
+++ b/third_party/nvfuser/benchmark/batch_norm_channels_last.cpp
@ -1,363 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr,
-      true);
-
-  auto output = result.output;
-
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_BatchNorm_nhwc(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(2),
-      benchmark_state.range(1)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[3]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_nhwc(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options)
-                        .contiguous(c10::MemoryFormat::ChannelsLast);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(at_weight);
-  auto ato_bias = c10::optional<at::Tensor>(at_bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
-
-  auto output = at::batch_norm(
-      at_x,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    at::_ops::_batch_norm_impl_index::call(
-        at_x,
-        ato_weight,
-        ato_bias,
-        ato_run_mean,
-        ato_run_var,
-        true,
-        kMomentum,
-        kEps,
-        true);
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_nhwc_cuDNN_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_nhwc_cuDNN_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
-}
-
-// Simple aliases just for names in the printed output
-static void Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
-}
-
-static void Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_nhwc_fp32,
-    setupBatchNorm_nhwc,
-    NvFuserScheduler_BatchNorm_nhwc,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_nhwc_fp16,
-    setupBatchNorm_nhwc,
-    NvFuserScheduler_BatchNorm_nhwc,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    // cuDNN didn't make it to 1024
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-// RESNET and REXNEXT benchmarks
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16,
-    setupBatchNorm_nhwc,
-    NvFuserScheduler_BatchNorm_nhwc,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16,
-    setupBatchNorm_nhwc,
-    NvFuserScheduler_BatchNorm_nhwc,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// Permutation of TIMM sizes
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16,
-    setupBatchNorm_nhwc,
-    NvFuserScheduler_BatchNorm_nhwc,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
-    ->ArgsProduct(
-        {{8, 16, 32, 64, 128, 256},
-         {24, 40, 48, 56, 72, 152, 184, 200, 368},
-         {7, 14, 28, 56, 112}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
-    ->ArgsProduct(
-        {{128, 256, 512, 1024, 2048},
-         {24, 40, 48, 56, 72, 152},
-         {7, 14, 28, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/batch_norm_channels_last_backward.cpp
+++ b/third_party/nvfuser/benchmark/batch_norm_channels_last_backward.cpp
@ -1,383 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <ATen/Operators.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto grad_output = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, DataType::Float);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-  auto save_mean = makeContigTensor(1, DataType::Float);
-  auto save_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(grad_output);
-  fusion->addInput(weight);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-  fusion->addInput(save_mean);
-  fusion->addInput(save_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    grad_output = castOp(DataType::Float, grad_output);
-  }
-
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm_backward(
-      input,
-      grad_output,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_var,
-      kTraining,
-      eps_ptr,
-      std::vector<bool>(3, true),
-      true);
-
-  auto grad_input = result.grad_input;
-  auto grad_weight = result.grad_weight;
-  auto grad_bias = result.grad_bias;
-
-  if (dtype == DataType::Half) {
-    grad_input = castOp(DataType::Half, grad_input);
-    grad_weight = castOp(DataType::Half, grad_weight);
-    grad_bias = castOp(DataType::Half, grad_bias);
-  }
-
-  fusion->addOutput(grad_input);
-  fusion->addOutput(grad_weight);
-  fusion->addOutput(grad_bias);
-}
-
-static void NvFuserScheduler_BatchNorm_nhwc_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(2),
-      benchmark_state.range(1)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor weight = at::ones({input_shape[3]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[3]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[3]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[3]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs(
-      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_nhwc_BWD(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options)
-                         .contiguous(c10::MemoryFormat::ChannelsLast);
-  at::Tensor grad_out = at::randn(input_shape, options)
-                            .contiguous(c10::MemoryFormat::ChannelsLast);
-  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(weight);
-  auto ato_bias = c10::optional<at::Tensor>(bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(run_var);
-  auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
-  auto ato_save_var = c10::optional<at::Tensor>(save_var);
-
-  auto fwd_result = at::_ops::_batch_norm_impl_index::call(
-      input,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-  cudaDeviceSynchronize();
-
-  // Sync everything up before we start
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    at::_ops::cudnn_batch_norm_backward::call(
-        input,
-        grad_out,
-        weight,
-        ato_run_mean,
-        ato_run_var,
-        save_mean,
-        save_var,
-        kEps,
-        std::get<3>(fwd_result));
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
-}
-
-// Simple aliases just for names in the printed output
-static void Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
-}
-
-static void Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_nhwc_BWD_fp32,
-    setupBatchNorm_nhwc_BWD,
-    NvFuserScheduler_BatchNorm_nhwc_BWD,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_nhwc_BWD_fp16,
-    setupBatchNorm_nhwc_BWD,
-    NvFuserScheduler_BatchNorm_nhwc_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-// RESNET and REXNEXT benchmarks
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16,
-    setupBatchNorm_nhwc_BWD,
-    NvFuserScheduler_BatchNorm_nhwc_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16,
-    setupBatchNorm_nhwc_BWD,
-    NvFuserScheduler_BatchNorm_nhwc_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// Permutation of TIMM sizes
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16,
-    setupBatchNorm_nhwc_BWD,
-    NvFuserScheduler_BatchNorm_nhwc_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
-    ->ArgsProduct(
-        {{8, 16, 32, 64, 128, 256},
-         {24, 40, 48, 56, 72, 152, 184, 200, 368},
-         {7, 14, 28, 56, 112}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
-    ->ArgsProduct(
-        {{128, 256, 512, 1024, 2048},
-         {24, 40, 48, 56, 72, 152},
-         {7, 14, 28, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16)
-    ->Args({256, 64, 112})
-    ->Args({256, 64, 56})
-    ->Args({256, 256, 56})
-    ->Args({256, 128, 56})
-    ->Args({256, 128, 28})
-    ->Args({256, 512, 28})
-    ->Args({256, 256, 28})
-    ->Args({256, 256, 14})
-    ->Args({256, 1024, 14})
-    ->Args({256, 512, 14})
-    ->Args({256, 512, 7})
-    ->Args({256, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16)
-    ->Args({128, 64, 112})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 56})
-    ->Args({128, 128, 56})
-    ->Args({128, 256, 28})
-    ->Args({128, 512, 28})
-    ->Args({128, 512, 14})
-    ->Args({128, 1024, 14})
-    ->Args({128, 1024, 7})
-    ->Args({128, 2048, 7})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/bert.cpp
+++ b/third_party/nvfuser/benchmark/bert.cpp
@ -1,766 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <sstream>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-// Return reduction tensor view and output of reduction
-static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = TensorViewBuilder()
-                        .ndims(4)
-                        .dtype(dtype)
-                        .contiguity({true, false, false, true})
-                        .shape({-1, 1, 1, -1})
-                        .build();
-  TensorView* tv1 = makeContigTensor(4, dtype);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // TODO: should be input
-  auto d16 = IrBuilder::create<Double>(1.0);
-
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv1 = castOp(DataType::Float, tv1);
-  }
-
-  auto tv2 = div(tv1, d16);
-  auto tv3 = add(tv2, tv0);
-
-  auto tv10 = softmax(tv3, 3);
-  auto dropout_tvs = dropout(tv10, IrBuilder::create<Double>(0.9));
-  auto tv12 = dropout_tvs.mask;
-  auto tv14 = dropout_tvs.output;
-
-  if (is_fp16) {
-    tv14 = castOp(DataType::Half, tv14);
-    tv10 = castOp(DataType::Half, tv10);
-    tv3 = castOp(DataType::Half, tv3);
-  }
-
-  fusion->addOutput(tv14);
-  fusion->addOutput(tv12);
-  fusion->addOutput(tv10);
-  fusion->addOutput(tv3);
-}
-
-static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
-  TensorView* tv0 = makeContigTensor(4, dtype);
-  // Strangely tv1 isn't used anywhere, need to come back to that...
-  TensorView* tv1 = makeContigTensor(4, dtype);
-  TensorView* tv2 = makeContigTensor(4, dtype);
-  TensorView* tv3 = makeContigTensor(4, DataType::Bool);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-  fusion->addInput(tv3);
-
-  bool is_fp16 = dtype == DataType::Half;
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv1 = castOp(DataType::Float, tv1);
-    tv2 = castOp(DataType::Float, tv2);
-  }
-
-  // TODO: should be inputs
-  auto d32 = IrBuilder::create<Double>(1.0);
-  // fusion->addInput(d32);
-  auto d33 = IrBuilder::create<Double>(2.0);
-  // fusion->addInput(d33);
-
-  auto tv4 = mul(tv2, tv3);
-  auto tv5 = mul(tv4, d33);
-  auto tv6 = mul(tv5, tv0);
-  auto tv7 = sum(tv6, {-1});
-  auto tv8 = broadcast(tv7, {false, false, false, true});
-  auto tv9 = mul(tv0, tv8);
-  auto tv10 = sub(tv6, tv9);
-  auto tv11 = div(tv10, d32);
-
-  if (is_fp16) {
-    tv10 = castOp(DataType::Half, tv10);
-    tv11 = castOp(DataType::Half, tv11);
-  }
-
-  fusion->addOutput(tv11);
-  fusion->addOutput(tv10);
-}
-
-static void MagicScheduler_DivMaxSoftDropFwd(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto w = benchmark_state.range(0);
-  auto x = benchmark_state.range(1);
-  auto y = benchmark_state.range(2);
-  auto z = benchmark_state.range(3);
-
-  setupDivMaxSoftmaxDropoutForward(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({w, 1, 1, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  std::vector<c10::IValue> at_inputs = {t0, t1};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    cg_outputs = fe.runFusion({t0, t1}, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  for (auto tensor : std::vector<at::Tensor>({t0, t1})) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-static void MagicScheduler_DivMaxSoftDropBwd(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto w = benchmark_state.range(0);
-  auto x = benchmark_state.range(1);
-  auto y = benchmark_state.range(2);
-  auto z = benchmark_state.range(3);
-
-  setupDivMaxSoftmaxDropoutBackward(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({w, x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-  at::Tensor t2 = at::randn({w, x, y, z}, options);
-  at::Tensor t3 = at::randn({w, x, y, z}, options).round().to(at::kBool);
-
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  // Some reason t1 isn't used, ignore it.
-  for (auto tensor : std::vector<at::Tensor>({t0, t2, t3})) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = makeContigTensor(1, dtype);
-  TensorView* tv1 = makeContigTensor(1, dtype);
-  TensorView* tv2 = makeContigTensor(3, dtype);
-  TensorView* tv3 = makeContigTensor(3, dtype);
-  TensorView* tv4 = makeContigTensor(1, dtype);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-  fusion->addInput(tv3);
-  fusion->addInput(tv4);
-
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv1 = castOp(DataType::Float, tv1);
-    tv2 = castOp(DataType::Float, tv2);
-    tv3 = castOp(DataType::Float, tv3);
-    tv4 = castOp(DataType::Float, tv4);
-  }
-
-  auto tv5 = broadcast(tv4, {true, true, false});
-  auto tv6 = add(tv3, tv5);
-  auto dropout_outs = dropout(tv6, IrBuilder::create<Double>(0.9));
-
-  auto tv8 = dropout_outs.output;
-  auto tv10 = dropout_outs.mask;
-
-  auto tv11 = add(tv10, tv2);
-
-  auto layer_norm_outs =
-      layer_norm(tv11, 1, tv0, tv1, IrBuilder::create<Double>(1e-5));
-  auto tv14 = layer_norm_outs.output;
-  auto tv21 = layer_norm_outs.mean;
-  auto tv26 = layer_norm_outs.invstd;
-
-  if (is_fp16) {
-    tv11 = castOp(DataType::Half, tv11);
-    tv14 = castOp(DataType::Half, tv14);
-    tv21 = castOp(DataType::Half, tv21);
-    tv26 = castOp(DataType::Half, tv26);
-  }
-
-  fusion->addOutput(tv8);
-  fusion->addOutput(tv11);
-  fusion->addOutput(tv14);
-  fusion->addOutput(tv21);
-  fusion->addOutput(tv26);
-}
-
-static void MagicScheduler_BiasDropoutAddLayernormFwd(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto x = benchmark_state.range(0);
-  auto y = benchmark_state.range(1);
-  auto z = benchmark_state.range(2);
-
-  setupBiasDropoutAddLayernormFwd(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({z}, options);
-  at::Tensor t1 = at::randn({z}, options);
-  at::Tensor t2 = at::randn({x, y, z}, options);
-  at::Tensor t3 = at::randn({x, y, z}, options);
-  at::Tensor t4 = at::randn({z}, options);
-
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  for (auto inp : at_inputs) {
-    auto tensor = inp.toTensor();
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv1 = makeContigTensor(3, dtype);
-  TensorView* tv2 = makeContigTensor(3, dtype);
-  TensorView* tv3 = TensorViewBuilder()
-                        .ndims(3)
-                        .dtype(dtype)
-                        .contiguity({true, true, true})
-                        .shape({-1, -1, 1})
-                        .build();
-  TensorView* tv4 = TensorViewBuilder()
-                        .ndims(3)
-                        .dtype(dtype)
-                        .contiguity({true, true, true})
-                        .shape({-1, -1, 1})
-                        .build();
-
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-  fusion->addInput(tv3);
-  fusion->addInput(tv4);
-
-  if (is_fp16) {
-    tv1 = castOp(DataType::Float, tv1);
-    tv2 = castOp(DataType::Float, tv2);
-    tv3 = castOp(DataType::Float, tv3);
-    tv4 = castOp(DataType::Float, tv4);
-  }
-
-  auto tv7 = sub(tv2, tv3);
-  auto tv8 = mul(tv7, tv4);
-  auto tv24 = sum(tv1, {0, 1});
-  auto tv22 = mul(tv1, tv8);
-  auto tv23 = sum(tv22, {0, 1});
-
-  if (is_fp16) {
-    tv24 = castOp(DataType::Half, tv24);
-    tv23 = castOp(DataType::Half, tv23);
-    tv8 = castOp(DataType::Half, tv8);
-  }
-
-  fusion->addOutput(tv24);
-  fusion->addOutput(tv23);
-  fusion->addOutput(tv8);
-}
-
-static void MagicScheduler_BiasDropoutAddLayernormBwd1(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto x = benchmark_state.range(0);
-  auto y = benchmark_state.range(1);
-  auto z = benchmark_state.range(2);
-
-  setupBiasDropoutAddLayernormBwd1(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
-  at::Tensor t2 = at::randn({x, y, 1}, options);
-  at::Tensor t3 = at::randn({x, y, 1}, options);
-
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getReductionHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  scheduleReduction(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  for (auto inp : at_inputs) {
-    auto tensor = inp.toTensor();
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv4 = TensorViewBuilder()
-                        .ndims(3)
-                        .dtype(dtype)
-                        .contiguity({true, true, true})
-                        .shape({-1, -1, 1})
-                        .build();
-  TensorView* tv5 = makeContigTensor(1, dtype);
-  TensorView* tv1 = makeContigTensor(3, dtype);
-  TensorView* tv8 = makeContigTensor(3, dtype);
-
-  fusion->addInput(tv4);
-  fusion->addInput(tv5);
-  fusion->addInput(tv1);
-  fusion->addInput(tv8);
-
-  if (is_fp16) {
-    tv4 = castOp(DataType::Float, tv4);
-    tv5 = castOp(DataType::Float, tv5);
-    tv1 = castOp(DataType::Float, tv1);
-    tv8 = castOp(DataType::Float, tv8);
-  }
-  auto d36 = mul(IrBuilder::create<Double>(1.0), tv1->axis(2)->extent());
-  auto d47 = unaryOp(UnaryOpType::Reciprocal, d36);
-
-  auto tv9 = broadcast(tv5, {true, true, false});
-  auto tv10 = mul(tv1, tv9);
-  auto tv14 = mul(tv10, tv8);
-  auto tv15 = sum(tv14, {2});
-  auto tv16 = broadcast(tv15, {false, false, true});
-  auto tv17 = mul(tv8, tv16);
-  auto tv12 = sum(tv10, {2});
-  auto tv13 = broadcast(tv12, {false, false, true});
-  auto tv11 = mul(d36, tv10);
-  auto tv18 = sub(tv11, tv13);
-  auto tv20 = mul(d47, tv4);
-  auto tv19 = sub(tv18, tv17);
-  auto tv21 = mul(tv20, tv19);
-
-  if (is_fp16) {
-    tv21 = castOp(DataType::Half, tv21);
-  }
-
-  fusion->addOutput(tv21);
-}
-
-static void MagicScheduler_BiasDropoutAddLayernormBwd2(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto x = benchmark_state.range(0);
-  auto y = benchmark_state.range(1);
-  auto z = benchmark_state.range(2);
-
-  setupBiasDropoutAddLayernormBwd2(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t4 = at::randn({x, y, 1}, options);
-  at::Tensor t5 = at::randn({z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
-  at::Tensor t8 = at::randn({x, y, z}, options);
-
-  std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  for (auto inp : at_inputs) {
-    auto tensor = inp.toTensor();
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = makeContigTensor(3, dtype);
-  TensorView* tv21 = makeContigTensor(3, dtype);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv21);
-
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv21 = castOp(DataType::Float, tv21);
-  }
-
-  // Uncertain this is the right value, but going for it anyways
-  auto d34 = div(IrBuilder::create<Double>(1.0), tv0->axis(2)->extent());
-
-  auto tv25 = mul(tv21, tv0);
-  auto tv26 = mul(tv25, d34);
-  auto tv27 = sum(tv26, {0, 1});
-
-  if (is_fp16) {
-    tv26 = castOp(DataType::Half, tv27);
-    tv27 = castOp(DataType::Half, tv27);
-  }
-
-  fusion->addOutput(tv26);
-  fusion->addOutput(tv27);
-}
-
-static void MagicScheduler_BiasDropoutAddLayernormBwd3(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto x = benchmark_state.range(0);
-  auto y = benchmark_state.range(1);
-  auto z = benchmark_state.range(2);
-
-  setupBiasDropoutAddLayernormBwd3(&fusion, dtype);
-
-  auto tvs = ir_utils::allTvs(&fusion);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t21 = at::randn({x, y, z}, options);
-
-  std::vector<c10::IValue> at_inputs = {t0, t21};
-  std::vector<at::Tensor> cg_outputs;
-
-  auto norm_params = getReductionHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
-  scheduleReduction(&fusion, *norm_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  int64_t bytes = 0;
-  for (auto inp : at_inputs) {
-    auto tensor = inp.toTensor();
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  for (auto tensor : cg_outputs) {
-    bytes += tensor.numel() *
-        (int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
-  }
-
-  benchmark_state.SetBytesProcessed(
-      bytes * int64_t(benchmark_state.iterations()));
-}
-
-//------------------------------------------------------------------------------
-
-static void DivMaxSoftDropFwd_fp32(benchmark::State& benchmark_state) {
-  MagicScheduler_DivMaxSoftDropFwd(benchmark_state, DataType::Float);
-}
-
-static void DivMaxSoftDropBwd_fp32(benchmark::State& benchmark_state) {
-  MagicScheduler_DivMaxSoftDropBwd(benchmark_state, DataType::Float);
-}
-
-static void DivMaxSoftDropFwd_fp16(benchmark::State& benchmark_state) {
-  MagicScheduler_DivMaxSoftDropFwd(benchmark_state, DataType::Half);
-}
-
-static void DivMaxSoftDropBwd_fp16(benchmark::State& benchmark_state) {
-  MagicScheduler_DivMaxSoftDropBwd(benchmark_state, DataType::Half);
-}
-
-static void BiasDropoutAddLayernormFwd_fp32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
-}
-
-static void BiasDropoutAddLayernormFwd_tf32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
-}
-
-static void BiasDropoutAddLayernormBwd1_fp32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormBwd1(benchmark_state, DataType::Float);
-}
-
-// Use full ampere wave here
-static void BiasDropoutAddLayernormBwd1_tf32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormBwd1(benchmark_state, DataType::Float);
-}
-
-static void BiasDropoutAddLayernormBwd2_fp32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormBwd2(benchmark_state, DataType::Float);
-}
-
-static void BiasDropoutAddLayernormBwd3_fp32(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormBwd3(benchmark_state, DataType::Float);
-}
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(DivMaxSoftDropFwd_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(DivMaxSoftDropBwd_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(DivMaxSoftDropFwd_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(DivMaxSoftDropBwd_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(BiasDropoutAddLayernormFwd_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// Use full ampere wave here
-BENCHMARK(BiasDropoutAddLayernormFwd_tf32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32, 1024}, {128, 128}, {864, 864}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(BiasDropoutAddLayernormBwd1_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// Use full ampere wave here
-BENCHMARK(BiasDropoutAddLayernormBwd1_tf32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32, 1024}, {128, 128}, {864, 864}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(BiasDropoutAddLayernormBwd2_fp32)
-    ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(BiasDropoutAddLayernormBwd3_fp32)
-    ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/broadcast.cpp
+++ b/third_party/nvfuser/benchmark/broadcast.cpp
@ -1,367 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <sstream>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-// Return broadcast tensor view and output of broadcast
-static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = makeContigTensor(2, dtype);
-  TensorView* tv1 = makeContigTensor(1, dtype);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  std::vector<bool> bcast_pattern(2, false);
-  bcast_pattern[bcast_axis] = true;
-
-  if (is_fp16) {
-    tv0 = castOp(DataType::Float, tv0);
-    tv1 = castOp(DataType::Float, tv1);
-  }
-
-  TensorView* tv2 = broadcast(tv1, bcast_pattern);
-  TensorView* tv3 = add(tv0, tv2);
-
-  if (is_fp16) {
-    tv3 = castOp(DataType::Half, tv3);
-  }
-
-  fusion->addOutput(tv3);
-}
-
-static void NvFuserScheduler_Broadcast(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    int bcast_dim) {
-  auto bcast_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 =
-      (bcast_dim ? at::randn({iter_size, bcast_size}, options)
-                 : at::randn({bcast_size, iter_size}, options));
-
-  at::Tensor t1 = at::randn({iter_size}, options);
-
-  fusion_executor_cache->profile(true);
-  fusion_executor_cache->runFusionWithInputs({t0, t1});
-
-  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
-  auto executor_instance = compile_log.fusion_executor;
-  auto params = toString(compile_log.params);
-  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-
-  benchmark_state.SetLabel(params + lparams);
-
-  fusion_executor_cache->profile(false);
-  executor_instance->setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs({t0, t1});
-    benchmark_state.SetIterationTime(
-        executor_instance->kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (iter_size * bcast_size * 2 + iter_size) * int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_Broadcast(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    int bcast_dim) {
-  auto bcast_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  at::Tensor t0 =
-      (bcast_dim ? at::randn({iter_size, bcast_size}, options)
-                 : at::randn({bcast_size, iter_size}, options));
-
-  at::Tensor t1 = at::randn({iter_size}, options);
-
-  // Sync everything up before we start
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    auto output = t0.add(t1.unsqueeze(bcast_dim));
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (iter_size * bcast_size * 2 + iter_size) * int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Broadcast_Outer_fp32(benchmark::State& benchmark_state) {
-  Baseline_Broadcast(benchmark_state, DataType::Float, 0);
-}
-
-static void Baseline_Broadcast_Outer_fp16(benchmark::State& benchmark_state) {
-  Baseline_Broadcast(benchmark_state, DataType::Half, 0);
-}
-
-static void Baseline_Broadcast_Inner_fp32(benchmark::State& benchmark_state) {
-  Baseline_Broadcast(benchmark_state, DataType::Float, 1);
-}
-
-static void Baseline_Broadcast_Inner_fp16(benchmark::State& benchmark_state) {
-  Baseline_Broadcast(benchmark_state, DataType::Half, 1);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Broadcast_Outer_fp32,
-    setupBroadcast,
-    NvFuserScheduler_Broadcast,
-    DataType::Float,
-    0);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Broadcast_Outer_fp16,
-    setupBroadcast,
-    NvFuserScheduler_Broadcast,
-    DataType::Half,
-    0);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Broadcast_Inner_fp32,
-    setupBroadcast,
-    NvFuserScheduler_Broadcast,
-    DataType::Float,
-    1);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Broadcast_Inner_fp16,
-    setupBroadcast,
-    NvFuserScheduler_Broadcast,
-    DataType::Half,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Broadcast_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/gelu_backward.cpp
+++ b/third_party/nvfuser/benchmark/gelu_backward.cpp
@ -1,242 +0,0 @@
-
-// Based on NVFuserTest.FusionBiasGeluBwd_CUDA
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static void setupFusion(Fusion* fusion) {
-  FusionGuard fg(fusion);
-
-  const float k_079 = 0.79788456;
-  const float k_004 = 0.044715;
-  const float k_010 = 0.1070322243;
-
-  // gradient tensor
-  auto t0 = makeContigTensor(3, DataType::Half);
-  fusion->addInput(t0);
-
-  auto t1 = castOp(DataType::Float, t0);
-
-  // bias tensor
-  auto t2 = makeContigTensor(1, DataType::Half);
-  fusion->addInput(t2);
-
-  auto t3 = castOp(DataType::Float, t2);
-
-  // input tensor
-  auto t4 = makeContigTensor(3, DataType::Half);
-  fusion->addInput(t4);
-
-  auto t5 = castOp(DataType::Float, t4);
-  auto t6 = broadcast(t3, {true, true, false});
-  auto t7 = add(t6, t5);
-  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
-  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
-  auto t10 = mul(t9, t7);
-  auto t11 = add(t10, IrBuilder::create<Int>(1));
-  auto t12 = mul(t8, t11);
-  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
-  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
-  auto t15 = mul(t13, t13);
-  auto t16 = unaryOp(UnaryOpType::Neg, t15);
-  auto t17 = add(t16, IrBuilder::create<Int>(1));
-  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
-  auto t19 = mul(t18, t7);
-  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
-  auto t21 = mul(t17, t20);
-  auto t22 = mul(t14, t21);
-  auto t23 = add(t13, IrBuilder::create<Int>(1));
-  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
-  auto t25 = add(t22, t24);
-  auto t26 = mul(t25, t1);
-
-  // Save float output for validation
-  fusion->addOutput(t26);
-  auto t27 = castOp(DataType::Half, t26);
-  fusion->addOutput(t27);
-}
-
-static std::vector<c10::IValue> setupInputs() {
-  at::manual_seed(0);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  std::vector<int64_t> input_shape{6, 512, 4096};
-  std::vector<int64_t> bias_shape{4096};
-  auto at_input = at::randn(input_shape, options);
-  auto at_bias = at::randn(bias_shape, options);
-  auto at_grad = at::randn(input_shape, options);
-
-  return {at_grad, at_bias, at_input};
-}
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_SetupFusion(benchmark::State& benchmark_state) {
-  for (auto _ : benchmark_state) {
-    Fusion fusion;
-    setupFusion(&fusion);
-  }
-}
-
-BENCHMARK(GeluBackward_SetupFusion)->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    benchmark_state.PauseTiming();
-    Fusion fusion;
-    setupFusion(&fusion);
-    std::vector<c10::IValue> inputs = setupInputs();
-    benchmark_state.ResumeTiming();
-
-    // Auto-schedule
-    schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-  }
-}
-
-BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_Lower(benchmark::State& benchmark_state) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs();
-
-  schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  for (auto _ : benchmark_state) {
-    GpuLower gpu_lower(&fusion);
-  }
-}
-
-BENCHMARK(GeluBackward_Lower)->Unit(benchmark::kMillisecond);
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_Compile(benchmark::State& benchmark_state) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs();
-
-  schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(&fusion);
-  }
-}
-
-BENCHMARK(GeluBackward_Compile)->Unit(benchmark::kMillisecond);
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs();
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.compileFusion(&fusion);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-  }
-}
-
-BENCHMARK(GeluBackward_RunFusion)->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs();
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.setMeasureKernelTimeFlag(true);
-  executor.compileFusion(&fusion);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-    benchmark_state.SetIterationTime(executor.kernelTimeMs() / 1000.0);
-    clearL2Cache();
-  }
-}
-
-BENCHMARK(GeluBackward_RunFusion_GpuOnly)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs();
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.setExecuteKernelFlag(false);
-  executor.compileFusion(&fusion);
-
-  for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-  }
-}
-
-BENCHMARK(GeluBackward_RunFusion_CpuOnly)->Unit(benchmark::kMicrosecond);
--- a/third_party/nvfuser/benchmark/heuristic_cache.cpp
+++ b/third_party/nvfuser/benchmark/heuristic_cache.cpp
@ -1,165 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static auto getLayerBackwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeSymbolicTensor(shape.size());
-  auto input = makeSymbolicTensor(shape.size());
-  auto mean = makeConcreteTensor(outer_shape);
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeSymbolicTensor(norm_shape.size());
-  auto bias = makeSymbolicTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(mean);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  auto grads = layer_norm_backward(
-      grad_out,
-      input,
-      norm_shape,
-      mean,
-      rstd,
-      weight,
-      bias,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  at::Tensor aten_bias = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-  auto at_bias = c10::optional<at::Tensor>(aten_bias);
-
-  const float kEps = 1e-5;
-  auto aten_results =
-      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
-  auto aten_output = std::get<0>(aten_results);
-  auto aten_mean = std::get<1>(aten_results);
-  auto aten_rstd = std::get<2>(aten_results);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {
-      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-static void LayerNormBackward_HeuristicLookup(
-    benchmark::State& benchmark_state) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    runtime->getMaybeHeuristicsFor(aten_inputs);
-  }
-}
-
-static auto getLayerForwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const float kEps = 1e-5;
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto input = makeSymbolicTensor(shape.size());
-  fusion.addInput(input);
-
-  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(shape, options);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-static void LayerNormForward_HeuristicLookup(
-    benchmark::State& benchmark_state) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    runtime->getMaybeHeuristicsFor(aten_inputs);
-  }
-}
-
-BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
--- a/third_party/nvfuser/benchmark/heuristic_lookup.cpp
+++ b/third_party/nvfuser/benchmark/heuristic_lookup.cpp
@ -1,171 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static auto getLayerBackwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeSymbolicTensor(shape.size());
-  auto input = makeSymbolicTensor(shape.size());
-  auto mean = makeConcreteTensor(outer_shape);
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeSymbolicTensor(norm_shape.size());
-  auto bias = makeSymbolicTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(mean);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  auto grads = layer_norm_backward(
-      grad_out,
-      input,
-      norm_shape,
-      mean,
-      rstd,
-      weight,
-      bias,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  at::Tensor aten_bias = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-  auto at_bias = c10::optional<at::Tensor>(aten_bias);
-
-  const float kEps = 1e-5;
-  auto aten_results =
-      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
-  auto aten_output = std::get<0>(aten_results);
-  auto aten_mean = std::get<1>(aten_results);
-  auto aten_rstd = std::get<2>(aten_results);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {
-      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-static void LayerNormBackward_HeuristicLookup(
-    benchmark::State& benchmark_state) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-
-  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
-
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(args).has_value());
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    runtime->getMaybeHeuristicsFor(args);
-  }
-}
-
-static auto getLayerForwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const float kEps = 1e-5;
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto input = makeSymbolicTensor(shape.size());
-  fusion.addInput(input);
-
-  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(shape, options);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-static void LayerNormForward_HeuristicLookup(
-    benchmark::State& benchmark_state) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-
-  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
-
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(args).has_value());
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    runtime->getMaybeHeuristicsFor(args);
-  }
-}
-
-BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
--- a/third_party/nvfuser/benchmark/instance_norm.cpp
+++ b/third_party/nvfuser/benchmark/instance_norm.cpp
@ -1,316 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static void setupInstanceNorm(
-    Fusion* fusion,
-    DataType dtype,
-    bool channels_last_3d = false) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  auto input = makeContigTensor(4, dtype);
-  if (channels_last_3d) {
-    input = makeContigTensor(5, dtype);
-  }
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto norm = instance_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr,
-      channels_last_3d);
-
-  auto output = unaryOp(UnaryOpType::Relu, norm.output);
-
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-//------------------------------------------------------------------------------
-
-static void NvFuserScheduler_InstanceNorm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    bool channels_last_3d = false) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(1),
-      benchmark_state.range(1)};
-
-  std::vector<int64_t> input_shape_3d{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x =
-      at::randn(channels_last_3d ? input_shape_3d : input_shape, options);
-  at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
-  at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
-  at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
-  at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs = {
-      at_x, at_weight, at_bias, at_mean, at_var};
-  std::vector<at::Tensor> outputs;
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  const size_t kChannels = benchmark_state.range(2);
-
-  // Read: x, weight, bias, running_mean, running_var
-  // Write: y, running_mean, running_var
-  benchmark_state.SetBytesProcessed(
-      benchmark_state.iterations() *
-      ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
-       (kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
-}
-
-static void Baseline_InstanceNorm(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    bool channels_last_3d = false) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(1),
-      benchmark_state.range(1)};
-  std::vector<int64_t> input_shape_3d{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-  };
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  const auto aten_dtype = data_type_to_aten(dtype);
-
-  at::manual_seed(0);
-  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor at_x = at::randn(input_shape, options);
-  if (channels_last_3d) {
-    at_x = at::randn(
-        input_shape_3d,
-        options.memory_format(c10::MemoryFormat::ChannelsLast3d));
-  }
-  at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
-  at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
-  at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
-  at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(at_weight);
-  auto ato_bias = c10::optional<at::Tensor>(at_bias);
-  auto ato_running_mean = c10::optional<at::Tensor>(at_mean);
-  auto ato_running_var = c10::optional<at::Tensor>(at_var);
-
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    auto norm = at::instance_norm(
-        at_x,
-        ato_weight,
-        ato_bias,
-        ato_running_mean,
-        ato_running_var,
-        true,
-        kMomentum,
-        kEps,
-        false);
-    auto output = at::relu(norm);
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  const size_t kChannels = benchmark_state.range(2);
-
-  // Read: x, weight, bias, running_mean, running_var
-  // Write: y, running_mean, running_var
-  benchmark_state.SetBytesProcessed(
-      benchmark_state.iterations() *
-      ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
-       (kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_InstanceNorm_fp32(benchmark::State& benchmark_state) {
-  Baseline_InstanceNorm(benchmark_state, DataType::Float);
-}
-
-static void Baseline_InstanceNorm_fp16(benchmark::State& benchmark_state) {
-  Baseline_InstanceNorm(benchmark_state, DataType::Half);
-}
-
-static void Baseline_InstanceNorm_fp32_channels_last_3d(
-    benchmark::State& benchmark_state) {
-  Baseline_InstanceNorm(benchmark_state, DataType::Float, true);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_InstanceNorm_fp32,
-    setupInstanceNorm,
-    NvFuserScheduler_InstanceNorm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_InstanceNorm_fp16,
-    setupInstanceNorm,
-    NvFuserScheduler_InstanceNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_InstanceNorm3d_channels_last_fp32,
-    setupInstanceNorm,
-    NvFuserScheduler_InstanceNorm,
-    DataType::Float,
-    true);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 8}, {128, 128}, {32, 32}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 8}, {64, 64}, {64, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 8}, {32, 32}, {128, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 8}, {16, 16}, {256, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 8}, {4, 8}, {320, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_InstanceNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_InstanceNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
-    ->RangeMultiplier(2)
-    ->Ranges({{2, 8}, {128, 128}, {32, 32}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
-    ->RangeMultiplier(2)
-    ->Ranges({{2, 8}, {64, 64}, {64, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
-    ->RangeMultiplier(2)
-    ->Ranges({{2, 8}, {16, 16}, {256, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
-    ->RangeMultiplier(2)
-    ->Ranges({{2, 8}, {4, 8}, {320, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
--- a/third_party/nvfuser/benchmark/layer_norm.cpp
+++ b/third_party/nvfuser/benchmark/layer_norm.cpp
@ -1,240 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupLayerNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const float kEps = 1e-5;
-
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  // setup fusion
-  auto input = makeContigTensor(2, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto layer_norm_results = layer_norm(input, 1, weight, bias, eps_ptr);
-
-  auto output = layer_norm_results.output;
-
-  if (dtype != DataType::Float) {
-    output = castOp(dtype, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_LayerNorm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[1]}, options);
-  at::Tensor bias = at::randn({input_shape[1]}, options);
-
-  std::vector<c10::IValue> aten_inputs({input, weight, bias});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * input.numel() + weight.numel() + bias.numel()) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_LayerNorm(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-  const size_t kReductionAxis = 1;
-  std::vector<int64_t> norm_shape;
-  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
-    norm_shape.push_back(input_shape[idx]);
-  }
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[1]}, options);
-  at::Tensor bias = at::randn({input_shape[1]}, options);
-
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    auto output = at::layer_norm(input, norm_shape, weight, bias);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * input.numel() + weight.numel() + bias.numel()) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_LayerNorm_fp32(benchmark::State& benchmark_state) {
-  Baseline_LayerNorm(benchmark_state, DataType::Float);
-}
-
-static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) {
-  Baseline_LayerNorm(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_LayerNorm_fp32,
-    setupLayerNorm,
-    NvFuserScheduler_LayerNorm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_LayerNorm_fp16,
-    setupLayerNorm,
-    NvFuserScheduler_LayerNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/layer_norm_backward.cpp
+++ b/third_party/nvfuser/benchmark/layer_norm_backward.cpp
@ -1,274 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  // setup fusion
-  auto grad_out = makeContigTensor(2, dtype);
-  auto input = makeContigTensor(2, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-
-  auto mean = TensorViewBuilder()
-                  .contiguity({false, false})
-                  .shape({-1, 1})
-                  .dtype(DataType::Float)
-                  .build();
-  auto rstd = TensorViewBuilder()
-                  .contiguity({false, false})
-                  .shape({-1, 1})
-                  .dtype(DataType::Float)
-                  .build();
-
-  fusion->addInput(grad_out);
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(mean);
-  fusion->addInput(rstd);
-
-  if (dtype == DataType::Half) {
-    grad_out = castOp(DataType::Float, grad_out);
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto layer_norm_results = layer_norm_backward(
-      grad_out, input, {1}, mean, rstd, weight, bias, {true, true, true});
-
-  if (dtype != DataType::Float) {
-    layer_norm_results.grad_input =
-        castOp(dtype, layer_norm_results.grad_input);
-    layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias);
-    layer_norm_results.grad_weight =
-        castOp(dtype, layer_norm_results.grad_weight);
-  }
-
-  fusion->addOutput(layer_norm_results.grad_input);
-  fusion->addOutput(layer_norm_results.grad_bias);
-  fusion->addOutput(layer_norm_results.grad_weight);
-}
-
-static void NvFuserScheduler_LayerNorm_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-
-  // inputs
-  at::manual_seed(0);
-  auto maybe_fp16_options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
-  at::Tensor input = at::randn(input_shape, maybe_fp16_options);
-  at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
-  at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
-  at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
-  at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs(
-      {grad_out, input, weight, bias, mean, rstd});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() + weight.numel() + bias.numel() + mean.numel() +
-       rstd.numel()) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_LayerNorm_BWD(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-  const size_t kReductionAxis = 1;
-  std::vector<int64_t> norm_shape;
-  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
-    norm_shape.push_back(input_shape[idx]);
-  }
-
-  // inputs
-  at::manual_seed(0);
-  auto maybe_fp16_options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
-  at::Tensor input = at::randn(input_shape, maybe_fp16_options);
-  at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
-  at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
-  at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
-  at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
-  std::array<bool, 3> output_mask = {true, true, true};
-
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    at::native_layer_norm_backward(
-        grad_out, input, norm_shape, mean, rstd, weight, bias, output_mask);
-
-    auto output = at::layer_norm(input, norm_shape, weight, bias);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() + weight.numel() + bias.numel() + mean.numel() +
-       rstd.numel()) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_LayerNorm_BWD_fp32(benchmark::State& benchmark_state) {
-  Baseline_LayerNorm_BWD(benchmark_state, DataType::Float);
-}
-
-static void Baseline_LayerNorm_BWD_fp16(benchmark::State& benchmark_state) {
-  Baseline_LayerNorm_BWD(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_LayerNorm_BWD_fp32,
-    setupLayerNorm_BWD,
-    NvFuserScheduler_LayerNorm_BWD,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_LayerNorm_BWD_fp16,
-    setupLayerNorm_BWD,
-    NvFuserScheduler_LayerNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{160, 320}, {2, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_LayerNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/lstm_cell.cpp
+++ b/third_party/nvfuser/benchmark/lstm_cell.cpp
@ -1,257 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-// TODO: add LSTM function to composite operations
-// Function Signature: cy, hy = lstm(x, cx)
-static void setupFusion(Fusion* fusion) {
-  FusionGuard fg(fusion);
-
-  TensorView* tvs[16];
-  for (size_t i = 0; i < 16; i++) {
-    tvs[i] = makeContigTensor(2, DataType::Float);
-    fusion->addInput(tvs[i]);
-  }
-
-  const auto cx = makeContigTensor(2, DataType::Float);
-  fusion->addInput(cx);
-
-  const auto in_x = add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]);
-  const auto forget_x = add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]);
-  const auto cell_x = add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]);
-  const auto out_x = add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]);
-  auto lstm_result = lstm(cx, in_x, forget_x, cell_x, out_x);
-
-  fusion->addOutput(lstm_result.cell);
-  fusion->addOutput(lstm_result.hidden);
-}
-
-static std::vector<c10::IValue> setupInputs(
-    int hidden_features,
-    int batch_size) {
-  at::manual_seed(0);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const at::Tensor large_tensor0 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  const at::Tensor large_tensor1 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  const at::Tensor large_tensor2 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  const at::Tensor large_tensor3 =
-      at::randn({batch_size, hidden_features * 4}, options);
-
-  const auto chunked0 = large_tensor0.chunk(4, 1);
-  const auto chunked1 = large_tensor1.chunk(4, 1);
-  const auto chunked2 = large_tensor2.chunk(4, 1);
-  const auto chunked3 = large_tensor3.chunk(4, 1);
-
-  std::vector<c10::IValue> inputs;
-  inputs.insert(inputs.end(), chunked0.begin(), chunked0.end());
-  inputs.insert(inputs.end(), chunked1.begin(), chunked1.end());
-  inputs.insert(inputs.end(), chunked2.begin(), chunked2.end());
-  inputs.insert(inputs.end(), chunked3.begin(), chunked3.end());
-
-  const auto at_cx = at::randn({batch_size, hidden_features}, options);
-  inputs.push_back(at_cx);
-
-  return inputs;
-}
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_SetupFusion(benchmark::State& benchmark_state) {
-  for (auto _ : benchmark_state) {
-    Fusion fusion;
-    setupFusion(&fusion);
-  }
-}
-
-BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
-  constexpr int kHiddenFeatures = 512;
-  constexpr int kBatchSize = 64;
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    benchmark_state.PauseTiming();
-    Fusion fusion;
-    setupFusion(&fusion);
-    std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
-    benchmark_state.ResumeTiming();
-
-    // Auto-schedule
-    schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-  }
-}
-
-BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_Lower(benchmark::State& benchmark_state) {
-  constexpr int kHiddenFeatures = 512;
-  constexpr int kBatchSize = 64;
-
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
-
-  schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  for (auto _ : benchmark_state) {
-    GpuLower gpu_lower(&fusion);
-  }
-}
-
-BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond);
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_Compile(benchmark::State& benchmark_state) {
-  constexpr int kHiddenFeatures = 512;
-  constexpr int kBatchSize = 64;
-
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
-
-  schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(&fusion);
-  }
-}
-
-BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond);
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_RunFusion(
-    benchmark::State& benchmark_state,
-    int hidden_features,
-    int batch_size) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.compileFusion(&fusion);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-}
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64)
-    ->Unit(benchmark::kMicrosecond);
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128)
-    ->Unit(benchmark::kMicrosecond);
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_RunFusion_GpuOnly(
-    benchmark::State& benchmark_state,
-    int hidden_features,
-    int batch_size) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.setMeasureKernelTimeFlag(true);
-  executor.compileFusion(&fusion);
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-    benchmark_state.SetIterationTime(executor.kernelTimeMs() / 1000.0);
-  }
-}
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-static void LstmCell_RunFusion_CpuOnly(
-    benchmark::State& benchmark_state,
-    int hidden_features,
-    int batch_size) {
-  Fusion fusion;
-
-  // setup fusion
-  setupFusion(&fusion);
-
-  // inputs
-  std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.setExecuteKernelFlag(false);
-  executor.compileFusion(&fusion);
-
-  for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-  }
-}
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64)
-    ->Unit(benchmark::kMicrosecond);
-
-BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128)
-    ->Unit(benchmark::kMicrosecond);
--- a/third_party/nvfuser/benchmark/main.cpp
+++ b/third_party/nvfuser/benchmark/main.cpp
@ -1,3 +0,0 @@
-#include <benchmark/benchmark.h>
-
-BENCHMARK_MAIN();
--- a/third_party/nvfuser/benchmark/matmul.cpp
+++ b/third_party/nvfuser/benchmark/matmul.cpp
@ -1,357 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/matmul.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
-  int capability_major = at::cuda::getCurrentDeviceProperties()->major;
-  int capability_minor = at::cuda::getCurrentDeviceProperties()->minor;
-
-  if (capability_major < required_major ||
-      (capability_major == required_major &&
-       capability_minor < required_minor)) {
-    return true;
-  }
-  return false;
-}
-
-bool hasRequiredSmemSize(size_t required_size) {
-  // Only checking device 0
-  return at::cuda::getDeviceProperties(0)->sharedMemPerBlockOptin >=
-      required_size;
-}
-
-#define NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(                       \
-    REQUIRED_MAJOR, REQUIRED_MINOR, SMEM_SIZE, STATE)            \
-  if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR) || \
-      !hasRequiredSmemSize(SMEM_SIZE)) {                         \
-    STATE.SkipWithError("Unsupported arch or not enough smem!"); \
-    return;                                                      \
-  }
-
-// util to track support matmul operand layout.
-using MatmulLayout = MmaOptions::MmaInputLayout;
-
-static constexpr std::array<MatmulLayout, 3> kAllSupportedLayout = {
-    MatmulLayout::TT,
-    MatmulLayout::NT,
-    MatmulLayout::TN};
-
-// Generic interface to get matmul op with the given layout.
-TensorView* matmul(TensorView* a, TensorView* b, MatmulLayout layout) {
-  TORCH_CHECK(
-      a->nDims() == 2 && b->nDims() == 2, "only pure matmuls for these tests");
-  TensorView *tv2 = nullptr, *tv0b = nullptr, *tv1b = nullptr;
-  switch (layout) {
-    case MatmulLayout::TT:
-      tv0b = broadcast(a, {false, false, true});
-      tv1b = broadcast(b, {true, false, false});
-      tv2 = fusedMultiplySum(tv0b, tv1b, {1});
-      break;
-    case MatmulLayout::TN:
-      tv0b = broadcast(a, {false, true, false});
-      tv1b = broadcast(b, {true, false, false});
-      tv2 = fusedMultiplySum(tv0b, tv1b, {2});
-      break;
-    case MatmulLayout::NT:
-      tv0b = broadcast(a, {false, false, true});
-      tv1b = broadcast(b, {false, true, false});
-      tv2 = fusedMultiplySum(tv0b, tv1b, {0});
-      break;
-    default:
-      TORCH_CHECK(false, "unsupported data layout.");
-  }
-  return tv2;
-}
-
-// Utility to generate matmul input tensors based on given layout
-at::Tensor atMatmul(at::Tensor a, at::Tensor b, MatmulLayout layout) {
-  switch (layout) {
-    case MatmulLayout::TT:
-      return a.matmul(b);
-    case MatmulLayout::TN:
-      return a.matmul(b.t());
-    case MatmulLayout::NT:
-      return a.t().matmul(b);
-    default:
-      TORCH_CHECK(false, "unsupported data layout.");
-  }
-  return at::Tensor();
-}
-
-// Utility to generate reference results based on given layout
-std::pair<at::Tensor, at::Tensor> fp16MatmulAtInput(
-    int M,
-    int N,
-    int K,
-    MatmulLayout layout) {
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  switch (layout) {
-    case MatmulLayout::TT:
-      return std::make_pair(
-          at::randn({M, K}, options), at::randn({K, N}, options));
-    case MatmulLayout::TN:
-      return std::make_pair(
-          at::randn({M, K}, options), at::randn({N, K}, options));
-    case MatmulLayout::NT:
-      return std::make_pair(
-          at::randn({K, M}, options), at::randn({K, N}, options));
-    default:
-      TORCH_CHECK(false, "unsupported data layout.");
-  }
-  return std::make_pair(at::Tensor(), at::Tensor());
-}
-
-// TODO: separate compute and schedule definition once the can schedule
-//  logic and pattern matching is ready.
-void setupMatmul(Fusion* fusion, MatmulLayout layout, MatmulParam params) {
-  // Only hgemm on the initial setup
-  auto a = makeContigTensor(2, DataType::Half);
-  auto b = makeContigTensor(2, DataType::Half);
-
-  auto c = matmul(a, b, layout);
-
-  fusion->addInput(a);
-  fusion->addInput(b);
-  fusion->addOutput(c);
-
-  scheduleMatmul(c, a, b, params);
-}
-
-static void SingleMatmulBase(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout,
-    MatmulParam params) {
-  std::vector<int64_t> input_mnk{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  // Define fusion graph
-  setupMatmul(fusion, layout, params);
-
-  // inputs
-  at::manual_seed(0);
-
-  // Tensor inputs
-  auto inputs = fp16MatmulAtInput(
-      input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
-
-  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
-      {inputs.first, inputs.second});
-
-  // Always use 32b indexing mode for now.
-  TORCH_INTERNAL_ASSERT(args.getIndexMode() == KernelIndexMode::INT32);
-
-  // Compile kernel
-  FusionExecutor fe;
-  fe.compileFusion(fusion, args, LaunchParams());
-
-  // Warm up run
-  auto outputs = fe.runFusion({inputs.first, inputs.second});
-  fe.setMeasureKernelTimeFlag(true);
-
-  // Sync everything up before we start
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto outputs = fe.runFusion({inputs.first, inputs.second});
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  cudaDeviceSynchronize();
-
-  // TODO: FLOPS calculation
-}
-
-static void EagerModeMatmul(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout) {
-  std::vector<int64_t> input_mnk{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-
-  auto inputs = fp16MatmulAtInput(
-      input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
-
-  // warm up run
-  auto outputs = atMatmul(inputs.first, inputs.second, layout);
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    CudaKernelTimer timer;
-    outputs = atMatmul(inputs.first, inputs.second, layout);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  cudaDeviceSynchronize();
-}
-
-// Actual benchmarking
-// -----------------------------------------------------------------
-
-size_t getSmemSize(GemmTile cta_tile, int stage_number) {
-  return ((cta_tile.m * cta_tile.k) + (cta_tile.n * cta_tile.k)) *
-      dataTypeSize(DataType::Half) * stage_number;
-}
-
-// TODO: this part eventually will be automated by heuristics
-MatmulParam getMatmulParams(
-    GemmTile cta_tile,
-    int stage_number,
-    MatmulLayout layout) {
-  MatMulTileOptions gemm_tile;
-  gemm_tile.cta_tile = cta_tile;
-  // TODO: pipe through split K
-  gemm_tile.warp_tile = GemmTile(64, 64, cta_tile.k);
-  gemm_tile.instruction_tile = GemmTile(16, 16, 16);
-
-  // Collect mma swizzle info
-  auto mma_builder =
-      MmaBuilder(MmaOptions::MacroType::Ampere_16_16_16, gemm_tile)
-          .layout(layout);
-
-  MatmulParam params(mma_builder);
-  params.tile_sizes = gemm_tile;
-  params.async_gmem_load_operands = true;
-  params.double_buffer_options.double_buffer_smem_write = true;
-  params.double_buffer_options.double_buffer_smem_read = true;
-  params.double_buffer_options.smem_double_buffer_stage = stage_number;
-
-  return params;
-}
-
-static void Nvfuser_Matmul_4warp3stage(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout) {
-  auto cta_tile = GemmTile(128, 128, 32);
-  int number_of_stage = 3;
-
-  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
-
-  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
-      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
-
-  // Run benchmark:
-  SingleMatmulBase(benchmark_state, layout, params);
-}
-
-static void Nvfuser_Matmul_8warp3stage(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout) {
-  auto cta_tile = GemmTile(256, 128, 32);
-  int number_of_stage = 3;
-
-  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
-
-  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
-      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
-
-  // Run benchmark:
-  SingleMatmulBase(benchmark_state, layout, params);
-}
-
-static void Nvfuser_Matmul_4warp4stage(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout) {
-  auto cta_tile = GemmTile(128, 128, 32);
-  int number_of_stage = 4;
-
-  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
-
-  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
-      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
-
-  // Run benchmark:
-  SingleMatmulBase(benchmark_state, layout, params);
-}
-
-static void Nvfuser_Matmul_8warp4stage(
-    benchmark::State& benchmark_state,
-    MatmulLayout layout) {
-  auto cta_tile = GemmTile(256, 128, 32);
-  int number_of_stage = 4;
-
-  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
-
-  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
-      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
-
-  // Run benchmark:
-  SingleMatmulBase(benchmark_state, layout, params);
-}
-
-// ----------------------------- Benchmark Instantiation-------
-
-// Common utils:
-#define NO_TILE_QUANTIZATION_ARGS                                             \
-  ArgsProduct(                                                                \
-      {{2048}, {3456}, benchmark::CreateDenseRange(512, 4096, /*step=*/512)}) \
-      ->Unit(benchmark::kMicrosecond)                                         \
-      ->UseManualTime();
-
-#define ForAllLayouts(run)   \
-  run(TT, MatmulLayout::TT); \
-  run(TN, MatmulLayout::TN); \
-  run(NT, MatmulLayout::NT)
-
-// Instantiations:
-#define Nvfuser_4warp3stage_test(layout_label, layout) \
-  BENCHMARK_CAPTURE(                                   \
-      Nvfuser_Matmul_4warp3stage,                      \
-      no_quant_nvfuser_4warp_##layout_label,           \
-      layout)                                          \
-      ->NO_TILE_QUANTIZATION_ARGS
-
-#define Nvfuser_8warp3stage_test(layout_label, layout) \
-  BENCHMARK_CAPTURE(                                   \
-      Nvfuser_Matmul_8warp3stage,                      \
-      no_quant_nvfuser_8warp_##layout_label,           \
-      layout)                                          \
-      ->NO_TILE_QUANTIZATION_ARGS
-
-#define Nvfuser_4warp4stage_test(layout_label, layout) \
-  BENCHMARK_CAPTURE(                                   \
-      Nvfuser_Matmul_4warp4stage,                      \
-      no_quant_nvfuser_4warp_##layout_label,           \
-      layout)                                          \
-      ->NO_TILE_QUANTIZATION_ARGS
-
-#define Nvfuser_8warp4stage_test(layout_label, layout) \
-  BENCHMARK_CAPTURE(                                   \
-      Nvfuser_Matmul_8warp4stage,                      \
-      no_quant_nvfuser_8warp_##layout_label,           \
-      layout)                                          \
-      ->NO_TILE_QUANTIZATION_ARGS
-
-#define Eagermode_test(layout_label, layout)                      \
-  BENCHMARK_CAPTURE(                                              \
-      EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \
-      ->NO_TILE_QUANTIZATION_ARGS
-
-ForAllLayouts(Nvfuser_4warp3stage_test);
-ForAllLayouts(Nvfuser_4warp4stage_test);
-ForAllLayouts(Nvfuser_8warp3stage_test);
-ForAllLayouts(Nvfuser_8warp4stage_test);
-ForAllLayouts(Eagermode_test);
--- a/third_party/nvfuser/benchmark/reduction.cpp
+++ b/third_party/nvfuser/benchmark/reduction.cpp
@ -1,384 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <sstream>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-// Return reduction tensor view and output of reduction
-static void setupReduction(Fusion* fusion, DataType dtype, int red_axis) {
-  FusionGuard fg(fusion);
-
-  bool is_fp16 = dtype == DataType::Half;
-
-  TensorView* tv0 = makeContigTensor(2, dtype);
-  fusion->addInput(tv0);
-
-  TensorView* tv0_cast = tv0;
-  if (is_fp16) {
-    tv0_cast = castOp(DataType::Float, tv0);
-  }
-
-  TensorView* tv1 = sum(tv0_cast, {red_axis});
-
-  TensorView* tv1_cast = tv1;
-  if (is_fp16) {
-    tv1_cast = castOp(DataType::Half, tv1);
-  }
-
-  fusion->addOutput(tv1_cast);
-
-  TensorView* output_of_reduction = nullptr;
-  if (is_fp16) {
-    output_of_reduction = tv1_cast;
-  }
-}
-
-static void NvFuserScheduler_Reduction(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    int reduction_dim) {
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor aten_input =
-      (reduction_dim ? at::randn({iter_size, reduction_size}, options)
-                     : at::randn({reduction_size, iter_size}, options));
-
-  fusion_executor_cache->profile(true);
-  fusion_executor_cache->runFusionWithInputs({aten_input});
-
-  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
-  auto executor_instance = compile_log.fusion_executor;
-  auto rparams = toString(compile_log.params);
-  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-
-  benchmark_state.SetLabel(rparams + lparams);
-
-  fusion_executor_cache->profile(false);
-  executor_instance->setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs({aten_input});
-    benchmark_state.SetIterationTime(
-        executor_instance->kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (iter_size * reduction_size + iter_size) * int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_Reduction(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    int reduction_dim) {
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor aten_input =
-      (reduction_dim ? at::randn({iter_size, reduction_size}, options)
-                     : at::randn({reduction_size, iter_size}, options));
-
-  // Sync everything up before we start
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    auto output = aten_input.sum({reduction_dim});
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (iter_size * reduction_size + iter_size) * int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Reduction_Outer_fp32(benchmark::State& benchmark_state) {
-  Baseline_Reduction(benchmark_state, DataType::Float, 0);
-}
-
-static void Baseline_Reduction_Outer_fp16(benchmark::State& benchmark_state) {
-  Baseline_Reduction(benchmark_state, DataType::Half, 0);
-}
-
-static void Baseline_Reduction_Inner_fp32(benchmark::State& benchmark_state) {
-  Baseline_Reduction(benchmark_state, DataType::Float, 1);
-}
-
-static void Baseline_Reduction_Inner_fp16(benchmark::State& benchmark_state) {
-  Baseline_Reduction(benchmark_state, DataType::Half, 1);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Reduction_Outer_fp32,
-    setupReduction,
-    NvFuserScheduler_Reduction,
-    DataType::Float,
-    0);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Reduction_Outer_fp16,
-    setupReduction,
-    NvFuserScheduler_Reduction,
-    DataType::Half,
-    0);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Reduction_Inner_fp32,
-    setupReduction,
-    NvFuserScheduler_Reduction,
-    DataType::Float,
-    1);
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Reduction_Inner_fp16,
-    setupReduction,
-    NvFuserScheduler_Reduction,
-    DataType::Half,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1024, 1024 * 512}, {2, 4 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 4 * 1024}, {1024, 1024 * 512}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1024, 1024 * 1024}, {2, 4 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 4 * 1024}, {1024, 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Reduction_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/rms_norm.cpp
+++ b/third_party/nvfuser/benchmark/rms_norm.cpp
@ -1,170 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupRMSNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(
-      dtype == DataType::Float || dtype == DataType::Half ||
-      dtype == DataType::BFloat16);
-
-  FusionGuard fg(fusion);
-
-  const float kEps = 1e-6;
-
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  // setup fusion
-  auto input = makeContigTensor(3, dtype);
-  auto weight = makeContigTensor(1, dtype);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-  }
-
-  auto rms_norm_results = rms_norm(input, 1, weight, eps_ptr);
-
-  auto output = rms_norm_results.output;
-
-  if (dtype != DataType::Float) {
-    output = castOp(dtype, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_RMSNorm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(
-      dtype == DataType::Float || dtype == DataType::Half ||
-      dtype == DataType::BFloat16);
-
-  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[2]}, options);
-
-  std::vector<c10::IValue> aten_inputs({input, weight});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * input.numel() + weight.numel()) * int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_RMSNorm_fp32,
-    setupRMSNorm,
-    NvFuserScheduler_RMSNorm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{16, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{18, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{22, 44}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{24, 48}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_RMSNorm_fp16,
-    setupRMSNorm,
-    NvFuserScheduler_RMSNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{16, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{18, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{22, 44}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{24, 48}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// TODO: Automatically disable/enable if bf16 is supported
-// NVFUSER_BENCHMARK_DEFINE(
-//     NvFuserScheduler_RMSNorm_bf16,
-//     setupRMSNorm,
-//     NvFuserScheduler_RMSNorm,
-//     DataType::BFloat16);
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{16, 64}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{18, 56}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{22, 44}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{24, 48}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
--- a/third_party/nvfuser/benchmark/rms_norm_backward.cpp
+++ b/third_party/nvfuser/benchmark/rms_norm_backward.cpp
@ -1,163 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
-  FusionGuard fg(fusion);
-
-  TORCH_INTERNAL_ASSERT(
-      dtype == DataType::Float || dtype == DataType::Half ||
-      dtype == DataType::BFloat16);
-
-  // setup fusion
-  auto grad_out = makeContigTensor(3, dtype);
-  auto input = makeContigTensor(3, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto rstd = TensorViewBuilder()
-                  .contiguity({false, false, false})
-                  .shape({-1, -1, 1})
-                  .dtype(dtype)
-                  .build();
-
-  fusion->addInput(grad_out);
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(rstd);
-
-  if (dtype == DataType::Half) {
-    grad_out = castOp(DataType::Float, grad_out);
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    rstd = castOp(DataType::Float, rstd);
-  }
-
-  auto rms_norm_results =
-      rms_norm_backward(grad_out, input, {1}, rstd, weight, {true, true, true});
-
-  if (dtype != DataType::Float) {
-    rms_norm_results.grad_input = castOp(dtype, rms_norm_results.grad_input);
-    rms_norm_results.grad_weight = castOp(dtype, rms_norm_results.grad_weight);
-  }
-
-  fusion->addOutput(rms_norm_results.grad_input);
-  fusion->addOutput(rms_norm_results.grad_weight);
-}
-
-static void NvFuserScheduler_RMSNorm_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(
-      dtype == DataType::Float || dtype == DataType::Half ||
-      dtype == DataType::BFloat16);
-
-  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[2]}, options);
-  at::Tensor rstd = at::randn({input_shape[0], input_shape[1], 1}, options);
-
-  std::vector<c10::IValue> aten_inputs({grad_out, input, weight, rstd});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() + weight.numel() + rstd.numel()) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_RMSNorm_BWD_fp32,
-    setupRMSNorm_BWD,
-    NvFuserScheduler_RMSNorm_BWD,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{16, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{28, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
-    ->RangeMultiplier(2)
-    ->Ranges({{24, 48}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_RMSNorm_BWD_fp16,
-    setupRMSNorm_BWD,
-    NvFuserScheduler_RMSNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{16, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{28, 56}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
-    ->RangeMultiplier(2)
-    ->Ranges({{24, 48}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// TODO: Automatically disable/enable if bf16 is supported
-// NVFUSER_BENCHMARK_DEFINE(
-//     NvFuserScheduler_RMSNorm_BWD_bf16,
-//     setupRMSNorm_BWD,
-//     NvFuserScheduler_RMSNorm_BWD,
-//     DataType::BFloat16);
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{16, 64}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{28, 56}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
-
-// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
-//     ->RangeMultiplier(2)
-//     ->Ranges({{24, 48}})
-//     ->Unit(benchmark::kMicrosecond)
-//     ->UseManualTime();
--- a/third_party/nvfuser/benchmark/scale_bias_relu.cpp
+++ b/third_party/nvfuser/benchmark/scale_bias_relu.cpp
@ -1,406 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static void setupSBR(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const size_t kNumberOfDims = 4;
-
-  std::vector<int64_t> bcast_shape(kNumberOfDims, 1);
-  bcast_shape[bcast_shape.size() - 1] = -1;
-
-  std::vector<bool> bcast_contig(kNumberOfDims, false);
-  bcast_contig[bcast_contig.size() - 1] = true;
-
-  auto x = makeContigTensor(kNumberOfDims, dtype);
-
-  auto scale = TensorViewBuilder()
-                   .contiguity(bcast_contig)
-                   .shape(bcast_shape)
-                   .dtype(dtype)
-                   .build();
-
-  auto bias = TensorViewBuilder()
-                  .contiguity(bcast_contig)
-                  .shape(bcast_shape)
-                  .dtype(dtype)
-                  .build();
-
-  fusion->addInput(x);
-  fusion->addInput(scale);
-  fusion->addInput(bias);
-
-  if (dtype == DataType::Half) {
-    x = castOp(DataType::Float, x);
-    scale = castOp(DataType::Float, scale);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto scale_bias = add(mul(x, scale), bias);
-  auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
-
-  if (dtype == DataType::Half) {
-    scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
-  }
-  fusion->addOutput(scale_bias_relu);
-}
-
-static void setupSBRNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-  FusionGuard fg(fusion);
-
-  const size_t kNumberOfDims = 4;
-
-  auto x = makeContigTensor(kNumberOfDims, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-  auto mean = makeContigTensor(1, dtype);
-  auto var = makeContigTensor(1, dtype);
-
-  fusion->addInput(x);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(mean);
-  fusion->addInput(var);
-
-  std::vector<bool> broadcast_mask(kNumberOfDims, true);
-  broadcast_mask[broadcast_mask.size() - 1] = false;
-
-  if (dtype == DataType::Half) {
-    x = castOp(DataType::Float, x);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-    mean = castOp(DataType::Float, mean);
-    var = castOp(DataType::Float, var);
-  }
-
-  auto rsqrt = unaryOp(UnaryOpType::Rsqrt, var);
-  auto this_scale = mul(weight, rsqrt);
-  auto this_bias = mul(sub(bias, mean), this_scale);
-
-  auto bcast_scale = broadcast(this_scale, broadcast_mask);
-  auto bcast_bias = broadcast(this_bias, broadcast_mask);
-
-  auto scale_bias = add(mul(x, bcast_scale), bcast_bias);
-  auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
-
-  if (dtype == DataType::Half) {
-    scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
-  }
-
-  fusion->addOutput(scale_bias_relu);
-}
-
-//------------------------------------------------------------------------------
-
-static void NvFuserScheduler_SBR(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-  std::vector<int64_t> bcast_shape{1, 1, 1, -1};
-
-  // inputs
-  at::manual_seed(0);
-  std::vector<int64_t> static_bcast_shape{1, 1, 1, benchmark_state.range(2)};
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_scale = at::ones(static_bcast_shape, options);
-  at::Tensor at_bias = at::zeros(static_bcast_shape, options);
-
-  // inputs
-  std::vector<c10::IValue> aten_inputs = {at_x, at_scale, at_bias};
-
-  fusion_executor_cache->profile(true);
-  fusion_executor_cache->runFusionWithInputs(aten_inputs);
-
-  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
-  auto executor_instance = compile_log.fusion_executor;
-  auto params = toString(compile_log.params);
-  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-
-  benchmark_state.SetLabel(params + lparams);
-  benchmark_state.SetLabel(lparams);
-
-  fusion_executor_cache->profile(false);
-  executor_instance->setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
-    benchmark_state.SetIterationTime(
-        executor_instance->kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  const size_t size =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t channels = input_shape[3];
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) {
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-  std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_y = at::randn(input_shape, options);
-  at::Tensor at_scale = at::ones(bcast_shape, options);
-  at::Tensor at_bias = at::zeros(bcast_shape, options);
-
-  clearL2Cache();
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    auto scale = at::mul(at_x, at_scale);
-    auto bias = at::add(scale, at_bias);
-    auto output = at::relu(bias);
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    clearL2Cache();
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  const size_t size =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t channels = input_shape[3];
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-static void NvFuserScheduler_SBR_Norm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-  std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones(bcast_shape, options);
-  at::Tensor at_bias = at::zeros(bcast_shape, options);
-  at::Tensor at_mean = at::zeros(bcast_shape, options);
-  at::Tensor at_var = at::ones(bcast_shape, options);
-
-  // inputs
-  std::vector<c10::IValue> aten_inputs = {
-      at_x, at_weight, at_bias, at_mean, at_var};
-
-  fusion_executor_cache->profile(true);
-  fusion_executor_cache->runFusionWithInputs(aten_inputs);
-
-  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
-  auto executor_instance = compile_log.fusion_executor;
-  auto params = toString(compile_log.params);
-  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-
-  benchmark_state.SetLabel(params + lparams);
-
-  fusion_executor_cache->profile(false);
-  executor_instance->setMeasureKernelTimeFlag(true);
-  // Sync everything up before we start
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
-    benchmark_state.SetIterationTime(
-        executor_instance->kernelTimeMs() / 1000.0);
-  }
-
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  const size_t size =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t channels = input_shape[3];
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-static void Baseline_SBR_Norm(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-  std::vector<int64_t> bcast_shape{1, 1, 1, benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones(bcast_shape, options);
-  at::Tensor at_bias = at::zeros(bcast_shape, options);
-  at::Tensor at_mean = at::zeros(bcast_shape, options);
-  at::Tensor at_var = at::ones(bcast_shape, options);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    auto this_scale = at::mul(at_weight, at::rsqrt(at_var));
-    auto this_bias = at::mul(at::sub(at_bias, at_mean), this_scale);
-
-    auto scale = at::mul(at_x, this_scale);
-    auto bias = at::add(scale, this_bias);
-    auto output = at::relu(bias);
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-
-  const size_t size =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t channels = input_shape[3];
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
-      int64_t(dataTypeSize(dtype)));
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_SBR_fp32,
-    setupSBR,
-    NvFuserScheduler_SBR,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_SBR_fp16,
-    setupSBR,
-    NvFuserScheduler_SBR,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_SBR_Norm_fp32,
-    setupSBRNorm,
-    NvFuserScheduler_SBR_Norm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_SBR_Norm_fp16,
-    setupSBRNorm,
-    NvFuserScheduler_SBR_Norm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-static void Baseline_SBR_fp32(benchmark::State& benchmark_state) {
-  Baseline_SBR(benchmark_state, DataType::Float);
-}
-
-BENCHMARK(Baseline_SBR_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void Baseline_SBR_fp16(benchmark::State& benchmark_state) {
-  Baseline_SBR(benchmark_state, DataType::Half);
-}
-
-BENCHMARK(Baseline_SBR_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-static void Baseline_SBR_Norm_fp32(benchmark::State& benchmark_state) {
-  Baseline_SBR_Norm(benchmark_state, DataType::Float);
-}
-
-BENCHMARK(Baseline_SBR_Norm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void Baseline_SBR_Norm_fp16(benchmark::State& benchmark_state) {
-  Baseline_SBR_Norm(benchmark_state, DataType::Half);
-}
-
-BENCHMARK(Baseline_SBR_Norm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/shape_inference.cpp
+++ b/third_party/nvfuser/benchmark/shape_inference.cpp
@ -1,211 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static auto getLayerBackwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeSymbolicTensor(shape.size());
-  auto input = makeSymbolicTensor(shape.size());
-  auto mean = makeConcreteTensor(outer_shape);
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeSymbolicTensor(norm_shape.size());
-  auto bias = makeSymbolicTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(mean);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  auto grads = layer_norm_backward(
-      grad_out,
-      input,
-      norm_shape,
-      mean,
-      rstd,
-      weight,
-      bias,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  at::Tensor aten_bias = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-  auto at_bias = c10::optional<at::Tensor>(aten_bias);
-
-  const float kEps = 1e-5;
-  auto aten_results =
-      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
-  auto aten_output = std::get<0>(aten_results);
-  auto aten_mean = std::get<1>(aten_results);
-  auto aten_rstd = std::get<2>(aten_results);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {
-      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-void LayerNormBackward_ShapeInference_Base(
-    benchmark::State& benchmark_state,
-    bool disable_launch_parameter_cache) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-
-  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
-
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(args).has_value());
-
-  fec->profile(true);
-  fec->disableKernelLaunch();
-  fec->runFusionWithInputs(aten_inputs);
-  if (disable_launch_parameter_cache) {
-    fec->disableLaunchParamCache();
-  }
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    fec->runFusionWithInputs(aten_inputs);
-  }
-}
-
-static void LayerNormBackward_ShapeInference(
-    benchmark::State& benchmark_state) {
-  LayerNormBackward_ShapeInference_Base(benchmark_state, true);
-}
-
-static void LayerNormBackward_NoShapeInferenceCachedBaseline(
-    benchmark::State& benchmark_state) {
-  LayerNormBackward_ShapeInference_Base(benchmark_state, false);
-}
-
-static auto getLayerForwardNormRuntime(
-    std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
-    std::vector<at::IValue>& aten_inputs,
-    std::vector<int64_t>& shape,
-    std::vector<int64_t>& norm_shape) {
-  Fusion& fusion = *fusion_ptr.get();
-
-  const float kEps = 1e-5;
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto input = makeSymbolicTensor(shape.size());
-  fusion.addInput(input);
-
-  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(shape, options);
-
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-  aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
-
-  return fec->getMostRecentKernelRuntime();
-}
-
-void LayerNormForward_ShapeInferenceBase(
-    benchmark::State& benchmark_state,
-    bool disable_launch_param_cache) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-
-  // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
-  std::vector<at::IValue> aten_inputs;
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
-
-  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
-
-  TORCH_INTERNAL_ASSERT(
-      runtime->getMaybeHeuristicsFor(args).has_value());
-
-  fec->profile(true);
-  fec->disableKernelLaunch();
-  fec->runFusionWithInputs(aten_inputs);
-
-  if (disable_launch_param_cache) {
-    fec->disableLaunchParamCache();
-  }
-
-  for (auto _ : benchmark_state) {
-    // Setup (not included in the measurement)
-    fec->runFusionWithInputs(aten_inputs);
-  }
-}
-
-static void LayerNormForward_NoShapeInferenceCachedBaseline(
-    benchmark::State& benchmark_state) {
-  LayerNormForward_ShapeInferenceBase(benchmark_state, false);
-}
-
-static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
-  LayerNormForward_ShapeInferenceBase(benchmark_state, true);
-}
-
-BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
-    ->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
-    ->Unit(benchmark::kMicrosecond);
--- a/third_party/nvfuser/benchmark/softmax.cpp
+++ b/third_party/nvfuser/benchmark/softmax.cpp
@ -1,454 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupSoftmax(
-    Fusion* fusion,
-    DataType dtype,
-    const int reduction_axis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-  // setup fusion
-  auto input = makeContigTensor(2, dtype);
-  fusion->addInput(input);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-  }
-
-  auto output = softmax(input, reduction_axis);
-
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_Softmax(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    const int reduction_axis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::Tensor aten_input =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  std::vector<c10::IValue> aten_inputs({aten_input});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-// Warp softmax comparison
-static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) {
-  auto dtype = DataType::Float;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-  setupSoftmax(fusion, dtype, 1);
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  std::vector<c10::IValue> aten_inputs({aten_input});
-
-  // Schedule through magic scheduler:
-  SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
-  TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
-      ScheduleHeuristic::Persistent, fusion, runtime_info));
-  auto scheduler = SchedulerEntry::makeEntry(
-      ScheduleHeuristic::Persistent, fusion, runtime_info);
-  scheduler->schedule(fusion);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-  fe.setMeasureKernelTimeFlag(true);
-
-  // Sync everything up before we start
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto outputs = fe.runFusion(aten_inputs);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-static void Softmax_WarpReduce(benchmark::State& benchmark_state) {
-  auto dtype = DataType::Float;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0), benchmark_state.range(1)};
-
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-  setupSoftmax(fusion, dtype, 1);
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  std::vector<c10::IValue> aten_inputs({aten_input});
-
-  // Schedule through magic scheduler:
-  SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
-  TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
-      ScheduleHeuristic::Persistent, fusion, runtime_info));
-  auto scheduler = SchedulerEntry::makeEntry(
-      ScheduleHeuristic::Persistent, fusion, runtime_info);
-  scheduler->schedule(fusion);
-
-  // Modify the schedule to use warp reduction
-  auto used_vals = fusion->usedMathVals();
-  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
-    for (IterDomain* id : tv->domain()->domain()) {
-      if (id->getParallelType() == ParallelType::TIDx) {
-        id->padToMultipleOfWarp();
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-  fe.setMeasureKernelTimeFlag(true);
-
-  // Sync everything up before we start
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    auto outputs = fe.runFusion(aten_inputs);
-    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-BENCHMARK(Softmax_WarpReduce)
-    ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16 * 197, 16 * 197}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Softmax_WarpReduceReference)
-    ->RangeMultiplier(2)
-    ->Ranges({{8, 8}, {16 * 197, 16 * 197}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Softmax(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    const int reduction_axis) {
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::Tensor aten_input =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    CudaKernelTimer timer;
-    auto output = at::_softmax(aten_input, reduction_axis, false);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-static void Baseline_Softmax_Outer_fp32(benchmark::State& benchmark_state) {
-  Baseline_Softmax(benchmark_state, DataType::Float, 0);
-}
-
-static void Baseline_Softmax_Inner_fp32(benchmark::State& benchmark_state) {
-  Baseline_Softmax(benchmark_state, DataType::Float, 1);
-}
-
-static void Baseline_Softmax_Outer_fp16(benchmark::State& benchmark_state) {
-  Baseline_Softmax(benchmark_state, DataType::Half, 0);
-}
-
-static void Baseline_Softmax_Inner_fp16(benchmark::State& benchmark_state) {
-  Baseline_Softmax(benchmark_state, DataType::Half, 1);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Outer_fp32,
-    setupSoftmax,
-    NvFuserScheduler_Softmax,
-    DataType::Float,
-    0);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Inner_fp32,
-    setupSoftmax,
-    NvFuserScheduler_Softmax,
-    DataType::Float,
-    1);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Outer_fp16,
-    setupSoftmax,
-    NvFuserScheduler_Softmax,
-    DataType::Half,
-    0);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Inner_fp16,
-    setupSoftmax,
-    NvFuserScheduler_Softmax,
-    DataType::Half,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/softmax_backward.cpp
+++ b/third_party/nvfuser/benchmark/softmax_backward.cpp
@ -1,364 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupSoftmaxBWD(
-    Fusion* fusion,
-    DataType dtype,
-    const int reduction_axis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-  // setup fusion
-  auto grad_output = makeContigTensor(2, dtype);
-  auto output = makeContigTensor(2, dtype);
-  auto input = makeContigTensor(2, dtype);
-  fusion->addInput(grad_output);
-  fusion->addInput(output);
-  fusion->addInput(input);
-
-  if (dtype == DataType::Half) {
-    grad_output = castOp(DataType::Float, grad_output);
-    output = castOp(DataType::Float, output);
-    input = castOp(DataType::Float, input);
-  }
-
-  auto grad_input = softmax_backward(grad_output, output, reduction_axis);
-
-  if (dtype == DataType::Half) {
-    grad_input = castOp(DataType::Half, grad_input);
-  }
-
-  fusion->addOutput(grad_input);
-}
-
-static void NvFuserScheduler_Softmax_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    const int reduction_axis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::Tensor input =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  at::Tensor grad_output =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  at::Tensor output =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  std::vector<c10::IValue> aten_inputs({grad_output, output, input});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Softmax_BWD(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    const int reduction_axis) {
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  auto reduction_size = benchmark_state.range(0);
-  auto iter_size = benchmark_state.range(1);
-
-  at::Tensor input =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  at::Tensor grad_output =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  at::Tensor output =
-      (reduction_axis ? at::randn({iter_size, reduction_size}, options)
-                      : at::randn({reduction_size, iter_size}, options));
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    CudaKernelTimer timer;
-    auto grad_input = at::_softmax_backward_data(
-        grad_output, output, reduction_axis, data_type_to_aten(dtype));
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() * int64_t(dataTypeSize(dtype))));
-}
-
-static void Baseline_Softmax_BWD_Outer_fp32(benchmark::State& benchmark_state) {
-  Baseline_Softmax_BWD(benchmark_state, DataType::Float, 0);
-}
-
-static void Baseline_Softmax_BWD_Inner_fp32(benchmark::State& benchmark_state) {
-  Baseline_Softmax_BWD(benchmark_state, DataType::Float, 1);
-}
-
-static void Baseline_Softmax_BWD_Outer_fp16(benchmark::State& benchmark_state) {
-  Baseline_Softmax_BWD(benchmark_state, DataType::Half, 0);
-}
-
-static void Baseline_Softmax_BWD_Inner_fp16(benchmark::State& benchmark_state) {
-  Baseline_Softmax_BWD(benchmark_state, DataType::Half, 1);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_BWD_Outer_fp32,
-    setupSoftmaxBWD,
-    NvFuserScheduler_Softmax_BWD,
-    DataType::Float,
-    0);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_BWD_Inner_fp32,
-    setupSoftmaxBWD,
-    NvFuserScheduler_Softmax_BWD,
-    DataType::Float,
-    1);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_BWD_Outer_fp16,
-    setupSoftmaxBWD,
-    NvFuserScheduler_Softmax_BWD,
-    DataType::Half,
-    0);
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_BWD_Inner_fp16,
-    setupSoftmaxBWD,
-    NvFuserScheduler_Softmax_BWD,
-    DataType::Half,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{1, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/softmax_dropout.cpp
+++ b/third_party/nvfuser/benchmark/softmax_dropout.cpp
@ -1,377 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupSoftmaxDropout(
-    Fusion* fusion,
-    DataType dtype,
-    const int kReductionAxis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  constexpr int kHiddenSize = 768;
-  constexpr int kNumAttentionHeads = 12;
-  constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
-  constexpr float kDropoutProbability = 0.9;
-  constexpr float kScale = 1.0f / kDropoutProbability;
-
-  // setup fusion
-  auto attention_scores = makeContigTensor(4, dtype);
-  auto attention_mask = makeContigTensor(4, dtype);
-
-  Double* divisor = IrBuilder::create<Double>();
-
-  fusion->addInput(attention_scores);
-  fusion->addInput(attention_mask);
-  fusion->addInput(divisor);
-
-  if (dtype == DataType::Half) {
-    attention_scores = castOp(DataType::Float, attention_scores);
-    attention_mask = castOp(DataType::Float, attention_mask);
-  }
-
-  attention_scores = div(attention_scores, divisor);
-  attention_scores = add(attention_scores, attention_mask);
-  auto attention_probs = softmax(attention_scores, kReductionAxis);
-  auto prob = IrBuilder::create<Double>(kDropoutProbability);
-  auto scale = IrBuilder::create<Double>(kScale);
-  auto dropout_results = dropout(attention_probs, prob, scale);
-  auto output = dropout_results.output;
-
-  if (dtype == DataType::Half) {
-    attention_scores = castOp(DataType::Half, attention_scores);
-    attention_probs = castOp(DataType::Half, attention_probs);
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(attention_scores);
-  fusion->addOutput(attention_probs);
-  fusion->addOutput(output);
-
-  fusion->addOutput(dropout_results.mask);
-}
-
-static void NvFuserScheduler_SoftmaxDropout(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    const int kReductionAxis) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  // reduce across 1, [256, 12, 100, 8]
-  std::vector<int64_t> input_shape{256, 12, 100, benchmark_state.range(0)};
-
-  constexpr int kHiddenSize = 768;
-  constexpr int kNumAttentionHeads = 12;
-  constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
-  constexpr float kDropoutProbability = 0.9;
-  constexpr float kScale = 1.0f / kDropoutProbability;
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor at_scores = at::randn(input_shape, options);
-  at::Tensor at_mask = at::randn(input_shape, options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_scores, at_mask, sqrt(kAttentionHeadSize)});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // 5 dtype: attention_scores + attention_mask + attention_scores_out +
-  // attention_probs_out + output
-  // 1 bool: dropout_results.mask
-  // All the same size
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * 5 * at_scores.numel() *
-          int64_t(dataTypeSize(dtype)) +
-      // bool mask
-      int64_t(benchmark_state.iterations()) * at_scores.numel() *
-          int64_t(dataTypeSize(DataType::Bool)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Softmax_Dropout(
-    benchmark::State& benchmark_state,
-    const int kReductionAxis,
-    DataType dtype) {
-  std::vector<int64_t> input_shape{256, 12, 100, benchmark_state.range(0)};
-
-  constexpr int kHiddenSize = 768;
-  constexpr int kNumAttentionHeads = 12;
-  constexpr float kDropoutProbability = 0.1;
-  constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor attention_scores = at::randn(input_shape, options);
-  at::Tensor at_y = at::randn(input_shape, options);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    CudaKernelTimer timer;
-
-    attention_scores = attention_scores / sqrt(kAttentionHeadSize);
-    attention_scores = attention_scores + at_y;
-    auto attention_probs =
-        at::_softmax(attention_scores, kReductionAxis, false);
-    attention_probs = at::dropout(attention_probs, kDropoutProbability, true);
-
-    // Record
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  // 5 dtype: attention_scores + attention_mask + attention_scores_out +
-  // attention_probs_out + output
-  // 1 bool: dropout_results.mask
-  // All the same size
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * 5 * attention_scores.numel() *
-          int64_t(dataTypeSize(dtype)) +
-      // bool mask
-      int64_t(benchmark_state.iterations()) * attention_scores.numel() *
-          int64_t(dataTypeSize(DataType::Bool)));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Softmax_Dropout_Inner_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_Softmax_Dropout(benchmark_state, 3, DataType::Float);
-}
-
-static void Baseline_Softmax_Dropout_Outer_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_Softmax_Dropout(benchmark_state, 1, DataType::Float);
-}
-
-static void Baseline_Softmax_Dropout_Inner_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_Softmax_Dropout(benchmark_state, 3, DataType::Half);
-}
-
-static void Baseline_Softmax_Dropout_Outer_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_Softmax_Dropout(benchmark_state, 1, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Dropout_Inner_fp32,
-    setupSoftmaxDropout,
-    NvFuserScheduler_SoftmaxDropout,
-    DataType::Float,
-    3);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Inner_fp32)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Dropout_Outer_fp32,
-    setupSoftmaxDropout,
-    NvFuserScheduler_SoftmaxDropout,
-    DataType::Float,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Outer_fp32)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Dropout_Inner_fp16,
-    setupSoftmaxDropout,
-    NvFuserScheduler_SoftmaxDropout,
-    DataType::Half,
-    3);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Inner_fp16)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_Softmax_Dropout_Outer_fp16,
-    setupSoftmaxDropout,
-    NvFuserScheduler_SoftmaxDropout,
-    DataType::Half,
-    1);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Outer_fp16)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Softmax_Dropout_Inner_fp32)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Dropout_Outer_fp32)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Softmax_Dropout_Inner_fp16)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Softmax_Dropout_Outer_fp16)
-    ->Arg(8)
-    ->Arg(16)
-    ->Arg(24)
-    ->Arg(32)
-    ->Arg(40)
-    ->Arg(48)
-    ->Arg(56)
-    ->Arg(64)
-    ->Arg(72)
-    ->Arg(80)
-    ->Arg(88)
-    ->Arg(96)
-    ->Arg(104)
-    ->Arg(112)
-    ->Arg(120)
-    ->Arg(128)
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/timm.cpp
+++ b/third_party/nvfuser/benchmark/timm.cpp
@ -1,738 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-using namespace torch::jit::fuser::cuda;
-
-static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) {
-  FusionGuard fg(fusion);
-
-  auto t2 = makeContigTensor(3, DataType::Float);
-  auto t3 = TensorViewBuilder()
-                .shape({-1, -1, 1})
-                .dtype(DataType::Float)
-                .contiguity({true, true, false})
-                .build();
-  auto t4 = TensorViewBuilder()
-                .shape({-1, -1, 1})
-                .dtype(DataType::Float)
-                .contiguity({true, true, false})
-                .build();
-  auto t7 = makeContigTensor(3, DataType::Half);
-
-  fusion->addInput(t2);
-  fusion->addInput(t3);
-  fusion->addInput(t4);
-  fusion->addInput(t7);
-
-  auto t8 = castOp(DataType::Float, t7);
-  auto t9 = set(t8);
-  auto t10 = sub(t2, t3);
-  auto t11 = mul(t10, t4);
-  auto t25 = mul(t9, t11);
-  auto t26 = sum(t25, {0, 1});
-  auto t36 = set(t26);
-  auto t27 = sum(t9, {0, 1});
-  auto t37 = set(t27);
-  auto t39 = castOp(DataType::Half, t11);
-
-  fusion->addOutput(t36);
-  fusion->addOutput(t37);
-  fusion->addOutput(t39);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t2 = at::randn(input_shape, fp32_options);
-  auto t3 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
-  auto t4 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
-  auto t7 = at::randn(input_shape, fp16_options);
-
-  std::vector<c10::IValue> aten_inputs({t2, t3, t4, t7});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // full tensor - float + halfx2 - t2, t7, t39
-  // Inner most dimension only - floatx2 - t36, t37
-  // Outer two dimensions only - floatx2 - t3, t4
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      //           t2 + t7    t3 + t4                 t36 + t37
-      t2.numel() * (4 + 2) + t3.numel() * 4 * 2 + input_shape[2] * (4 * 2) +
-      // T39
-      t2.numel() * 2);
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7,
-    setup_vit_base_patch16_224_bcast7,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7,
-    nullptr);
-
-// pwise case, broadcasting both sides
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7)
-    ->Args({64, 197, 768})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
-  FusionGuard fg(fusion);
-
-  auto t2 = makeContigTensor(3, DataType::Float);
-  auto t5 = makeContigTensor(1, DataType::Float);
-  auto t3 = makeContigTensor(3, DataType::Half);
-  auto t0 = makeContigTensor(1, DataType::Float);
-  auto t1 = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(t2);
-  fusion->addInput(t5);
-  fusion->addInput(t3);
-  fusion->addInput(t0);
-  fusion->addInput(t1);
-
-  std::vector<bool> bcast_pattern0({true, true, false});
-  std::vector<bool> bcast_pattern1({false, false, true});
-
-  auto t4 = castOp(DataType::Float, t3);
-  auto t6 = set(t5);
-  auto t7 = broadcast(t6, bcast_pattern0);
-  auto t8 = add(t4, t7);
-  auto t9 = rand_like(t8);
-  auto d34 =
-      sub(IrBuilder::create<Double>(1.0), IrBuilder::create<Double>(0.0));
-  auto t10 = lt(t9, d34);
-  auto t11 = castOp(DataType::Float, t10);
-  auto t12 = mul(t8, t11);
-  auto b36 = eq(d34, IrBuilder::create<Double>(0.0));
-  auto d37 = castOp(DataType::Double, b36);
-  auto d38 = add(d37, d34);
-  auto d40 = div(IrBuilder::create<Double>(1.0), d38);
-  auto t13 = mul(t12, d40);
-  auto t14 = set(t13);
-  auto t15 = add(t2, t14);
-  auto t16 = set(t15);
-  auto t36 = sum(t16, {2});
-  auto d151 = castOp(DataType::Double, t2->axis(2)->extent());
-  auto d152 = mul(IrBuilder::create<Double>(1.0), d151);
-  auto t19 = div(t36, d152);
-  auto t22 = broadcast(t19, bcast_pattern1);
-  auto t23 = sub(t16, t22);
-  auto t37 = mul(t23, t23);
-  auto t20 = sum(t37, {2});
-  auto t24 = broadcast(t20, bcast_pattern1);
-  auto d95 = castOp(DataType::Double, t2->axis(2)->extent());
-  auto d105 = reciprocal(d95);
-  auto t25 = mul(t24, d105);
-  auto t26 = add(t25, IrBuilder::create<Double>(1e-6));
-  auto t27 = rsqrt(t26);
-  auto t28 = mul(t23, t27);
-  auto t17 = set(t1);
-  auto t29 = broadcast(t17, bcast_pattern0);
-  auto t30 = mul(t28, t29);
-  auto t18 = set(t0);
-  auto t31 = broadcast(t18, bcast_pattern0);
-  auto t32 = add(t30, t31);
-  auto t33 = set(t32);
-  auto t34 = castOp(DataType::Half, t33);
-
-  fusion->addOutput(t16); // full 3d float
-  fusion->addOutput(t10); // full 3d bool
-  fusion->addOutput(t22); // bcast last dim float
-  fusion->addOutput(t27); // bcast last dim float
-  fusion->addOutput(t18); // passthrough t0 float
-  fusion->addOutput(t17); // passthrough t1 float
-  fusion->addOutput(t34); // full 3d half
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t2 = at::randn(input_shape, fp32_options);
-  auto t5 = at::randn({input_shape[2]}, fp32_options);
-  auto t3 = at::randn(input_shape, fp16_options);
-  auto t0 = at::randn({input_shape[2]}, fp32_options);
-  auto t1 = at::randn({input_shape[2]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs({t2, t5, t3, t0, t1});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16
-  // Inner most dim only - floatx5 - t5, t0, t1, t7, t17
-  // Outer two dims only - floatx2 - t22, t27
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      t2.numel() * (2 * 4 + 2 * 2 + 1) + t5.numel() * 5 * 4 +
-      input_shape[0] * input_shape[1] * 2 * 4);
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW,
-    setup_vit_base_patch16_224_bcast5,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5,
-    nullptr);
-
-// Broadcast on both sides
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW)
-    ->Args({64, 197, 768})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void setup_vit_base_patch16_224_bcast_outer2(
-    Fusion* fusion,
-    void* null) {
-  FusionGuard fg(fusion);
-
-  auto t0 = makeContigTensor(3, DataType::Half);
-  auto t2 = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(t0);
-  fusion->addInput(t2);
-
-  auto t1 = castOp(DataType::Float, t0);
-  auto t3 = set(t2);
-  auto t4 = broadcast(t3, {true, true, false});
-  auto t5 = add(t1, t4);
-  auto t6 = castOp(DataType::Half, t5);
-  auto t7 = castOp(DataType::Half, t3);
-
-  fusion->addOutput(t6);
-  fusion->addOutput(t7);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t0 = at::randn(input_shape, fp16_options);
-  auto t2 = at::randn({input_shape[2]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // full tensor - halfx2 - t0, t6
-  // inner dimension only - halfx2 - t2, t7
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2) +
-      input_shape[2] * (2 + 4));
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2,
-    setup_vit_base_patch16_224_bcast_outer2,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2,
-    nullptr);
-
-NVFUSER_BENCHMARK_RUN(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2)
-    ->Args({64, 197, 2304})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
-  FusionGuard fg(fusion);
-
-  auto t0 = makeContigTensor(4, DataType::Half);
-  fusion->addInput(t0);
-  auto d13 = IrBuilder::create<Double>();
-  fusion->addInput(d13);
-
-  auto t1 = castOp(DataType::Float, t0);
-  auto t2 = set(t1);
-  auto t3 = mul(t2, d13);
-  auto t4 = set(t3);
-  auto t5 = max(t4, {3});
-  auto t6 = broadcast(t5, {false, false, false, true});
-  auto t7 = sub(t4, t6);
-  auto t8 = exp(t7);
-  auto t9 = sum(t8, {3});
-  auto t10 = broadcast(t9, {false, false, false, true});
-  auto t11 = reciprocal(t10);
-  auto t12 = mul(t8, t11);
-  auto t13 = rand_like(t12);
-  auto d79 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
-  auto t14 = lt(t13, d79);
-  auto t15 = castOp(DataType::Float, t14);
-  auto b81 = eq(d79, IrBuilder::create<Double>(0));
-  auto d82 = castOp(DataType::Double, b81);
-  auto d83 = add(d82, d79);
-  auto d85 = div(IrBuilder::create<Double>(1), d83);
-  auto t16 = mul(t12, t15);
-  auto t17 = mul(t16, d85);
-  auto t18 = set(t17);
-  auto t19 = castOp(DataType::Half, t18);
-
-  fusion->addOutput(t19);
-  fusion->addOutput(t14);
-  fusion->addOutput(t12);
-  fusion->addOutput(t4);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  auto t0 = at::randn(input_shape, fp16_options);
-
-  std::vector<c10::IValue> aten_inputs({t0, 0.125});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * t0.numel() * 13);
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3,
-    setup_vit_base_patch16_224_norm_inner3,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3,
-    nullptr);
-
-// Norm inner dim
-NVFUSER_BENCHMARK_RUN(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3)
-    ->Args({64, 12, 197})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void setup_vit_base_patch16_224_bcast_outer6(
-    Fusion* fusion,
-    void* null) {
-  FusionGuard fg(fusion);
-
-  auto t0 = makeContigTensor(3, DataType::Half);
-  auto t2 = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(t0);
-  fusion->addInput(t2);
-
-  auto t1 = castOp(DataType::Float, t0);
-  auto t3 = set(t2);
-  auto t4 = broadcast(t3, {true, true, false});
-  auto t5 = add(t1, t4);
-  auto t6 = set(t5);
-  auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
-  auto t8 = erf(t7);
-  auto t9 = add(IrBuilder::create<Double>(1), t8);
-  auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
-  auto t11 = mul(t6, t10);
-  auto t12 = rand_like(t11);
-  auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
-  auto t13 = lt(t12, d66);
-  auto t14 = castOp(DataType::Float, t13);
-  auto t15 = mul(t11, t14);
-  auto b68 = eq(d66, IrBuilder::create<Double>(0));
-  auto d69 = castOp(DataType::Double, b68);
-  auto d70 = add(d69, d66);
-  auto d72 = div(IrBuilder::create<Double>(1), d70);
-  auto t16 = mul(t15, d72);
-  auto t17 = set(t16);
-  auto t18 = castOp(DataType::Half, t17);
-  auto t19 = castOp(DataType::Half, t3);
-
-  fusion->addOutput(t18);
-  fusion->addOutput(t13);
-  fusion->addOutput(t6);
-  fusion->addOutput(t19);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t0 = at::randn(input_shape, fp16_options);
-  auto t2 = at::randn({input_shape[2]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-  // full tensors - float, halfx2, bool - t6, t0, t18, t13
-  // inner dimension only - float, half - t2, t19
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
-      input_shape[2] * (4 + 2));
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6,
-    setup_vit_base_patch16_224_bcast_outer6,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6,
-    nullptr);
-
-NVFUSER_BENCHMARK_RUN(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6)
-    // First size is original, the rest are variations to check perf
-    // reliability.
-    ->Args({64, 197, 3 * 1024})
-    ->Args({64, 197, 2 * 1024})
-    ->Args({64, 197, 1024})
-    ->Args({64, 197, 512})
-    ->Args({3, 1024, 64 * 197})
-    ->Args({2, 1024, 64 * 197})
-    ->Args({1, 1024, 64 * 197})
-    ->Args({2, 256, 64 * 197})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-// Reverse the broadcast dimensions to check for consistency in scheduling.
-static void setup_vit_base_patch16_224_bcast_inner6(
-    Fusion* fusion,
-    void* null) {
-  FusionGuard fg(fusion);
-
-  auto t0 = makeContigTensor(3, DataType::Half);
-  auto t2 = makeContigTensor(2, DataType::Float);
-
-  fusion->addInput(t0);
-  fusion->addInput(t2);
-
-  auto t1 = castOp(DataType::Float, t0);
-  auto t3 = set(t2);
-  auto t4 = broadcast(t3, {false, false, true});
-  auto t5 = add(t1, t4);
-  auto t6 = set(t5);
-  auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
-  auto t8 = erf(t7);
-  auto t9 = add(IrBuilder::create<Double>(1), t8);
-  auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
-  auto t11 = mul(t6, t10);
-  auto t12 = rand_like(t11);
-  auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
-  auto t13 = lt(t12, d66);
-  auto t14 = castOp(DataType::Float, t13);
-  auto t15 = mul(t11, t14);
-  auto b68 = eq(d66, IrBuilder::create<Double>(0));
-  auto d69 = castOp(DataType::Double, b68);
-  auto d70 = add(d69, d66);
-  auto d72 = div(IrBuilder::create<Double>(1), d70);
-  auto t16 = mul(t15, d72);
-  auto t17 = set(t16);
-  auto t18 = castOp(DataType::Half, t17);
-  auto t19 = castOp(DataType::Half, t3);
-
-  fusion->addOutput(t18);
-  fusion->addOutput(t13);
-  fusion->addOutput(t6);
-  fusion->addOutput(t19);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t0 = at::randn(input_shape, fp16_options);
-  auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // full tensors - float, halfx2, bool - t6, t0, t18, t13
-  // outer two dimensions only - float, half - t2, t19
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
-      input_shape[0] * input_shape[1] * (4 + 2));
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6,
-    setup_vit_base_patch16_224_bcast_inner6,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6,
-    nullptr);
-
-NVFUSER_BENCHMARK_RUN(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6)
-    ->Args({64, 197, 3 * 1024})
-    ->Args({64, 197, 2 * 1024})
-    ->Args({64, 197, 1024})
-    ->Args({64, 197, 512})
-    ->Args({3, 1024, 64 * 197})
-    ->Args({2, 1024, 64 * 197})
-    ->Args({1, 1024, 64 * 197})
-    ->Args({2, 256, 64 * 197})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) {
-  FusionGuard fg(fusion);
-
-  auto t0 = makeContigTensor(3, DataType::Bool);
-  fusion->addInput(t0);
-
-  auto t1 = makeContigTensor(3, DataType::Half);
-  fusion->addInput(t1);
-
-  auto t2 = castOp(DataType::Float, t1);
-
-  auto t3 = makeContigTensor(3, DataType::Half);
-  fusion->addInput(t3);
-
-  auto t4 = castOp(DataType::Float, t3);
-
-  auto d35 = t3->axis(2)->extent();
-
-  auto t5 = TensorViewBuilder()
-                .shape({-1, -1, 1})
-                .dtype(DataType::Float)
-                .contiguity({true, true, false})
-                .build();
-  fusion->addInput(t5);
-
-  auto t6 = TensorViewBuilder()
-                .shape({-1, -1, 1})
-                .dtype(DataType::Float)
-                .contiguity({true, true, false})
-                .build();
-  fusion->addInput(t6);
-
-  auto t7 = makeContigTensor(1, DataType::Half);
-  fusion->addInput(t7);
-
-  auto t8 = castOp(DataType::Float, t7);
-
-  auto t9 = makeContigTensor(1, DataType::Half);
-  fusion->addInput(t9);
-
-  auto t11 = sub(t4, t5);
-  auto t12 = mul(t11, t6);
-
-  auto t13 = broadcast(t8, {true, true, false});
-  auto t14 = mul(t2, t13);
-  auto t15 = mul(d35, t14);
-  auto t16 = sum(t14, {2});
-  auto t17 = broadcast(t16, {false, false, true});
-  auto t18 = mul(t14, t12);
-  auto t19 = sum(t18, {2});
-  auto t20 = broadcast(t19, {false, false, true});
-
-  auto t40 = castOp(DataType::Half, t12);
-  auto t41 = castOp(DataType::Float, t40);
-  auto t42 = castOp(DataType::Half, t20);
-  auto t43 = castOp(DataType::Float, t42);
-  auto t21 = mul(t42, t43);
-
-  auto t38 = castOp(DataType::Half, t15);
-  auto t39 = castOp(DataType::Float, t38);
-  auto t44 = castOp(DataType::Half, t17);
-  auto t45 = castOp(DataType::Float, t44);
-  auto t22 = sub(t39, t45);
-
-  auto t23 = sub(t22, t21);
-
-  auto d87 = reciprocal(d35);
-  auto t24 = mul(d87, t6);
-
-  auto t25 = mul(t24, t23);
-  auto t26 = mul(t2, t41);
-  auto t27 = sum(t26, {0, 1});
-  auto t28 = sum(t2, {0, 1});
-
-  auto t29 = castOp(DataType::Float, t0);
-  auto t30 = mul(t25, t29);
-
-  auto d33 = IrBuilder::create<Double>();
-  fusion->addInput(d33);
-  auto t31 = mul(t30, d33);
-  auto t32 = sum(t31, {0, 1});
-  auto t33 = castOp(DataType::Half, t32);
-  auto t34 = castOp(DataType::Half, t31);
-  auto t35 = castOp(DataType::Half, t25);
-  auto t36 = castOp(DataType::Half, t27);
-  auto t37 = castOp(DataType::Half, t28);
-
-  fusion->addOutput(t33);
-  fusion->addOutput(t34);
-  fusion->addOutput(t35);
-  fusion->addOutput(t36);
-  fusion->addOutput(t37);
-}
-
-static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  // auto bool_options = at::TensorOptions().dtype(at::kBool).device(at::kCUDA,
-  // 0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto t0 = at::randn(input_shape, fp16_options).to(at::kBool);
-  auto t1 = at::randn(input_shape, fp16_options);
-  auto t3 = at::randn(input_shape, fp16_options);
-  auto t5 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
-  auto t6 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
-  auto t7 = at::randn({input_shape[2]}, fp16_options);
-  auto t9 = at::randn({input_shape[2]}, fp16_options);
-
-  std::vector<c10::IValue> aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35
-  // Outer two dimensions - floatx2 - t5, t6
-  // Inner dimension - halfx5 - t7, t9, t33, t36, t37
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * ((t0.numel() * (4 * 2 + 1))) +
-      (t5.numel() * 4 * 2) + (t7.numel() * 5 * 2));
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD,
-    setup_vit_base_patch16_224_LN_BWD,
-    NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD,
-    nullptr);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD)
-    ->Args({128, 197, 768})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) {
-  FusionGuard fg(fusion);
-
-  auto t2 = makeContigTensor(4, DataType::Half);
-  auto t5 = makeContigTensor(4, DataType::Half);
-  auto t7 = makeContigTensor(4, DataType::Half);
-  auto t9 = makeContigTensor(4, DataType::Half);
-  auto t4 = makeConcreteTensor({}, DataType::Half);
-
-  fusion->addInput(t2);
-  fusion->addInput(t5);
-  fusion->addInput(t7);
-  fusion->addInput(t9);
-  fusion->addInput(t4);
-
-  auto d86 = IrBuilder::create<Double>(0);
-
-  auto t3 = castOp(DataType::Float, t2);
-  auto t6 = castOp(DataType::Float, t5);
-  auto t8 = castOp(DataType::Float, t7);
-  auto t10 = castOp(DataType::Float, t9);
-  auto t11 = add(t8, t10);
-  auto t12 = set(t11);
-  auto t13 = set(t6);
-  auto t14 = lt(t13, d86);
-  auto t15 = broadcast(t4, {true, true, true, true});
-  auto t16 = where(t14, t15, t12);
-  auto t17 = set(t16);
-  auto t29 = castOp(DataType::Half, t17);
-  auto t18 = mul(t17, t3);
-  auto t19 = permute(t18, {0, 2, 3, 1});
-  auto t30 = castOp(DataType::Half, t19);
-
-  fusion->addOutput(t29);
-  fusion->addOutput(t30);
-}
-
-static void NvFuserScheduler_nhwc_seresnet152d_transpose65(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    void* null) {
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(2),
-      benchmark_state.range(2),
-      benchmark_state.range(1)};
-
-  at::manual_seed(0);
-  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  auto t2 = at::randn(input_shape, fp16_options);
-  auto t5 = at::randn(input_shape, fp16_options);
-  auto t7 = at::randn(input_shape, fp16_options);
-  auto t9 = at::randn(input_shape, fp16_options);
-  // Need zero dim tensor don't know how to do that, so just going to reduce a
-  // 1D tensor
-  auto t4 = at::randn({2}, fp16_options).sum();
-
-  std::vector<c10::IValue> aten_inputs({t2, t5, t7, t9, t4});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) * t2.numel() * 6 * 2);
-}
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65,
-    nhwc_seresnet152d_transpose65,
-    NvFuserScheduler_nhwc_seresnet152d_transpose65,
-    nullptr);
-
-// Norm inner dim Half version of vit_base_patch16_224_norm_inner3
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65)
-    ->Args({128, 12, 197})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
--- a/third_party/nvfuser/benchmark/transpose.cpp
+++ b/third_party/nvfuser/benchmark/transpose.cpp
@ -1,476 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-#define TRANSPOSE_CONFIG {true, false, false, false}
-
-using namespace torch::jit::fuser::cuda;
-
-struct TransposeConfig {
-    bool input1_transpose_axes = false;
-    bool input2_transpose_axes = false;
-    bool intermediate_transpose_axes = false;
-    bool output_transpose_axes = false;
-};
-
-std::vector<at::Tensor> generateInputs(
-    DataType dtype,
-    int num_dims,
-    std::pair<int, int> axes,
-    int perm_size,
-    int innerdim_size,
-    bool input1_transpose_axes,
-    bool input2_transpose_axes,
-    bool non_vectorize_offset = false,
-    int iter_size = 32) {
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-
-  std::vector<int64_t> transpose_shape(num_dims, iter_size);
-  transpose_shape[axes.second] = innerdim_size;
-  transpose_shape[axes.first] = perm_size;
-
-  std::vector<int64_t> non_transpose_shape(num_dims, iter_size);
-  non_transpose_shape[axes.first] = innerdim_size;
-  non_transpose_shape[axes.second] = perm_size;
-
-  // TensorType: Concrete, Contig, Symbolic
-  // Vectorization | Unroll - Add 1 to sizes
-  // Shift axis by 1 to disable vectorize loads
-  if (non_vectorize_offset) {
-    for (auto idx : c10::irange(transpose_shape.size())) {
-      transpose_shape[idx] += 1;
-    }
-    for (auto idx : c10::irange(non_transpose_shape.size())) {
-      non_transpose_shape[idx] += 1;
-    }
-  }
-
-  auto optionalTransposeSize =
-      [&transpose_shape, &non_transpose_shape](bool transpose_tensor) {
-        return (transpose_tensor) ? transpose_shape : non_transpose_shape;
-      };
-
-  at::Tensor aten_input1 =
-      at::randn(optionalTransposeSize(input1_transpose_axes), options);
-  at::Tensor aten_input2 =
-      at::randn(optionalTransposeSize(input2_transpose_axes), options);
-  return {aten_input1, aten_input2};
-}
-
-//------------------------------------------------------------------------------
-
-static void setupTranspose(
-    Fusion* fusion,
-    DataType dtype,
-    int num_dims,
-    std::pair<int, int> axes,
-    TransposeConfig tc) {
-  FusionGuard fg(fusion);
-
-  auto optionalTranspose = [axes](TensorView* tv, bool is_transpose) {
-    return (is_transpose) ? transpose(tv, axes.first, axes.second) : tv;
-  };
-
-  auto input1 = makeContigTensor(num_dims, dtype);
-  auto input2 = makeContigTensor(num_dims, dtype);
-  fusion->addInput(input1);
-  fusion->addInput(input2);
-
-  auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes);
-  auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes);
-  auto intermediate = add(ot_input1, ot_input2);
-  auto ot_intermediate =
-      optionalTranspose(intermediate, tc.intermediate_transpose_axes);
-  auto output = relu(ot_intermediate);
-  auto ot_output = optionalTranspose(output, tc.output_transpose_axes);
-  fusion->addOutput(ot_output);
-}
-
-static void NvFuserScheduler_Transpose(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype,
-    int num_dims,
-    std::pair<int, int> axes,
-    TransposeConfig tc) {
-  auto aten_inputs = generateInputs(
-      dtype,
-      num_dims,
-      axes,
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      tc.input1_transpose_axes,
-      tc.input2_transpose_axes);
-  auto at_input1 = aten_inputs[0];
-  auto at_input2 = aten_inputs[1];
-
-  std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype))));
-}
-
-//------------------------------------------------------------------------------
-
-#define NVFUSER_TRANSPOSE_SQUARE_RUN(             \
-    TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
-  NVFUSER_BENCHMARK_DEFINE(                       \
-      TITLE,                                      \
-      setupTranspose,                             \
-      NvFuserScheduler_Transpose,                 \
-      DTYPE,                                      \
-      NUM_DIMS,                                   \
-      {AXIS1, AXIS2},                             \
-      CONFIG);                                    \
-                                                  \
-  NVFUSER_BENCHMARK_RUN(TITLE)                    \
-      ->RangeMultiplier(8)                        \
-      ->Args({9, 2408})                           \
-      ->Args({16, 512})                           \
-      ->Args({18, 96})                            \
-      ->Args({24, 96})                            \
-      ->Args({24, 256})                           \
-      ->Args({24, 512})                           \
-      ->Args({32, 27})                            \
-      ->Args({32, 96})                            \
-      ->Args({32, 288})                           \
-      ->Args({32, 864})                           \
-      ->Args({40, 120})                           \
-      ->Args({48, 128})                           \
-      ->Args({48, 256})                           \
-      ->Args({49, 512})                           \
-      ->Args({49, 1024})                          \
-      ->Args({49, 2048})                          \
-      ->Args({49, 4608})                          \
-      ->Args({64, 64})                            \
-      ->Args({64, 96})                            \
-      ->Args({64, 128})                           \
-      ->Args({64, 147})                           \
-      ->Args({64, 192})                           \
-      ->Args({64, 256})                           \
-      ->Args({64, 288})                           \
-      ->Args({64, 512})                           \
-      ->Args({80, 64})                            \
-      ->Args({81, 1728})                          \
-      ->Args({83, 1728})                          \
-      ->Args({96, 864})                           \
-      ->Args({100, 1280})                         \
-      ->Args({100, 4032})                         \
-      ->Args({120, 40})                           \
-      ->Args({128, 128})                          \
-      ->Args({128, 512})                          \
-      ->Args({128, 1152})                         \
-      ->Args({192, 128})                          \
-      ->Args({192, 256})                          \
-      ->Args({192, 720})                          \
-      ->Args({192, 768})                          \
-      ->Args({192, 1120})                         \
-      ->Args({192, 1728})                         \
-      ->Args({196, 256})                          \
-      ->Args({196, 512})                          \
-      ->Args({196, 1024})                         \
-      ->Args({196, 2304})                         \
-      ->Args({256, 256})                          \
-      ->Args({256, 1024})                         \
-      ->Args({256, 2304})                         \
-      ->Args({284, 512})                          \
-      ->Args({320, 1280})                         \
-      ->Args({320, 1728})                         \
-      ->Args({324, 2592})                         \
-      ->Args({361, 768})                          \
-      ->Args({361, 1120})                         \
-      ->Args({384, 2})                            \
-      ->Args({384, 32})                           \
-      ->Args({384, 128})                          \
-      ->Args({384, 256})                          \
-      ->Args({384, 512})                          \
-      ->Args({384, 1280})                         \
-      ->Args({384, 2592})                         \
-      ->Args({384, 4032})                         \
-      ->Args({448, 1280})                         \
-      ->Args({480, 16})                           \
-      ->Args({480, 256})                          \
-      ->Args({512, 2})                            \
-      ->Args({512, 16})                           \
-      ->Args({512, 128})                          \
-      ->Args({512, 256})                          \
-      ->Args({512, 1024})                         \
-      ->Args({512, 2048})                         \
-      ->Args({512, 3072})                         \
-      ->Args({512, 4608})                         \
-      ->Args({784, 40})                           \
-      ->Args({784, 120})                          \
-      ->Args({784, 128})                          \
-      ->Args({784, 1152})                         \
-      ->Args({1001, 2408})                        \
-      ->Args({1024, 16})                          \
-      ->Args({1024, 256})                         \
-      ->Args({1024, 512})                         \
-      ->Args({1024, 1024})                        \
-      ->Args({1024, 3072})                        \
-      ->Args({1369, 192})                         \
-      ->Args({1369, 256})                         \
-      ->Args({1369, 288})                         \
-      ->Args({2048, 512})                         \
-      ->Args({2048, 1024})                        \
-      ->Args({2250, 27})                          \
-      ->Args({3072, 512})                         \
-      ->Args({3072, 1024})                        \
-      ->Args({3136, 64})                          \
-      ->Args({5329, 720})                         \
-      ->Args({5625, 64})                          \
-      ->Args({12544, 147})                        \
-      ->Args({22201, 288})                        \
-      ->Unit(benchmark::kMicrosecond)
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp32_Inner_2D_01_Axis,
-    DataType::Float,
-    2 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp32_Inner_3D_02_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp32_Inner_3D_12_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    1 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp32_Outer_3D_01_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-//------------------------------------------------------------------------------
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp16_Inner_2D_01_Axis,
-    DataType::Half,
-    2 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp16_Inner_3D_02_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp16_Inner_3D_12_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    1 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_SQUARE_RUN(
-    NF_Transpose_Random_fp16_Outer_3D_01_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-//------------------------------------------------------------------------------
-
-
-#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
-  NVFUSER_BENCHMARK_DEFINE(                                                 \
-      TITLE,                                                                \
-      setupTranspose,                                                       \
-      NvFuserScheduler_Transpose,                                           \
-      DTYPE,                                                                \
-      NUM_DIMS,                                                             \
-      {AXIS1, AXIS2},                                                       \
-      CONFIG);                                                              \
-                                                                            \
-  NVFUSER_BENCHMARK_RUN(TITLE)                                              \
-      ->RangeMultiplier(8)                                                  \
-      ->Ranges({{2, 256 * 256}, {160, 320}})                                \
-      ->Unit(benchmark::kMicrosecond)                                       \
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp32_Inner_2D_01_Axis,
-    DataType::Float,
-    2 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp32_Inner_3D_02_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp32_Inner_3D_12_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    1 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp32_Outer_3D_01_Axis,
-    DataType::Float,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-//------------------------------------------------------------------------------
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp16_Inner_2D_01_Axis,
-    DataType::Half,
-    2 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp16_Inner_3D_02_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp16_Inner_3D_12_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    1 /* axis1 */,
-    2 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-NVFUSER_TRANSPOSE_RUN(
-    NF_Transpose_fp16_Outer_3D_01_Axis,
-    DataType::Half,
-    3 /* num_dims */,
-    0 /* axis1 */,
-    1 /* axis2 */,
-    TransposeConfig(TRANSPOSE_CONFIG));
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Transpose(
-    benchmark::State& benchmark_state,
-    DataType dtype,
-    int num_dims,
-    std::pair<int, int> axes,
-    TransposeConfig tc) {
-  auto aten_inputs = generateInputs(
-      dtype,
-      num_dims,
-      axes,
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      tc.input1_transpose_axes,
-      tc.input2_transpose_axes);
-  auto at_input1 = aten_inputs[0];
-  auto at_input2 = aten_inputs[1];
-
-  auto optionalTransposeAten = [&axes](at::Tensor x, bool is_transpose) {
-    return (is_transpose) ? at::transpose(x, axes.first, axes.second) : x;
-  };
-
-  for (auto _ : benchmark_state) {
-    clearL2Cache();
-    CudaKernelTimer timer;
-
-    auto at_ot_input1 =
-        optionalTransposeAten(at_input1, tc.input1_transpose_axes);
-    auto at_ot_input2 =
-        optionalTransposeAten(at_input2, tc.input2_transpose_axes);
-    auto at_intermediate = add(at_ot_input1, at_ot_input2);
-    auto at_ot_intermediate =
-        optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes);
-    auto at_output = relu(at_ot_intermediate);
-    auto at_ot_output =
-        optionalTransposeAten(at_output, tc.output_transpose_axes);
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-  }
-  // Sync everything up before we're finished, don't want to run ahead on the
-  // cpu while benchmarking.
-  cudaDeviceSynchronize();
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (at_input1.numel() * 3 * int64_t(dataTypeSize(dtype))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_Transpose_fp32_Inner_2D_01_Axis(
-    benchmark::State& benchmark_state) {
-  Baseline_Transpose(
-      benchmark_state,
-      DataType::Float,
-      2 /* num_dims */,
-      {0, 1} /* axes */,
-      TRANSPOSE_CONFIG);
-}
-
-static void Baseline_Transpose_fp16_Inner_2D_01_Axis(
-    benchmark::State& benchmark_state) {
-  Baseline_Transpose(
-      benchmark_state,
-      DataType::Half,
-      2 /* num_dims */,
-      {0, 1} /* axes */,
-      TRANSPOSE_CONFIG);
-}
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 1024 * 1024}, {160, 320}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
--- a/third_party/nvfuser/benchmark/utils.cpp
+++ b/third_party/nvfuser/benchmark/utils.cpp
@ -1,228 +0,0 @@
-#include <benchmarks/cpp/nvfuser/utils.h>
-
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <sstream>
-
-using namespace torch::jit::fuser::cuda;
-
-std::string toString(const ReductionParams& rparams) {
-  std::stringstream ss;
-  ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
-     << (rparams.persistent_kernel ? "Persistent Kernel // " : "")
-     << (rparams.project_persistent_buffers ? "Project Persistent Buffers // "
-                                            : "");
-
-  if (rparams.schedule_3D) {
-    ss << "3D Schedule // "
-       << "Outer Reduction: "
-       << (rparams.cross_block_outer_reduction ? "cross block / " : "")
-       << (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
-       << (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
-    if (rparams.batches_per_block_outer_reduction > 1 ||
-        rparams.persistent_kernel) {
-      ss << "persistent batch - " << rparams.batches_per_block_outer_reduction
-         << " / ";
-    }
-  }
-
-  ss << " // Iteration Domain: "
-     << (rparams.multiple_reds_per_blk ? "multiple reductions per block / "
-                                       : "")
-     << (rparams.split_grid_dim_iter_dom ? "split grid dimension / " : "")
-     << (rparams.vectorize_iter_dom ? "vectorize / " : "")
-     << (rparams.unroll_factor_iter_dom > 1 && !rparams.vectorize_iter_dom
-             ? "unroll / "
-             : "");
-  if (rparams.unroll_factor_iter_dom > 1 || rparams.vectorize_iter_dom) {
-    ss << "factor " << rparams.unroll_factor_iter_dom;
-  }
-
-  ss << " // Inner Reduction Domain: "
-     << (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
-     << (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
-     << (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");
-
-  if (rparams.batches_per_block_inner_reduction > 1 ||
-      rparams.persistent_kernel) {
-    ss << "persistent batch - " << rparams.batches_per_block_inner_reduction
-       << " / ";
-  }
-
-  ss << (rparams.cross_grid_inner_reduction &&
-                 rparams.split_grid_dim_inner_reduction
-             ? "split grid dimension / "
-             : "")
-     << (rparams.vectorize_inner_reduction ? "vectorize / " : "")
-     << (rparams.unroll_factor_inner_reduction > 1 &&
-                 !rparams.vectorize_inner_reduction
-             ? "unroll / "
-             : "");
-  if (rparams.unroll_factor_inner_reduction > 1 ||
-      rparams.vectorize_inner_reduction) {
-    ss << "factor " << rparams.unroll_factor_inner_reduction;
-  }
-  return ss.str();
-}
-
-std::string toString(const PointwiseParams& params) {
-  std::stringstream ss;
-  if (params.break_point) {
-    ss << "2D Schedule at " << params.break_point << "/";
-    if (params.split_block) {
-      ss << " Split block into y-dim/";
-    }
-    if (params.split_grid_y_dim) {
-      ss << " Split y grid dim/";
-    }
-  } else {
-    ss << "1D"
-       << "/";
-  }
-  if (params.unroll_factor > 1) {
-    if (params.vectorize) {
-      ss << "Vectorize, Factor: " << params.unroll_factor;
-    } else {
-      ss << "Unroll, Factor: " << params.unroll_factor;
-    }
-  }
-  return ss.str();
-}
-
-std::string toString(const TransposeParams& params) {
-  std::stringstream ss;
-  ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2
-     << ")/";
-  ss << "Vectorize size: (" << params.vectorize_factor1 << ","
-     << params.vectorize_factor2 << ")";
-  return ss.str();
-}
-
-std::string toString(const std::shared_ptr<HeuristicParams>& params) {
-  auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
-  if (rparams) {
-    return toString(*rparams);
-  }
-  auto pparams = std::dynamic_pointer_cast<PointwiseParams>(params);
-  if (pparams) {
-    return toString(*pparams);
-  }
-  auto tparams = std::dynamic_pointer_cast<TransposeParams>(params);
-  if (tparams) {
-    return toString(*tparams);
-  }
-  TORCH_INTERNAL_ASSERT(
-      false,
-      "Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
-}
-
-std::string toString(LaunchParams lparams) {
-  std::stringstream ss;
-  lparams.toString();
-  ss << "/Launch_Parameters["
-     << "block(" << lparams.bdimz() << "/" << lparams.bdimy() << "/"
-     << lparams.bdimx() << ")/grid(" << lparams.gdimz() << "/"
-     << lparams.gdimy() << "/" << lparams.gdimx() << ")/" << lparams.smem()
-     << "]";
-  return ss.str();
-}
-
-void clearL2Cache() {
-  torch::NoGradGuard no_grad;
-  auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize;
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0);
-
-  auto l2_elems = l2_cache_size / 4;
-  torch::Tensor t0 = torch::empty(l2_elems, options);
-  torch::Tensor t1 = torch::clone(t0);
-};
-
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-TensorView* makeContigTensor(size_t ndims, DataType dtype) {
-  return TensorViewBuilder()
-      .ndims(ndims)
-      .dtype(dtype)
-      .contiguity(std::vector<bool>(ndims, true))
-      .build();
-}
-
-TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
-TensorView* makeContigConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype) {
-  return TensorViewBuilder()
-      .shape(shape)
-      .dtype(dtype)
-      .contiguity(std::vector<bool>(shape.size(), true))
-      .build();
-}
-
-void runBenchmarkIterations(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    std::vector<c10::IValue>& aten_inputs) {
-  fusion_executor_cache->runFusionWithInputs(aten_inputs);
-  bool segmented =
-      fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() &&
-      fusion_executor_cache->getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() > 1;
-
-  if (!segmented) {
-    fusion_executor_cache->profile(true);
-    fusion_executor_cache->runFusionWithInputs(aten_inputs);
-    auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
-    auto executor_instance = compile_log.fusion_executor;
-
-    auto params = toString(compile_log.params);
-    auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-    benchmark_state.SetLabel(params + lparams);
-
-    executor_instance->setMeasureKernelTimeFlag(true);
-
-    // Sync everything up before we start
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    for (auto _ : benchmark_state) {
-      clearL2Cache();
-      auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
-      benchmark_state.SetIterationTime(
-          executor_instance->kernelTimeMs() / 1000.0);
-    }
-    // Sync everything up before we're finished, don't want to run ahead on the
-    // cpu while benchmarking.
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  } else {
-    // Segmented
-    // Sync everything up before we start
-    {
-      // Compile/warmup
-      auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
-    }
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-    CudaKernelTimer timer;
-    for (auto _ : benchmark_state) {
-      clearL2Cache();
-      timer.restart();
-      auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
-      benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    }
-    // Sync everything up before we're finished, don't want to run ahead on the
-    // cpu while benchmarking.
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-  }
-}
-
-namespace executorCache {
-thread_local ExecutorMap executor_map_;
-ExecutorMap& getGlobalMap() {
-  return executor_map_;
-}
-} // namespace executorCache
--- a/third_party/nvfuser/benchmark/utils.h
+++ b/third_party/nvfuser/benchmark/utils.h
@ -1,204 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/torch.h>
-
-#include <cuda_runtime.h>
-
-using namespace torch::jit::fuser::cuda;
-
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float);
-
-// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
-// but unknown sizes. Taken from test_gpu.cpp
-TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float);
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float);
-
-// Make a contiguous tensor of compile-time known sizes
-TensorView* makeContigConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float);
-
-std::string toString(const ReductionParams& rparams);
-std::string toString(const PointwiseParams& params);
-std::string toString(const TransposeParams& params);
-std::string toString(const std::shared_ptr<HeuristicParams>& params);
-std::string toString(LaunchParams lparams);
-
-// Run benchmark iterations with provided inputs. If not segmented, report
-// kernel time from the runtime, as well as heuristic parameters. If segmented
-// use timers. Make sure to clear L2 between iterations.
-void runBenchmarkIterations(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    std::vector<c10::IValue>& aten_inputs);
-
-void clearL2Cache();
-
-class CudaKernelTimer {
- public:
-  CudaKernelTimer() {
-    // Setup
-    C10_CUDA_CHECK(cudaEventCreate(&start_event));
-    C10_CUDA_CHECK(cudaEventCreate(&finish_event));
-    C10_CUDA_CHECK(cudaEventRecord(start_event));
-  }
-
-  ~CudaKernelTimer() {
-    C10_CUDA_IGNORE_ERROR(cudaEventDestroy(start_event));
-    C10_CUDA_IGNORE_ERROR(cudaEventDestroy(finish_event));
-  }
-
-  void restart() {
-    C10_CUDA_CHECK(cudaEventRecord(start_event));
-  }
-
-  float elapsed() {
-    // Record
-    C10_CUDA_CHECK(cudaEventRecord(finish_event));
-    C10_CUDA_CHECK(cudaEventSynchronize(start_event));
-    C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
-    C10_CUDA_CHECK(
-        cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
-    return kernel_time_ms_;
-  }
-
- private:
-  // Create
-  float kernel_time_ms_ = 0;
-  cudaEvent_t start_event = {};
-  cudaEvent_t finish_event = {};
-};
-
-namespace executorCache {
-using ExecutorPtr = std::unique_ptr<FusionExecutorCache>;
-using ExecutorMap = std::unordered_map<std::string, ExecutorPtr>;
-ExecutorMap& getGlobalMap();
-} // namespace executorCache
-
-//! Utility to manage FusionExecutorCache instances for
-//!  all defined benchmarks
-class BenchmarkGraph : public benchmark::Fixture {
- public:
-  using SetupFusionFunction = std::function<void(Fusion*)>;
-  using SetupFusionMap = std::unordered_map<std::string, SetupFusionFunction>;
-
-  virtual std::string graphName() = 0;
-  virtual SetupFusionFunction setupFusion() = 0;
-
-  FusionExecutorCache* getExecutorCache() {
-    auto& executor_ = getExecutorCacheMap()[graphName()];
-    TORCH_INTERNAL_ASSERT(executor_);
-    return executor_.get();
-  }
-
-  void SetUp(const ::benchmark::State& state) {
-    auto& executor_ = getExecutorCacheMap()[graphName()];
-    // Makes sure same graph hasn't been compiled before
-    if (!executor_) {
-      auto fusion_ptr = std::make_unique<Fusion>();
-      FusionGuard(fusion_ptr.get());
-      setupFusion()(fusion_ptr.get());
-      getExecutorCacheMap()[graphName()] =
-          std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
-    }
-  }
-
-  void TearDown(const ::benchmark::State& state) {}
-
- protected:
-  static executorCache::ExecutorMap& getExecutorCacheMap() {
-    return executorCache::getGlobalMap();
-  }
-};
-
-#define NVFUSER_TO_STRING_HELPER(n) std::string(#n)
-#define NVFUSER_TO_STRING(n) NVFUSER_TO_STRING_HELPER(n)
-
-//! NVFUSER_BENCHMARK_RUN utility usage:
-//!  This utility helps create and manage FusionExecutorCaches and tries to use
-//!  the caching
-//! mechanism in NVFuser to avoid re-compilation.
-//!
-//!  There are two macros in this utility: NVFUSER_BENCHMARK_DEFINE, and
-//!  NVFUSER_BENCHMARK_RUN,
-//! and user needs to supply two functions SETUP_FUSION and RUN_FUSION, with
-//! following signatures:
-//!
-//!  SETUP_FUSION(Fusion* , args...);
-//!  RUN_FUSION(benchmark::State&, FusionExecutorCache* , args...);
-//!
-//!  where args... are additional arguments, and they need to be the same for
-//!  SETUP_FUSION and RUN_FUSION.
-//!
-//!  SETUP_FUSION is called once in each definition of benchmark to build the
-//!  fusionIR graph
-//!
-//!  RUN_FUSION is just like the normal benchmark instance, except that a
-//!  FusionExecutorCache
-//!   will be provided for scheduling, running and timing the fusion runs. It is
-//!   called once in each benchmark instance. For example:
-//!   NVFUSER_BENCHMARK_RUN(my_benchmark)
-//!    ->RangeMultiplier(2)
-//!    ->Ranges({{1, 4})
-//!  Calls RUN_FUSION 3 times.
-//!
-//!  To register a benchmark, the API is:
-//!
-//!  NVFUSER_BENCHMARK_DEFINE(my_benchmark,SETUP_FUSION,RUN_FUSION,args...);
-//!
-//!    where my_benchmark is any unique name given for this benchmark,
-//!      SETUP_FUSION, RUN_FUSION as described above,
-//!      args... is the arg list supplied to both setup_fusion and run_fusion
-//!
-//!  each NVFUSER_BENCHMARK_DEFINE registers a benchmark with a single
-//!  FusionExecutorCache, i.e. a single fusion graph, and multiple benchmark
-//!  data points can be registered like:
-//!
-//!  NVFUSER_BENCHMARK_RUN(my_benchmark)
-//!    ->Ranges({{1,2}});
-//!
-//!  NVFUSER_BENCHMARK_RUN(my_benchmark)
-//!    ->Ranges({{3,4}});
-//!
-//!  All datapoints will use the same FusionExecutorCache so recompilation is
-//!  avoided as much as possible.
-
-#define NVFUSER_BENCHMARK_DEFINE(                                       \
-    BENCHMARK_NAME, SETUP_FUSION, RUN_FUSION, ...)                      \
-  class BENCHMARK_NAME##___GRAPH : public BenchmarkGraph {              \
-   public:                                                              \
-    std::string graphName() {                                           \
-      return NVFUSER_TO_STRING(BENCHMARK_NAME##___GRAPH);               \
-    }                                                                   \
-    SetupFusionFunction setupFusion() {                                 \
-      return [](Fusion* fusion) { SETUP_FUSION(fusion, __VA_ARGS__); }; \
-    }                                                                   \
-  };                                                                    \
-  BENCHMARK_DEFINE_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME)          \
-  (benchmark::State & benchmark_state) {                                \
-    RUN_FUSION(                                                         \
-        benchmark_state,                                                \
-        BENCHMARK_NAME##___GRAPH::getExecutorCache(),                   \
-        __VA_ARGS__);                                                   \
-  }
-
-#define NVFUSER_BENCHMARK_RUN(BENCHMARK_NAME) \
-  BENCHMARK_REGISTER_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME)
--- a/third_party/nvfuser/csrc/arith.cpp
+++ b/third_party/nvfuser/csrc/arith.cpp
--- a/third_party/nvfuser/csrc/arith.h
+++ b/third_party/nvfuser/csrc/arith.h
@ -1,676 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <ir_interface_nodes.h>
-#include <type.h>
-#include <type_promotion.h>
-
-class Val;
-
-/*
- * The operations defined in this header is intended as user facing functions.
- * Generally users should not directly instantiate temporary TensorViews they
- * should instead use the functions below which will automatically create IR
- * nodes, and return a resulting TensorView of correctly tracked shapes.
- */
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// Insertion of casting op to dtype, returns new resulting val
-TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1);
-TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1);
-
-TORCH_CUDA_CU_API Val* bitCastOp(DataType dtype, Val* v1);
-TORCH_CUDA_CU_API TensorView* bitCastOp(DataType dtype, TensorView* v1);
-
-// Perform unary op type and return the output
-TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1);
-TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
-TORCH_CUDA_CU_API Val* unaryIsOp(UnaryOpType type, Val* v1);
-TORCH_CUDA_CU_API TensorView* unaryIsOp(UnaryOpType type, TensorView* v1);
-TORCH_CUDA_CU_API Val* unaryOp(
-    UnaryOpType type,
-    Val* v1,
-    const TypePromotionConfig& config);
-TORCH_CUDA_CU_API TensorView* unaryOp(
-    UnaryOpType type,
-    TensorView* v1,
-    const TypePromotionConfig& config);
-
-// Perform binary op type on v1 and v2 and return a type promoted output.
-// Mod, CeilDiv, and LT are considered Int only output operations for now.
-TORCH_CUDA_CU_API Val* binaryOp(
-    BinaryOpType type,
-    Val* v1,
-    Val* v2,
-    DataType out_dtype = DataType::Null);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    TensorView* v1,
-    Val* v2,
-    DataType out_dtype = DataType::Null);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    Val* v1,
-    TensorView* v2,
-    DataType out_dtype = DataType::Null);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    TensorView* v1,
-    TensorView* v2,
-    DataType out_dtype = DataType::Null);
-
-TORCH_CUDA_CU_API Val* binaryOp(
-    BinaryOpType type,
-    Val* v1,
-    Val* v2,
-    const TypePromotionConfig& config);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    TensorView* v1,
-    Val* v2,
-    const TypePromotionConfig& config);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    Val* v1,
-    TensorView* v2,
-    const TypePromotionConfig& config);
-TORCH_CUDA_CU_API TensorView* binaryOp(
-    BinaryOpType type,
-    TensorView* v1,
-    TensorView* v2,
-    const TypePromotionConfig& config);
-
-// Perform a reduction operation on v1, initial value for reduction is init,
-// reduces across axes, and reduction operation defined by BinaryOp.
-TORCH_CUDA_CU_API TensorView* reductionOp(
-    BinaryOpType reduction_op_type,
-    const std::vector<int>& axes,
-    Val* init,
-    TensorView* v1,
-    bool keep_dim = false,
-    DataType dtype = DataType::Null);
-
-//! Auxiliary Struct holding result of
-//! a single welford op in ternsorview
-class TORCH_CUDA_CU_API WelfordResult {
- public:
-  TensorView* avg;
-  TensorView* var_sum;
-  TensorView* n;
-
-  explicit WelfordResult(
-      TensorView* in_avg,
-      TensorView* in_var_sum,
-      TensorView* in_n);
-};
-
-//! Welford operator on specified axes. This is currently the only scan op with
-//! multiple outputs that is supported. May consider generalization if more scan
-//! ops are added.
-TORCH_CUDA_CU_API WelfordResult Welford(
-    TensorView* tv,
-    const std::vector<int>& axes,
-    TensorView* init_avg = nullptr,
-    TensorView* init_var = nullptr,
-    // Initializes to 0 in function definition, doing this so we don't have to
-    // import IrBuilder just for this one interface.
-    Int* init_N = nullptr);
-
-// RNG OPERATIONS
-TORCH_CUDA_CU_API TensorView* rand(
-    const std::vector<Val*>& shape,
-    DataType dtype);
-TORCH_CUDA_CU_API Val* rand_like(Val*);
-TORCH_CUDA_CU_API TensorView* rand_like(TensorView*);
-
-TORCH_CUDA_CU_API TensorView* uniform(
-    const std::vector<Val*>& shape,
-    Val* low,
-    Val* high,
-    DataType dtype);
-
-// TENSOR FACTORIES
-TORCH_CUDA_CU_API TensorView* full(
-    const std::vector<Val*>& shape,
-    Val* fill_value,
-    DataType dtype);
-TORCH_CUDA_CU_API TensorView* full_like(TensorView* tv, Val* fill_value);
-TORCH_CUDA_CU_API Val* full_like(Val* tv, Val* fill_value);
-TORCH_CUDA_CU_API TensorView* zeros(
-    const std::vector<Val*>& shape,
-    DataType dtype);
-TORCH_CUDA_CU_API TensorView* zeros_like(TensorView*);
-TORCH_CUDA_CU_API Val* zeros_like(Val*);
-TORCH_CUDA_CU_API TensorView* ones(
-    const std::vector<Val*>& shape,
-    DataType dtype);
-TORCH_CUDA_CU_API TensorView* ones_like(TensorView*);
-TORCH_CUDA_CU_API Val* ones_like(Val*);
-//! WARNING: giving invalid combinations of the start, end and step
-//! arguments can result in undefined behavior. Specifically, the
-//! signs of `end - start` and step must be the same.
-TORCH_CUDA_CU_API TensorView* arange(Val* end, DataType dtype = DataType::Int);
-TORCH_CUDA_CU_API TensorView* arange(
-    Val* start,
-    Val* end,
-    DataType dtype = DataType::Int);
-TORCH_CUDA_CU_API TensorView* arange(
-    Val* start,
-    Val* end,
-    Val* step,
-    DataType dtype = DataType::Int);
-TORCH_CUDA_CU_API TensorView* eye(Val* size, DataType dtype);
-TORCH_CUDA_CU_API TensorView* eye(Val* rows, Val* cols, DataType dtype);
-
-// UNARY OPERATIONS
-// abs
-TORCH_CUDA_CU_API Val* abs(Val*);
-TORCH_CUDA_CU_API TensorView* abs(TensorView*);
-// acos
-TORCH_CUDA_CU_API Val* acos(Val*);
-TORCH_CUDA_CU_API TensorView* acos(TensorView*);
-// asin
-TORCH_CUDA_CU_API Val* asin(Val*);
-TORCH_CUDA_CU_API TensorView* asin(TensorView*);
-// atan
-TORCH_CUDA_CU_API Val* atan(Val*);
-TORCH_CUDA_CU_API TensorView* atan(TensorView*);
-// atanh
-TORCH_CUDA_CU_API Val* atanh(Val*);
-TORCH_CUDA_CU_API TensorView* atanh(TensorView*);
-// ceil
-TORCH_CUDA_CU_API Val* ceil(Val*);
-TORCH_CUDA_CU_API TensorView* ceil(TensorView*);
-// cos
-TORCH_CUDA_CU_API Val* cos(Val*);
-TORCH_CUDA_CU_API TensorView* cos(TensorView*);
-// cosh
-TORCH_CUDA_CU_API Val* cosh(Val*);
-TORCH_CUDA_CU_API TensorView* cosh(TensorView*);
-// exp
-TORCH_CUDA_CU_API Val* exp(Val*);
-TORCH_CUDA_CU_API TensorView* exp(TensorView*);
-// expm1
-TORCH_CUDA_CU_API Val* expm1(Val*);
-TORCH_CUDA_CU_API TensorView* expm1(TensorView*);
-// erf
-TORCH_CUDA_CU_API Val* erf(Val*);
-TORCH_CUDA_CU_API TensorView* erf(TensorView*);
-// erfc
-TORCH_CUDA_CU_API Val* erfc(Val*);
-TORCH_CUDA_CU_API TensorView* erfc(TensorView*);
-// floor
-TORCH_CUDA_CU_API Val* floor(Val*);
-TORCH_CUDA_CU_API TensorView* floor(TensorView*);
-// frac
-TORCH_CUDA_CU_API Val* frac(Val*);
-TORCH_CUDA_CU_API TensorView* frac(TensorView*);
-// silu
-TORCH_CUDA_CU_API Val* silu(Val*);
-TORCH_CUDA_CU_API TensorView* silu(TensorView*);
-// lgamma
-TORCH_CUDA_CU_API Val* lgamma(Val*);
-TORCH_CUDA_CU_API TensorView* lgamma(TensorView*);
-// log
-TORCH_CUDA_CU_API Val* log(Val*);
-TORCH_CUDA_CU_API TensorView* log(TensorView*);
-// log10
-TORCH_CUDA_CU_API Val* log10(Val*);
-TORCH_CUDA_CU_API TensorView* log10(TensorView*);
-// log1p
-TORCH_CUDA_CU_API Val* log1p(Val*);
-TORCH_CUDA_CU_API TensorView* log1p(TensorView*);
-// log2
-TORCH_CUDA_CU_API Val* log2(Val*);
-TORCH_CUDA_CU_API TensorView* log2(TensorView*);
-// neg
-TORCH_CUDA_CU_API Val* neg(Val*);
-TORCH_CUDA_CU_API TensorView* neg(TensorView*);
-// real
-TORCH_CUDA_CU_API Val* real(Val*);
-TORCH_CUDA_CU_API TensorView* real(TensorView*);
-// reciprocal
-TORCH_CUDA_CU_API Val* reciprocal(Val*);
-TORCH_CUDA_CU_API TensorView* reciprocal(TensorView*);
-// relu
-TORCH_CUDA_CU_API Val* relu(Val*);
-TORCH_CUDA_CU_API TensorView* relu(TensorView*);
-// rsqrt
-TORCH_CUDA_CU_API Val* rsqrt(Val*);
-TORCH_CUDA_CU_API TensorView* rsqrt(TensorView*);
-// round
-TORCH_CUDA_CU_API Val* round(Val*);
-TORCH_CUDA_CU_API TensorView* round(TensorView*);
-// set
-TORCH_CUDA_CU_API Val* set(Val*);
-TORCH_CUDA_CU_API TensorView* set(TensorView*);
-// sigmoid
-TORCH_CUDA_CU_API Val* sigmoid(Val*);
-TORCH_CUDA_CU_API TensorView* sigmoid(TensorView*);
-// sin
-TORCH_CUDA_CU_API Val* sin(Val*);
-TORCH_CUDA_CU_API TensorView* sin(TensorView*);
-// sinh
-TORCH_CUDA_CU_API Val* sinh(Val*);
-TORCH_CUDA_CU_API TensorView* sinh(TensorView*);
-// sqrt
-TORCH_CUDA_CU_API Val* sqrt(Val*);
-TORCH_CUDA_CU_API TensorView* sqrt(TensorView*);
-// tan
-TORCH_CUDA_CU_API Val* tan(Val*);
-TORCH_CUDA_CU_API TensorView* tan(TensorView*);
-// tanh
-TORCH_CUDA_CU_API Val* tanh(Val*);
-TORCH_CUDA_CU_API TensorView* tanh(TensorView*);
-// trunc
-TORCH_CUDA_CU_API Val* trunc(Val*);
-TORCH_CUDA_CU_API TensorView* trunc(TensorView*);
-// bitwise_not
-TORCH_CUDA_CU_API Val* bitwise_not(Val*);
-TORCH_CUDA_CU_API TensorView* bitwise_not(TensorView*);
-// imag
-TORCH_CUDA_CU_API Val* imag(Val*);
-TORCH_CUDA_CU_API TensorView* imag(TensorView*);
-// isfinite
-TORCH_CUDA_CU_API Val* isfinite(Val*);
-TORCH_CUDA_CU_API TensorView* isfinite(TensorView*);
-// isinf
-TORCH_CUDA_CU_API Val* isinf(Val*);
-TORCH_CUDA_CU_API TensorView* isinf(TensorView*);
-// isnan
-TORCH_CUDA_CU_API Val* isnan(Val*);
-TORCH_CUDA_CU_API TensorView* isnan(TensorView*);
-// isneginf
-TORCH_CUDA_CU_API Val* isneginf(Val*);
-TORCH_CUDA_CU_API TensorView* isneginf(TensorView*);
-// isposinf
-TORCH_CUDA_CU_API Val* isposinf(Val*);
-TORCH_CUDA_CU_API TensorView* isposinf(TensorView*);
-// isreal
-TORCH_CUDA_CU_API Val* isreal(Val*);
-TORCH_CUDA_CU_API TensorView* isreal(TensorView*);
-// print
-TORCH_CUDA_CU_API Val* print(Val*);
-TORCH_CUDA_CU_API TensorView* print(TensorView*);
-
-// Broadcasts inp based on bool vector. Size of broadcast bool vector should be
-// the number of dims desired in the broadcasted tensor. This vector should be
-// true if output dim should be a broadcasted dim, and false if it is not a
-// broadcasted dim. Number of false entires must match the number of input dims.
-TORCH_CUDA_CU_API TensorView* broadcast(
-    TensorView* inp,
-    const std::vector<bool>& is_broadcast_dim);
-
-// Expands input based on provided sizes. expand_sizes should be larger than
-// the input's root domain (really rfactor) and will broadcast on inner
-// dimensions. expand_sizes should be -1 for any dimension that should remain a
-// symbolic size. For dimensions that remain broadcast after the expand should
-// be set to 1, any dimension being expanded must be marked as a broadcast in
-// the input and will be expanded to the provided constant size. Any dimension
-// that's symbolic in the input but specified as a non -1 value will be set to
-// that constant value.
-TORCH_CUDA_CU_API TensorView* expand(
-    TensorView* inp,
-    const std::vector<Val*>& expanded_sizes);
-
-// Expands input based on other. For dimensions in inp that are broadcast with a
-// matching entry in other that's either a broadcast with expanded extent or a
-// non broadcasted iter domain, inp will be expanded to other's size.
-TORCH_CUDA_CU_API TensorView* expand_as(TensorView* inp, TensorView* other);
-
-// BINARY OPERATIONS
-// add
-TORCH_CUDA_CU_API Val* add(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* add(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* add(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* add(TensorView* v1, TensorView* v2);
-// atan2
-TORCH_CUDA_CU_API Val* atan2(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* atan2(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, TensorView* v2);
-// div
-TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
-// fmod
-TORCH_CUDA_CU_API Val* fmod(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* fmod(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* fmod(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* fmod(TensorView* v1, TensorView* v2);
-// mul
-TORCH_CUDA_CU_API Val* mul(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* mul(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, TensorView* v2);
-// pow
-TORCH_CUDA_CU_API Val* pow(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* pow(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* pow(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* pow(TensorView* v1, TensorView* v2);
-// remainder
-TORCH_CUDA_CU_API Val* remainder(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* remainder(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* remainder(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* remainder(TensorView* v1, TensorView* v2);
-// sub
-TORCH_CUDA_CU_API Val* sub(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* sub(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, TensorView* v2);
-// Integer binary ops
-// mod
-TORCH_CUDA_CU_API Val* mod(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* mod(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, TensorView* v2);
-// ceilDiv
-TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
-// Bitwise binary ops
-// bitwise_and
-TORCH_CUDA_CU_API Val* bitwise_and(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_and(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, TensorView* v2);
-// bitwise_left_shift
-TORCH_CUDA_CU_API Val* bitwise_left_shift(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_left_shift(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_left_shift(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_left_shift(
-    TensorView* v1,
-    TensorView* v2);
-// bitwise_right_shift
-TORCH_CUDA_CU_API Val* bitwise_right_shift(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_right_shift(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_right_shift(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_right_shift(
-    TensorView* v1,
-    TensorView* v2);
-// bitwise_or
-TORCH_CUDA_CU_API Val* bitwise_or(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_or(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, TensorView* v2);
-// bitwise_xor
-TORCH_CUDA_CU_API Val* bitwise_xor(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_xor(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, TensorView* v2);
-// Logical binary ops
-// eq
-TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* eq(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, TensorView* v2);
-// ge
-TORCH_CUDA_CU_API Val* ge(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ge(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ge(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* ge(TensorView* v1, TensorView* v2);
-// gt
-TORCH_CUDA_CU_API Val* gt(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* gt(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* gt(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* gt(TensorView* v1, TensorView* v2);
-// le
-TORCH_CUDA_CU_API Val* le(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* le(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* le(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* le(TensorView* v1, TensorView* v2);
-// lt
-TORCH_CUDA_CU_API Val* lt(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* lt(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, TensorView* v2);
-// ne
-TORCH_CUDA_CU_API Val* ne(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* ne(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, TensorView* v2);
-
-// REDUCTION OPERATIONS
-TORCH_CUDA_CU_API TensorView* sum(
-    TensorView* v1,
-    const std::vector<int>& reduction_axes,
-    bool keep_dim = false,
-    DataType dtype = DataType::Null);
-
-TORCH_CUDA_CU_API TensorView* max(
-    TensorView* v1,
-    const std::vector<int>& reduction_axes,
-    bool keep_dim = false,
-    DataType dtype = DataType::Null);
-
-TORCH_CUDA_CU_API TensorView* min(
-    TensorView* v1,
-    const std::vector<int>& reduction_axes,
-    bool keep_dim = false,
-    DataType dtype = DataType::Null);
-
-// COMPOUND OPERATIONS
-// add_alpha
-TORCH_CUDA_CU_API Val* add_alpha(Val* v1, Val* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
-// sub_alpha
-TORCH_CUDA_CU_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
-TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
-// lerp
-TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight);
-TORCH_CUDA_CU_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
-TORCH_CUDA_CU_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
-TORCH_CUDA_CU_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
-TORCH_CUDA_CU_API TensorView* lerp(
-    TensorView* start,
-    TensorView* end,
-    Val* weight);
-TORCH_CUDA_CU_API TensorView* lerp(
-    TensorView* start,
-    Val* end,
-    TensorView* weight);
-TORCH_CUDA_CU_API TensorView* lerp(
-    Val* start,
-    TensorView* end,
-    TensorView* weight);
-TORCH_CUDA_CU_API TensorView* lerp(
-    TensorView* start,
-    TensorView* end,
-    TensorView* weight);
-// addcmul
-TORCH_CUDA_CU_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(
-    TensorView* v1,
-    TensorView* v2,
-    Val* v3,
-    Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(
-    TensorView* v1,
-    Val* v2,
-    TensorView* v3,
-    Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(
-    Val* v1,
-    TensorView* v2,
-    TensorView* v3,
-    Val* s);
-TORCH_CUDA_CU_API TensorView* addcmul(
-    TensorView* v1,
-    TensorView* v2,
-    TensorView* v3,
-    Val* s);
-
-// TERNARY OPERATIONS
-// where
-TORCH_CUDA_CU_API Val* where(Val* c, Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* where(Val* c, Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* where(
-    TensorView* c,
-    TensorView* v1,
-    TensorView* v2);
-// threshold
-TORCH_CUDA_CU_API Val* threshold(Val* in, Val* thresh, Val* value);
-TORCH_CUDA_CU_API TensorView* threshold(
-    TensorView* in,
-    Val* thresh,
-    Val* value);
-// clamp
-TORCH_CUDA_CU_API Val* clamp(Val* in, Val* min_val, Val* max_val);
-TORCH_CUDA_CU_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
-
-//! Internal operator for supporting backward graphs
-//!
-//! example:
-//!   v1 = T1 [I0(10),I1(20),I2(30),I3(40)]
-//!   v2 = sum_to(v1,{30,1}) ------> v2 = T2[I2,R3 (keep_dim)]
-//!
-//!  This operator will return v1* directly if sizes of v1 root domain
-//!  is already the same as shape.
-//!
-//!  Name of sum_to is different from NV fuser naming,
-//!  this is to align with the operator name of at::sum_to.
-
-TORCH_CUDA_CU_API TensorView* sum_to(
-    TensorView* v1,
-    const std::vector<Int*>& sum_to_size);
-
-TORCH_CUDA_CU_API TensorView* sum_to(
-    TensorView* v1,
-    const std::vector<int64_t>& sum_to_size);
-
-//! Shift a tensor to a direction specified by offsets.
-//!
-//! Example:
-//!   t0: 2D tensor of size N by M
-//!   t1 = shift(t0, {1, -1});
-//!
-//!   then:
-//!     t1[i, j] = t0[i-1, j+1] for 1 <= i < N and 0 <= j < M-1.
-//!     t1[i, j] = 0, otherwise
-//!
-//! The pad option controls how out-of-boundary accesses are
-//! handled. It specifies how many zeros are logically padded. If no
-//! pad option is given, it automatically pads the input tensor so
-//! that the output tensor has the same extent for each axis.
-//!
-//! When a padding value is smaller than the absolute value of a shift
-//! offset, the output axis still has the same extent but its start or
-//! stop offset is moved inward to signify those outside of the offset
-//! are invalid.
-//!
-//! It is not allowed to use padding values that are larger than shift
-//! offsets, which would mean output extentes would be larger than
-//! input extents
-TORCH_CUDA_CU_API TensorView* shift(
-    TensorView* inp,
-    const std::vector<int>& offsets,
-    const std::vector<int>& pad_width = {});
-
-TORCH_CUDA_CU_API TensorView* shift(
-    TensorView* inp,
-    const std::vector<int>& offsets,
-    bool pad);
-
-//! Gather a window of nearby elements for each element.
-//!
-//! Each window of size window_shape is stored as a additional
-//! innermost domain, meaning that the number of dimensions of the
-//! output tensor doubles. The pad_width parameter specifies the
-//! padding width of each side of each axis. The strides parameter
-//! specifies striding of the operation. Non-unit striding is
-//! implemented with strided split, whose outer output domain becomes
-//! the root domain for subsequent consumers. The inner output domain
-//! becomes a Stride domain, which is ignored by subsequent consumers.
-//! Only valid input ranges are fed into strided splits.
-//!
-//! When trim_out_of_bounds is true, the values at the first and last
-//! ends that are outside of the start and stop offsets are
-//! effetively trimmed by partial split by 1.
-//!
-//! Example 1:
-//!   t0: 2D tensor of [N, M]
-//!   t1 = gather(t0, {1, 3}, {{0, 0}, {1, 1}});
-//!
-//!   then:
-//!     t1: [N, M, 1, 3]
-//!     t1[i, j, k, l] = The value at the window position of [k, l]
-//!                      for t0[i, j]
-//!
-//! Example 2.1 (without trimming):
-//!   t0: 2D tensor of [N, M]
-//!   t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}});
-//!
-//!   then:
-//!     t1: [N (stop offset: 1), M (stop offset: 1, 2, 2)]
-//!
-//! Example 2.1 (with trimming)
-//!   t0: 2D tensor of [N, M]
-//!   t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}, true);
-//!
-//!   then:
-//!     t1: [ceilDiv(N - 1, 1), ceilDiv(M - 1, 1), 2, 2]
-//!
-//! Example 3:
-//!   t0: 2D tensor of [N, M]
-//!   t1 = gather(t0, {3, 3}, {{0, 0}, {0, 0}}, {3, 3});
-//!
-//!   then:
-//!     t1: [ceilDiv(N - 2, 3), ceilDiv(M - 2, 3), 2, 2]
-//!
-TORCH_CUDA_CU_API TensorView* gather(
-    TensorView* inp,
-    const std::vector<int>& window_shape,
-    const std::vector<std::vector<int>>& pad_width,
-    const std::vector<int>& strides = {},
-    bool trim_out_of_bounds = false);
-
-// Append a new IterDomain to the end of a TenorView to allow
-// iterating on a vector type. The input tensor must have
-// vector dtype.
-TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp);
-
-//! A fused pointwise multiply and sum
-//!  operator that instantiates the following
-//!  fused pattern:
-//!     c = mul(tv_a, tv_b);
-//!     return sum(c, axes)
-//!
-//! \param tv_a first multiply operand
-//! \param tv_b second multiply operand
-//! \param axes axes to sum over
-//! \param init sum initial value
-//!
-//! Note & TODO:
-//!   currently only support lowering to a mma op
-//!   through this interface and only support fp16 inputs.
-//!   will support converting back to multiply and reduce in
-//!   a follow up.
-TORCH_CUDA_CU_API TensorView* fusedMultiplySum(
-    TensorView* tv_a,
-    TensorView* tv_b,
-    const std::vector<int>& axes,
-    Val* init = nullptr);
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/codegen.cpp
+++ b/third_party/nvfuser/csrc/codegen.cpp
--- a/third_party/nvfuser/csrc/codegen.h
+++ b/third_party/nvfuser/csrc/codegen.h
@ -1,23 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <kernel.h>
-
-#include <string>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace codegen {
-
-//! Generates a CUDA kernel definition for the given kernel
-TORCH_CUDA_CU_API std::string generateCudaKernel(
-    const kir::Kernel* kernel,
-    const std::string& kernel_name = "CUDAGeneratedKernel");
-
-} // namespace codegen
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/compute_at.cpp
+++ b/third_party/nvfuser/csrc/compute_at.cpp
@ -1,277 +0,0 @@
-#include <compute_at.h>
-#include <instrumentation.h>
-#include <ir_all_nodes.h>
-#include <ir_iostream.h>
-#include <ir_utils.h>
-#include <lower_utils.h>
-#include <root_domain_map.h>
-#include <transform_iter.h>
-
-#include <c10/util/irange.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// Simple selector that only propagates across tensor views in the provided
-// unordered_set. Will also propagate to all consumers of those tensors, and the
-// siblings of those tensors.
-class ComputeAtSelector : public MaxInfoSpanningTree::Selector {
-  std::unordered_set<TensorView*> selected_;
-
- public:
-  virtual bool allowC2P(TensorView* from, TensorView* to) override {
-    return selected_.count(to) > 0;
-  }
-
-  virtual bool allowP2C(TensorView* from, TensorView* to) override {
-    // If the producer is in the selected set, then the consumer must also be
-    // replayed to obtain a compatible loop structure so that this producer
-    // can be consumed in this loop.
-    return selected_.count(from) > 0 || selected_.count(to) > 0;
-  }
-
-  virtual bool allowSibling(TensorView* from, TensorView* to) override {
-    return true;
-  }
-
-  ComputeAtSelector(std::unordered_set<TensorView*> selected)
-      : selected_(std::move(selected)) {}
-  const std::unordered_set<TensorView*>& selected() const {
-    return selected_;
-  }
-};
-
-namespace {
-
-// Wrapper around set_intersection
-template <typename T>
-std::set<T> set_intersection(const std::set<T>& set1, const std::set<T>& set2) {
-  std::set<T> intersection;
-  std::set_intersection(
-      set1.begin(),
-      set1.end(),
-      set2.begin(),
-      set2.end(),
-      std::inserter(intersection, intersection.begin()));
-  return intersection;
-}
-
-std::deque<std::deque<TensorView*>> tvChains(
-    std::deque<std::deque<Val*>> val_chains) {
-  std::deque<std::deque<TensorView*>> tv_chains(val_chains.size());
-  for (const auto i : c10::irange(val_chains.size())) {
-    auto tv_iterable = ir_utils::filterByType<TensorView>(val_chains[i]);
-    tv_chains[i] =
-        std::deque<TensorView*>(tv_iterable.begin(), tv_iterable.end());
-  }
-  return tv_chains;
-}
-
-std::unordered_set<TensorView*> getAllTVsBetween(
-    TensorView* producer,
-    TensorView* consumer) {
-  TORCH_CHECK(
-      DependencyCheck::isDependencyOf(producer, consumer),
-      "Compute At expects ",
-      producer->name(),
-      " is a dependency of ",
-      consumer->name(),
-      ", however it is not.");
-  auto between_vals =
-      DependencyCheck::getAllValsBetween({producer}, {consumer});
-  auto between_tvs = ir_utils::filterByType<TensorView>(between_vals);
-  std::unordered_set<TensorView*> result(
-      between_tvs.begin(), between_tvs.end());
-  result.erase(consumer);
-  return result;
-}
-
-TensorView* getCommonConsumer(TensorView* producer, TensorView* consumer) {
-  FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer");
-  auto producer_use_chains_ =
-      tvChains(DependencyCheck::getAllUseChains(producer));
-
-  // Convert the first chain to a set.
-  std::set<TensorView*> common_consumers(
-      producer_use_chains_.front().begin(), producer_use_chains_.front().end());
-
-  // Run through all use chains of producer, and intersect them to find common
-  // TVs
-  for (auto tv_chain : producer_use_chains_) {
-    common_consumers = set_intersection(
-        common_consumers,
-        std::set<TensorView*>(tv_chain.begin(), tv_chain.end()));
-  }
-
-  auto all_chains =
-      tvChains(DependencyCheck::getAllDependencyChains(producer, consumer));
-
-  // Right now we only support compute at if at some point in the graph consumer
-  // is dependent on producer.
-  TORCH_CHECK(
-      !all_chains.empty(),
-      "Compute At expects ",
-      producer->name(),
-      " is a dependency of ",
-      consumer->name(),
-      ", however it is not.");
-
-  // Remove all TVs from producer to consumer as common consumer must be at or
-  // after consumer
-  for (const auto& tv_chain : all_chains) {
-    for (auto tv : tv_chain) {
-      if (tv != consumer)
-        common_consumers.erase(tv);
-    }
-  }
-
-  // If there is a common consumer, grab the first one at or after consumer
-  TensorView* common_consumer = nullptr;
-  if (!common_consumers.empty()) {
-    for (auto tv : producer_use_chains_.front()) {
-      if (common_consumers.find(tv) != common_consumers.end()) {
-        common_consumer = tv;
-        break;
-      }
-    }
-    TORCH_INTERNAL_ASSERT(
-        common_consumer != nullptr,
-        "Hit a logical inconsistency in the computeAt pass.");
-  }
-  return common_consumer;
-}
-
-void pullInSiblings(std::unordered_set<TensorView*>& s) {
-  for (auto tv : s) {
-    for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
-      if (sibling_tv == tv) {
-        continue;
-      }
-      s.emplace(sibling_tv);
-    }
-  }
-}
-
-// I am just trying to get the same set of tensors being transformed matching
-// the previous behavior of ComputeAt. The algorithm to compute this set is
-// horrible, but I don't care because I will eventually completely remove
-// ComputeAt, and this algorihtm is not worse than the pervious ComputeAt. :)
-std::unordered_set<TensorView*> getPropagationSubgraph(
-    TensorView* producer,
-    TensorView* consumer) {
-  TORCH_CHECK(
-      DependencyCheck::isDependencyOf(producer, consumer),
-      "Compute At expects ",
-      producer->name(),
-      " is a dependency of ",
-      consumer->name(),
-      ", however it is not.");
-  TensorView* common_consumer = getCommonConsumer(producer, consumer);
-  if (common_consumer != nullptr) {
-    auto result = getAllTVsBetween(producer, common_consumer);
-    pullInSiblings(result);
-    return result;
-  }
-  auto result_vals = DependencyCheck::getAllDependentVals({producer});
-  result_vals.emplace(producer);
-  auto result_tvs = ir_utils::filterByType<TensorView>(result_vals);
-  std::unordered_set<TensorView*> result;
-  std::copy_if(
-      result_tvs.begin(),
-      result_tvs.end(),
-      std::inserter(result, result.begin()),
-      [](TensorView* tv) { return !tv->uses().empty(); });
-  pullInSiblings(result);
-  return result;
-}
-
-} // namespace
-
-void ComputeAt::runAt(
-    TensorView* producer,
-    TensorView* consumer,
-    int64_t consumer_position,
-    ComputeAtMode mode) {
-  FUSER_PERF_SCOPE("ComputeAt::runAt");
-
-  // Make sure the correct fusion is setup between this and consumer.
-  TORCH_CHECK(
-      producer->fusion() == consumer->fusion(),
-      producer,
-      " and ",
-      consumer,
-      " are not in the same fusion.");
-
-  if (mode == ComputeAtMode::MostInlined) {
-    consumer_position = -1;
-  }
-
-  FusionGuard fg(producer->fusion());
-
-  auto selected = getPropagationSubgraph(producer, consumer);
-  ComputeAtSelector selector(selected);
-
-  MaxRootDomainInfoSpanningTree path(consumer, consumer_position, &selector);
-
-  if (mode == ComputeAtMode::MostInlined) {
-    MostInlinedTransformPropagator propagator;
-    path.traverse(&propagator);
-    inlineMost(selected);
-  } else {
-    TransformPropagator propagator(consumer, consumer_position);
-    path.traverse(&propagator);
-    inlineSelectedAt(
-        selected,
-        consumer,
-        consumer_position,
-        mode == ComputeAtMode::BestEffort);
-  }
-}
-
-void ComputeAt::runWith(
-    TensorView* producer,
-    TensorView* consumer,
-    int64_t producer_position,
-    ComputeAtMode mode) {
-  FUSER_PERF_SCOPE("ComputeAt::runWith");
-
-  // Make sure the correct fusion is setup between this and consumer.
-  TORCH_CHECK(
-      producer->fusion() == consumer->fusion(),
-      producer,
-      " and ",
-      consumer,
-      " are not in the same fusion.");
-
-  if (mode == ComputeAtMode::MostInlined) {
-    producer_position = -1;
-  }
-
-  FusionGuard fg(producer->fusion());
-
-  auto selected = getPropagationSubgraph(producer, consumer);
-  ComputeAtSelector selector(selected);
-
-  MaxRootDomainInfoSpanningTree path(producer, producer_position, &selector);
-
-  if (mode == ComputeAtMode::MostInlined) {
-    MostInlinedTransformPropagator propagator;
-    path.traverse(&propagator);
-    inlineMost(selected);
-  } else {
-    TransformPropagator propagator(producer, producer_position);
-    path.traverse(&propagator);
-    inlineSelectedAt(
-        selected,
-        producer,
-        producer_position,
-        mode == ComputeAtMode::BestEffort);
-  }
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/compute_at.h
+++ b/third_party/nvfuser/csrc/compute_at.h
@ -1,45 +0,0 @@
-#pragma once
-
-#include <inlining.h>
-#include <root_domain_map.h>
-#include <transform_replay.h>
-
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-
-#include <deque>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class TensorDomain;
-class TensorView;
-
-struct ComputeAt {
- public:
-  // Runs the compute at pass making producer look like consumer, computing
-  // producer relative to consumer
-  static void runAt(
-      TensorView* producer,
-      TensorView* consumer,
-      int64_t consumer_position,
-      ComputeAtMode mode = ComputeAtMode::Standard);
-
-  // Runs the compute with pass making consumer look like producer, computing
-  // producer relative to consumer
-  static void runWith(
-      TensorView* producer,
-      TensorView* consumer,
-      int64_t producer_position,
-      ComputeAtMode mode = ComputeAtMode::Standard);
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/compute_at_map.cpp
+++ b/third_party/nvfuser/csrc/compute_at_map.cpp
--- a/third_party/nvfuser/csrc/compute_at_map.h
+++ b/third_party/nvfuser/csrc/compute_at_map.h
@ -1,264 +0,0 @@
-#pragma once
-
-#include <disjoint_set.h>
-#include <ir_all_nodes.h>
-#include <kernel_ir.h>
-#include <lower_trivial_reductions.h>
-
-#include <deque>
-#include <unordered_map>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// There's three modes of these iter domain mappings all uniquely important in
-// the lowering process.
-//
-// For EXACT/PERMISSIVE mode consider:
-//
-// consumer[i0, b1] = producer[i0]
-// consumer->merge(0) (consumer will now be [i0 * b1])
-// When producer is replayed as consumer (the direction we use for mapping)
-// with BestEffortReplay forward_bcast_mismatch = True the producer to
-// consumer map will have both a mapping of consumer(i0) to producer(i0) as
-// well as consumer(i0*b1) to producer(i0). This latter mapping is important
-// for loop nest mappings as the consumer will generate a loop based on i0*b1
-// and the producer may be computeAt inside this loop nest. However, for
-// indexing we do not want these two maps as producer may be indexed as i0*i1
-// depending on the loop nest structure and how it was built. Therefore we
-// really need to carry (at least) two sets of maps around for lowering.
-//
-// LOOP mode is important if we have something like:
-// consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt
-// = 1) which can easily happen when using shared memory. We want to make sure
-// that the iteration domain used for loop construction (concreteId) has the
-// proper parallelization strategy. In parallel mode we do typical iteration
-// domain mapping, however we remove from it any iteration domains outside the
-// computeAt of producer when mapping. This guarentees we won't map
-// IterDomains that could have different parallelization strategies. We also
-// propagate the parallel strategy in parallel mode so all mapped IDs that
-// must have the same parallel type, do.
-//
-// IdMappingMode::LOOP
-//   Only maps leaf axes to left of compute at
-//   Forward broadcast axes in replay
-// IdMappingMode::PERMISSIVE
-//   Forward broadcast axes in replay
-//   Map all iteration domains
-//   Always contain root mappings (otherwise they could have been forwarded in
-//   broadcast)
-// IdMappingMode::EXACT
-//   Don't map any broadcast axes to non-broadcast axes
-//   Do not forward through any broadcast IDs
-class TORCH_CUDA_CU_API IterDomainGraph {
- public:
-  IterDomainGraph(Fusion* fusion, bool allow_self_mapping = false);
-
-  const DisjointSets<IterDomain*>& permissiveNodes() const {
-    return permissive_nodes_;
-  }
-  const DisjointSets<IterDomain*>& exactNodes() const {
-    return exact_nodes_;
-  }
-  const DisjointSets<IterDomain*>& loopNodes() const {
-    return loop_nodes_;
-  }
-
-  // Consumers and producers is not symmetric like the other sets
-  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
-  consumers() const {
-    return consumers_;
-  }
-  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
-  producers() const {
-    return producers_;
-  }
-
-  const DisjointSets<IterDomain*>& siblings() const {
-    return sibling_sets_;
-  }
-
-  const VectorOfUniqueEntries<IterDomain*>& allIds() const {
-    return all_ids_;
-  }
-
-  const std::unordered_set<IterDomain*>& viewRfactorIds() const {
-    return view_rfactor_ids_;
-  }
-
-  // Returns if first and second are expressions through which the provided
-  // id_map have matching inputs (if forward), or outputs (if not forward).
-  // Returning true means the expressions are "the same", in terms they modify
-  // matching original extents, by the same amount.
-  static bool exprsMap(
-      Expr* first,
-      Expr* second,
-      bool forward,
-      const DisjointSets<IterDomain*>& id_map);
-
-  bool hasSelfMapping() const {
-    return self_mapping_info_.has_value();
-  }
-
- private:
-  void build(Fusion* fusion);
-
-  void initializeId(IterDomain* id, bool is_view_rfactor_id, bool is_leaf_id);
-
-  // Checks if exprsMap then if forward will map outputs else inputs in exact
-  // and permissive map.
-  void mapThroughExpr(Expr* first, Expr* second, bool forward);
-
-  DisjointSets<IterDomain*> permissive_nodes_;
-  DisjointSets<IterDomain*> exact_nodes_;
-  DisjointSets<IterDomain*> loop_nodes_;
-
-  // Consumers and producers is not symmetric like the other sets
-  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
-      consumers_;
-  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
-      producers_;
-
-  DisjointSets<IterDomain*> sibling_sets_;
-
-  VectorOfUniqueEntries<IterDomain*> all_ids_;
-
-  std::unordered_set<IterDomain*> view_rfactor_ids_;
-
-  c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
-      self_mapping_info_ = c10::nullopt;
-};
-
-class TrivialReductionInfo;
-
-using DoubleBufferIndices = std::unordered_map<DoubleBufferLoopStage, Int*>;
-
-class TORCH_CUDA_CU_API ComputeAtMap {
- public:
-  ComputeAtMap() = delete;
-  ComputeAtMap(const ComputeAtMap&) = delete;
-  ComputeAtMap& operator=(const ComputeAtMap&) = delete;
-  ComputeAtMap(ComputeAtMap&&) = default;
-  ComputeAtMap& operator=(ComputeAtMap&&) = default;
-  ComputeAtMap(Fusion* fusion);
-
-  //! Run through disjoint sets in the LOOP map, make sure there's only one
-  //! non-serial parallel type in each disjoint set, set the parallel type of
-  //! all IterDomains in the disjoint set to that PType.
-  void validateAndPropagatePType();
-
-  //! Run through disjoint sets in the LOOP map and allocate the index
-  //!  variable for the associated for loop that will be generated
-  //!  for each disjoint sets in the loop map. This pre-allocation makes
-  //!  2 key assumptions about computeAt map that would very likely be
-  //!  long term invariant:
-  //!    1. All kir::forloop created in the lowering pass should belong
-  //!  to one of the disjoint sets in loop map.
-  //!    2. The lowering pass will *never* create a loop nest with 2
-  //!  different nesting levels mapped together, i.e. the case below
-  //!  never occurs:
-  //!   for i in IterDomain1
-  //!    for j in IterDomain2
-  //!     ...
-  //!   With loop_map.areMapped(IterDomain1, IterDomain2) == true.
-  //! Under this condition, we can pre-allocate all required index
-  //!  variable integers before creating any kir::forloop, and this
-  //!  would help optimizing the generated integer math for indexing.
-  void allocateIndexVariables();
-
-  //! Returns if id0 and id1 are mapped to eachother with provided IdMappingMode
-  bool areMapped(IterDomain* id0, IterDomain* id1, IdMappingMode mode) const;
-
-  //! Returns an iter domain that is the maximum expanded size of all iter
-  //! domains the one provided maps to. Useful for opening loops to the correct
-  //! iteration size. Not guarenteed to return the same ID every call, but is
-  //! guarenteed to return iter domains in the same disjoint set.
-  IterDomain* getConcreteMappedID(IterDomain* id, IdMappingMode mode) const;
-
-  // Prints mapping information, forwards to an internal IterDomainGraph
-  std::string toString() const;
-
-  // Returns if the provided ID is a view like rfactor id
-  bool isViewRfactor(IterDomain* ref_id) const;
-
-  // Returns all rfactor domains in rfactor_concrete_count_reset_domains_ that
-  // are in the disjoint set of the provided IterDomain. This will be every view
-  // like rfactor ID the provided ID "depends" on in the map.
-  std::vector<IterDomain*> getViewRfactorDomainsOfIdGroup(
-      IterDomain* ref_id,
-      IdMappingMode mode) const;
-
-  const IterDomainGraph& idGraph() const {
-    return id_graph_;
-  }
-
-  //! Get the ID sets for a provided IdMappingMode
-  const DisjointSets<IterDomain*>& getIdSets(IdMappingMode mode) const;
-
-  // Returns if the ID actually has a disjoint set meaning it has been processed
-  // in the creation of the compute at map.
-  bool idExistsInMap(IterDomain* id) const;
-
-  //! Returns the pre-allocated index variable integer used in
-  //!  the kir::ForLoop corresponding to the given IterDomain.
-  //!  this interface is only valid if the ID has a loop mapping,
-  //!  ca_map will throw exceptions if given iterdomain doesn't
-  //!  have a loop map entry.
-  Val* getIndexVariable(
-      IterDomain* id,
-      DoubleBufferLoopStage double_buffer_loop_stage =
-          DoubleBufferLoopStage::NotApplicable) const;
-
- private:
-  // Build id_graph_
-  void build(Fusion* fusion);
-
-  // Build concrete_id_cache_
-  // Build a single entry in  concrete_cache_id_
-  IterDomain* computeConcreteId(IterDomain* id, IdMappingMode mode);
-  void buildConcreteIds();
-
-  // Produce the disjoint set containing provided id with mapping mode.
-  const std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>& disjointSetOf(
-      IterDomain* id,
-      IdMappingMode mode) const;
-
-  // Should be built once and never modified again.
-  IterDomainGraph id_graph_;
-  TrivialReductionInfo trivial_reduction_info_;
-
-  // Prevent needing to recompute concrete_id's in compute at map.
-  // VectorOfUniqueEntries is unique across mapping modes, so don't need to use
-  // mapping mode directly in this cache. const
-  // VectorOfUniqueEntries<IterDomain*>& is what's returned by
-  // ComputeAtMap::disjointSetOf which can be used directly.
-  std::unordered_map<
-      std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>,
-      IterDomain*>
-      concrete_id_cache_;
-
-  //! Allocated Loop index variable through the CA map.
-  //!   only valid for disjoint sets on the loop ca map.
-  std::unordered_map<const VectorOfUniqueEntries<IterDomain*>*, Val*>
-      loop_index_variable_map_;
-
-  //! Allocated loop indices for double buffer loop.
-  //!  only valid for disjoint sets on the loop ca map
-  //!  that have double buffer-ed iterdomains.
-  using DoubleBufferIndicesPtr = std::unique_ptr<DoubleBufferIndices>;
-  std::unordered_map<
-      const VectorOfUniqueEntries<IterDomain*>*,
-      DoubleBufferIndicesPtr>
-      double_buffered_loop_index_variable_map_;
-
-  // Shortcut to access the fusion this computeAt map was
-  //  built from.
-  Fusion* fusion_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/contiguity.cpp
+++ b/third_party/nvfuser/csrc/contiguity.cpp
@ -1,619 +0,0 @@
-#include <ir_utils.h>
-#include <iter_visitor.h>
-#include <lower2device.h>
-
-#include <contiguity.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-OrderedIdInformation::OrderedIdInformation(
-    const std::vector<IterDomain*>& ids,
-    const std::vector<IterDomain*>& root_domain,
-    std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info)
-    : active_ids_(root_domain), concrete_info_(concrete_info) {
-  if (ids.empty() || root_domain.empty()) {
-    return;
-  }
-
-  // Grab root ids and initialize them.
-  for (const auto root_i : c10::irange(root_domain.size())) {
-    auto root_id = root_domain[root_i]->as<IterDomain>();
-
-    // Initialize id_to_root_ids to map roots to themselves
-    id_to_root_ids_[root_id] = {root_id};
-
-    // Initialize roots as being made up of correctly ordered transforms.
-    consistently_ordered_ids_.emplace(root_id);
-
-    exclusively_consumes_roots_.emplace(root_id);
-  }
-
-  // Iterate from the root domain to the provided ids and fill
-  // consistently_ordered_ids_, id_to_root_ids_, and exclusively_consumes_roots_
-  // for all the IDs
-  auto exprs = StmtSort::getExprsBetween(
-      ids[0]->fusion(),
-      {root_domain.begin(), root_domain.end()},
-      {ids.begin(), ids.end()});
-
-  for (auto expr : exprs) {
-    OptInDispatch::handle(expr);
-  }
-}
-
-bool OrderedIdInformation::checkExclusivelyConsumesRoots(IterDomain* id) {
-  TORCH_INTERNAL_ASSERT(
-      std::find(active_ids_.begin(), active_ids_.end(), id) !=
-          active_ids_.end(),
-      "Error replaying transforms in contiguous ID checker, expected ",
-      id->toString(),
-      " to be in the active ID set.");
-
-  auto root_id_it = id_to_root_ids_.find(id);
-  TORCH_INTERNAL_ASSERT(
-      root_id_it != id_to_root_ids_.end(),
-      "Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
-      id->toString());
-
-  const auto& root_ids = root_id_it->second;
-
-  // Check all the roots of all other ids, to see if any root_ids in id are also
-  // in them.
-  for (auto other_active_id : active_ids_) {
-    if (other_active_id == id || other_active_id == nullptr) {
-      continue;
-    }
-
-    auto root_id_it = id_to_root_ids_.find(other_active_id);
-    TORCH_INTERNAL_ASSERT(
-        root_id_it != id_to_root_ids_.end(),
-        "Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
-        other_active_id->toString());
-
-    const auto& other_root_ids = root_id_it->second;
-
-    for (auto other_root_id : other_root_ids) {
-      if (root_ids.has(other_root_id)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void OrderedIdInformation::handle(Merge* merge) {
-  // Find inputs in the active_ids_ vector
-  const auto inner_it =
-      std::find(active_ids_.begin(), active_ids_.end(), merge->inner());
-  const auto outer_it =
-      std::find(active_ids_.begin(), active_ids_.end(), merge->outer());
-
-  // If either aren't in active_ids_ it means the inputs were detected to not be
-  // ordered correctly before hitting this expression.
-  if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) {
-    return;
-  }
-
-  auto inner_pos = std::distance(active_ids_.begin(), inner_it);
-  auto outer_pos = std::distance(active_ids_.begin(), outer_it);
-
-  // Find inputs in the ordered transforms map
-  const auto inner_ordered_it = consistently_ordered_ids_.find(merge->inner());
-  const auto outer_ordered_it = consistently_ordered_ids_.find(merge->outer());
-
-  bool inner_ordered = inner_ordered_it != consistently_ordered_ids_.end();
-  bool outer_ordered = outer_ordered_it != consistently_ordered_ids_.end();
-
-  // Get root ids of the two inputs
-  const auto inner_root_ids_it = id_to_root_ids_.find(merge->inner());
-  const auto outer_root_ids_it = id_to_root_ids_.find(merge->outer());
-
-  TORCH_INTERNAL_ASSERT(
-      inner_root_ids_it != id_to_root_ids_.end() &&
-          outer_root_ids_it != id_to_root_ids_.end(),
-      "Error replaying transforms in contiguous ID checker.");
-
-  const auto& inner_root_ids = inner_root_ids_it->second;
-  const auto& outer_root_ids = outer_root_ids_it->second;
-
-  // TODO: Concretization may prevent contiguous indexing or vectorization.
-  //  It prevents contiguous indexing if the concretization is within the IDs
-  //  that are used for indexing.
-  //  For vectorization it just means we need to make sure the extents of the
-  //  axes to the right of the broadcast root domain in the contigous merge is
-  //  bigger than the vectorization dimension. And that the tensor buffer
-  //  supports the vector word size (always done).
-  bool outer_is_concretized_bcast = merge->outer()->isBroadcast() &&
-      concrete_info_->isConcretized(merge->outer());
-
-  bool inner_is_concretized_bcast = merge->inner()->isBroadcast() &&
-      concrete_info_->isConcretized(merge->inner());
-
-  // Update maps
-  // Find the position inner would have to have to be considered ordered
-  auto pos_after_outer = outer_pos + 1;
-  for (; pos_after_outer < int64_t(active_ids_.size()); pos_after_outer++) {
-    if (active_ids_[pos_after_outer] == nullptr) {
-      // Can't be considered ordered after a nullptr
-      break;
-    }
-    if (active_ids_[pos_after_outer]->isReduction() ||
-        ((active_ids_[pos_after_outer]->isBroadcast() &&
-          !concrete_info_->isConcretized(active_ids_[pos_after_outer])))) {
-      // Skip reduction or broadcast axes that aren't concretized in the fusion
-      continue;
-    }
-    break;
-  }
-
-  // The output is ordered as long as the inputs were ordered and outer position
-  // is directly left of the inner position.
-  bool out_ordered = inner_ordered && outer_ordered;
-  out_ordered = out_ordered &&
-      // If inner_pos is before outer_pos it's not ordered correctly. If for
-      // some reason it's the same, that would be an error.
-      inner_pos > outer_pos &&
-      // Inner could be a broadcast, so doesn't have to be right on
-      // pos_after_outer as that ID (if it exists) should not be a broadcast.
-      // However, merging over a broadcast should be fine.
-      inner_pos <= pos_after_outer && !inner_is_concretized_bcast &&
-      !outer_is_concretized_bcast;
-
-  if (out_ordered) {
-    consistently_ordered_ids_.emplace(merge->out());
-  }
-
-  // Don't just remove active_ids_, as if we have something like:
-  //   [i0, i1, i2, i3]
-  //   ->merge(0, 2)
-  //   ->merge(1)
-  // The latter merge looks like it's ordered correctly, if we update the active
-  // map as:
-  //   [i0, i1, i2, i3] -> [i0*i2, i1, i3]
-  // Hoever if we instead mark it as:
-  //   [i0, i1, i2, i3] -> [i0*i2, i1, nullptr, i3]
-  // Or:
-  //   [i0, i1, i2, i3] -> [nullptr, i1, i0*i2, i3]
-  // It's clear the second merge is not ordered correctly. Doesn't matter which
-  // direction we put the iter domain in, prefer putting it in outer as we often
-  // are looking for inner dimensions that are contiguous. We don't want to
-  // always do this, as it could make ordered merges look non-ordered.
-  // For exmaple: [i0, i1, i2, i3]
-  // ->merge(0)
-  // ->merge(1)
-  // ->merge(0)
-  // If it's updated as:
-  // [i0, i1, i2, i3]
-  // -> [i0*i1, nullptr, i2, i3]
-  // -> [i0*i1, nullptr, i2*i3, nullptr]
-  // Now the final merge looks non-ordered but it is. So only insert a nullptr
-  // entry if the out is not ordered.
-  active_ids_[outer_pos] = merge->out();
-
-  if (!out_ordered) {
-    active_ids_[inner_pos] = nullptr;
-  } else {
-    active_ids_.erase(active_ids_.begin() + inner_pos);
-    for (auto i = outer_pos + 1; i < inner_pos; i++) {
-      // If there's broadcast axes between outer and inner and the merge was
-      // contiguous, there may be broadcasts between outer and inner that cannot
-      // be ordered merged anywhere else so remove them.
-      active_ids_.erase(active_ids_.begin() + outer_pos + 1);
-    }
-  }
-
-  // Update the root_id entry for the output.
-  VectorOfUniqueEntries<IterDomain*> root_ids = inner_root_ids;
-  root_ids.pushBack(outer_root_ids);
-
-  id_to_root_ids_[merge->out()] = root_ids;
-
-  // Need to check this after updating active_ids_ and id_to_root_ids_
-  if (checkExclusivelyConsumesRoots(merge->out())) {
-    exclusively_consumes_roots_.emplace(merge->out());
-  }
-}
-
-void OrderedIdInformation::handle(Split* split) {
-  // Find the input in the active_ids_ vector
-  const auto in_it =
-      std::find(active_ids_.begin(), active_ids_.end(), split->in());
-
-  if (in_it == active_ids_.end()) {
-    return;
-  }
-
-  auto in_pos = std::distance(active_ids_.begin(), in_it);
-
-  // Find the input in the ordered transforms map
-  const auto in_ordered_it = consistently_ordered_ids_.find(split->in());
-
-  bool in_ordered = in_ordered_it != consistently_ordered_ids_.end();
-
-  // Get root ids of the input
-  const auto in_root_ids_it = id_to_root_ids_.find(split->in());
-
-  TORCH_INTERNAL_ASSERT(
-      in_root_ids_it != id_to_root_ids_.end(),
-      "Error replaying transforms in contiguous ID checker.");
-
-  VectorOfUniqueEntries<IterDomain*> in_root_ids = in_root_ids_it->second;
-
-  // Update map for outputs
-  // Remove inputs from the active_ids_ and insert the output ID
-  active_ids_[in_pos] = split->outer();
-  active_ids_.insert(active_ids_.begin() + in_pos + 1, split->inner());
-
-  // The outputs are ordered as long as the input is ordered.
-  if (in_ordered) {
-    consistently_ordered_ids_.emplace(split->outer());
-    consistently_ordered_ids_.emplace(split->inner());
-  }
-
-  // Update the root_id entry for the outputs.
-  id_to_root_ids_[split->outer()] = in_root_ids;
-  id_to_root_ids_[split->inner()] = in_root_ids;
-}
-
-// Swizzle generally can't be contiguous because of the non-affine nature of it,
-// but we can still analyze the operation in the same way as merge/split.
-void OrderedIdInformation::handle(Swizzle2D* swizzle) {
-  // Find inputs in the active_ids_ vector
-  const auto in_x_it =
-      std::find(active_ids_.begin(), active_ids_.end(), swizzle->inX());
-  const auto in_y_it =
-      std::find(active_ids_.begin(), active_ids_.end(), swizzle->inY());
-
-  if (in_x_it == active_ids_.end() || in_y_it == active_ids_.end()) {
-    return;
-  }
-
-  auto in_x_pos = std::distance(active_ids_.begin(), in_x_it);
-  auto in_y_pos = std::distance(active_ids_.begin(), in_y_it);
-
-  // Find inputs in the ordered transforms map
-  const auto in_x_ordered_it = consistently_ordered_ids_.find(swizzle->inX());
-  const auto in_y_ordered_it = consistently_ordered_ids_.find(swizzle->inY());
-
-  bool in_x_ordered = in_x_ordered_it != consistently_ordered_ids_.end();
-  bool in_y_ordered = in_y_ordered_it != consistently_ordered_ids_.end();
-
-  // Get root ids of the two inputs
-  const auto in_x_root_ids_it = id_to_root_ids_.find(swizzle->inX());
-  const auto in_y_root_ids_it = id_to_root_ids_.find(swizzle->inY());
-
-  TORCH_INTERNAL_ASSERT(
-      in_x_root_ids_it != id_to_root_ids_.end() &&
-          in_y_root_ids_it != id_to_root_ids_.end(),
-      "Error replaying transforms in contiguous ID checker.");
-
-  const auto& in_x_root_ids = in_x_root_ids_it->second;
-  const auto& in_y_root_ids = in_y_root_ids_it->second;
-
-  // Update map for outputs
-  // Remove inputs from the active_ids_ and insert the output ID
-  active_ids_[in_x_pos] = swizzle->outX();
-  active_ids_[in_y_pos] = swizzle->outY();
-
-  // In the case of no real swizzle we can forward properties on each domain
-  // independently.
-  if (swizzle->swizzleType() == Swizzle2DType::NoSwizzle) {
-    if (in_x_ordered) {
-      consistently_ordered_ids_.emplace(swizzle->outX());
-    }
-
-    if (exclusivelyConsumesRoots(swizzle->inX())) {
-      exclusively_consumes_roots_.emplace(swizzle->outX());
-    }
-
-    if (in_y_ordered) {
-      consistently_ordered_ids_.emplace(swizzle->outY());
-    }
-
-    if (exclusivelyConsumesRoots(swizzle->inY())) {
-      exclusively_consumes_roots_.emplace(swizzle->outY());
-    }
-
-    id_to_root_ids_[swizzle->outX()] = in_x_root_ids;
-    id_to_root_ids_[swizzle->outY()] = in_y_root_ids;
-  } else {
-    VectorOfUniqueEntries<IterDomain*> root_ids = in_x_root_ids;
-    root_ids.pushBack(in_y_root_ids);
-    id_to_root_ids_[swizzle->outX()] = root_ids;
-    id_to_root_ids_[swizzle->outY()] = root_ids;
-  }
-}
-
-NonDivisibleSplitDependencies::NonDivisibleSplitDependencies(
-    // TODO: Revisit reduction rfactor axes and propagation. Should probably use
-    // ca_map to propogate non divisibility dependencies across exact map. Still
-    // need to think through divisible split and non divisible dependencies to
-    // see if there's conflicts where a split might look non divisible but
-    // actually is divisible and one's overruling the other.
-    const std::vector<IterDomain*>& ids,
-    const std::vector<IterDomain*>& root_domain,
-    const std::unordered_set<Split*>& divisible_splits) {
-  if (ids.empty() || root_domain.empty()) {
-    return;
-  }
-  auto transforms = StmtSort::getExprsBetween(
-      ids[0]->fusion(),
-      {root_domain.begin(), root_domain.end()},
-      {ids.begin(), ids.end()});
-  for (auto transform : transforms) {
-    auto inp_ids = ir_utils::filterByType<IterDomain>(transform->inputs());
-    for (auto inp_id : inp_ids) {
-      if (std::find(root_domain.begin(), root_domain.end(), inp_id) !=
-          root_domain.end()) {
-        // This generally shouldn't happen as there shouldn't be
-        // transformations before the root ids, but in case for some reason
-        // we eventually do have cases like that, we should reset the
-        // root_ids if for some reason they've been placed in the non
-        // divisible split set.
-        depends_on_non_divisible_split.erase(inp_id);
-      }
-    }
-
-    bool inputs_non_divisible =
-        std::any_of(inp_ids.begin(), inp_ids.end(), [this](IterDomain* inp_id) {
-          return depends_on_non_divisible_split.find(inp_id) !=
-              depends_on_non_divisible_split.end();
-        });
-
-    auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
-
-    if (inputs_non_divisible) {
-      // If any inputs are known to be dependent on a divisible split
-      // Mark outputs as dependent on a non_divisible split
-      depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
-      continue;
-    }
-
-    if (!transform->isA<Split>()) {
-      continue;
-    }
-
-    auto split = transform->as<Split>();
-    // If this transform is a non-divisible split
-    if (divisible_splits.find(split) == divisible_splits.end()) {
-      // Mark outputs as dependent on a non_divisible split
-      auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
-      depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
-    }
-  }
-}
-
-ContigIDs::ContigIDs(
-    const std::vector<IterDomain*>& ids,
-    const std::vector<IterDomain*>& root_domain,
-    const std::vector<bool>& root_contiguity,
-    const std::unordered_set<IterDomain*>& final_ids,
-    const std::unordered_map<IterDomain*, Val*>& index_map,
-    const std::unordered_set<Split*>& divisible_splits,
-    std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
-    bool ignore_indexability,
-    bool ignore_consistent_ordering)
-    : root_domain_(root_domain),
-      root_contiguity_(root_contiguity),
-      final_ids_(final_ids),
-      index_map_(index_map),
-      divisible_splits_(divisible_splits),
-      p2c_id_map_(std::move(p2c_id_map)),
-      ignore_indexability_(ignore_indexability),
-      ignore_consistent_ordering_(ignore_consistent_ordering),
-      non_divisible_id_info_(ids, root_domain_, divisible_splits_) {
-  if (ids.size() > 0) {
-    // This constructor doesn't provide the following information so it needs to
-    // be built.
-    ca_map_ = std::make_shared<ComputeAtMap>(ids[0]->fusion());
-    halo_info_ = std::make_shared<HaloInfo>(ids[0]->fusion(), ca_map_);
-    concrete_info_ =
-        std::make_shared<ConcretizedBroadcastDomains>(ids[0]->fusion());
-
-    consistent_transform_info_ = std::make_unique<const OrderedIdInformation>(
-        ids, root_domain, concrete_info_);
-  }
-  build(ids);
-}
-
-ContigIDs::ContigIDs(
-    const std::vector<IterDomain*>& ids,
-    const std::vector<IterDomain*>& root_domain,
-    const std::vector<bool>& root_contiguity,
-    const std::unordered_set<IterDomain*>& final_ids,
-    const std::unordered_map<IterDomain*, Val*>& index_map,
-    const std::unordered_set<Split*>& divisible_splits,
-    std::shared_ptr<const ComputeAtMap> ca_map,
-    std::shared_ptr<const HaloInfo> halo_info,
-    std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
-    std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
-    bool ignore_indexability,
-    bool ignore_consistent_ordering)
-    : root_domain_(root_domain),
-      root_contiguity_(root_contiguity),
-      final_ids_(final_ids),
-      index_map_(index_map),
-      divisible_splits_(divisible_splits),
-      ca_map_(ca_map),
-      halo_info_(halo_info),
-      concrete_info_(concrete_info),
-      p2c_id_map_(std::move(p2c_id_map)),
-      ignore_indexability_(ignore_indexability),
-      ignore_consistent_ordering_(ignore_consistent_ordering),
-      consistent_transform_info_(std::make_unique<const OrderedIdInformation>(
-          ids,
-          root_domain,
-          concrete_info_)),
-      non_divisible_id_info_(ids, root_domain, divisible_splits_) {
-  build(ids);
-}
-
-ContigIDs ContigIDs::getNonContigIDs() {
-  return ContigIDs({}, {}, {}, {}, {}, {});
-}
-
-void ContigIDs::build(const std::vector<IterDomain*>& ids) {
-  if (ids.empty() || root_domain_.empty()) {
-    return;
-  }
-
-  TORCH_INTERNAL_ASSERT(
-      root_domain_.size() == root_contiguity_.size(),
-      "Arguments don't match ",
-      root_domain_.size(),
-      " != ",
-      root_contiguity_.size());
-
-  for (const auto root_domain_i : c10::irange(root_domain_.size())) {
-    auto root_domain_id = root_domain_[root_domain_i]->as<IterDomain>();
-    root_to_indexed_id_[root_domain_id] = root_domain_id;
-    // Initialize to false
-    is_contig_root_[root_domain_id] = false;
-    // If a root domain has halo, can't use merged domain even if
-    // both inputs are contiguous. HaloInfo is also initialized for
-    // rfactor root domains, which should just return "zero"
-    // RootAxisInfo. This should be safe as no rfactor tensor should
-    // need halo.
-    if (root_contiguity_[root_domain_i] &&
-        !halo_info_->getRootAxisInfo(root_domain_id).hasHalo()) {
-      contig_ids_.emplace(root_domain_id);
-      is_contig_root_[root_domain_id] = true;
-      within_contig_ids_[root_domain_id] = std::unordered_set<IterDomain*>();
-    }
-  }
-
-  if (!contig_ids_.empty()) {
-    auto exprs = StmtSort::getExprsBetween(
-        ids[0]->fusion(),
-        {root_domain_.begin(), root_domain_.end()},
-        {ids.begin(), ids.end()});
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-  }
-}
-
-void ContigIDs::handle(Merge* merge) {
-  // If output is not consistently ordered or doesn't solely consume all root
-  // domains in its dependencies, then it can't be a contiguously indexable
-  // iterdomain.
-  if (!(ignore_consistent_ordering_ ||
-        consistent_transform_info_->isConsistentlyOrdered(merge->out()))) {
-    return;
-  }
-
-  if (!consistent_transform_info_->exclusivelyConsumesRoots(merge->out())) {
-    return;
-  }
-
-  // If output is not "directly indexable" then it's definitely not contiguously
-  // indexable.
-  if (!ignore_indexability_ && !isIndexable(merge->out())) {
-    return;
-  }
-
-  // If inputs are marked as final, stop
-  if (final_ids_.count(merge->inner()) || final_ids_.count(merge->outer())) {
-    return;
-  }
-
-  // Check root domains for contiguity
-  auto root_ids_it =
-      consistent_transform_info_->idToRootIds().find(merge->out());
-
-  TORCH_INTERNAL_ASSERT(
-      root_ids_it != consistent_transform_info_->idToRootIds().end(),
-      "\nError in contiguous analysis, merge info doesn't exist for:\n",
-      merge->toString(),
-      "\nId: ",
-      merge->out()->toString());
-
-  VectorOfUniqueEntries<IterDomain*> root_ids = root_ids_it->second;
-
-  bool is_indexing_pass = !ignore_consistent_ordering_;
-
-  IterDomain* last_root = nullptr;
-  for (auto root_id_i : c10::irange(root_domain_.size())) {
-    auto root_id = root_domain_[root_id_i];
-    if (root_ids.has(root_id)) {
-      // ID found, remove it
-      root_ids.erase(root_id);
-      // If we're indexing:
-      // we could still potentially consider this ID linearly indexable, as we
-      // could multiple the index by the last root's stride.
-      //
-      // If we're computing predicates (ignore_consistent_ordering_==true),
-      // then we don't have this same constraint, we can just ignore
-      // contiguity of the roots all together.
-      if (!root_contiguity_[root_id_i] && is_indexing_pass) {
-        if (!root_ids.empty()) {
-          return;
-        }
-      }
-      last_root = root_id;
-    }
-  }
-
-  // If there's a non_divisible split in the history of merge->out then it can't
-  // be contiguously indexable.
-  if (non_divisible_id_info_.dependsOnNonDivisibleSplit(merge->out())) {
-    return;
-  }
-
-  // Now we know merge->out is a contiguously indexable ID
-
-  TORCH_INTERNAL_ASSERT(
-      last_root != nullptr,
-      "Issue processing root ids for ",
-      merge->out()->toString());
-
-  // Reset root_ids
-  root_ids = root_ids_it->second;
-  for (auto root_id : root_ids) {
-    root_to_indexed_id_[root_id] = merge->out();
-  }
-
-  auto all_within_vals = DependencyCheck::getAllValsBetween(
-      {root_domain_.begin(), root_domain_.end()}, {merge->out()});
-  auto all_within_ids = ir_utils::filterByType<IterDomain>(all_within_vals);
-
-  std::unordered_set<IterDomain*> within_id_set(
-      all_within_ids.begin(), all_within_ids.end());
-
-  within_id_set.erase(merge->out());
-  within_contig_ids_[merge->out()] = within_id_set;
-  for (auto id : all_within_ids) {
-    contig_ids_.erase(id);
-  }
-
-  contig_ids_.emplace(merge->out());
-}
-
-IterDomain* ContigIDs::getMappedId(IterDomain* id) const {
-  auto it = p2c_id_map_.find(id);
-  if (it != p2c_id_map_.end()) {
-    return it->second;
-  } else {
-    return id;
-  }
-}
-
-bool ContigIDs::isIndexable(IterDomain* id) const {
-  // If ID is mapped to consumer through persmissive map but not exact map it
-  // will not be mapped through to the exact map through the p2c map. Therefore
-  // reject because it involves broadcast resolution.
-  if (!ca_map_->idExistsInMap(getMappedId(id))) {
-    return false;
-  }
-  auto c_id =
-      ca_map_->getConcreteMappedID(getMappedId(id), IdMappingMode::EXACT);
-  return index_map_.find(c_id) != index_map_.end();
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/contiguity.h
+++ b/third_party/nvfuser/csrc/contiguity.h
@ -1,311 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <compute_at_map.h>
-#include <disjoint_set.h>
-#include <ir_all_nodes.h>
-#include <lower_shift.h>
-#include <lower_trivial_broadcast.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// Goes through the transformations associated with a series of ids and root
-// ids. Checks the ordering of the iteration domains through these operations to
-// pick out which operations are consistently ordered. For example:
-// [i0, i1, i2]
-// ->split(0, 4)->merge(1)->merge(1)->merge(0)
-// are consistently ordered from largest to smallest extents, but
-// ->split(0, 4)->merge(1)->merge(0, 2)->merge(0) is not consistently ordered
-// with the roots.
-//
-// This property is important to understand the contiguity of dimensions through
-// complex transformations.
-class OrderedIdInformation : public OptInDispatch {
- public:
-  OrderedIdInformation() = delete;
-
-  OrderedIdInformation(
-      const std::vector<IterDomain*>& ids,
-      const std::vector<IterDomain*>& root_domain,
-      std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info);
-
-  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
-  idToRootIds() const {
-    return id_to_root_ids_;
-  }
-
-  bool isConsistentlyOrdered(IterDomain* id) const {
-    return consistently_ordered_ids_.find(id) !=
-        consistently_ordered_ids_.end();
-  }
-
-  bool exclusivelyConsumesRoots(IterDomain* id) const {
-    return exclusively_consumes_roots_.find(id) !=
-        exclusively_consumes_roots_.end();
-  }
-
- private:
-  // Returns if the id in active_ids should be in exclusively_consumes_roots_
-  bool checkExclusivelyConsumesRoots(IterDomain* id);
-
-  void handle(Split*) override;
-
-  void handle(Merge* merge) override;
-
-  void handle(Swizzle2D* swizzle) override;
-
-  // Track which root ids were used to generate each iter domain
-  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
-      id_to_root_ids_;
-
-  // Track all IterDomains that have correct ordered transforms for contiguity.
-  // i.e. if we have:
-  //
-  // root = [i0, i1, i2]
-  // i3 = merge(i0, i2)
-  // would not be consistently ordered transformed
-  //
-  // root = [i0, i1, i2]
-  // i4, i5 = spit(merge(merge(i0, i1), i2), 4)
-  // would be consistently ordered transforms
-  //
-  // root = [i0, i1, i2, i3]
-  // i4 = merge(i1, i2) would also be consistently ordered transformed
-  std::unordered_set<IterDomain*> consistently_ordered_ids_;
-
-  // Active series of IterDomains that are updated while we're processing the
-  // domain. Helps us identify which ids are consistently_ordered_ids_. Used
-  // for intermediate storage, not to return.
-  std::vector<IterDomain*> active_ids_;
-
-  // IterDomains in this set exclusively consume all the uses of their roots.
-  // For example:
-  // [i0, i1] split(0, f)->merge(1)
-  // [ceilDiv(i0, f), f*i1]
-  // neither iter domains exclusively consume the roots. With another:
-  // merge(0) -> [ceilDiv(i0, f)*f*i1]
-  // The resulting iter domain does exclusively consume the roots.
-  //
-  // Also:
-  // [i0, i1, i2, i3] merge(1)->merge(1)
-  // ->[i0, i1*i2*i3]
-  // both resulting iter domains do exclusively consume their roots
-  std::unordered_set<IterDomain*> exclusively_consumes_roots_;
-
-  // Broadcast domains that are concretized cannot be considered contiguously
-  // indexable.
-  // TODO: This constraint is more conservative than necessary as it's only if
-  // the domain is concretized within the local indexing, not in the entire
-  // fusion.
-  std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
-};
-
-// Based on provided divisible split set, goes through expressions and marks all
-// IterDomains that are dependent on a non-divisible split.
-class NonDivisibleSplitDependencies : public OptInDispatch {
- public:
-  NonDivisibleSplitDependencies() = delete;
-
-  NonDivisibleSplitDependencies(
-      const std::vector<IterDomain*>& ids,
-      const std::vector<IterDomain*>& root_domain,
-      const std::unordered_set<Split*>& divisible_splits);
-
-  bool dependsOnNonDivisibleSplit(IterDomain* id) const {
-    return depends_on_non_divisible_split.find(id) !=
-        depends_on_non_divisible_split.end();
-  }
-
- private:
-  std::unordered_set<IterDomain*> depends_on_non_divisible_split;
-};
-
-// A merge is contiguous if:
-//   Inputs of outer are to the left in the root domain of the inputs of RHS.
-//   All inputs are contiguous in the root domain:
-//     - All marked as contiguous
-//     - Only gaps between inputs are broadcast or reductoin dims
-//   There are no split transformations performed on outer or inner
-//   All transformations on outer or inner are contiguous merges
-// If this criteria holds, then we can index the input root domains of this
-// merge with the indexing provided to the output of the merge in the backward
-// index pass
-
-class ContigIDs : public OptInDispatch {
- public:
-  //! Check through the history of ids whose inputs map to root_domain with
-  //! contiguity root_contiguity. Return unordered_set of all merges that are
-  //! contiguous. Ignore root order is primarily used for predicate generation.
-  //! In this case we can linearize indexing of any ID that only consists of
-  //! merge operations.
-  //!
-  //! Mapping information from CA Index concrete to reference domains
-  //! is used to find if merged output domains can be indexed. If there's
-  //! no mapping to a reference domain, there's no corresponding
-  //! index, so it isn't marked as conting merge.
-  //!
-  //! p2c_id_map can be used when replayed producer domains are
-  //! analyzed, in which case producer-to-consumer maps should be
-  //! passed.
-  //!
-  //! If ignore_indexability and ignore_halo_constraint are true,
-  //! ignore the constraint on indexing and halo, respectively. It is
-  //! the caller that is responsible for its correctness.
-  //! Not really sure why but clang-tidy only complains about
-  //! std::unordered_map if passed as a const reference.
-  ContigIDs(
-      const std::vector<IterDomain*>& ids,
-      const std::vector<IterDomain*>& root_domain,
-      const std::vector<bool>& root_contiguity,
-      const std::unordered_set<IterDomain*>& final_ids,
-      const std::unordered_map<IterDomain*, Val*>& index_map,
-      const std::unordered_set<Split*>& divisible_splits,
-      std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
-      bool ignore_indexability = false,
-      bool ignore_consistent_ordering = false);
-
-  //! \param ids IterDomains on the leaves of the domain we're looking for
-  //! contiguous indexing into.
-  //! \param root_domain the root domain of the domain we're looking for
-  //! contiguous indexing into.
-  //! \param root_contiguity the contiguity of the root_domain.
-  //! \param concrete_to_ref concrete ids of the exact map that the reference
-  //! index is using for indexing.
-  //! \param divisible_splits a set of all splits in the fusion that are
-  //! divisible.
-  //! \param ca_map compute at map of the fusion.
-  //! \param halo_info halo information of the fusion.
-  //! \param concrete_info concretized broadcast information of the fusion.
-  //! \param p2c_id_map map from producer to consumer ids used for indexing
-  //! producer tensors.
-  //! \param ignore_consistent_ordering true for actual indexing into tensors
-  //! but false for predicate analysis. Ordering of merges don't matter for
-  //! predicate generation as they don't map to a physical address.
-  //! \param ignore_indexability can only be true if providing a real
-  //! concrete_to_ref map. As what it's checking is if the index is actually
-  //! indexable based on the reference.
-  ContigIDs(
-      const std::vector<IterDomain*>& ids,
-      const std::vector<IterDomain*>& root_domain,
-      const std::vector<bool>& root_contiguity,
-      const std::unordered_set<IterDomain*>& final_ids,
-      const std::unordered_map<IterDomain*, Val*>& index_map,
-      const std::unordered_set<Split*>& divisible_splits,
-      std::shared_ptr<const ComputeAtMap> ca_map,
-      std::shared_ptr<const HaloInfo> halo_info,
-      std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
-      std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
-      bool ignore_indexability = false,
-      bool ignore_consistent_ordering = false);
-
-  //! Return an empty ContigIDs with no contiguous ID
-  static ContigIDs getNonContigIDs();
-
-  const std::unordered_set<IterDomain*>& contigIDs() const {
-    return contig_ids_;
-  }
-
-  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
-  withinContigIDs() const {
-    return within_contig_ids_;
-  }
-
-  const std::unordered_map<IterDomain*, IterDomain*>& rootToIndexedID() const {
-    return root_to_indexed_id_;
-  }
-
-  VectorOfUniqueEntries<IterDomain*> indexedRootIDs(IterDomain* id) const {
-    auto root_ids_it = consistent_transform_info_->idToRootIds().find(id);
-    if (root_ids_it == consistent_transform_info_->idToRootIds().end()) {
-      return {};
-    }
-    return root_ids_it->second;
-  }
-
- private:
-  using OptInDispatch::handle;
-
-  bool inRoot(const std::vector<IterDomain*>& ids) {
-    return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) {
-      return is_contig_root_.find(id) != is_contig_root_.end();
-    });
-  }
-
-  bool isContig(IterDomain* id) {
-    return contig_ids_.find(id) != contig_ids_.end();
-  }
-
-  // Split outputs are not contiguous, don't need to do anything.
-  void handle(Split*) override {}
-
-  void handle(Merge* merge) override;
-
-  // TODO:
-  //  Currently not propagating any contiguity information
-  // as contiguity is generally not preserved after swizzles.
-  // But in follow ups we could gradually add back a few special
-  // cases, depending on specific swizzle type and axes.
-  void handle(Swizzle2D* swizzle) override {}
-
-  IterDomain* getCAIndexConcreteId(IterDomain* id) const;
-
-  //! True if an ID is indexable.
-  //! E.g., a merged domain with broadcast may not be indexable when
-  //! its corresponding reference tensor has non-broadcast domains.
-  bool isIndexable(IterDomain* id) const;
-
-  //! Return an ID mapped with id_map_ or itself
-  IterDomain* getMappedId(IterDomain* id) const;
-
- private:
-  void build(const std::vector<IterDomain*>& ids);
-
-  //! Root domains to analyze contiguity
-  const std::vector<IterDomain*>& root_domain_;
-  //! Contiguity of root_domain_
-  const std::vector<bool>& root_contiguity_;
-  //! Domains where indexing/predicates cannot be done with their
-  //! consumers domains
-  const std::unordered_set<IterDomain*>& final_ids_;
-  //! Mapping of concrete domains to indices. Just used to check if
-  //! there's an index for an IterDomain.
-  const std::unordered_map<IterDomain*, Val*> index_map_;
-  // Divisible split information as we can still consider iter domains
-  // contiguous through divisible splits.
-  const std::unordered_set<Split*>& divisible_splits_;
-
-  std::shared_ptr<const ComputeAtMap> ca_map_;
-  std::shared_ptr<const HaloInfo> halo_info_;
-  std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
-
-  //! Producer-to-consumer index map in the case of analyzing replayed
-  //! producer tensors
-  const std::unordered_map<IterDomain*, IterDomain*> p2c_id_map_;
-
-  const bool ignore_indexability_ = false;
-  const bool ignore_consistent_ordering_ = false;
-
-  //! Mapping of root domain to bool indicating contiguity
-  std::unordered_map<IterDomain*, bool> is_contig_root_;
-  // Mark if ids are result of contigous merges
-  std::unordered_set<IterDomain*> contig_ids_;
-  // Given contiguous domain, return all iter domains within its history.
-  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
-      within_contig_ids_;
-  //! Mapping of root domain to the actual indexed domain, which can
-  //! be itself or a contig merged domain if found.
-  std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
-
-  std::unique_ptr<const OrderedIdInformation> consistent_transform_info_;
-
-  NonDivisibleSplitDependencies non_divisible_id_info_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/disjoint_set.h
+++ b/third_party/nvfuser/csrc/disjoint_set.h
@ -1,330 +0,0 @@
-#pragma once
-
-#include <c10/util/Exception.h>
-
-#include <algorithm>
-#include <initializer_list>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-// For printing of the set when using a Statement as the type for the set
-#include <ir_base_nodes.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-template <typename T>
-std::string abstractToString(T* ptr) {
-  return ptr->toString();
-}
-
-template <typename T>
-std::string abstractToString(T ref) {
-  return ref.toString();
-}
-
-} // namespace
-
-// Vector like class that will prevent adding duplicate entries by also
-// maintaing a set
-template <typename T, typename Hash = std::hash<T>>
-class VectorOfUniqueEntries {
- public:
-  VectorOfUniqueEntries() = default;
-
-  VectorOfUniqueEntries(const std::initializer_list<T>& x)
-      : vector_(x), set_(x) {}
-
-  // Returns if a node was actually added
-  bool pushBack(T entry) {
-    if (set_.emplace(entry).second) {
-      vector_.push_back(entry);
-      return true;
-    }
-    return false;
-  }
-
-  // Returns if any node was added
-  bool pushBack(const VectorOfUniqueEntries<T, Hash>& other) {
-    bool any_added = false;
-    for (auto entry : other) {
-      any_added = any_added | pushBack(entry);
-    }
-    return any_added;
-  }
-
-  // Returns a const vector useful for iterating on
-  const std::vector<T>& vector() const {
-    return vector_;
-  }
-
-  // Returns first element in vector
-  T front() const {
-    return vector_.front();
-  }
-
-  // Returns last element in vector
-  T back() const {
-    return vector_.back();
-  }
-
-  // Remove and returns the last element in vector
-  T popBack() {
-    T v = vector_.back();
-    set_.erase(v);
-    vector_.pop_back();
-    return v;
-  }
-
-  // Returns if this container is empty
-  bool empty() const {
-    return vector_.empty();
-  }
-
-  // Returns the number of elements in this container
-  size_t size() const {
-    return vector_.size();
-  }
-
-  // Returns if entry is in this vector
-  bool has(T entry) const {
-    return set_.find(entry) != set_.end();
-  }
-
-  // Erase given entry from the containers if
-  //  there is a match.
-  void erase(T entry) {
-    vector_.erase(
-        std::remove_if(
-            vector_.begin(),
-            vector_.end(),
-            [entry](T val) { return val == entry; }),
-        vector_.end());
-
-    set_.erase(entry);
-  }
-
-  // Insert elements at the end of the container.
-  template <typename InputIt>
-  void insert(InputIt begin, InputIt end) {
-    for (auto it = begin; it != end; it++) {
-      pushBack(*it);
-    }
-  }
-
-  // Returns iterator pointing to the beginning of vector container
-  auto begin() const {
-    return vector().begin();
-  }
-
-  // Returns iterator pointing to the end of vector container
-  auto end() const {
-    return vector().end();
-  }
-
-  // Returns iterator pointing to the beginning of vector container
-  auto begin() {
-    return vector().begin();
-  }
-
-  // Returns iterator pointing to the end of vector container
-  auto end() {
-    return vector().end();
-  }
-
-  std::string toString() {
-    std::stringstream ss;
-    ss << "{ ";
-    for (auto entry : vector()) {
-      ss << abstractToString(entry);
-      if (entry != vector().back()) {
-        ss << "; ";
-      }
-    }
-    ss << " }";
-    return ss.str();
-  }
-
- private:
-  std::vector<T> vector_;
-  std::unordered_set<T, Hash> set_;
-};
-
-//! Container class DisjointSet models equivalence relationships
-//!
-//! Each instance of this class keeps equivalence sets
-//! DisjointSet::mapEntries(a,b) makes the full set of a and b equivalent
-//! DisjointSet::*AreMapped(a,b) checks if a and b belong to the same disjoint
-//! set
-template <typename T, typename Hash = std::hash<T>>
-class DisjointSets {
- public:
-  DisjointSets() = default;
-
-  // Warning: returned values should never be modified. This accessor isn't
-  // strictly safe as VectorOfUniqueEntries is not returned as a const.
-  const std::
-      unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>&
-      disjointSetMap() const {
-    return disjoint_set_maps_;
-  }
-
-  // Warning: returned values should never be modified. This accessor isn't
-  // strictly safe as VectorOfUniqueEntries is not returned as a const.
-  const std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>>&
-  disjointSets() const {
-    return disjoint_sets_;
-  }
-
-  // Return the entire disjoint set of provided entry
-  const VectorOfUniqueEntries<T, Hash>& getDisjointSetOf(T entry) const {
-    auto set_it = disjoint_set_maps_.find(entry);
-    TORCH_INTERNAL_ASSERT(
-        set_it != disjoint_set_maps_.end(),
-        "Could not find entry for ",
-        entry->toString());
-    return *(set_it->second);
-  }
-
-  // Initializes a new set for provided entry
-  //
-  // TODO: Return iterator
-  void initializeSet(T entry) {
-    if (disjoint_set_maps_.find(entry) != disjoint_set_maps_.end()) {
-      return;
-    }
-
-    disjoint_sets_.push_back(
-        std::make_shared<VectorOfUniqueEntries<T, Hash>>());
-    disjoint_sets_.back()->pushBack(entry);
-    disjoint_set_maps_.emplace(std::make_pair(entry, disjoint_sets_.back()));
-  }
-
-  // Adds all of the disjoint set belonging to entry1 to the disjoint set
-  // belonging to entry0, maps all entries of disjoint set belonging to entry1
-  // to entry0, removes original disjoint set belonging to entry1.
-  void mapEntries(T entry0, T entry1) {
-    auto set_it_0 = disjoint_set_maps_.find(entry0);
-    auto set_it_1 = disjoint_set_maps_.find(entry1);
-
-    // Track if we need to reset iterators, optimize for case where both entries
-    // exist
-    bool invalid_iterators = false;
-    if (set_it_0 == disjoint_set_maps_.end()) {
-      initializeSet(entry0);
-      invalid_iterators = true;
-    }
-
-    if (set_it_1 == disjoint_set_maps_.end()) {
-      initializeSet(entry1);
-      invalid_iterators = true;
-    }
-
-    // TODO: We can avoid refinding one iterator if initialize set returns an
-    // iterator, though if we insert entry1 we'd have to refind entry0 as it
-    // could invalidate all iterators
-    if (invalid_iterators) {
-      set_it_0 = disjoint_set_maps_.find(entry0);
-      set_it_1 = disjoint_set_maps_.find(entry1);
-    }
-
-    auto set0_shared_ptr = set_it_0->second;
-    auto set1_shared_ptr = set_it_1->second;
-
-    // If the sets are already the same, do nothing
-    if (set0_shared_ptr == set1_shared_ptr) {
-      return;
-    }
-
-    // Place everything in set1 into set0 and remap all entries in set1 to set0
-    for (auto entry : set1_shared_ptr->vector()) {
-      set0_shared_ptr->pushBack(entry);
-      disjoint_set_maps_[entry] = set0_shared_ptr;
-    }
-
-    // set1 no longer needed as its entries are copied into set0
-    disjoint_sets_.erase(std::find(
-        disjoint_sets_.begin(), disjoint_sets_.end(), set1_shared_ptr));
-  }
-
-  // Will assert if provided entry0 is not in any disjoint set, otherwise
-  // returns if entry0 and entry1 are in the same disjoint set.
-  bool strictAreMapped(T entry0, T entry1) const {
-    auto entry_it = disjointSetMap().find(entry0);
-    TORCH_INTERNAL_ASSERT(
-        entry_it != disjointSetMap().end(),
-        "Strict mapping failed on element: ",
-        abstractToString(entry0),
-        " either an error occurred, or non strict mapping should have been used.");
-    return entry_it->second->has(entry1);
-  }
-
-  // If entry0 doesn't have a disjoint set returns false, otherwise returns if
-  // entry0 and entry1 are in the same disjoint set.
-  bool permissiveAreMapped(T entry0, T entry1) const {
-    auto entry_it = disjointSetMap().find(entry0);
-    if (entry_it == disjointSetMap().end()) {
-      return false;
-    }
-    return entry_it->second->has(entry1);
-  }
-
-  // Returns if a set exists with provided entry
-  bool mappingExists(T entry) const {
-    return disjoint_set_maps_.find(entry) != disjoint_set_maps_.end();
-  }
-
-  // Returns a deterministic list of all entries that have been added to any
-  // disjoint set.
-  //
-  // Warning: constructed on every call, consider caching result.
-  VectorOfUniqueEntries<T, Hash> getAllElements() const {
-    VectorOfUniqueEntries<T, Hash> all_elements;
-    for (auto set : disjoint_sets_) {
-      for (auto entry : set->vector()) {
-        all_elements.pushBack(entry);
-      }
-    }
-    return all_elements;
-  }
-
-  // Completely clears all disjoint sets
-  void clear() {
-    disjoint_set_maps_.clear();
-    disjoint_sets_.clear();
-  }
-
-  std::string toString() const {
-    std::stringstream ss;
-    ss << "disjoint sets{\n";
-    const std::string sep("  ");
-    for (auto s_ptr : disjoint_sets_) {
-      auto& set = *s_ptr;
-      ss << sep << "{\n";
-      for (auto entry : set.vector()) {
-        ss << sep << sep << abstractToString(entry) << "\n";
-      }
-      ss << sep << "}\n";
-    }
-    ss << "}";
-    return ss.str();
-  }
-
- private:
-  // Disjoint sets
-  std::unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>
-      disjoint_set_maps_;
-
-  // Keep a list of disjoint_sets that's deterministic to iterate over
-  std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>> disjoint_sets_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/dispatch.cpp
+++ b/third_party/nvfuser/csrc/dispatch.cpp
--- a/third_party/nvfuser/csrc/dispatch.h
+++ b/third_party/nvfuser/csrc/dispatch.h
@ -1,378 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-
-#include <utils.h>
-
-#include <unordered_map>
-
-// dispatch.h prevents the need from adding manual dispatch in every class that
-// wants to define how to process a series of nodes. dispatch.h provides 4
-// classes that can be inherited providing a means to override functions on a
-// per-node basis. There are currently 4 provided dispatch mechanisms:
-//
-// OptOutDispatch:
-//
-// provides the functions:
-// virtual void handle(ValType* irnode){}
-//
-// This provides a mechanisms to override this handle for particular node
-// types. For example if we only wanted to actually run a function on
-// BinaryOps, we could inherit OptOutDispatch and simply override: void
-// handle(BinaryOp*) { doSomething; } Then we could run through all our
-// Statement* and call OptOutDispatch::handle(statement). When a BinaryOp is
-// encountered our override function will be called. For every other node,
-// nothing will be done.
-//
-// OptInDispatch:
-//
-// This class is similar to OptOutDispatch, however if we encounter a node
-// that we haven't specified an override for in the derived class, an error
-// will be thrown. This is useful if we create a class that is expected to
-// handle any type of node it encounters.
-//
-// OptOutMutator:
-//
-// This class is similar to OptOutDispatch except the functions provided are of
-// type: virtual Statement* mutate(Statement*) this is useful for when we want
-// to have an IR node result from our overloaded functions.
-//
-// OptInMutator:
-//
-// This class is similar to OptInDispatch except the functions provided are of
-// type: virtual Statement* mutate(Statement*) this is useful for when we want
-// to have an IR node result from our overloaded functions.
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-class IrContainer;
-class Fusion;
-
-// Hierarchal dispatch functions for handle
-class Statement;
-class Expr;
-class Val;
-
-// Vals
-class IterDomain;
-class TensorDomain;
-class TensorView;
-
-class Bool;
-class Double;
-class Int;
-class ComplexDouble;
-class NamedScalar;
-
-// Exprs
-class FullOp;
-class ARangeOp;
-class EyeOp;
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class RNGOp;
-class ReductionOp;
-class GroupedReductionOp;
-class WelfordOp;
-class GroupedWelfordOp;
-class LoadStoreOp;
-class MmaOp;
-class BroadcastOp;
-class TransposeOp;
-class ExpandOp;
-class ShiftOp;
-class GatherOp;
-class ViewAsScalar;
-class ViewOp;
-
-// Exprs
-class Split;
-class Merge;
-class Swizzle2D;
-
-namespace kir {
-class Predicate;
-class TensorIndex;
-class IntPair;
-
-class Allocate;
-class BlockSync;
-class GridSync;
-class CpAsyncWait;
-class CpAsyncCommit;
-class ForLoop;
-class IfThenElse;
-class GridReduction;
-class GroupedGridReduction;
-class GridBroadcast;
-class GridWelford;
-class GroupedGridWelford;
-class AllocateFusedReduction;
-class InitMagicZero;
-class UpdateMagicZero;
-class Swizzle2DInt;
-class PairSelect;
-
-} // namespace kir
-
-// By default, all IR nodes are handled in this dispatch, and will call an empty
-// function on all nodes.
-class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
- protected:
-  virtual void unhandled(const Statement*) {}
-
- public:
-  // Hierarchal dispatch functions for handle
-  virtual void handle(const Statement*);
-  virtual void handle(const Expr*);
-  virtual void handle(const Val*);
-
-  // Vals
-  virtual void handle(const IterDomain* stmt);
-  virtual void handle(const TensorDomain* stmt);
-  virtual void handle(const TensorView* stmt);
-  virtual void handle(const Bool* stmt);
-  virtual void handle(const Double* stmt);
-  virtual void handle(const Int* stmt);
-  virtual void handle(const ComplexDouble* stmt);
-  virtual void handle(const NamedScalar* stmt);
-
-  virtual void handle(const kir::Predicate*);
-  virtual void handle(const kir::TensorIndex*);
-  virtual void handle(const kir::IntPair*);
-
-  // Exprs
-  virtual void handle(const FullOp* stmt);
-  virtual void handle(const ARangeOp* stmt);
-  virtual void handle(const EyeOp* stmt);
-  virtual void handle(const UnaryOp* stmt);
-  virtual void handle(const BinaryOp* stmt);
-  virtual void handle(const TernaryOp* stmt);
-  virtual void handle(const RNGOp* stmt);
-  virtual void handle(const ReductionOp* stmt);
-  virtual void handle(const GroupedReductionOp* stmt);
-  virtual void handle(const WelfordOp* stmt);
-  virtual void handle(const GroupedWelfordOp* stmt);
-  virtual void handle(const LoadStoreOp* stmt);
-  virtual void handle(const MmaOp* stmt);
-  virtual void handle(const BroadcastOp* stmt);
-
-  virtual void handle(const Split* stmt);
-  virtual void handle(const Merge* stmt);
-  virtual void handle(const Swizzle2D* stmt);
-  virtual void handle(const TransposeOp* stmt);
-  virtual void handle(const ExpandOp* stmt);
-  virtual void handle(const ShiftOp* stmt);
-  virtual void handle(const GatherOp* stmt);
-  virtual void handle(const ViewAsScalar* stmt);
-  virtual void handle(const ViewOp* stmt);
-
-  virtual void handle(const kir::Allocate*);
-  virtual void handle(const kir::BlockSync*);
-  virtual void handle(const kir::GridSync*);
-  virtual void handle(const kir::CpAsyncWait*);
-  virtual void handle(const kir::CpAsyncCommit*);
-  virtual void handle(const kir::InitMagicZero*);
-  virtual void handle(const kir::UpdateMagicZero*);
-  virtual void handle(const kir::ForLoop*);
-  virtual void handle(const kir::IfThenElse*);
-  virtual void handle(const kir::GridReduction*);
-  virtual void handle(const kir::GroupedGridReduction*);
-  virtual void handle(const kir::GridBroadcast*);
-  virtual void handle(const kir::GridWelford*);
-  virtual void handle(const kir::GroupedGridWelford*);
-  virtual void handle(const kir::AllocateFusedReduction*);
-  virtual void handle(const kir::Swizzle2DInt*);
-  virtual void handle(const kir::PairSelect*);
-};
-
-class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
- protected:
-  virtual void unhandled(Statement*);
-
- public:
-  // Hierarchal dispatch functions for handle
-  virtual void handle(Statement*);
-  virtual void handle(Expr*);
-  virtual void handle(Val*);
-
-  // Vals
-  virtual void handle(Bool* stmt);
-  virtual void handle(Double* stmt);
-  virtual void handle(Int* stmt);
-  virtual void handle(ComplexDouble* stmt);
-  virtual void handle(NamedScalar* stmt);
-  virtual void handle(IterDomain* stmt);
-  virtual void handle(TensorDomain* stmt);
-  virtual void handle(TensorView* stmt);
-
-  virtual void handle(kir::Predicate*);
-  virtual void handle(kir::TensorIndex*);
-  virtual void handle(kir::IntPair*);
-
-  // Exprs
-  virtual void handle(FullOp* stmt);
-  virtual void handle(ARangeOp* stmt);
-  virtual void handle(EyeOp* stmt);
-  virtual void handle(UnaryOp* stmt);
-  virtual void handle(BinaryOp* stmt);
-  virtual void handle(TernaryOp* stmt);
-  virtual void handle(RNGOp* stmt);
-  virtual void handle(ReductionOp* stmt);
-  virtual void handle(GroupedReductionOp* stmt);
-  virtual void handle(WelfordOp* stmt);
-  virtual void handle(GroupedWelfordOp* stmt);
-  virtual void handle(LoadStoreOp* stmt);
-  virtual void handle(MmaOp* stmt);
-  virtual void handle(BroadcastOp* stmt);
-
-  virtual void handle(Split* stmt);
-  virtual void handle(Merge* stmt);
-  virtual void handle(Swizzle2D* stmt);
-  virtual void handle(TransposeOp* stmt);
-  virtual void handle(ExpandOp* stmt);
-  virtual void handle(ShiftOp* stmt);
-  virtual void handle(GatherOp* stmt);
-  virtual void handle(ViewAsScalar* stmt);
-  virtual void handle(ViewOp* stmt);
-
-  virtual void handle(kir::Allocate* stmt);
-  virtual void handle(kir::BlockSync* stmt);
-  virtual void handle(kir::GridSync* stmt);
-  virtual void handle(kir::CpAsyncWait* stmt);
-  virtual void handle(kir::CpAsyncCommit* stmt);
-  virtual void handle(kir::InitMagicZero* stmt);
-  virtual void handle(kir::UpdateMagicZero* stmt);
-  virtual void handle(kir::ForLoop* stmt);
-  virtual void handle(kir::IfThenElse* stmt);
-  virtual void handle(kir::GridReduction* stmt);
-  virtual void handle(kir::GroupedGridReduction* stmt);
-  virtual void handle(kir::GridBroadcast* stmt);
-  virtual void handle(kir::GridWelford* stmt);
-  virtual void handle(kir::GroupedGridWelford* stmt);
-  virtual void handle(kir::AllocateFusedReduction* stmt);
-  virtual void handle(kir::Swizzle2DInt* stmt);
-  virtual void handle(kir::PairSelect* stmt);
-};
-
-class TORCH_CUDA_CU_API OptInConstDispatch : public OptOutConstDispatch {
- public:
-  using OptOutConstDispatch::handle;
-
- protected:
-  virtual void unhandled(const Statement* stmt) final;
-};
-
-class TORCH_CUDA_CU_API OptInDispatch : public OptOutDispatch {
- public:
-  using OptOutDispatch::handle;
-
- protected:
-  virtual void unhandled(Statement* stmt) final;
-};
-
-// Class to perform mutations on Fusion IR. Exprs can simply be redefined, but
-// when mutating values they have to be registered through registerMutation so
-// that exprs can detect there's been a muatation and know to modify all
-// instances of that Val. This means each Val should be mutated "consistently".
-// Otherwise behavior may be difficult to understand as it depends on which
-// order mutate is called in. This class expects user to topologically call the
-// statments of interest so inputs are called and mutated before exprs depending
-// on them.
-//
-// Warning: TensorViews need to be treated carefully. As we don't generally
-// register their mutation when their tensor domains only change. If a TV needs
-// to be swapped out, it needs to be registered as a "proper" mutation like
-// other vals, on top of TensorDomain being updated in the mutated TensorView.
-//
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
- public:
-  // Hierarchal dispatch functions for handle
-  virtual void mutate(Statement* s);
-  virtual void mutate(Expr* e);
-  virtual void mutate(Val* v);
-
-  void registerMutation(Val* val, Val* mutation);
-
-  Val* maybeMutated(Val* val) {
-    if (mutations.find(val) == mutations.end()) {
-      return val;
-    }
-    return mutations.at(val);
-  }
-
-  std::unordered_map<Val*, Val*> mutations;
-
-  //****Functions below defined in mutator.cpp*****
-
-  // Vals
-  virtual void mutate(Bool*);
-  virtual void mutate(Double*);
-  virtual void mutate(Int*);
-  virtual void mutate(ComplexDouble*);
-  virtual void mutate(NamedScalar*);
-  virtual void mutate(IterDomain*);
-  virtual void mutate(TensorDomain*);
-  virtual void mutate(TensorView*);
-
-  virtual void mutate(kir::Predicate*);
-  virtual void mutate(kir::TensorIndex*);
-  virtual void mutate(kir::IntPair*);
-
-  // Exprs
-  virtual void mutate(FullOp*);
-  virtual void mutate(ARangeOp*);
-  virtual void mutate(EyeOp*);
-  virtual void mutate(UnaryOp*);
-  virtual void mutate(BinaryOp*);
-  virtual void mutate(TernaryOp*);
-  virtual void mutate(RNGOp*);
-  virtual void mutate(ReductionOp*);
-  virtual void mutate(GroupedReductionOp*);
-  virtual void mutate(WelfordOp*);
-  virtual void mutate(GroupedWelfordOp*);
-  virtual void mutate(LoadStoreOp*);
-  virtual void mutate(MmaOp*);
-  virtual void mutate(BroadcastOp*);
-
-  virtual void mutate(Split*);
-  virtual void mutate(Merge*);
-  virtual void mutate(Swizzle2D*);
-  virtual void mutate(TransposeOp*);
-  virtual void mutate(ExpandOp*);
-  virtual void mutate(ShiftOp*);
-  virtual void mutate(GatherOp*);
-  virtual void mutate(ViewAsScalar*);
-  virtual void mutate(ViewOp*);
-
-  virtual void mutate(kir::Allocate*);
-  virtual void mutate(kir::BlockSync*);
-  virtual void mutate(kir::GridSync*);
-  virtual void mutate(kir::CpAsyncWait*);
-  virtual void mutate(kir::CpAsyncCommit*);
-  virtual void mutate(kir::InitMagicZero*);
-  virtual void mutate(kir::UpdateMagicZero*);
-  virtual void mutate(kir::ForLoop*);
-  virtual void mutate(kir::IfThenElse*);
-  virtual void mutate(kir::GridReduction*);
-  virtual void mutate(kir::GroupedGridReduction*);
-  virtual void mutate(kir::GridBroadcast*);
-  virtual void mutate(kir::GridWelford*);
-  virtual void mutate(kir::GroupedGridWelford*);
-  virtual void mutate(kir::AllocateFusedReduction*);
-  virtual void mutate(kir::Swizzle2DInt*);
-  virtual void mutate(kir::PairSelect*);
-
- protected:
-  void removeExpr(IrContainer*, Expr*);
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/docs/.gitignore
+++ b/third_party/nvfuser/csrc/docs/.gitignore
@ -1 +0,0 @@
-html
--- a/third_party/nvfuser/csrc/docs/documentation.h
+++ b/third_party/nvfuser/csrc/docs/documentation.h
@ -1,23 +0,0 @@
-
-#error This is used exclusively for generating the documentation (not a real header)
-
-//! \namespace torch::jit::fuser
-//! \brief Main PyTorch JIT Fuser namespace
-
-//! \namespace torch::jit::fuser::cuda
-//! \brief CUDA specific components
-
-//! \namespace torch::jit::fuser::cuda::executor_utils
-//! \brief Fuser executor related utilities
-
-//! \namespace torch::jit::fuser::kir
-//! \brief Kernel IR
-
-//! \namespace torch::jit::fuser::ir_utils
-//! \brief IR manipulation utilities
-
-//! \namespace torch::jit::fuser::loop_utils
-//! \brief Loop utilities
-
-//! \namespace torch::jit::fuser::scope_utils
-//! \brief Scope utilities
--- a/third_party/nvfuser/csrc/docs/fuser.doxygen
+++ b/third_party/nvfuser/csrc/docs/fuser.doxygen
--- a/third_party/nvfuser/csrc/docs/images/ir_architecture.png
+++ b/third_party/nvfuser/csrc/docs/images/ir_architecture.png
--- a/third_party/nvfuser/csrc/docs/main_page.md
+++ b/third_party/nvfuser/csrc/docs/main_page.md
@ -1,8 +0,0 @@
-
-This is the implementation reference for the CUDA PyTorch JIT Fuser
-
- [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
- [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
- Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
-
-![Fuser Architecture Overview](images/ir_architecture.png)
--- a/third_party/nvfuser/csrc/dynamic_type.h
+++ b/third_party/nvfuser/csrc/dynamic_type.h
@ -1,304 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-#include <cmath>
-#include <iostream>
-#include <variant>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class TORCH_CUDA_CU_API IntOrDouble {
-  std::variant<double, int64_t> value_;
-
- public:
-  IntOrDouble(int64_t i) : value_(i) {}
-  IntOrDouble(double d) : value_(d) {}
-  IntOrDouble(int i) : value_((int64_t)i) {}
-  IntOrDouble(size_t i) : value_((int64_t)i) {}
-  IntOrDouble() : IntOrDouble(0) {}
-
-  IntOrDouble(const IntOrDouble& other) = default;
-  IntOrDouble& operator=(const IntOrDouble& other) = default;
-  IntOrDouble(IntOrDouble&& other) noexcept = default;
-  IntOrDouble& operator=(IntOrDouble&& other) noexcept = default;
-
-  bool is_int() const {
-    return std::holds_alternative<int64_t>(value_);
-  }
-
-  template <typename T>
-  T as() const {
-    TORCH_CHECK(
-        std::holds_alternative<T>(value_),
-        "The expected dtype and the actual dtype does not match in IntOrDouble");
-    return std::get<T>(value_);
-  }
-
-  template <typename T>
-  T cast() const;
-
-#define DEFINE_ARITHMETIC_OP(op)                                  \
-  IntOrDouble operator op(const IntOrDouble& other) const {       \
-    switch ((int)is_int() << 1 | (int)other.is_int()) {           \
-      case 0b00:                                                  \
-        return IntOrDouble(as<double>() op other.as<double>());   \
-      case 0b01:                                                  \
-        return IntOrDouble(as<double>() op other.as<int64_t>());  \
-      case 0b10:                                                  \
-        return IntOrDouble(as<int64_t>() op other.as<double>());  \
-      case 0b11:                                                  \
-        return IntOrDouble(as<int64_t>() op other.as<int64_t>()); \
-    }                                                             \
-    TORCH_INTERNAL_ASSERT(false);                                 \
-  }                                                               \
-  template <typename T>                                           \
-  IntOrDouble operator op(T other) const {                        \
-    if (is_int()) {                                               \
-      return IntOrDouble(as<int64_t>() op other);                 \
-    }                                                             \
-    return IntOrDouble(as<double>() op other);                    \
-  }
-
-  DEFINE_ARITHMETIC_OP(+)
-  DEFINE_ARITHMETIC_OP(-)
-  DEFINE_ARITHMETIC_OP(*)
-  DEFINE_ARITHMETIC_OP(/)
-  DEFINE_ARITHMETIC_OP(&&)
-
-#undef DEFINE_ARITHMETIC_OP
-
-#define DEFINE_ASSIGN_OP(assign, op)                                      \
-  IntOrDouble& operator assign(const IntOrDouble& other) {                \
-    switch ((int)is_int() << 1 | (int)other.is_int()) {                   \
-      case 0b00:                                                          \
-        return *this = IntOrDouble(as<double>() op other.as<double>());   \
-      case 0b01:                                                          \
-        return *this = IntOrDouble(as<double>() op other.as<int64_t>());  \
-      case 0b10:                                                          \
-        return *this = IntOrDouble(as<int64_t>() op other.as<double>());  \
-      case 0b11:                                                          \
-        return *this = IntOrDouble(as<int64_t>() op other.as<int64_t>()); \
-    }                                                                     \
-    TORCH_INTERNAL_ASSERT(false);                                         \
-  }                                                                       \
-  template <typename T>                                                   \
-  IntOrDouble& operator assign(T other) {                                 \
-    if (is_int()) {                                                       \
-      return *this = IntOrDouble(as<int64_t>() op other);                 \
-    }                                                                     \
-    return *this = IntOrDouble(as<double>() op other);                    \
-  }
-
-  DEFINE_ASSIGN_OP(+=, +)
-  DEFINE_ASSIGN_OP(-=, -)
-  DEFINE_ASSIGN_OP(*=, *)
-  DEFINE_ASSIGN_OP(/=, /)
-
-#undef DEFINE_ASSIGN_OP
-
-  IntOrDouble operator%(const IntOrDouble& other) const {
-    if (is_int() && other.is_int()) {
-      return IntOrDouble(as<int64_t>() % other.as<int64_t>());
-    }
-    TORCH_INTERNAL_ASSERT(false);
-  }
-  IntOrDouble operator%(int64_t other) const {
-    if (is_int()) {
-      return IntOrDouble(as<int64_t>() % other);
-    }
-    TORCH_INTERNAL_ASSERT(false);
-  }
-  IntOrDouble& operator%=(const IntOrDouble& other) {
-    if (is_int() && other.is_int()) {
-      return *this = IntOrDouble(as<int64_t>() % other.as<int64_t>());
-    }
-    TORCH_INTERNAL_ASSERT(false);
-  }
-  IntOrDouble& operator%=(int64_t other) {
-    if (is_int()) {
-      return *this = IntOrDouble(as<int64_t>() % other);
-    }
-    TORCH_INTERNAL_ASSERT(false);
-  }
-
-#define DEFINE_COMPARE_OP(op)                           \
-  bool operator op(const IntOrDouble& other) const {    \
-    switch ((int)is_int() << 1 | (int)other.is_int()) { \
-      case 0b00:                                        \
-        return as<double>() op other.as<double>();      \
-      case 0b01:                                        \
-        return as<double>() op other.as<int64_t>();     \
-      case 0b10:                                        \
-        return as<int64_t>() op other.as<double>();     \
-      case 0b11:                                        \
-        return as<int64_t>() op other.as<int64_t>();    \
-    }                                                   \
-    TORCH_INTERNAL_ASSERT(false);                       \
-  }                                                     \
-  bool operator op(double other) {                      \
-    if (is_int()) {                                     \
-      return as<int64_t>() op other;                    \
-    }                                                   \
-    return as<double>() op other;                       \
-  }                                                     \
-  bool operator op(int64_t other) {                     \
-    if (is_int()) {                                     \
-      return as<int64_t>() op other;                    \
-    }                                                   \
-    return as<double>() op other;                       \
-  }                                                     \
-  bool operator op(int other) {                         \
-    if (is_int()) {                                     \
-      return as<int64_t>() op other;                    \
-    }                                                   \
-    return as<double>() op other;                       \
-  }
-
-  DEFINE_COMPARE_OP(>)
-  DEFINE_COMPARE_OP(>=)
-  DEFINE_COMPARE_OP(<)
-  DEFINE_COMPARE_OP(<=)
-  DEFINE_COMPARE_OP(==)
-  DEFINE_COMPARE_OP(!=)
-
-#undef DEFINE_COMPARE_OP
-
-  IntOrDouble operator-() const {
-    if (is_int()) {
-      return IntOrDouble(-as<int64_t>());
-    }
-    return IntOrDouble(-as<double>());
-  }
-
-  explicit operator double() const;
-  explicit operator int64_t() const;
-  explicit operator size_t() const;
-  explicit operator int() const;
-};
-
-#define DEFINE_ARITHMETIC_OP(op)                           \
-  template <typename T>                                    \
-  inline IntOrDouble operator op(T lhs, IntOrDouble rhs) { \
-    if (rhs.is_int()) {                                    \
-      return IntOrDouble(lhs op rhs.as<int64_t>());        \
-    }                                                      \
-    return IntOrDouble(lhs op rhs.as<double>());           \
-  }
-
-DEFINE_ARITHMETIC_OP(+)
-DEFINE_ARITHMETIC_OP(-)
-DEFINE_ARITHMETIC_OP(*)
-DEFINE_ARITHMETIC_OP(/)
-
-#undef DEFINE_ARITHMETIC_OP
-
-template <>
-inline double IntOrDouble::cast<double>() const {
-  if (is_int()) {
-    return (double)as<int64_t>();
-  }
-  return as<double>();
-}
-
-template <>
-inline int64_t IntOrDouble::cast<int64_t>() const {
-  if (!is_int()) {
-    return (int64_t)as<double>();
-  }
-  return as<int64_t>();
-}
-
-inline IntOrDouble::operator double() const {
-  return as<double>();
-}
-
-inline IntOrDouble::operator int64_t() const {
-  return as<int64_t>();
-}
-
-inline IntOrDouble::operator size_t() const {
-  return as<int64_t>();
-}
-
-inline IntOrDouble::operator int() const {
-  return as<int64_t>();
-}
-
-#define DEFINE_EQ_OP(op)                                         \
-  inline bool operator op(double lhs, const IntOrDouble& rhs) {  \
-    if (rhs.is_int()) {                                          \
-      return false;                                              \
-    }                                                            \
-    return lhs op rhs.as<double>();                              \
-  }                                                              \
-                                                                 \
-  inline bool operator op(int64_t lhs, const IntOrDouble& rhs) { \
-    if (rhs.is_int()) {                                          \
-      return lhs op rhs.as<int64_t>();                           \
-    }                                                            \
-    return false;                                                \
-  }                                                              \
-                                                                 \
-  inline bool operator op(int lhs, const IntOrDouble& rhs) {     \
-    return operator op((int64_t)lhs, rhs);                       \
-  }
-
-DEFINE_EQ_OP(==)
-DEFINE_EQ_OP(!=)
-
-#undef DEFINE_EQ_OP
-
-inline std::ostream& operator<<(std::ostream& os, const IntOrDouble& val) {
-  if (val.is_int()) {
-    return os << val.as<int64_t>();
-  }
-  return os << val.as<double>();
-}
-
-namespace IntOrDouble_functions {
-
-inline IntOrDouble ceildiv(const IntOrDouble& a, const IntOrDouble& b) {
-  if (a.is_int() && b.is_int()) {
-    auto aa = a.as<int64_t>();
-    auto bb = b.as<int64_t>();
-    if (bb > 0) {
-      return (aa + bb - 1) / bb;
-    } else {
-      return (aa + bb + 1) / bb;
-    }
-  }
-  return std::ceil((a / b).as<double>());
-}
-
-inline IntOrDouble max(const IntOrDouble& a, const IntOrDouble& b) {
-  if (a.is_int() && b.is_int()) {
-    return std::max(a.as<int64_t>(), b.as<int64_t>());
-  }
-  return (a > b ? a : b).cast<double>();
-}
-
-inline IntOrDouble min(const IntOrDouble& a, const IntOrDouble& b) {
-  if (a.is_int() && b.is_int()) {
-    return std::min(a.as<int64_t>(), b.as<int64_t>());
-  }
-  return (a < b ? a : b).cast<double>();
-}
-
-inline IntOrDouble abs(const IntOrDouble& a) {
-  if (a.is_int()) {
-    return IntOrDouble(std::abs(a.as<int64_t>()));
-  } else {
-    return IntOrDouble(std::abs(a.as<double>()));
-  }
-}
-
-} // namespace IntOrDouble_functions
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/evaluator_common.cpp
+++ b/third_party/nvfuser/csrc/evaluator_common.cpp
@ -1,602 +0,0 @@
-#include <expr_evaluator.h>
-#include <instrumentation.h>
-#include <ir_utils.h>
-#include <kernel_expr_evaluator.h>
-#include <lower2device.h>
-
-#include <evaluator_common.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-template <typename VALTYPE>
-std::vector<VALTYPE*> getImmediateProducers(VALTYPE* val) {
-  if (val->definition()) {
-    auto expr = val->definition();
-    return expr->inputs();
-  } else {
-    return {};
-  }
-}
-
-//! IR-Generic utility, collects all the producers required for the
-//!  given list of IR values and returns them along with the original
-//!  list in topological order.
-template <typename VALTYPE>
-std::vector<VALTYPE*> makeSortedEvaluationList(std::vector<VALTYPE*> input) {
-  // Deduplicate
-  std::vector<VALTYPE*> to_sort;
-  std::unordered_set<VALTYPE*> visited;
-  for (auto val : input) {
-    if (!visited.count(val)) {
-      to_sort.push_back(val);
-      visited.insert(val);
-    }
-  }
-
-  std::vector<VALTYPE*> sorted;
-  visited.clear();
-
-  // Topological Sort
-  //  Note: didn't explicitly exclude producers that are not in the original
-  //   list. This should be acceptable for the intended use.
-  while (!to_sort.empty()) {
-    auto top_val = to_sort.back();
-    if (visited.count(top_val)) {
-      to_sort.pop_back();
-    } else {
-      bool ready_to_pop = true;
-      for (auto producer : getImmediateProducers(top_val)) {
-        if (!visited.count(producer)) {
-          ready_to_pop = false;
-          to_sort.push_back(producer);
-        }
-      }
-      if (ready_to_pop) {
-        visited.insert(top_val);
-        sorted.push_back(top_val);
-        to_sort.pop_back();
-      }
-    }
-  }
-
-  return sorted;
-}
-
-//! Kernel IR utility, collects all the symbolic values
-//!  used in allocation nodes.
-void collectBufferSizes(
-    std::vector<Val*>& into,
-    const std::vector<Expr*>& exprs) {
-  for (auto expr : exprs) {
-    if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
-      into.push_back(allocate->size());
-    } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      collectBufferSizes(into, for_loop->body().exprs());
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      collectBufferSizes(into, ite->thenBody().exprs());
-      collectBufferSizes(into, ite->elseBody().exprs());
-    }
-  }
-}
-
-//! Kernel IR utility, collects all the kernel symbolic
-//!  values we will need at runtime, i.e. after the
-//!  generated cuda kernel has already been compiled.
-//!  The values are to be used for runtime logic, like
-//!  `computeLaunchparams`.
-std::vector<Val*> collectRuntimeUsedValues(kir::Kernel* kernel) {
-  std::vector<Val*> ret;
-  auto all_tvs = ir_utils::allTvs(kernel);
-  // Collect extent and inputs
-  for (auto tv : all_tvs) {
-    for (auto id : tv->domain()->domain()) {
-      ret.push_back(id->extent());
-    }
-  }
-  for (auto inp : kernel->inputs()) {
-    if (inp->isA<Int>() || inp->isA<Double>()) {
-      ret.push_back(inp);
-    }
-  }
-  // Collect allocation sizes:
-  collectBufferSizes(ret, kernel->topLevelExprs());
-  return makeSortedEvaluationList(ret);
-}
-
-std::vector<Val*> collectRuntimeUsedValues(Fusion* fusion) {
-  std::vector<Val*> ret;
-  auto all_tvs = ir_utils::allTvs(fusion);
-  // Collect extent and inputs
-  for (auto tv : all_tvs) {
-    for (auto id : tv->domain()->domain()) {
-      ret.push_back(id->extent());
-    }
-  }
-  for (auto inp : fusion->inputs()) {
-    if (inp->isA<Int>() || inp->isA<Double>()) {
-      ret.push_back(inp);
-    }
-  }
-  return makeSortedEvaluationList(ret);
-}
-
-} // namespace
-
-template <typename IRContext>
-void PrecomputedValuesBase<IRContext>::initializeValueList(
-    typename IRContext::EVALUATOR_TYPE& const_evaluator,
-    const std::vector<Val*>& sorted_value_list) {
-  // Initialize workspace
-  num_of_values_ = sorted_value_list.size();
-  defined_ = std::vector<bool>(num_of_values_, false);
-  is_constant_ = std::vector<bool>(num_of_values_, false);
-  values_ = std::vector<IntOrDouble>(num_of_values_, -1);
-
-  // Fill in constants and assign evaluator indices
-  for (const auto i : c10::irange(num_of_values_)) {
-    // Use an expression evaluator to test if value is const
-    auto const_val = const_evaluator.evaluate(sorted_value_list[i]);
-    if (const_val.has_value()) {
-      is_constant_[i] = true;
-      values_[i] = const_val.value();
-    }
-    sorted_value_list[i]->setEvaluatorIndex(i);
-  }
-}
-
-template <typename IRContext>
-c10::optional<IntOrDouble> PrecomputedValuesBase<IRContext>::getMaybeValueFor(
-    const Val* val) {
-  auto index = val->evaluatorIndex();
-  if (index < 0) {
-    return c10::nullopt;
-  }
-  if (!defined_[index] && !is_constant_[index]) {
-    return c10::nullopt;
-  }
-  return values_[index];
-}
-
-template <typename IRContext>
-void PrecomputedValuesBase<IRContext>::print() const {
-  std::cout << "Precomputed Values:\n";
-  for (auto i : c10::irange(symbols_.size())) {
-    if (defined_[i]) {
-      std::cout << symbols_[i]->toInlineString() << " = " << values_[i]
-                << std::endl;
-    }
-  }
-}
-
-template <typename IRContext>
-void PrecomputedValuesBase<IRContext>::evaluate() {
-  FUSER_PERF_SCOPE("PrecomputedValues::Evaluate");
-  value_machine_->run();
-  validate();
-}
-
-template <typename IRContext>
-void PrecomputedValuesBase<IRContext>::invalidate() {
-  // clear binding values
-  binding_log_.clear();
-
-  // invalidate value entries
-  std::fill(defined_.begin(), defined_.end(), false);
-
-  // invalidate flag
-  has_valid_values_ = false;
-}
-
-template <typename IRContext>
-void PrecomputedValuesBase<IRContext>::validate() {
-  FUSER_PERF_SCOPE("PrecomputedValuess::Validate");
-  for (auto it : binding_log_) {
-    TORCH_INTERNAL_ASSERT(
-        values_[it.first] == it.second,
-        "Precomputed values failed to validate.",
-        "\nSomething unexpected changed between the compilation and execution.\n",
-        values_[it.first],
-        " != ",
-        it.second);
-  }
-  has_valid_values_ = true;
-}
-
-template <typename IRContext>
-NaiveValueMachine<IRContext>::NaiveValueMachine(
-    PrecomputedValuesBase<IRContext>& precomputed_values)
-    : precomputed_values_(precomputed_values) {
-  num_of_instructions_ = 0;
-  for (auto val : precomputed_values_.symbols_) {
-    auto def = val->definition();
-    if (def) {
-      if (auto uop = dynamic_cast<UnaryOp*>(def)) {
-        makeUnaryOp(uop);
-      } else if (auto bop = dynamic_cast<BinaryOp*>(def)) {
-        makeBinaryOp(bop);
-      } else {
-        TORCH_INTERNAL_ASSERT(false, "Unsupported expr");
-      }
-    }
-  }
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::run() {
-  for (const auto i : c10::irange(num_of_instructions_)) {
-    // Skip this instruction if the dest location
-    //  has already been computed or is constant.
-    if (precomputed_values_.defined_[dest_[i]] ||
-        precomputed_values_.is_constant_[dest_[i]]) {
-      continue;
-    }
-    runInstruction(i);
-  }
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::makeUnaryOp(UnaryOp* uop) {
-  int in = uop->inputs()[0]->evaluatorIndex();
-  int out = uop->outputs()[0]->evaluatorIndex();
-  TORCH_INTERNAL_ASSERT(in >= 0, "Integer Machine: unknown input: ", uop);
-  TORCH_INTERNAL_ASSERT(out >= 0, "Integer Machine: unknown out: ", uop);
-
-  int index = makeInstructionEntry();
-  inst_type_[index] = InstructionType::UNARY_OP;
-  uop_type_[index] = IRContext::getOpType(uop);
-  if (uop_type_[index] == UnaryOpType::Cast) {
-    data_type_[index] = uop->out()->getDataType().value();
-  }
-  src0_[index] = in;
-  dest_[index] = out;
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::makeBinaryOp(BinaryOp* bop) {
-  int in0 = bop->inputs()[0]->evaluatorIndex();
-  int in1 = bop->inputs()[1]->evaluatorIndex();
-  int out = bop->outputs()[0]->evaluatorIndex();
-
-  TORCH_INTERNAL_ASSERT(in0 >= 0, "Integer Machine: unknown lhs: ", bop);
-  TORCH_INTERNAL_ASSERT(in1 >= 0, "Integer Machine: unknown rhs: ", bop);
-  TORCH_INTERNAL_ASSERT(out >= 0, "Integer Machine: unknown out: ", bop);
-
-  int index = makeInstructionEntry();
-  inst_type_[index] = InstructionType::BINARY_OP;
-  bop_type_[index] = IRContext::getOpType(bop);
-  src0_[index] = in0;
-  src1_[index] = in1;
-  dest_[index] = out;
-}
-
-template <typename IRContext>
-int NaiveValueMachine<IRContext>::makeInstructionEntry() {
-  int index = num_of_instructions_++;
-  inst_type_.push_back(InstructionType::UNARY_OP);
-  uop_type_.push_back(UnaryOpType::Abs);
-  bop_type_.push_back(BinaryOpType::Add);
-  data_type_.push_back(DataType::Null);
-  src0_.push_back(-1);
-  src1_.push_back(-1);
-  dest_.push_back(-1);
-  return index;
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::runInstruction(int index) {
-  switch (inst_type_[index]) {
-    case InstructionType::UNARY_OP:
-      runUnaryOp(index);
-      break;
-    case InstructionType::BINARY_OP:
-      runBinaryOp(index);
-      break;
-  }
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::runUnaryOp(int index) {
-  using namespace IntOrDouble_functions;
-  int src_index = src0_[index];
-  bool src_defined = precomputed_values_.defined_[src_index];
-  bool src_is_const = precomputed_values_.is_constant_[src_index];
-  if (!src_defined && !src_is_const) {
-    return;
-  }
-
-  int dest_index = dest_[index];
-
-  auto& src = precomputed_values_.values_[src_index];
-  auto& dest = precomputed_values_.values_[dest_index];
-
-  switch (uop_type_[index]) {
-    case UnaryOpType::Neg:
-      dest = -src;
-      break;
-    case UnaryOpType::Set:
-      dest = src;
-      break;
-    case UnaryOpType::Cast:
-      if (data_type_[index] == DataType::Double) {
-        dest = src.template cast<double>();
-      } else if (data_type_[index] == DataType::Int) {
-        dest = src.template cast<int64_t>();
-      } else {
-        TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
-      }
-      break;
-    case UnaryOpType::Abs:
-      dest = abs(src);
-      break;
-    default:
-      TORCH_CHECK(!"Unexpected operator type ", uop_type_[index]);
-  }
-
-  precomputed_values_.defined_[dest_index] = true;
-}
-
-template <typename IRContext>
-void NaiveValueMachine<IRContext>::runBinaryOp(int index) {
-  using namespace IntOrDouble_functions;
-  int src0_index = src0_[index];
-  int src1_index = src1_[index];
-  bool src0_is_const = precomputed_values_.is_constant_[src0_index];
-  bool src1_is_const = precomputed_values_.is_constant_[src1_index];
-
-  bool src_defined =
-      (precomputed_values_.defined_[src0_index] || src0_is_const) &&
-      (precomputed_values_.defined_[src1_index] || src1_is_const);
-
-  if (!src_defined) {
-    return;
-  }
-  int dest_index = dest_[index];
-
-  auto& lhs = precomputed_values_.values_[src0_index];
-  auto& rhs = precomputed_values_.values_[src1_index];
-  auto& dest = precomputed_values_.values_[dest_index];
-
-  switch (bop_type_[index]) {
-    case BinaryOpType::Add:
-      dest = lhs + rhs;
-      break;
-    case BinaryOpType::Sub:
-      dest = lhs - rhs;
-      break;
-    case BinaryOpType::Mul:
-      dest = lhs * rhs;
-      break;
-    case BinaryOpType::Div:
-      TORCH_CHECK(rhs != 0);
-      dest = lhs / rhs;
-      break;
-    case BinaryOpType::Mod:
-      TORCH_CHECK(rhs != 0);
-      dest = lhs % rhs;
-      break;
-    case BinaryOpType::CeilDiv:
-      TORCH_CHECK(rhs != 0);
-      dest = ceildiv(lhs, rhs);
-      break;
-    case BinaryOpType::And:
-      dest = Int::ScalarType(lhs && rhs);
-      break;
-    case BinaryOpType::Max:
-      dest = lhs > rhs ? lhs : rhs;
-      break;
-    case BinaryOpType::Min:
-      dest = lhs < rhs ? lhs : rhs;
-      break;
-    default:
-      TORCH_CHECK(!"Unexpected operator type");
-  }
-
-  precomputed_values_.defined_[dest_index] = true;
-}
-
-KernelPrecomputedValues::KernelPrecomputedValues(kir::Kernel* kernel) {
-  loadSymbols(collectRuntimeUsedValues(kernel));
-  kir::ExpressionEvaluator evaluator;
-  initializeValueList(evaluator, symbols());
-  initializeNamedScalars();
-  initializeIntegerMachine();
-}
-
-// TODO: put this to base class
-void KernelPrecomputedValues::bindTensorMetaData(
-    TensorView* tv,
-    const TensorArgAbstract* tensor_arg_abstract) {
-  const auto root_domain =
-      TensorDomain::noReductions(tv->domain()->getMaybeRFactorDomain());
-  TORCH_INTERNAL_ASSERT(
-      tensor_arg_abstract->getRank() == static_cast<int>(root_domain.size()),
-      "Something went wrong configuring launch. Inputs do not match.");
-
-  for (const auto dim : c10::irange(root_domain.size())) {
-    auto extent = root_domain[dim]->extent();
-    auto value = tensor_arg_abstract->getSize(dim);
-    bindValue(extent->evaluatorIndex(), value);
-  }
-}
-
-namespace {
-
-//! Compares the name of given scalar with thread size strings
-//!  and returns the corresponding parallel type if a match
-//!  is found.
-c10::optional<ParallelType> getMaybeThreadSizeParallelType(
-    NamedScalar* named_scalar) {
-  auto& var_name = named_scalar->name();
-  for (auto ptype : kParallelTypeThreads) {
-    if (var_name == stringifyThreadSize(ptype)) {
-      return ptype;
-    }
-  }
-  return c10::nullopt;
-}
-
-} // namespace
-
-void KernelPrecomputedValues::initializeNamedScalars() {
-  for (auto val : symbols()) {
-    if (auto named_scalar = dynamic_cast<NamedScalar*>(val)) {
-      auto maybe_parallel_type = getMaybeThreadSizeParallelType(named_scalar);
-      if (maybe_parallel_type.has_value()) {
-        auto& index_list =
-            thread_dim_value_indices_[maybe_parallel_type.value()];
-        if (!index_list) {
-          index_list = std::make_unique<std::vector<int>>();
-        }
-        index_list->push_back(val->evaluatorIndex());
-      }
-    }
-  }
-}
-
-// TODO: merge this one with above.
-void KernelPrecomputedValues::bindKernelInputs(
-    kir::Kernel* kernel,
-    const KernelArgumentHolder& args) {
-  if (hasValidValues()) {
-    invalidate();
-  }
-
-  const auto& inputs = kernel->inputs();
-  TORCH_INTERNAL_ASSERT(
-      args.size() == inputs.size(), "kernel inputs size does not match args");
-
-  for (const auto i : c10::irange(inputs.size())) {
-    auto arg = args[i];
-    const auto input = inputs[i];
-    if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
-      if (const auto& tensor_arg_abstract =
-              dynamic_cast<const TensorArgAbstract*>(arg)) {
-        bindTensorMetaData(tensor_input, tensor_arg_abstract);
-      } else {
-        // TODO: cpu scalar of int type should be bound as scalar int as well
-        TORCH_CHECK(
-            arg->isType(ArgType::CpuScalarTensor),
-            "binding input to TensorView expects input arg to be of tensor type");
-      }
-    } else if (input->isScalar()) {
-      if (input->dtype() == DataType::Int) {
-        TORCH_CHECK(
-            arg->isType(ArgType::Long),
-            "binding input to integer type expects input arg to be a scalar of Long type");
-        precomputedValuesBaseType::bindValue(
-            input->evaluatorIndex(), *static_cast<const int64_t*>(arg->arg()));
-      } else if (input->dtype() == DataType::Double) {
-        TORCH_CHECK(
-            arg->isType(ArgType::Double),
-            "binding input to double type expects input arg to be a scalar of Double type");
-        precomputedValuesBaseType::bindValue(
-            input->evaluatorIndex(), *static_cast<const double*>(arg->arg()));
-      }
-    }
-  }
-}
-
-void KernelPrecomputedValues::bindParallelExtents(
-    const ParallelExtentMap& parallel_extents,
-    const LaunchParams& launch_constraint) {
-  // Bind values of extents of parallelized
-  //  iterdomains from launch_constraint when applicable.
-  // Consistency will be checked at validate().
-  for (const auto& it : parallel_extents) {
-    auto raw_val = launch_constraint.getRawVal(it.first);
-    if (raw_val > 0) {
-      for (auto extent : it.second) {
-        bindValue(extent->evaluatorIndex(), raw_val);
-      }
-    }
-  }
-}
-
-void KernelPrecomputedValues::bindConcreteParallelTypeValue(
-    ParallelType pt,
-    int64_t value) {
-  auto index_list_it = thread_dim_value_indices_.find(pt);
-  if (index_list_it != thread_dim_value_indices_.end()) {
-    for (auto index : *(index_list_it->second)) {
-      bindValue(index, value);
-    }
-  }
-}
-
-FusionPrecomputedValues::FusionPrecomputedValues(Fusion* fusion)
-    : fusion_(fusion) {
-  loadSymbols(collectRuntimeUsedValues(fusion));
-  ExpressionEvaluator evaluator(fusion);
-  initializeValueList(evaluator, symbols());
-  initializeIntegerMachine();
-}
-
-// TODO: put this to base class
-void FusionPrecomputedValues::bindTensorMetaData(
-    TensorView* tv,
-    const TensorArgAbstract* tensor_arg_abstract) {
-  const auto root_domain =
-      TensorDomain::noReductions(tv->getMaybeRFactorDomain());
-  TORCH_INTERNAL_ASSERT(
-      tensor_arg_abstract->getRank() == static_cast<int>(root_domain.size()),
-      "Something went wrong configuring launch. Inputs do not match.");
-
-  for (const auto dim : c10::irange(root_domain.size())) {
-    auto extent = root_domain[dim]->extent();
-    auto value = tensor_arg_abstract->getSize(dim);
-    precomputedValuesBaseType::bindValue(extent->evaluatorIndex(), value);
-  }
-}
-
-void FusionPrecomputedValues::bindFusionInputs(
-    const KernelArgumentHolder& args) {
-  if (hasValidValues()) {
-    precomputedValuesBaseType::invalidate();
-  }
-
-  const auto& inputs = fusion_->inputs();
-  TORCH_INTERNAL_ASSERT(
-      args.size() == inputs.size(), "kernel inputs size does not match args");
-
-  for (const auto i : c10::irange(inputs.size())) {
-    const auto input = inputs[i];
-    const ArgAbstract* arg = args[i];
-    if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
-      if (const auto& tensor_arg_abstract =
-              dynamic_cast<const TensorArgAbstract*>(arg)) {
-        bindTensorMetaData(tensor_input, tensor_arg_abstract);
-      } else {
-        TORCH_CHECK(
-            arg->isType(ArgType::CpuScalarTensor),
-            "binding input to TensorView expects input arg to be of tensor type");
-      }
-    } else if (input->isScalar()) {
-      if (input->getDataType() == DataType::Int) {
-        TORCH_CHECK(
-            arg->isType(ArgType::Long),
-            "binding input to integer type expects input arg to be a scalar of Long type");
-        precomputedValuesBaseType::bindValue(
-            input->evaluatorIndex(), *static_cast<const int64_t*>(arg->arg()));
-      } else if (input->getDataType() == DataType::Double) {
-        TORCH_CHECK(
-            arg->isType(ArgType::Double),
-            "binding input to double type expects input arg to be a scalar of Double type");
-        precomputedValuesBaseType::bindValue(
-            input->evaluatorIndex(), *static_cast<const double*>(arg->arg()));
-      }
-    }
-  }
-}
-
-template class PrecomputedValuesBase<FusionIRContext>;
-template class PrecomputedValuesBase<KernelIRContext>;
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/evaluator_common.h
+++ b/third_party/nvfuser/csrc/evaluator_common.h
@ -1,343 +0,0 @@
-#pragma once
-#include <dynamic_type.h>
-#include <executor_kernel_arg.h>
-#include <executor_launch_params.h>
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <lower2device.h>
-
-#include <c10/core/DeviceType.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-//! This is the common space for expression evaluators in
-//!  fusion IR and kernel IR context. Much of the evaluator
-//!  optimizations and runtimes could share the same code
-//!  path and they could be collected here.
-
-class ExpressionEvaluator;
-
-namespace kir {
-
-class ExpressionEvaluator;
-
-} // namespace kir
-
-//! IR Contexts to be passed to generic evaluator optimizations
-//!   and runtimes. Defines the essential interface for the
-//!   generic logic to get necessary type and function info
-//!   from the IR nodes. Generic optimizations will assume
-//!   the same list of static definitions are provided
-//!   in each of the contexts, just FusionIR and KernelIR
-//!   currently.
-
-//! Context for using generic logic on FusionIR
-class FusionIRContext {
- public:
-  using TV_TYPE = TensorView;
-  using EVALUATOR_TYPE = ExpressionEvaluator;
-
-  static BinaryOpType getOpType(BinaryOp* bop) {
-    return bop->getBinaryOpType();
-  }
-
-  static UnaryOpType getOpType(UnaryOp* uop) {
-    return uop->getUnaryOpType();
-  }
-};
-
-//! Context for using generic logic on KernelIR
-class KernelIRContext {
- public:
-  using EVALUATOR_TYPE = kir::ExpressionEvaluator;
-
-  static BinaryOpType getOpType(BinaryOp* bop) {
-    return bop->getBinaryOpType();
-  }
-
-  static UnaryOpType getOpType(UnaryOp* uop) {
-    return uop->getUnaryOpType();
-  }
-};
-
-template <typename IRContext>
-class PrecomputedValuesBase;
-
-//! NaiveValueMachine:
-//!  This is an un-optimized runtime for evaluating a
-//!   set of values in one run. The runtime contains
-//!   a vector of instructions inferred from IR at compile-time
-//!   and it currently must be associated with an instance of
-//!   PrecomputedValuesBase that will provide the workspace
-//!   containing the concrete values for the values.
-template <typename IRContext>
-class NaiveValueMachine {
-  //! The generic types of instructions supported for this
-  //!  machine, currently only binary and unary.
-  enum class InstructionType { UNARY_OP, BINARY_OP };
-
- public:
-  //! Constructor lowers all the expr IR nodes stored in precomputed_values
-  //!  and stores them in the private state.
-  NaiveValueMachine(PrecomputedValuesBase<IRContext>& precomputed_values);
-
-  //! Runs all the instructions and write results to the associated
-  //!  precomputed_values.
-  void run();
-
- private:
-  //! Convert an unary IR expr to an instruction
-  void makeUnaryOp(UnaryOp* uop);
-
-  //! Convert an binary IR expr to an instruction
-  void makeBinaryOp(BinaryOp* bop);
-
-  //! Create an empty instruction with all default values
-  //!  and place it at the end of the instruction buffer.
-  int makeInstructionEntry();
-
-  //! Run a single instruction at the given index of
-  //!  the instruction buffer. Decodes and dispatches
-  //!  to the corresponding instruction handle functions.
-  void runInstruction(int index);
-
-  //! Runs a unary operation at given index of instruction buffer
-  void runUnaryOp(int index);
-
-  //! Runs a binary operation at given index of instruction buffer
-  void runBinaryOp(int index);
-
- private:
-  friend PrecomputedValuesBase<IRContext>;
-
-  //! Reference to the PrecomputedValues workspace associated with
-  //!   this runtime. All the instructions will read and write the
-  //!   values in this workspace.
-  PrecomputedValuesBase<IRContext>& precomputed_values_;
-
-  //! Instruction buffer. All states are in separate vectors and
-  //!  the entry of each vector at the same index correspond to
-  //!  the same instruction.
-
-  //! Total number of instructions
-  int num_of_instructions_ = 0;
-
-  //! Machine instruction type for each instruction i.e.
-  //!  unary or binary
-  std::vector<InstructionType> inst_type_;
-
-  //! Unary operator type if applicable, contains a default
-  //!  value at each index corresponding to a binary op.
-  std::vector<UnaryOpType> uop_type_;
-
-  //! Data type for unary op of type UnaryOpType::Cast, contains a default
-  //!  value at each index corresponding other ops.
-  std::vector<DataType> data_type_;
-
-  //! Unary operator type if applicable, contains a default
-  //!  value at each index corresponding to a unary op.
-  std::vector<BinaryOpType> bop_type_;
-
-  //! Indexes of operands and destination of each instruction.
-  //!  The indexes corresponds to positions in the workspace
-  //!  where concrete values are hosted.
-
-  //! Operand 0 of each instruction.
-  std::vector<int> src0_;
-
-  //! Operand 1 of each instruction, a default value at
-  //!  each index corresponding to a unary op.
-  std::vector<int> src1_;
-
-  //! Destination of each instruction.
-  std::vector<int> dest_;
-};
-
-//! PrecomputedValuesBase:
-//!  A class to support optimized evaluation of values
-//!  at runtime.
-//!    At compile time all necessary values are collected
-//!  from given IR nodes and a runtime and a workspace containing
-//!  the concrete values is created and pre-allocated.
-//!    At runtime the value vm is used to evaluate all the
-//!  values and store them in the workspace ahead of time.
-template <typename IRContext>
-class PrecomputedValuesBase {
-  using VALUE_MACHINE = NaiveValueMachine<IRContext>;
-
- public:
-  explicit PrecomputedValuesBase() = default;
-
-  //! Returns if the workspace contains evaluated results.
-  bool ready() {
-    return has_valid_values_;
-  }
-
-  //! Runs the internal value machine that will compute
-  //!  the values allocated in the workspace.
-  void evaluate();
-
-  //! Returns value for the given IR node if it's stored
-  //!  in the workspace and has been evaluated.
-  c10::optional<IntOrDouble> getMaybeValueFor(const Val* val);
-
-  //! Debugging helper, prints all the currently known values
-  void print() const;
-
- protected:
-  //! Initialize the workspace before first use.
-  //!  Assume the given value list IR nodes have
-  //!  been topologically sorted.
-  void initializeValueList(
-      typename IRContext::EVALUATOR_TYPE& evaluator,
-      const std::vector<Val*>& sorted_value_list);
-
-  //! Bind concrete value to the given index
-  //!  if the index is valid.
-  void bindValue(int index, IntOrDouble value) {
-    if (index < 0 || is_constant_[index]) {
-      return;
-    }
-    defined_[index] = true;
-    values_[index] = value;
-    binding_log_.emplace_back(index, value);
-  }
-
-  //! Invalidate all computed values in the workspace.
-  void invalidate();
-
-  //! Interface for subclasses to access symbols_
-  void loadSymbols(std::vector<Val*> symbols) {
-    symbols_ = std::move(symbols);
-  }
-
-  //! Interface for subclasses to access symbols_
-  std::vector<Val*>& symbols() {
-    return symbols_;
-  }
-
-  //! Initialize the value runtime that will
-  //!  infer instructions from the workspace.
-  void initializeIntegerMachine() {
-    value_machine_ = std::make_unique<VALUE_MACHINE>(*this);
-  }
-
-  bool hasValidValues() {
-    return has_valid_values_;
-  }
-
- private:
-  //! Post evaluation check, throws if any computed value
-  //!  is inconsistent with its bound value
-  void validate();
-
-  //! Returns true if workspace has a computed or constant
-  //!  value for given index.
-  bool hasValue(int index) {
-    TORCH_INTERNAL_ASSERT(index > 0);
-    return defined_[index] || is_constant_[index];
-  }
-
- private:
-  friend VALUE_MACHINE;
-
-  //! Marks if an evaluation has finished
-  bool has_valid_values_ = false;
-
-  //! The size of workspace
-  int num_of_values_ = -1;
-
-  //! Marks if a value has been bound or
-  //!  computed at each index.
-  std::vector<bool> defined_;
-
-  //! Marks if a value is compile-time constant
-  //!  at each index.
-  std::vector<bool> is_constant_;
-
-  //! Stores the concrete values at each index.
-  std::vector<IntOrDouble> values_;
-
-  //! Stores the IR nodes corresponding to each index.
-  std::vector<Val*> symbols_;
-
-  //! An internal log to keep track of all the bindings
-  //!  used in each evaluation cycle. To be used for
-  //!  consistency check.
-  std::vector<std::pair<int, IntOrDouble>> binding_log_;
-
-  //! Integer runtime for realizing the values computations.
-  std::unique_ptr<VALUE_MACHINE> value_machine_;
-};
-
-//! PrecomputedValues workspace in Fusion IR context,
-//!  defines the set of values to be collected in each
-//!  fusion graph and the input value binding given each
-//!  fusion runtime input.
-class FusionPrecomputedValues : public PrecomputedValuesBase<FusionIRContext> {
-  using precomputedValuesBaseType = PrecomputedValuesBase<FusionIRContext>;
-
- public:
-  FusionPrecomputedValues(Fusion* fusion);
-
-  //! Bind concrete values from fusion runtime inputs
-  void bindFusionInputs(const KernelArgumentHolder& args);
-
- private:
-  void bindTensorMetaData(
-      TensorView* tv,
-      const TensorArgAbstract* tensor_arg_abstract);
-
- private:
-  Fusion* fusion_ = nullptr;
-};
-//! PrecomputedValues workspace in Fusion IR context,
-//!  defines the set of values to be collected in each
-//!  kernel IR sequence and the input value binding given each
-//!  fusion runtime input and launch constraints.
-class KernelPrecomputedValues : public PrecomputedValuesBase<KernelIRContext> {
-  using precomputedValuesBaseType = PrecomputedValuesBase<KernelIRContext>;
-
- public:
-  using ParallelExtentMap =
-      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
-
-  KernelPrecomputedValues(kir::Kernel* kernel);
-
-  //! Bind concrete values from fusion runtime inputs
-  void bindKernelInputs(kir::Kernel* kernel, const KernelArgumentHolder& args);
-
-  //! Bind concrete values from launch constraints
-  void bindParallelExtents(
-      const ParallelExtentMap& parallel_extents,
-      const LaunchParams& launch_constraint);
-
-  //! Bind the NamedScalars corresponding to the
-  //!  concrete parallel dimension sizes after the
-  //!  actual value has been resolved.
-  void bindConcreteParallelTypeValue(ParallelType pt, int64_t value);
-
- private:
-  void bindTensorMetaData(
-      TensorView* tv,
-      const TensorArgAbstract* tensor_arg_abstract);
-
-  //! Iterate through all the named scalars corresponding
-  //!  to thread sizes and pre-group them by their parallel
-  //!  types.
-  void initializeNamedScalars();
-
- private:
-  //! Contains all the named scalars correspond
-  //!  to thread size of each parallel type.
-  std::unordered_map<ParallelType, std::unique_ptr<std::vector<int>>, TypeHash>
-      thread_dim_value_indices_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor.cpp
+++ b/third_party/nvfuser/csrc/executor.cpp
--- a/third_party/nvfuser/csrc/executor.h
+++ b/third_party/nvfuser/csrc/executor.h
@ -1,330 +0,0 @@
-#pragma once
-#include <executor_launch_params.h>
-#include <executor_utils.h>
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_cloner.h>
-#include <ir_printer.h>
-#include <kernel_expr_evaluator.h>
-#include <lower2device.h>
-#include <utils.h>
-
-#include <c10/core/DeviceType.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-TORCH_CUDA_CU_API bool shouldFillAllocationWithNan();
-TORCH_CUDA_CU_API void setFillAllocationWithNan(bool value);
-
-// TODO: Should this actually be in launch params?
-struct TORCH_CUDA_CU_API CompileOptions {
-  c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
-  KernelIndexMode index_mode = KernelIndexMode::INT64;
-};
-
-class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
- public:
-  // Unsafe compilation that's useful for debugging kernels, iterating over
-  // slight modifications of a generated kernel
-  void debugCompileFusionFromStr(
-      Fusion* fusion,
-      const std::string& code,
-      const std::string& name,
-      int id,
-      CompileOptions options = CompileOptions());
-
-  //! infers output sizes via returning non-allocated KernelArgumentHolder.
-  //! this function is useful for async compilation for segmented fusion
-  KernelArgumentHolder inferOutputSizes(
-      const KernelArgumentHolder& args,
-      const LaunchParams& launch_constraints);
-
-  void compileFusion(
-      Fusion* fusion,
-      const KernelArgumentHolder& args,
-      const LaunchParams& launch_constraints = LaunchParams());
-
-  // TODO: merge it with the overload above.
-  //! This API is merely here so we don't have to go back and update all cpp
-  //! tests.
-  void compileFusion(
-      Fusion* fusion,
-      const at::ArrayRef<IValue>& inputs = {},
-      const LaunchParams& launch_constraints = LaunchParams()) {
-    KernelArgumentHolder args =
-        KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    compileFusion(fusion, args, launch_constraints);
-  }
-
-  std::vector<at::Tensor> runFusion(
-      KernelArgumentHolder& args,
-      const LaunchParams& launch_constraints = LaunchParams(),
-      const std::vector<at::Tensor>& outputs = {});
-
-  std::vector<at::Tensor> runFusion(
-      const at::ArrayRef<IValue>& inputs,
-      const std::vector<at::Tensor>& outputs,
-      const LaunchParams& launch_constraints = LaunchParams(),
-      const c10::optional<size_t>& opt_code = c10::nullopt) {
-    KernelArgumentHolder args =
-        KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    if (opt_code.has_value()) {
-      args.setCacheId(*opt_code);
-    }
-    return runFusion(args, launch_constraints, outputs);
-  }
-
-  std::vector<at::Tensor> runFusion(
-      const at::ArrayRef<IValue>& inputs,
-      const LaunchParams& launch_constraints = LaunchParams(),
-      const c10::optional<size_t>& opt_code = c10::nullopt) {
-    return runFusion(inputs, {}, launch_constraints, opt_code);
-  }
-
-  // function to query whether a `FusionExecutor` has a compiled kernel to
-  // execute
-  bool compiled() const {
-    return fusion_id_ != -1 && lowered_;
-  };
-
-  void evictCache(size_t cache_id) {
-    executor_entry_lookup_.erase(cache_id);
-  }
-
-  // struct used to hold necessary information to launch compiled kernel on a
-  // given input set.
-  //
-  // TODO: strides would also be important when we handle permutations in
-  //       codegen.
-  //
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  struct ExecutorEntry {
-    bool init = false;
-    LaunchParams launch_params;
-    std::vector<std::pair<int, int>> io_alias_indices;
-    std::vector<std::vector<int64_t>> output_sizes;
-    std::vector<std::vector<int64_t>> output_strides;
-    std::vector<at::ScalarType> output_types;
-    std::vector<std::vector<int64_t>> buffer_sizes;
-    std::vector<at::ScalarType> buffer_types;
-    std::vector<bool> buffer_zero_init;
-    uint64_t rand_offset;
-  };
-
-  using ExecutorCompileTimeInfoCache =
-      executor_utils::caching::ExecutorCompileTimeInfoCache;
-
-  kir::Kernel* kernel() const {
-    TORCH_INTERNAL_ASSERT(lowered_);
-    return lowered_->kernel();
-  }
-
-  //! Internal knob used for debugging/profiling only
-  void setExecuteKernelFlag(bool execute_kernel) {
-    execute_kernel_ = execute_kernel;
-  }
-
-  //! Internal knob used for debugging/profiling only
-  void setMeasureKernelTimeFlag(bool measure_kernel_time) {
-    measure_kernel_time_ = measure_kernel_time;
-  }
-
-  //! Returns the last kernel execution time, in milliseconds
-  //!
-  //! \note The kernel time is only tracked if enabled by calling
-  //!    setMeasureKernelTimeFlag(true)
-  //!
-  float kernelTimeMs() const {
-    return measure_kernel_time_ ? kernel_time_ms_ : 0;
-  }
-
-  //! Returns the number of bytes processed last kernel execution
-  int64_t bytesProcessed() const {
-    return bytes_processed_;
-  }
-
-  //! Returns the launch parameters from the last kernel execution
-  LaunchParams lastLaunchParams() const {
-    return launch_params_;
-  }
-
-  //! Returns the string of the compiled kernel
-  std::string kernelString() const {
-    return kernel_code_;
-  }
-
-  //! Returns the latest compile log
-  std::string compilerLog() const {
-    return last_compiler_log_;
-  }
-
-  std::string kernelName() const {
-    std::stringstream ss;
-    ss << "kernel" << fusion_id_;
-    return ss.str();
-  }
-
-  //! Internal tests only. Compiles CUDA code with NVRTC directly from
-  //! string. This util provides a path to test runtime code, i.e. the resource
-  //! strings.
-  void compileRtc(
-      const std::string& code,
-      const std::string& name,
-      bool structured = false,
-      CompileOptions options = CompileOptions());
-
-  //! Internal tests only. Runs the compiled CUDA kernel from compileRtc.
-  void runRtc(
-      const LaunchParams& launch_params,
-      const std::vector<at::Tensor>& args);
-
-  //! Internal knob used for debugging/profiling only
-  void disableLaunchParamCache() {
-    disable_parameter_cache_ = true;
-  }
-
- private:
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  struct GlobalBuffers {
-    std::vector<at::Tensor> buffers;
-    std::vector<bool> zero_init;
-    at::Tensor profile_buffer;
-  };
-
-  static std::string kernelNamespace() {
-    return "CudaCodeGen";
-  }
-
-  // Add preamble and wrap in namespace
-  std::string getStructuredCode(const std::string& kernel);
-
-  LaunchParams computeLaunchParams(
-      const LaunchParams& launch_constraints,
-      kir::ExpressionEvaluator& expr_eval,
-      const int warp_size);
-
-  uint64_t computeSharedMemory(
-      kir::ExpressionEvaluator& expr_eval,
-      const std::vector<const kir::Allocate*>& buffers,
-      bool align_padding = false,
-      uint64_t total = 0);
-
-  // return a pair of vector of tensors, where tensors in the first vector are
-  // not initialized, while the second vector contains zero-initiliazed tensors
-  GlobalBuffers allocGlobalVals(kir::ExpressionEvaluator& expr_eval);
-
-  // alias_index: index of outputs that are aliases to inputs, hence we should
-  // skip allocating real storage for those, but still maintain its spot to
-  // maintain the indexing from output aliases to inputs
-  std::vector<at::Tensor> allocOutputs(
-      const KernelArgumentHolder& args,
-      kir::ExpressionEvaluator& expr_eval,
-      const std::unordered_set<int>& alias_indices = {});
-
-  void setUsedTVs();
-
-  const std::vector<TensorView*>& getUsedTVs() const {
-    return used_tvs_;
-  };
-
-  ExecutorCompileTimeInfoCache* compileTimeDataCache() {
-    return &compile_time_info_cache_;
-  }
-
-  //! returns KernelArgumentHolder representing the output sizes from kernel
-  //! execution. Note: 1. this API would ignoring aliased outputs and instead
-  //! pushing scalar int 0 as a place holder; 2. this API doesn't actually
-  //! allocate output in memory, but rather is used just to infer output sizes.
-  KernelArgumentHolder evaluateOutputSizes(
-      const KernelArgumentHolder& args,
-      kir::ExpressionEvaluator& expr_eval,
-      const std::unordered_set<int>& alias_indices = {});
-
- private:
-  CompileOptions options_;
-
-  //! Current configured total shared mem size from cudaDeviceProp
-  size_t configured_device_smem_ = std::numeric_limits<size_t>().max();
-
-  //! Available shared memory space for dynamic allocation for the current
-  //!  compiled kernel at the current shared memory/L1 configuration
-  c10::optional<size_t> maybe_available_dynamic_smem_ = c10::nullopt;
-
-  //! Absolute limit of all available shared mem space from cudaDeviceProp
-  size_t device_smem_limit_ = std::numeric_limits<size_t>().max();
-
-  // Assuming sm70 or above:
-  //  limit of statically allocated smem is 48 KB:
-  // See:
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
-  const uint64_t max_static_smem_ = 48 << 10;
-  int warp_size_ = 0;
-  executor_utils::NvrtcFunction compiled_kernel_;
-
-  // TensorViews actually used in the kernel.
-  std::vector<TensorView*> used_tvs_;
-
-  // Counter to be used for kernel name.
-  int fusion_id_ = -1;
-  static int fusion_id_counter_;
-
-  std::unique_ptr<GpuLower> lowered_;
-  // Copy of lowered_->kernel()
-  Fusion* fusion_ = nullptr;
-
-  // Track the block size this kernel was compiled with. If the block size
-  // increases, recompile to adjust maxregister count.
-  int64_t block_size_high_water_mark = 1;
-
-  // lookup table to take short cut to retrieve recorded information in order to
-  // launch kernels without re-inference parameters.
-  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
-
-  // Compile time information caching. This is used for shape inference
-  //  support. The cache stores graph information that are available
-  //  without shape information so that each shape inference call will
-  //  not need to re-compute them.
-  ExecutorCompileTimeInfoCache compile_time_info_cache_;
-
-  // Cached expr eval
-  std::unique_ptr<KernelPrecomputedValues> evaluator_precomputed_values_ =
-      nullptr;
-
-  // Profiling support: knob to control wheter we actually execute the
-  // kernel on the GPU or not
-  bool execute_kernel_ = true;
-
-  // Profiling support: knob to enable measuring kernel execution time
-  bool measure_kernel_time_ = false;
-
-  // Profiling support: the last kernel execution time, if measure_kernel_time_
-  // is true
-  float kernel_time_ms_ = 0;
-
-  // Profiling support: the last kernel Bytes processed
-  int64_t bytes_processed_ = 0;
-
-  // Profiling support: the last launch param used
-  LaunchParams launch_params_;
-
-  // Profiling support: disable caching of launch params and output allocation
-  // output allocation is also disable when output sizes are dependent on
-  // runtime scalar inputs, such as for the case of tensor factory. see
-  // https://github.com/csarofeen/pytorch/issues/2002
-  bool disable_parameter_cache_ = false;
-
-  // Profiling support: kept copy of the cuda kernel
-  std::string kernel_code_;
-
-  // Profiling support: nvrtc log for debugging
-  std::string last_compiler_log_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor_kernel_arg.cpp
+++ b/third_party/nvfuser/csrc/executor_kernel_arg.cpp
@ -1,320 +0,0 @@
-#include <c10/util/irange.h>
-
-// Extract size and strides
-#include <kernel_cache.h>
-
-#include <executor_kernel_arg.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-template <typename T, typename nvfuser_index_t>
-std::unique_ptr<TensorArgAbstract> getTensorArg(int nDims) {
-  switch (nDims) {
-    case (0):
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 0, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    case (1):
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 1, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    case (2):
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 2, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    case (3):
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 3, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    case (4):
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 4, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-    case (5):
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 5, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-    case (6):
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 6, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-    case (7):
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 7, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-    case (8):
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      return std::make_unique<TensorArg<
-          TensorArgCodegen<T, 8, nvfuser_index_t>,
-          nvfuser_index_t>>();
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to generate a tensor to run a generated kernel with ",
-          nDims,
-          " dimensions, however only 0 to 8 dimensional tensor are supported.");
-  }
-  return nullptr;
-}
-
-template <typename INDEX_MODE>
-std::unique_ptr<TensorArgAbstract> getTensorArg(
-    c10::ScalarType dtype,
-    int nDims) {
-  switch (dtype) {
-    case c10::ScalarType::Double:
-      return getTensorArg<double, INDEX_MODE>(nDims);
-    case c10::ScalarType::Float:
-      return getTensorArg<float, INDEX_MODE>(nDims);
-    case c10::ScalarType::Half:
-      return getTensorArg<at::Half, INDEX_MODE>(nDims);
-    case c10::ScalarType::BFloat16:
-      return getTensorArg<at::BFloat16, INDEX_MODE>(nDims);
-    case c10::ScalarType::Bool:
-      return getTensorArg<bool, INDEX_MODE>(nDims);
-    case c10::ScalarType::Long:
-      return getTensorArg<int64_t, INDEX_MODE>(nDims);
-    case c10::ScalarType::Int:
-      return getTensorArg<int32_t, INDEX_MODE>(nDims);
-    case c10::ScalarType::ComplexFloat:
-      return getTensorArg<c10::complex<float>, INDEX_MODE>(nDims);
-    case c10::ScalarType::ComplexDouble:
-      return getTensorArg<c10::complex<double>, INDEX_MODE>(nDims);
-    default:
-      TORCH_CHECK(
-          false,
-          "Dtype: ",
-          dtype,
-          " not currently supported in code generated kernels.");
-  }
-}
-
-std::unique_ptr<TensorArgAbstract> getTensorArg(
-    c10::ScalarType dtype,
-    int nDims,
-    KernelIndexMode index_mode) {
-  switch (index_mode) {
-    case KernelIndexMode::INT32:
-      return getTensorArg<int>(dtype, nDims);
-    case KernelIndexMode::INT64:
-      return getTensorArg<int64_t>(dtype, nDims);
-    default:
-      break;
-  }
-
-  TORCH_INTERNAL_ASSERT(false, "unknown index mode");
-  return nullptr;
-}
-
-} // namespace
-
-KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(
-    const c10::ArrayRef<c10::IValue>& inputs) {
-  if (inputs.empty()) {
-    // default to int32 on device 0
-    KernelArgumentHolder args(KernelIndexMode::INT32);
-    args.setDeviceIndex(0);
-    return args;
-  }
-  auto device_index = getCommonDeviceCUDA(inputs);
-  auto index_mode = collectIndexMode(inputs);
-
-  KernelArgumentHolder args(index_mode);
-  args.setDeviceIndex(device_index);
-  args.push(inputs);
-
-  return args;
-}
-
-// Push a tensor to the arguments
-void KernelArgumentHolder::push(const at::Tensor& tensor) {
-  changed_ = true;
-  if (is_cpu_scalar(tensor)) {
-    switch (tensor.scalar_type()) {
-      case c10::ScalarType::ComplexDouble:
-        arguments_.push_back(std::make_unique<CpuScalarTensorArg<
-                                 CpuScalarTensorCodegen<c10::complex<double>>>>(
-            tensor.data_ptr<c10::complex<double>>()[0]));
-        break;
-      case c10::ScalarType::ComplexFloat:
-        arguments_.push_back(std::make_unique<CpuScalarTensorArg<
-                                 CpuScalarTensorCodegen<c10::complex<float>>>>(
-            tensor.data_ptr<c10::complex<float>>()[0]));
-        break;
-      case c10::ScalarType::Double:
-        arguments_.push_back(
-            std::make_unique<
-                CpuScalarTensorArg<CpuScalarTensorCodegen<double>>>(
-                tensor.data_ptr<double>()[0]));
-        break;
-      case c10::ScalarType::Float:
-        arguments_.push_back(
-            std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<float>>>(
-                tensor.data_ptr<float>()[0]));
-        break;
-      case c10::ScalarType::Half:
-        arguments_.push_back(
-            std::make_unique<
-                CpuScalarTensorArg<CpuScalarTensorCodegen<at::Half>>>(
-                tensor.data_ptr<at::Half>()[0]));
-        break;
-      case c10::ScalarType::BFloat16:
-        arguments_.push_back(
-            std::make_unique<
-                CpuScalarTensorArg<CpuScalarTensorCodegen<at::BFloat16>>>(
-                tensor.data_ptr<at::BFloat16>()[0]));
-        break;
-      case c10::ScalarType::Bool:
-        arguments_.push_back(
-            std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<bool>>>(
-                tensor.data_ptr<bool>()[0]));
-        break;
-      case c10::ScalarType::Long:
-        arguments_.push_back(
-            std::make_unique<
-                CpuScalarTensorArg<CpuScalarTensorCodegen<int64_t>>>(
-                tensor.data_ptr<int64_t>()[0]));
-        break;
-      case c10::ScalarType::Int:
-        arguments_.push_back(
-            std::make_unique<
-                CpuScalarTensorArg<CpuScalarTensorCodegen<int32_t>>>(
-                tensor.data_ptr<int32_t>()[0]));
-        break;
-      default:
-        TORCH_CHECK(
-            false,
-            "Dtype: ",
-            tensor.scalar_type(),
-            " not currently supported in code generated kernels.");
-    }
-  } else {
-    int nDims = tensor.ndimension();
-
-    c10::ScalarType dtype = tensor.scalar_type();
-    std::unique_ptr<TensorArgAbstract> tensor_arg =
-        getTensorArg(dtype, nDims, index_mode_);
-    tensor_arg->setTensor(tensor);
-    tensor_arg->setPointer(tensor.data_ptr());
-    tensor_arg->setDataType(aten_to_data_type(dtype));
-    for (const auto i : c10::irange(nDims)) {
-      tensor_arg->setSize(i, tensor.sizes()[i]);
-      tensor_arg->setStride(i, tensor.strides()[i]);
-    }
-    arguments_.push_back(std::move(tensor_arg));
-  }
-}
-
-// Push a scalar or integer to the arguments
-void KernelArgumentHolder::push(const IValue& val) {
-  changed_ = true;
-  TORCH_INTERNAL_ASSERT(
-      val.isScalar(),
-      "Tried to push an arg to run in a fused kernel, expected a scalar but got, ",
-      val);
-  auto scalar_val = val.toScalar();
-  switch (scalar_val.type()) {
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-    case c10::ScalarType::ComplexDouble:
-      arguments_.push_back(
-          std::make_unique<ComplexDoubleArg>(scalar_val.toComplexDouble()));
-      return;
-    case c10::ScalarType::Double:
-      arguments_.push_back(std::make_unique<DoubleArg>(scalar_val.toDouble()));
-      return;
-    case c10::ScalarType::Long:
-      arguments_.push_back(std::make_unique<LongArg>(scalar_val.toLong()));
-      return;
-    case c10::ScalarType::Bool:
-      arguments_.push_back(std::make_unique<BoolArg>(scalar_val.toBool()));
-      return;
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          " Tried to create argument to send to a fused kernel, but got an unexpected type.");
-  }
-  TORCH_INTERNAL_ASSERT(
-      false,
-      " Tried to create argument to send to a fused kernel, but got a non-scalar type.");
-}
-
-void KernelArgumentHolder::push(int64_t val) {
-  arguments_.push_back(std::make_unique<LongArg>(val));
-}
-
-void KernelArgumentHolder::push(const at::PhiloxCudaState& val) {
-  arguments_.push_back(std::make_unique<PhiloxCudaStateArg>(val));
-}
-
-// Create buffer, flatten arguments into it, align by 8 Bytes, return pointers
-// in the buffer
-void** KernelArgumentHolder::getBuffer() {
-  if (changed_) {
-    void_ptrs_ = std::vector<void*>(arguments_.size(), nullptr);
-    for (const auto i : c10::irange(arguments_.size())) {
-      void_ptrs_[i] = static_cast<void*>(arguments_[i]->arg());
-    }
-    changed_ = false;
-  }
-  return void_ptrs_.data();
-}
-
-void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
-  // Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
-  // allocated here from the subgraph could be, and very likely are, different
-  // from I/O expected by the generated CUDA kernel.
-  for (const auto& arg : args) {
-    if (arg.isTensor()) {
-      push(arg.toTensor());
-    } else {
-      push(arg);
-    }
-  }
-}
-
-void KernelArgumentHolder::push(const std::vector<at::Tensor>& tensors) {
-  for (const auto& tensor : tensors) {
-    push(tensor);
-  }
-}
-
-void KernelArgumentHolder::push(const ArgAbstract* arg) {
-  changed_ = true;
-  arguments_.emplace_back(arg->copy_unique_ptr());
-}
-
-void KernelArgumentHolder::swap(int i, const ArgAbstract* arg) {
-  changed_ = true;
-  auto holder = arg->copy_unique_ptr();
-  arguments_[i].swap(holder);
-}
-
-void KernelArgumentHolder::appendPhiloxRNGSeed(uint64_t rand_offset) {
-  at::PhiloxCudaState philox_engine_inputs;
-  auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-  {
-    // See Note [Acquire lock when using random generators]
-    std::lock_guard<std::mutex> lock(gen.mutex());
-    philox_engine_inputs =
-        at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_cuda_state(
-            rand_offset);
-  }
-  push(philox_engine_inputs);
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor_kernel_arg.h
+++ b/third_party/nvfuser/csrc/executor_kernel_arg.h
@ -1,397 +0,0 @@
-#pragma once
-
-#include <ATen/core/ivalue.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <c10/util/Exception.h>
-#include <type.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <array>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// This should match the tensor used in the code generation (almost exactly)
-template <typename T, int N, typename nvfuser_index_t>
-struct TensorArgCodegen {
-  T& operator[](nvfuser_index_t ind) {
-    return data[ind];
-  };
-
-  T* data;
-  std::array<nvfuser_index_t, N> size;
-  std::array<nvfuser_index_t, N> stride;
-  constexpr int nDims() const {
-    return N;
-  }
-  void setSize(int i, nvfuser_index_t s) {
-    size[i] = s;
-  }
-  void setStride(int i, nvfuser_index_t s) {
-    stride[i] = s;
-  }
-  nvfuser_index_t getSize(int i) const {
-    return size[i];
-  }
-  nvfuser_index_t getStride(int i) const {
-    return stride[i];
-  }
-};
-
-// 0-Dim GPU based tensor
-template <typename T, typename nvfuser_index_t>
-struct TensorArgCodegen<T, 0, nvfuser_index_t> {
-  T& operator[](nvfuser_index_t ind) {
-    return data[ind];
-  };
-
-  T* data;
-  constexpr int nDims() const {
-    return 0;
-  }
-  void setSize(int, nvfuser_index_t) {
-    TORCH_INTERNAL_ASSERT(false, "Tried to set size of a 0-dim tensor");
-  }
-  void setStride(int, nvfuser_index_t) {
-    TORCH_INTERNAL_ASSERT(false, "Tried to set stride of a 0-dim tensor");
-  }
-  nvfuser_index_t getSize(int i) const {
-    TORCH_INTERNAL_ASSERT(false, "Tried to get size of a 0-dim tensor");
-  }
-  nvfuser_index_t getStride(int i) const {
-    TORCH_INTERNAL_ASSERT(false, "Tried to get stride of a 0-dim tensor");
-  }
-};
-
-// Specialization for 0-dim case that's easy to pass in a CPU based tensor
-// without memcpy
-template <typename T>
-struct CpuScalarTensorCodegen {
-  T& operator[](int) {
-    return data;
-  };
-
-  T data;
-};
-
-// TODO: macro this and the printer below
-enum class ArgType {
-  PhiloxCudaState,
-  Long,
-  Double,
-  ComplexDouble,
-  Bool,
-  Tensor,
-  CpuScalarTensor
-};
-
-inline std::string argTypeToString(ArgType type) {
-  std::string ret;
-  switch (type) {
-    case ArgType::PhiloxCudaState:
-      ret = "PhiloxCudaState";
-      break;
-    case ArgType::Long:
-      ret = "Long";
-      break;
-    case ArgType::Double:
-      ret = "Double";
-      break;
-    case ArgType::ComplexDouble:
-      ret = "ComplexDouble";
-      break;
-    case ArgType::Bool:
-      ret = "Bool";
-      break;
-    case ArgType::Tensor:
-      ret = "Tensor";
-      break;
-    case ArgType::CpuScalarTensor:
-      ret = "CpuScalarTensor";
-      break;
-  }
-  return ret;
-}
-
-struct ArgAbstract {
-  virtual ~ArgAbstract() = default;
-  virtual const void* arg() const = 0;
-  virtual void* arg() = 0;
-  virtual bool isType(ArgType type) const = 0;
-  virtual ArgType type() const = 0;
-  virtual std::unique_ptr<ArgAbstract> copy_unique_ptr() const = 0;
-  virtual void print() const {
-    printf("input type: %s\n", argTypeToString(type()).c_str());
-  };
-};
-
-#define DEF_HELPEE_FUNC(TARGET_TYPE, ARG_NAME)                    \
-  bool isType(ArgType type) const override {                      \
-    return ArgType::TARGET_TYPE == type;                          \
-  }                                                               \
-  ArgType type() const override {                                 \
-    return ArgType::TARGET_TYPE;                                  \
-  }                                                               \
-  const void* arg() const override {                              \
-    return &ARG_NAME;                                             \
-  }                                                               \
-  void* arg() override {                                          \
-    return &ARG_NAME;                                             \
-  }                                                               \
-  std::unique_ptr<ArgAbstract> copy_unique_ptr() const override { \
-    return std::make_unique<TARGET_TYPE##Arg>(*this);             \
-  }
-
-#define DEF_PRINT_FUNC              \
-  void print() const override {     \
-    std::cout << val_ << std::endl; \
-  }
-
-struct PhiloxCudaStateArg : public ArgAbstract {
-  at::PhiloxCudaState val_;
-  PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){};
-  DEF_HELPEE_FUNC(PhiloxCudaState, val_)
-};
-
-struct LongArg : public ArgAbstract {
-  int64_t val_;
-  explicit LongArg(int64_t _val) : val_(_val) {}
-  DEF_HELPEE_FUNC(Long, val_)
-  DEF_PRINT_FUNC
-};
-
-struct DoubleArg : public ArgAbstract {
-  double val_;
-  explicit DoubleArg(double _val) : val_(_val) {}
-  DEF_HELPEE_FUNC(Double, val_)
-  DEF_PRINT_FUNC
-};
-
-struct ComplexDoubleArg : public ArgAbstract {
-  c10::complex<double> val_;
-  explicit ComplexDoubleArg(c10::complex<double> _val) : val_(_val) {}
-  DEF_HELPEE_FUNC(ComplexDouble, val_)
-  DEF_PRINT_FUNC
-};
-
-struct BoolArg : public ArgAbstract {
-  bool val_;
-  explicit BoolArg(bool _val) : val_(_val) {}
-  DEF_HELPEE_FUNC(Bool, val_)
-  DEF_PRINT_FUNC
-};
-
-struct TensorArgAbstract : ArgAbstract {
-  virtual void setSize(int i, int64_t size) = 0;
-  virtual void setStride(int i, int64_t stride) = 0;
-  virtual void setPointer(void* ptr) = 0;
-  virtual void setDataType(DataType data_type) = 0;
-  virtual void setTensor(at::Tensor tensor) = 0;
-
-  virtual int64_t getRank() const = 0;
-  virtual int64_t getSize(int i) const = 0;
-  virtual int64_t getStride(int i) const = 0;
-  virtual void* getPointer() const = 0;
-  virtual DataType getDataType() const = 0;
-  virtual int64_t numel() const = 0;
-  virtual at::Tensor getTensor() const = 0;
-
-  // TODO: clean it up and also print out dtype
-  void print() const override {
-    auto rank = getRank();
-    std::cout << "tensor dtype: " << getDataType() << " sizes: (";
-    for (auto i = 0; i < rank; i++) {
-      std::cout << getSize(i) << ", ";
-    }
-    std::cout << ") stride: (";
-    for (auto i = 0; i < rank; i++) {
-      std::cout << getStride(i) << ", ";
-    }
-    std::cout << ") pointer: " << getPointer() << std::endl;
-  }
-};
-
-template <typename TENSOR_TYPE, typename nvfuser_index_t>
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct TensorArg : public TensorArgAbstract {
-  TENSOR_TYPE instance_;
-  // TODO: this is ugly, we should be extracting data type from `instance_`
-  // instead
-  DataType data_type_ = DataType::Null;
-  at::Tensor tensor_;
-
-  void setSize(int i, int64_t size) override {
-    instance_.setSize(i, (nvfuser_index_t)size);
-  }
-  void setStride(int i, int64_t stride) override {
-    instance_.setStride(i, (nvfuser_index_t)stride);
-  }
-  void setPointer(void* ptr) override {
-    instance_.data = static_cast<decltype(TENSOR_TYPE::data)>(ptr);
-  }
-  void setDataType(DataType data_type) override {
-    data_type_ = data_type;
-  }
-  void setTensor(at::Tensor tensor) override {
-    tensor_ = tensor;
-  }
-
-  int64_t getSize(int i) const override {
-    return instance_.getSize(i);
-  }
-  int64_t getStride(int i) const override {
-    return instance_.getStride(i);
-  }
-  int64_t getRank() const override {
-    return instance_.nDims();
-  }
-  void* getPointer() const override {
-    return instance_.data;
-  }
-  DataType getDataType() const override {
-    return data_type_;
-  }
-  at::Tensor getTensor() const override {
-    return tensor_;
-  }
-  int64_t numel() const override {
-    int64_t ret = 1;
-    for (auto i : c10::irange(instance_.nDims())) {
-      ret *= instance_.getSize(i);
-    }
-    return ret;
-  }
-
-  DEF_HELPEE_FUNC(Tensor, instance_)
-};
-
-template <typename CPU_TENSOR_TYPE>
-struct CpuScalarTensorArg : public ArgAbstract {
-  CPU_TENSOR_TYPE instance_;
-
-  CpuScalarTensorArg() = delete;
-
-  explicit CpuScalarTensorArg(decltype(CPU_TENSOR_TYPE::data) _data) {
-    instance_.data = _data;
-  }
-
-  DEF_HELPEE_FUNC(CpuScalarTensor, instance_)
-};
-
-// TODO: This class needs some further clean up and refactor
-//! KernelArgumentHolder copies meta information from kernel inputs, including
-//! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used
-//! for both compilation as well as kernel execution. The important thing is to
-//! strip ownership of tensor from KernelArgumentHolder, so that during async
-//! compilation, we are not unnecessarily holding memory that is not needed.
-class TORCH_CUDA_CU_API KernelArgumentHolder {
- public:
-  //! create KernelArgumentHolder from c10 inputs. Note that we we not taking
-  //! the ownership of the memory from the original inputs, but just recording
-  //! its meta data for kernel execution/compilation.
-  static KernelArgumentHolder createKernelArgumentHolder(
-      const c10::ArrayRef<c10::IValue>& inputs);
-
-  KernelIndexMode getIndexMode() const {
-    return index_mode_;
-  }
-
-  explicit KernelArgumentHolder(KernelIndexMode index_mode)
-      : index_mode_(index_mode) {}
-
-  KernelArgumentHolder(const KernelArgumentHolder& self)
-      : device_index_(self.getDeviceIndex()),
-        cache_id_(self.getCacheId()),
-        index_mode_(self.getIndexMode()) {
-    for (const auto& arg : self.arguments_) {
-      push(arg.get());
-    }
-  }
-
-  KernelArgumentHolder& operator=(const KernelArgumentHolder& self) {
-    device_index_ = self.getDeviceIndex();
-    index_mode_ = self.getIndexMode();
-    for (const auto& arg : self.arguments_) {
-      push(arg.get());
-    }
-    return *this;
-  }
-
-  // Push a tensor to the arguments
-  void push(const at::Tensor& tensor);
-
-  // Push a scalar or integer to the arguments
-  void push(const IValue& val);
-
-  void push(const at::PhiloxCudaState& val);
-
-  // Create buffer, flatten arguments into it, align by 8 Bytes, return pointers
-  // in the buffer
-  void** getBuffer();
-
-  void push(const c10::ArrayRef<c10::IValue>& args);
-
-  void push(const std::vector<at::Tensor>& tensors);
-
-  void push(const ArgAbstract* arg);
-
-  void swap(int i, const ArgAbstract* arg);
-
-  // push int64
-  void push(int64_t val);
-
-  const ArgAbstract* back() const {
-    return arguments_.back().get();
-  }
-
-  void appendPhiloxRNGSeed(uint64_t rand_offset);
-
-  const ArgAbstract* operator[](int ind) const {
-    return arguments_.at(ind).get();
-  };
-
-  size_t size() const {
-    return arguments_.size();
-  }
-
-  bool empty() const {
-    return arguments_.empty();
-  }
-
-  void setDeviceIndex(int index) {
-    device_index_ = index;
-  }
-
-  int getDeviceIndex() const {
-    return device_index_;
-  }
-
-  void setCacheId(size_t id) {
-    cache_id_ = id;
-  }
-
-  c10::optional<size_t> getCacheId() const {
-    return cache_id_;
-  }
-
-  void print() const {
-    for (const auto& arg : arguments_) {
-      arg->print();
-    }
-  }
-
- private:
-  std::vector<std::unique_ptr<ArgAbstract>> arguments_;
-  std::vector<void*> void_ptrs_;
-  bool changed_ = true;
-
-  int device_index_ = 0;
-  c10::optional<size_t> cache_id_ = c10::nullopt;
-  KernelIndexMode index_mode_ = KernelIndexMode::INT64;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor_launch_params.cpp
+++ b/third_party/nvfuser/csrc/executor_launch_params.cpp
@ -1,134 +0,0 @@
-#include <executor_launch_params.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-void LaunchParams::assertValid() {
-  TORCH_INTERNAL_ASSERT(
-      bdimx() * bdimy() * bdimz() > 0 &&
-          bdimx() * bdimy() * bdimz() <=
-              (int64_t)at::cuda::getCurrentDeviceProperties()
-                  ->maxThreadsPerMultiProcessor,
-      "Selected invalid number of threads for cuda: ",
-      bdimx() * bdimy() * bdimz());
-  TORCH_INTERNAL_ASSERT(
-      gdimx() > 0 && gdimx() < (std::int64_t(1) << 32) - 1,
-      "Invalid number of blocks in x direction: ",
-      gdimx());
-  TORCH_INTERNAL_ASSERT(
-      gdimy() > 0 && gdimy() <= 65535,
-      "Invalid number of blocks in y direction: ",
-      gdimy());
-  TORCH_INTERNAL_ASSERT(
-      gdimz() > 0 && gdimz() <= 65535,
-      "Invalid number of blocks in z direction: ",
-      gdimz());
-}
-
-void LaunchParams::bind(int64_t val, ParallelType p_type) {
-  switch (p_type) {
-    case ParallelType::TIDx:
-      checkAndSet(val, bdimx_, "blockDim.x");
-      break;
-    case ParallelType::BIDx:
-      checkAndSet(val, gdimx_, "gridDim.x");
-      break;
-    case ParallelType::TIDy:
-      checkAndSet(val, bdimy_, "blockDim.y");
-      break;
-    case ParallelType::BIDy:
-      checkAndSet(val, gdimy_, "gridDim.y");
-      break;
-    case ParallelType::TIDz:
-      checkAndSet(val, bdimz_, "blockdim.z");
-      break;
-    case ParallelType::BIDz:
-      checkAndSet(val, gdimz_, "gridDim.z");
-      break;
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to bind invalid parallel type in launch config: ",
-          p_type);
-  }
-  assertValid();
-}
-
-int64_t LaunchParams::getDim(ParallelType p_type) const {
-  switch (p_type) {
-    case ParallelType::TIDx:
-      return bdimx();
-    case ParallelType::BIDx:
-      return gdimx();
-    case ParallelType::TIDy:
-      return bdimy();
-    case ParallelType::BIDy:
-      return gdimy();
-    case ParallelType::TIDz:
-      return bdimz();
-    case ParallelType::BIDz:
-      return gdimz();
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to get with invalid parallel type in launch config: ",
-          p_type);
-  }
-}
-
-bool LaunchParams::hasDim(ParallelType p_type) const {
-  return getRawVal(p_type) != UNINITIALIZED_VAL;
-}
-
-const int64_t& LaunchParams::getRawVal(ParallelType p_type) const {
-  switch (p_type) {
-    case ParallelType::TIDx:
-      return bdimx_;
-    case ParallelType::BIDx:
-      return gdimx_;
-    case ParallelType::TIDy:
-      return bdimy_;
-    case ParallelType::BIDy:
-      return gdimy_;
-    case ParallelType::TIDz:
-      return bdimz_;
-    case ParallelType::BIDz:
-      return gdimz_;
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to get with invalid parallel type in launch config: ",
-          p_type);
-  }
-}
-
-bool LaunchParams::operator==(const LaunchParams& other) const {
-  return gdimx_ == other.gdimx_ && gdimy_ == other.gdimy_ &&
-      bdimx_ == other.bdimx_ && bdimy_ == other.bdimy_ && smem_ == other.smem_;
-}
-
-void LaunchParams::print() const {
-  std::cout << toString();
-}
-
-std::string LaunchParams::toString() const {
-  std::stringstream ss;
-  ss << "Launch Parameters: "
-     << "BlockDim.x = " << (bdimx_ == UNINITIALIZED_VAL ? -1 : bdimx_) << ", "
-     << "BlockDim.y = " << (bdimy_ == UNINITIALIZED_VAL ? -1 : bdimy_) << ", "
-     << "BlockDim.z = " << (bdimz_ == UNINITIALIZED_VAL ? -1 : bdimz_) << ", "
-     << "GridDim.x = " << (gdimx_ == UNINITIALIZED_VAL ? -1 : gdimx_) << ", "
-     << "GridDim.y = " << (gdimy_ == UNINITIALIZED_VAL ? -1 : gdimy_) << ", "
-     << "GridDim.z = " << (gdimz_ == UNINITIALIZED_VAL ? -1 : gdimz_) << ", "
-     << "Smem Size = " << smem() << "\n";
-  return ss.str();
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor_launch_params.h
+++ b/third_party/nvfuser/csrc/executor_launch_params.h
@ -1,136 +0,0 @@
-#pragma once
-#include <type.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class TORCH_CUDA_CU_API LaunchParams {
- public:
-  static constexpr int64_t UNINITIALIZED_VAL = -1;
-
-  LaunchParams(
-      int64_t gdimx = UNINITIALIZED_VAL,
-      int64_t gdimy = UNINITIALIZED_VAL,
-      int64_t gdimz = UNINITIALIZED_VAL,
-      int64_t bdimx = UNINITIALIZED_VAL,
-      int64_t bdimy = UNINITIALIZED_VAL,
-      int64_t bdimz = UNINITIALIZED_VAL)
-      : gdimx_(gdimx),
-        gdimy_(gdimy),
-        gdimz_(gdimz),
-        bdimx_(bdimx),
-        bdimy_(bdimy),
-        bdimz_(bdimz) {
-    assertValid();
-  }
-
-  void assertValid();
-
-  void setSmem(int64_t smem) {
-    smem_ = smem;
-  }
-
-  int64_t smem() const {
-    return smem_;
-  }
-
-  int64_t nBlocks() const {
-    return std::abs(gdimx_ * gdimy_ * gdimz_);
-  }
-
-  int64_t nThreads() const {
-    return std::abs(bdimx_ * bdimy_ * bdimz_);
-  }
-
-  int64_t bdimx() const {
-    return static_cast<int64_t>(bdimx_ == UNINITIALIZED_VAL ? 1 : bdimx_);
-  }
-
-  int64_t gdimx() const {
-    return static_cast<int64_t>(gdimx_ == UNINITIALIZED_VAL ? 1 : gdimx_);
-  }
-
-  int64_t bdimy() const {
-    return static_cast<int64_t>(bdimy_ == UNINITIALIZED_VAL ? 1 : bdimy_);
-  }
-
-  int64_t gdimy() const {
-    return static_cast<int64_t>(gdimy_ == UNINITIALIZED_VAL ? 1 : gdimy_);
-  }
-
-  int64_t bdimz() const {
-    return static_cast<int64_t>(bdimz_ == UNINITIALIZED_VAL ? 1 : bdimz_);
-  }
-
-  int64_t gdimz() const {
-    return static_cast<int64_t>(gdimz_ == UNINITIALIZED_VAL ? 1 : gdimz_);
-  }
-
-  void checkAndSet(
-      const int64_t incoming_val,
-      int64_t& class_val,
-      std::string val) {
-    TORCH_INTERNAL_ASSERT(
-        class_val == UNINITIALIZED_VAL || incoming_val == class_val,
-        "Tried to set ",
-        val,
-        " from ",
-        class_val,
-        " to ",
-        incoming_val,
-        ", but it was already set and new value does not match.",
-        " Thread dims all have to be bound to the same value.");
-    TORCH_CHECK(
-        incoming_val > 0,
-        "Received a thread binding on ",
-        val,
-        " that is ",
-        incoming_val,
-        ". Cannot create negative threads.");
-    if (class_val == UNINITIALIZED_VAL) {
-      class_val = incoming_val;
-    }
-    assertValid();
-  }
-
-  // Binds dim assocaited with p_type to val
-  void bind(int64_t val, ParallelType p_type);
-
-  // Adjusted value based on get functions above for each value
-  int64_t getDim(ParallelType p_type) const;
-
-  // Returns raw value which may be UNINITIALIZED_VAL
-  const int64_t& getRawVal(ParallelType p_type) const;
-
-  // Returns false if value associated with p_type == UNINITIALIZED_VAL
-  bool hasDim(ParallelType p_type) const;
-
-  bool operator==(const LaunchParams& other) const;
-
-  void print() const;
-
-  std::string toString() const;
-
- private:
-  // Spell them out because I want signed ints to know if they were initialized
-  // or not.
-  // TODO: convert to c10::optional
-  int64_t gdimx_ = UNINITIALIZED_VAL;
-  int64_t gdimy_ = UNINITIALIZED_VAL;
-  int64_t gdimz_ = UNINITIALIZED_VAL;
-  int64_t bdimx_ = UNINITIALIZED_VAL;
-  int64_t bdimy_ = UNINITIALIZED_VAL;
-  int64_t bdimz_ = UNINITIALIZED_VAL;
-
-  int64_t smem_ = 0;
-
-  // TODO: Fill in output sizes
-  std::vector<std::vector<int64_t>> output_sizes;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/executor_utils.cpp
+++ b/third_party/nvfuser/csrc/executor_utils.cpp
--- a/third_party/nvfuser/csrc/executor_utils.h
+++ b/third_party/nvfuser/csrc/executor_utils.h
@ -1,314 +0,0 @@
-#pragma once
-
-#include <ATen/core/ivalue.h>
-
-#include <c10/core/DeviceType.h>
-#include <c10/util/Exception.h>
-
-#include <cuda.h>
-
-#include <torch/csrc/jit/ir/ir.h>
-
-#include <executor_kernel_arg.h>
-#include <expr_evaluator.h>
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <kernel.h>
-#include <kernel_expr_evaluator.h>
-#include <lower2device.h>
-
-#include <string>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace executor_utils {
-
-// Include all the functions we might need in generated code
-std::string kernelPreamble();
-
-void validateKernelInputs(
-    Fusion* fusion,
-    const KernelArgumentHolder& args,
-    const c10::Device& device);
-
-void validateKernelOutputs(
-    Fusion* fusion,
-    const std::vector<at::Tensor>& outputs,
-    const c10::Device& device);
-
-//! Bind kernel input values to runtime values
-kir::ExpressionEvaluator bindKernelInputs(
-    const KernelArgumentHolder& args,
-    kir::Kernel* kernel,
-    bool check_consistency = true);
-
-//! Bind fusion input values to runtime values
-TORCH_CUDA_CU_API ExpressionEvaluator
-bindFusionInputs(const KernelArgumentHolder& args, Fusion* fusion);
-
-struct NvrtcFunction {
-  CUmodule module = CUmodule();
-  CUfunction function = CUfunction();
-};
-
-// Returns executable function and the ptxas log from compilation
-std::pair<NvrtcFunction, std::string> nvrtcCompile(
-    const std::string& code,
-    const std::string& func_name,
-    int id,
-    c10::optional<int> opt_block_size = c10::nullopt);
-
-namespace caching {
-// TODO: Could consider putting some of
-//  the logic in the common space and re-use
-
-//! List of all the possible entry types in
-//!  `FusionExecutor` compile-time data cache.
-enum class CompileTimeEntryType {
-  PARALLEL_BINDING_ITERDOMAINS,
-  PARALLEL_ITER_EXTENT_MAP,
-  SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP,
-  WARP_PADDED_PARALLEL_EXTENTS,
-  VECTORIZED_TENSOR_VALIDATION,
-  INPUT_ALIAS_INDICES,
-  OUTPUT_ALIAS_INDICES
-};
-
-//! Entry class definitions for each entry type:
-//!  each class defines the data type for each entry type
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  ParallelBindingIterDomains:
-//!    Stores all the iterdomains that are parallelized
-//!    on the scheduled Fusion graph. They will be used
-//!    in launch param iteration and their extents may
-//!    come from launch constraints.
-class ParallelBindingIterDomains {
- public:
-  using DataType = std::vector<IterDomain*>;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::PARALLEL_BINDING_ITERDOMAINS;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  ParallelIterExtentMap
-//!    Stores the symbolic extents of all the parallelized
-//!    iterdomains corresponding to each used parallel type.
-class ParallelIterExtentMap {
- public:
-  using DataType =
-      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  SimplifiedParallelIterExtentMap
-//!    This entry type is a simplified version of ParallelIterExtentMap.
-//!
-//!    For launch parameter binding we only need the most concrete iterdomain
-//!      in each disjoint set stored in CaParallelMap. This entry stores the
-//!      remaining list of extents for binding after this simplification.
-//!
-//!    We still need ParallelIterExtentMap since we want to bind the concrete
-//!      values to the extents of all parallelized iterdomains. We would be
-//!      able to save these bindings if the integer machine has a notion of
-//!      equality and could be configured compile time. But that'd be a longer
-//!      term target.
-class SimplifiedParallelIterExtentMap {
- public:
-  using DataType =
-      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP;
-};
-
-//!  WarpPaddedExtentsInfo:
-//!    Auxiliary data type for entry class WarpPaddedParallelExtents
-struct WarpPaddedExtentsInfo {
-  std::unordered_set<const Val*> warp_padded_extent_set;
-  std::unordered_map<const Val*, int64_t> warp_padded_constant;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  WarpPaddedParallelExtents
-//!    Stores the symbolic and constant extents of warp
-//!    padded parallel iterdomains.
-class WarpPaddedParallelExtents {
- public:
-  using DataType = WarpPaddedExtentsInfo;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::WARP_PADDED_PARALLEL_EXTENTS;
-};
-
-//!  VectorizedTensorInfo:
-//!    Auxiliary data type for entry class VectorizedTensorValidation
-struct VectorizedTensorInfo {
-  //! Aligned vectorized fusion inputs
-  std::vector<int> aligned_vectorized_inp_tensor_pos;
-  //! Aligned vectorized fusion outputs
-  std::vector<int> aligned_vectorized_out_tensor_pos;
-  //! Misaligned vectorized input tensors
-  std::unordered_set<TensorView*> global_inp_misaligned_tv;
-  //! Misaligned vectorized output tensors
-  std::unordered_set<TensorView*> global_out_misaligned_tv;
-  //! Positions of misaligned input tensors
-  std::vector<int> inp_misaligned_tensors_pos;
-  //! Positions of misaligned output tensors
-  std::vector<int> out_misaligned_tensors_pos;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  VectorizedTensorValidation
-//!    Stores position info and vector word sizes of
-//!    vectorized input/output tensors, to be used
-//!    in misaligned vectorization validation.
-class VectorizedTensorValidation {
- public:
-  using DataType = VectorizedTensorInfo;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::VECTORIZED_TENSOR_VALIDATION;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  InputAliasIndices
-//!    Stores position info of aliased input tensors
-class InputAliasIndices {
- public:
-  using DataType = std::vector<std::pair<int, int>>;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::INPUT_ALIAS_INDICES;
-};
-
-//! Compile-time info to be cached in each FusionExecutor:
-//!  OutputAliasIndices
-//!    Stores position info of aliased output tensors
-class OutputAliasIndices {
- public:
-  using DataType = std::unordered_set<int>;
-  static const CompileTimeEntryType EntryType =
-      CompileTimeEntryType::OUTPUT_ALIAS_INDICES;
-};
-
-//! Base abstract class for unified storage in `ExecutorCompileTimeInfoCache`,
-//!  each entry in `ExecutorCompileTimeInfoCache` will be a subclass.
-class CompileTimeInfoBase : public PolymorphicBase {
- public:
-  CompileTimeInfoBase(CompileTimeEntryType entry_type)
-      : entry_type_(entry_type) {}
-  CompileTimeEntryType type() {
-    return entry_type_;
-  }
-
- private:
-  CompileTimeEntryType entry_type_;
-};
-
-// Note: Do NOT export this class. MSVC issue with exported class that contains
-// std::vector<unique_ptr<xxx>>: https://godbolt.org/z/3E4e8T1P1
-//! Compile-time information cache
-class ExecutorCompileTimeInfoCache {
-  using Entry = CompileTimeInfoBase;
-  using EntryOwningPtr = std::unique_ptr<Entry>;
-  using EntryPtr = Entry*;
-  using EntryType = CompileTimeEntryType;
-
- public:
-  void insert(EntryOwningPtr new_entry);
-
-  EntryPtr at(EntryType entry_type) {
-    return entry_type_map_.at(entry_type);
-  }
-
-  bool has(EntryType entry_type) {
-    return entry_type_map_.count(entry_type);
-  }
-
- private:
-  std::vector<EntryOwningPtr> entries_;
-  std::unordered_map<EntryType, EntryPtr> entry_type_map_;
-};
-
-//! A utility class to facilitate accessing ExecutorCompileTimeInfoCache.
-template <typename EntryClass>
-class ExecutorCompileTimeEntry {
-  using EntryDataType = typename EntryClass::DataType;
-  using EntryDataTypeOwnPtr = std::unique_ptr<EntryDataType>;
-  using MakerFnType = std::function<EntryDataTypeOwnPtr()>;
-
- public:
-  //! Creates a data entry with type defined in EntryClass,
-  //!  eg. EntryClass = VectorizableInputsAndOutputs;
-  //!
-  //! @param data_cache, a pointer to an instantiated compile-time
-  //!  info cache. The info data will be
-  //!    1. read from data cache if data cache has the corresponding entry.
-  //!    2. written into data cache if data cache doesn't have the entry.
-  //!    3. managed by owned_data_ if data cache is nullptr
-  //! @param fn:
-  //!   The factory function that needs to return a owning pointer
-  //!  i.e. std::unique_ptr<EntryClass::DataType>. It will only
-  //!  be called either when data cache is missing an entry or when no data
-  //!  cache is given.
-  ExecutorCompileTimeEntry(
-      ExecutorCompileTimeInfoCache* data_cache,
-      MakerFnType fn);
-
-  //! Unified interface to get actual data, either from cache
-  //!  or from factory function.
-  EntryDataType& get() {
-    return *data_ptr_;
-  }
-
- private:
-  //! Internal data owing pointer that will manage the computed
-  //!  data where there is no data cache.
-  EntryDataTypeOwnPtr owned_data_ = nullptr;
-
-  //! Pointer to the valid data entry that could be accessed.
-  EntryDataType* data_ptr_ = nullptr;
-};
-
-} // namespace caching
-
-//! Returns the vector of tensorviews that will be used to bind parallel
-//!  dimensions.
-std::vector<IterDomain*> getParallelBindingsIterDomains(
-    GpuLower* lower,
-    const std::vector<TensorView*>& used_tvs);
-
-using ParallelExtentMap =
-    std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
-
-//! Returns the extents of all parallel binding iterdomains corresponding
-//!  to each parallel type.
-std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
-    std::vector<IterDomain*>& parallel_binding_ids);
-
-//! Returns the simplified set of extents necessary for launch parameter
-//!  binding.
-std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
-    GpuLower* lower,
-    std::vector<IterDomain*>& parallel_binding_ids);
-
-//! Returns the symbolic or constant extetns of warp padded parallel
-//!  iterdomains in the given vector.
-std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
-    kir::Kernel* lower,
-    std::vector<IterDomain*>& parallel_binding_ids);
-
-void validateVectorizedTensors(
-    kir::Kernel* kernel,
-    const KernelArgumentHolder& args,
-    const std::vector<at::Tensor>& outputs,
-    caching::ExecutorCompileTimeInfoCache* data_cache,
-    kir::ExpressionEvaluator& expr_eval);
-
-} // namespace executor_utils
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/expr_evaluator.cpp
+++ b/third_party/nvfuser/csrc/expr_evaluator.cpp
@ -1,202 +0,0 @@
-
-#include <evaluator_common.h>
-#include <expr_evaluator.h>
-#include <fusion.h>
-#include <instrumentation.h>
-#include <ir_all_nodes.h>
-#include <ir_iostream.h>
-
-#include <iostream>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-bool equals(Val* value, const IntOrDouble& concrete_value) {
-  switch (value->getDataType().value()) {
-    case DataType::Int: {
-      if (!concrete_value.is_int()) {
-        return false;
-      }
-      auto val = value->getInt();
-      return val.has_value() && val.value() == concrete_value.as<int64_t>();
-    }
-    case DataType::Double: {
-      if (concrete_value.is_int()) {
-        return false;
-      }
-      auto val = value->getDouble();
-      return val.has_value() && val.value() == concrete_value.as<double>();
-    }
-    default:
-      TORCH_INTERNAL_ASSERT(false);
-  }
-}
-
-template <typename T>
-c10::optional<IntOrDouble> toOptionalIntOrDouble(c10::optional<T> i) {
-  if (!i) {
-    return c10::nullopt;
-  }
-  return IntOrDouble(i.value());
-}
-
-} // namespace
-
-void ExpressionEvaluator::bind(Val* value, const IntOrDouble& concrete_value) {
-  if (equals(value, concrete_value)) {
-    return;
-  }
-  TORCH_CHECK(!value->isConstScalar(), "Tried to bind to a constant value");
-  TORCH_CHECK(
-      value->definition() == nullptr,
-      "Tried to bind to a value that is computed in the fusion IR");
-  if (value->isA<NamedScalar>()) {
-    known_named_scalars_[value->as<NamedScalar>()->name()] = concrete_value;
-  } else {
-    known_values_[value] = concrete_value;
-  }
-}
-
-void ExpressionEvaluator::bind(
-    const std::string& name,
-    const IntOrDouble& concrete_value) {
-  known_named_scalars_[name] = concrete_value;
-}
-
-c10::optional<IntOrDouble> ExpressionEvaluator::evaluate(Val* value) {
-  if (evaluator_precomputed_values_ != nullptr) {
-    return toOptionalIntOrDouble(
-        evaluator_precomputed_values_->getMaybeValueFor(value));
-  } else {
-    auto maybe_concrete_value = getValue(value);
-    if (!maybe_concrete_value.has_value()) {
-      if (value->definition() != nullptr) {
-        OptOutDispatch::handle(value->definition());
-        maybe_concrete_value = getValue(value);
-      }
-    }
-    return maybe_concrete_value;
-  }
-  return c10::nullopt;
-}
-
-void ExpressionEvaluator::print() const {
-  std::cout << "\nEvaluation context\n";
-  std::cout << "--------------------\n";
-  for (const auto& kv : known_values_) {
-    TORCH_INTERNAL_ASSERT(!kv.first->isConstScalar());
-    std::cout << kv.first << " = " << kv.second << " ; "
-              << *kv.first->getValType() << "\n";
-  }
-  std::cout << "--------------------\n\n";
-}
-
-c10::optional<IntOrDouble> ExpressionEvaluator::getValue(Val* value) {
-  TORCH_INTERNAL_ASSERT(
-      value->isAnInt() || value->isADouble(),
-      "Expression Evaluation does not support values other than integers/doubles at this time.");
-
-  if (value->getValType().value() == ValType::Scalar) {
-    if (value->isAnInt() && value->as<Int>()->value().has_value()) {
-      return toOptionalIntOrDouble(value->as<Int>()->value());
-    }
-    if (value->isADouble() && value->as<Double>()->value().has_value()) {
-      return toOptionalIntOrDouble(value->as<Double>()->value());
-    }
-  }
-
-  if (value->isA<NamedScalar>()) {
-    const auto it = known_named_scalars_.find(value->as<NamedScalar>()->name());
-    return it != known_named_scalars_.end()
-        ? c10::optional<IntOrDouble>(it->second)
-        : c10::nullopt;
-  } else {
-    const auto it = known_values_.find(value);
-    return it != known_values_.end() ? c10::optional<IntOrDouble>(it->second)
-                                     : c10::nullopt;
-  }
-}
-
-void ExpressionEvaluator::handle(UnaryOp* uop) {
-  using namespace IntOrDouble_functions;
-  const auto in = evaluate(uop->in());
-  if (in.has_value()) {
-    switch (uop->getUnaryOpType()) {
-      case UnaryOpType::Neg:
-        known_values_[uop->out()] = -*in;
-        break;
-      case UnaryOpType::Set:
-        known_values_[uop->out()] = *in;
-        break;
-      case UnaryOpType::Cast:
-        if (uop->out()->getDataType() == DataType::Int) {
-          known_values_[uop->out()] = in->cast<int64_t>();
-        } else if (uop->out()->getDataType() == DataType::Double) {
-          known_values_[uop->out()] = in->cast<double>();
-        } else {
-          TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
-        }
-        break;
-      case UnaryOpType::Abs:
-        known_values_[uop->out()] = abs(*in);
-        break;
-      default:
-        TORCH_CHECK(
-            !"Unexpected operator type ",
-            uop->getUnaryOpType(),
-            " in ",
-            uop->toString());
-    }
-  }
-}
-
-void ExpressionEvaluator::handle(BinaryOp* bop) {
-  using namespace IntOrDouble_functions;
-  const auto lhs = evaluate(bop->lhs());
-  const auto rhs = evaluate(bop->rhs());
-  if (lhs.has_value() && rhs.has_value()) {
-    switch (bop->getBinaryOpType()) {
-      case BinaryOpType::Add:
-        known_values_[bop->out()] = *lhs + *rhs;
-        break;
-      case BinaryOpType::Sub:
-        known_values_[bop->out()] = *lhs - *rhs;
-        break;
-      case BinaryOpType::Mul:
-        known_values_[bop->out()] = *lhs * *rhs;
-        break;
-      case BinaryOpType::Div:
-        TORCH_CHECK(*rhs != 0);
-        known_values_[bop->out()] = *lhs / *rhs;
-        break;
-      case BinaryOpType::Mod:
-        TORCH_CHECK(*rhs != 0);
-        known_values_[bop->out()] = *lhs % *rhs;
-        break;
-      case BinaryOpType::CeilDiv:
-        TORCH_CHECK(*rhs != 0);
-        known_values_[bop->out()] = ceildiv(*lhs, *rhs);
-        break;
-      case BinaryOpType::And:
-        known_values_[bop->out()] = *lhs && *rhs;
-        break;
-      case BinaryOpType::Max:
-        known_values_[bop->out()] = max(*lhs, *rhs);
-        break;
-      case BinaryOpType::Min:
-        known_values_[bop->out()] = min(*lhs, *rhs);
-        break;
-      default:
-        TORCH_CHECK(!"Unexpected operator type");
-    }
-  }
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/expr_evaluator.h
+++ b/third_party/nvfuser/csrc/expr_evaluator.h
@ -1,68 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <dynamic_type.h>
-#include <ir_interface_nodes.h>
-#include <iter_visitor.h>
-
-#include <c10/util/Optional.h>
-
-#include <string>
-#include <unordered_map>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class FusionPrecomputedValues;
-
-//! Calculate Fusion IR expressions
-class TORCH_CUDA_CU_API ExpressionEvaluator : private OptOutDispatch {
- public:
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  explicit ExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}
-
-  //! Returns the associated fusion object
-  Fusion* fusion() const {
-    return fusion_;
-  }
-
-  //! Bind a concrete value to an IR variable
-  void bind(Val* value, const IntOrDouble& concrete_value);
-
-  //! Bind a concrete value to a named scalar
-  void bind(const std::string& name, const IntOrDouble& concrete_value);
-
-  //! Try to evaluate a Fusion IR value
-  c10::optional<IntOrDouble> evaluate(Val* value);
-
-  //! Debugging helper, prints all the currently known values
-  void print() const;
-
-  void bindPrecomputedValues(FusionPrecomputedValues* precomputed_values) {
-    evaluator_precomputed_values_ = precomputed_values;
-  }
-
-  auto precomputedValues() {
-    return evaluator_precomputed_values_;
-  }
-
- private:
-  c10::optional<IntOrDouble> getValue(Val* value);
-
-  void handle(UnaryOp*) final;
-  void handle(BinaryOp*) final;
-  // TODO: handle swizzle
-
- private:
-  std::unordered_map<const Val*, IntOrDouble> known_values_;
-  std::unordered_map<std::string, IntOrDouble> known_named_scalars_;
-  Fusion* fusion_ = nullptr;
-  FusionPrecomputedValues* evaluator_precomputed_values_ = nullptr;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@ -1,723 +0,0 @@
-#include <arith.h>
-#include <codegen.h>
-#include <disjoint_set.h>
-#include <fusion.h>
-#include <fusion_segmenter.h>
-#include <instrumentation.h>
-#include <ir_all_nodes.h>
-#include <ir_cloner.h>
-#include <ir_printer.h>
-#include <ir_utils.h>
-#include <iter_visitor.h>
-#include <kernel.h>
-#include <lower2device.h>
-#include <lower_bank_conflict.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-static thread_local Fusion* ACTIVE_FUSION = nullptr; // NOLINT
-
-FusionGuard::FusionGuard(Fusion* fusion) {
-  prev_fusion = ACTIVE_FUSION;
-  ACTIVE_FUSION = fusion;
-}
-
-FusionGuard::~FusionGuard() {
-  ACTIVE_FUSION = prev_fusion;
-}
-
-Fusion* FusionGuard::getCurFusion() {
-  return ACTIVE_FUSION;
-}
-void FusionGuard::setCurFusion(Fusion* fusion) {
-  ACTIVE_FUSION = fusion;
-}
-
-void swap(Fusion& a, Fusion& b) noexcept {
-  FUSER_PERF_SCOPE("Fusion swap");
-
-  using std::swap;
-
-  swap(static_cast<IrContainer&>(a), static_cast<IrContainer&>(b));
-
-  swap(a.inputs_, b.inputs_);
-  swap(a.outputs_, b.outputs_);
-
-  swap(a.io_alias_, b.io_alias_);
-  swap(a.permuted_input_map_, b.permuted_input_map_);
-  swap(a.permuted_output_map_, b.permuted_output_map_);
-}
-
-std::unique_ptr<SegmentedFusion> Fusion::segment(
-    const KernelArgumentHolder& args) {
-  FUSER_PERF_SCOPE("Segment Fusion");
-  return SegmentCandidateFinder::segment(this, args);
-}
-
-IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
-  to->clear();
-  auto ir_cloner = IrContainer::copy(from, to);
-
-  for (auto val : from->vals_) {
-    ir_cloner.clone(val)->setDefinition(ir_cloner.clone(val->definition_));
-    ir_cloner.clone(val)->setUses(ir_cloner.clone(val->uses_));
-  }
-
-  to->inputs_ = ir_cloner.clone(from->inputs_);
-  to->outputs_ = ir_cloner.clone(from->outputs_);
-  for (auto inp : to->inputs_) {
-    inp->setIsFusionInput(true);
-  }
-  for (auto out : to->outputs_) {
-    out->setIsFusionOutput(true);
-  }
-
-  // TODO: put this into ir_cloner instead
-  for (const auto& entry : from->io_alias_) {
-    Val* copied_output = ir_cloner.clone(entry.first);
-    Val* copied_input = ir_cloner.clone(entry.second);
-    to->io_alias_[copied_output] = copied_input;
-  }
-
-  to->permuted_input_map_ = from->permuted_input_map_;
-  to->permuted_output_map_ = from->permuted_output_map_;
-
-  to->all_tv_uses_valid_ = from->all_tv_uses_valid_;
-  // This should never be true on copy, but copying for completeness.
-  to->is_during_update_uses_ = from->is_during_update_uses_;
-
-  return ir_cloner;
-}
-
-// Clang tidy complains when using default constructor for IrContainer instead
-// of copy constructor. Fusion::copy has a call to IrContainer::copy, so it's
-// redundant to use the IrContainer copy constructor, but it is harmless since
-// Fusion::copy starts by calling clear().
-Fusion::Fusion(const Fusion& other) : IrContainer(other) {
-  FUSER_PERF_SCOPE("Fusion copy");
-  Fusion::copy(&other, this);
-}
-
-Fusion::Fusion(Fusion&& other) noexcept {
-  FUSER_PERF_SCOPE("Fusion move");
-  swap(*this, other);
-}
-
-Fusion& Fusion::operator=(const Fusion& other) {
-  FUSER_PERF_SCOPE("Fusion copy assign");
-  Fusion copy(other);
-  clear();
-  swap(*this, copy);
-  return *this;
-}
-
-Fusion& Fusion::operator=(Fusion&& other) noexcept {
-  FUSER_PERF_SCOPE("Fusion move assign");
-  clear();
-  swap(*this, other);
-  return *this;
-}
-
-Fusion::~Fusion() {
-  clear();
-}
-
-void Fusion::clear() noexcept {
-  FUSER_PERF_SCOPE("Fusion clear");
-
-  IrContainer::clear();
-
-  inputs_.clear();
-  outputs_.clear();
-
-  io_alias_.clear();
-
-  permuted_input_map_.clear();
-  permuted_output_map_.clear();
-
-  all_tv_uses_valid_ = false;
-  is_during_update_uses_ = false;
-}
-
-void Fusion::removeExpr(Expr* expr) {
-  assertInContainer(expr, "Cannot remove expr ");
-  // If we hit this error too frequently, we could lighten the restrictions so
-  // that removing something that doesn't exist simply does nothing. For now,
-  // we're going with the strictest model which errors.
-
-  for (auto out : expr->outputs()) {
-    out->setDefinition(nullptr);
-  }
-
-  for (auto inp : expr->inputs()) {
-    auto uses_copy = inp->uses();
-    auto it = std::find(uses_copy.begin(), uses_copy.end(), expr);
-    if (it != uses_copy.end()) {
-      uses_copy.erase(it);
-      inp->setUses(uses_copy);
-    }
-  }
-
-  IrContainer::removeExpr(expr);
-}
-
-void Fusion::removeVal(Val* val) {
-  assertInContainer(val, "Cannot remove val ");
-
-  TORCH_CHECK(
-      !val->isFusionInput(),
-      "Cannot remove val as it is an input of the fusion.");
-  TORCH_CHECK(
-      !val->isFusionOutput(),
-      "Cannot remove val as it is an output of the fusion.");
-
-  Expr* orig = val->definition();
-  if (orig != nullptr)
-    removeExpr(val->definition());
-
-  for (Expr* use : unordered_uses(val)) {
-    removeExpr(use);
-  }
-  IrContainer::removeVal(val);
-}
-
-void Fusion::addInput(Val* input) {
-  assertInContainer(input, "Cannot register input ");
-
-  TORCH_INTERNAL_ASSERT(
-      input->getDataType() != DataType::Index,
-      "Data type Index is a local compile time data type only, it cannot be used as an input in case it was generated from another kernel.");
-
-  if (input->getValType().value() == ValType::TensorView) {
-    auto tv = input->as<TensorView>();
-    tv->setMemoryType(MemoryType::Global);
-  } else if (input->getValType().value() == ValType::Scalar) {
-    TORCH_CHECK(
-        !input->isConst(),
-        "Immediate scalar value cannot be added as an input. It is not necessary to pass it as an input.");
-  }
-
-  inputs_.push_back(input);
-  input->setIsFusionInput(true);
-
-  all_tv_uses_valid_ = false;
-}
-
-void Fusion::addOutput(Val* output) {
-  // We currently don't support explicitly outputing aliased inputs. This is
-  // because they are already marked as output for in-place update. It's tricky
-  // to allow marking them explicitly as real output, since that requires us to
-  // register/identify output not only by `Val*` pointer, but also by indices;
-  // it also requires us to magically arrange `outputs_` entries in proper order
-  // ^^^ this doesn't look intuitive on `outputs_` in fusion.
-  // I think we can solve this by marking addOutput on io_alias_ keys after
-  // fusion is fully defined. Tracking this in #1488
-  // Apparently we can't do this neither at the time. I think segmentation
-  // unfortunately would call addOutput after we marked io_alias_ map.
-  // TORCH_CHECK(io_alias_.count(output) == 0,
-  //     "can't register aliased output as real output");
-
-  assertInContainer(output, "Cannot register output ");
-  if (output->getValType().value() == ValType::TensorView) {
-    auto tv = output->as<TensorView>();
-    tv->setMemoryType(MemoryType::Global);
-  }
-  outputs_.push_back(output);
-  output->setIsFusionOutput(true);
-
-  all_tv_uses_valid_ = false;
-}
-
-void Fusion::removeInput(Val* input) {
-  auto find_input = std::find(inputs_.begin(), inputs_.end(), input);
-  if (find_input != inputs_.end()) {
-    inputs_.erase(find_input);
-  }
-  input->setIsFusionInput(false);
-  all_tv_uses_valid_ = false;
-}
-
-void Fusion::removeOutput(Val* output) {
-  auto find_output = std::find(outputs_.begin(), outputs_.end(), output);
-  if (find_output != outputs_.end()) {
-    outputs_.erase(find_output);
-  }
-  output->setIsFusionOutput(false);
-  all_tv_uses_valid_ = false;
-}
-
-void Fusion::replaceOutput(Val* output, Val* replacement) {
-  auto find_output = std::find(outputs_.begin(), outputs_.end(), output);
-  TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion");
-
-  if (find_output != outputs_.end()) {
-    std::replace_if(
-        outputs_.begin(),
-        outputs_.end(),
-        [&output](Val* v) { return v == output; },
-        replacement);
-
-    if (replacement->getValType().value() == ValType::TensorView) {
-      replacement->setIsFusionOutput(true);
-      replacement->as<TensorView>()->setMemoryType(MemoryType::Global);
-    }
-    if (output->getValType().value() == ValType::TensorView) {
-      output->setIsFusionOutput(false);
-      output->as<TensorView>()->setMemoryType(MemoryType::Local);
-    }
-    resetTvUses();
-  }
-
-  // Temporary WAR for issue #1112
-  // (https://github.com/csarofeen/pytorch/issues/1112)
-  if (io_alias_.count(output) != 0) {
-    auto input = io_alias_[output];
-    io_alias_.erase(output);
-    io_alias_[replacement] = input;
-  }
-}
-
-std::vector<Expr*> Fusion::exprs() {
-  return StmtSort::getExprs(this);
-}
-
-std::vector<Val*> Fusion::inputsOf(Val* val) {
-  return InputsOf::output(this, val);
-}
-
-void Fusion::validateInputs() {
-  std::unordered_set<Val*> all_inputs;
-  for (Val* out : outputs()) {
-    for (Val* input : inputsOf(out)) {
-      all_inputs.insert(input);
-    }
-  }
-
-  std::unordered_set<Val*> input_dims;
-  auto inp_tvs = ir_utils::filterByType<TensorView>(inputs());
-  for (auto tv : inp_tvs) {
-    for (auto id : tv->getMaybeRFactorDomain()) {
-      input_dims.emplace(id->extent());
-    }
-  }
-  for (Val* input : all_inputs) {
-    if (!input->isConstScalar()) {
-      TORCH_CHECK(
-          input->isFusionInput() ||
-              // TODO: Switch:
-              inContainer(input),
-          // to: input_dims.find(input) != input_dims.end(),
-          // https://github.com/csarofeen/pytorch/issues/1365
-          "Could not figure out how ",
-          input->toString(),
-          " is generated, however it was not specified as an input.");
-    }
-  }
-}
-
-void Fusion::print() {
-  FUSER_PERF_SCOPE("Fusion::print");
-
-  FusionGuard fg(this);
-  std::cout << "\n%kernel {\n";
-  IrMathPrinter op_exprs(std::cout);
-  op_exprs.handle(this);
-  std::cout << "\nTransformPrinter : \n";
-  IrTransformPrinter t_exprs(std::cout);
-  t_exprs.handle(this);
-  std::cout << "}\n\n";
-}
-
-void Fusion::printKernel(DataType index_type) {
-  FUSER_PERF_SCOPE("Fusion::printKernel");
-  TORCH_INTERNAL_ASSERT(
-      !this->isA<kir::Kernel>(),
-      "Cannot \"print kernel\" of a kernel container. ",
-      "This would require lowering during lowering.");
-  std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
-}
-
-std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
-    DataType index_type) {
-  GpuLower lower(this, index_type);
-  auto kernel = lower.kernel();
-  auto info = getBankConflictInfo(kernel);
-  // The container of exprs goes out of scope, so we return a map of string here
-  std::unordered_map<std::string, std::pair<int, int>> result;
-  result.reserve(info.size());
-  for (auto i : info) {
-    result[i.first->toString()] = i.second;
-  }
-  return result;
-}
-
-void Fusion::printMath(bool from_outputs_only) {
-  FUSER_PERF_SCOPE("Fusion::printMath");
-
-  FusionGuard fg(this);
-  auto exprs_for_print = exprs();
-  std::cout << "Inputs:" << std::endl;
-  for (auto inp : inputs()) {
-    std::cout << "  " << inp << ", " << inp->getDataType().value() << std::endl;
-  }
-
-  std::cout << "Outputs:" << std::endl;
-  for (auto out : outputs()) {
-    std::cout << "  " << out << ", " << out->getDataType().value() << std::endl;
-  }
-
-  // If we want everything in the fusion, grab all values without uses to
-  // traverse from.
-  if (!from_outputs_only) {
-    std::vector<Val*> leaf_vals;
-    for (auto val : deterministic_vals()) {
-      if (val->uses().empty()) {
-        leaf_vals.push_back(val);
-      }
-    }
-    exprs_for_print = StmtSort::getExprs(this, leaf_vals);
-  }
-
-  std::cout << "\n%kernel_math {\n";
-  for (auto expr : exprs_for_print) {
-    std::cout << expr;
-  }
-  std::cout << "}\n\n";
-}
-
-std::vector<Val*> Fusion::inputsAndCreated() {
-  auto result = inputs_;
-  for (auto expr : exprs()) {
-    auto tv_inputs = ir_utils::filterByType<TensorView>(expr->inputs());
-    if (tv_inputs.empty()) {
-      for (auto v : expr->outputs()) {
-        result.emplace_back(v);
-      }
-    }
-  }
-  return result;
-}
-
-void Fusion::printTransforms() {
-  FUSER_PERF_SCOPE("Fusion::printTransforms");
-
-  FusionGuard fg(this);
-  IrTransformPrinter t_exprs(std::cout);
-  t_exprs.handle(this);
-}
-
-void Fusion::registerVal(Val* val) {
-  if (inContainer(val)) {
-    return;
-  }
-
-  if (val->fusion()) {
-    TORCH_CHECK(
-        val->fusion() == this, val, " was not found in the active fusion.");
-  }
-
-  IrContainer::registerVal(val);
-}
-
-void Fusion::registerExpr(Expr* expr) {
-  if (inContainer(expr)) {
-    return;
-  }
-
-  if (expr->fusion()) {
-    TORCH_CHECK(
-        expr->fusion() == this, expr, " was not found in the active fusion.");
-  }
-
-  IrContainer::registerExpr(expr);
-
-  bool has_tv = false;
-
-  for (Val* input : expr->inputs()) {
-    has_tv = has_tv || input->isA<TensorView>();
-    assertInContainer(input, "Input to expr is invalid, ");
-    auto uses_copy = input->uses();
-    if (std::find(uses_copy.begin(), uses_copy.end(), expr) ==
-        uses_copy.end()) {
-      uses_copy.push_back(expr);
-      input->setUses(uses_copy);
-    }
-  }
-
-  // Kernel is the only container type that is non-ssa. This is mainly (maybe
-  // only) because of initialization expressions which would overwrite tensor
-  // view definitions.
-  bool is_ssa = !this->isA<kir::Kernel>();
-
-  for (Val* output : expr->outputs()) {
-    has_tv = has_tv || output->isA<TensorView>();
-    assertInContainer(output, "Output to expr is invalid, ");
-    if (output->definition() != nullptr && is_ssa) {
-      removeExpr(output->definition());
-    }
-    if (is_ssa || (!is_ssa && output->definition() == nullptr)) {
-      output->setDefinition(expr);
-    }
-  }
-
-  if (has_tv) {
-    resetTvUses();
-  }
-}
-
-void Fusion::resetTvUses() {
-  FUSER_PERF_SCOPE("Fusion::resetTvUses");
-  is_during_update_uses_ = true;
-
-  // getExprs only uses definition, so even if we've modified uses already to
-  // remove dead exprs, this could reinsert them. getExprs is also boundeds by
-  // inputs as registered inputs will return nullptr as their definition.
-  const auto all_tvs = ir_utils::filterByType<TensorView>(vals_);
-  const auto used_exprs = StmtSort::getExprs(this);
-
-  for (auto tv : all_tvs) {
-    tv->setUses({});
-  }
-
-  // Same as in register expr
-  for (auto expr : used_exprs) {
-    for (Val* input : expr->inputs()) {
-      auto uses_copy = input->uses();
-      if (std::find(uses_copy.begin(), uses_copy.end(), expr) ==
-          uses_copy.end()) {
-        uses_copy.push_back(expr);
-        input->setUses(uses_copy);
-      }
-    }
-  }
-
-  all_tv_uses_valid_ = true;
-  is_during_update_uses_ = false;
-}
-
-std::vector<Val*> Fusion::usedMathVals() {
-  // Note that using fusion->inputs() as the argument for the first
-  // parameter of getAllValsBetween does not grab all used vals as
-  // there can be vals that are created inside a fusion without using
-  // anything from inputs. See, for example, tv0 in the
-  // FusionOuterSplit test.
-  const auto inputs = InputsOf::outputs(this, outputs());
-  auto used_math_vals = DependencyCheck::getAllValsBetween(
-      {inputs.begin(), inputs.end()}, outputs());
-  // When an expre has multiple outputs and only some of them are
-  // used, the rest aren't included in used_math_vals as they are not
-  // used. However, we want them to be included as they must show up
-  // in the fusion.
-  std::vector<Val*> vals_to_add;
-  std::unordered_set<Val*> added_vals;
-
-  for (auto val : used_math_vals) {
-    auto def = val->definition();
-    if (def == nullptr || def->outputs().size() < 2) {
-      continue;
-    }
-    for (auto out : def->outputs()) {
-      if (std::find(used_math_vals.begin(), used_math_vals.end(), out) ==
-          used_math_vals.end()) {
-        if (!added_vals.count(out)) {
-          vals_to_add.push_back(out);
-          added_vals.insert(out);
-        }
-      }
-    }
-  }
-
-  used_math_vals.insert(
-      used_math_vals.end(), vals_to_add.begin(), vals_to_add.end());
-
-  return used_math_vals;
-}
-
-std::vector<Val*> Fusion::terminatingMathVals() {
-  VectorOfUniqueEntries<Val*> result;
-  auto used_vals = usedMathVals();
-  for (auto v : used_vals) {
-    // Locate the vals that are not expr outputs but have valid definitions.
-    if (unordered_uses(v).empty() && v->definition() != nullptr) {
-      result.pushBack(v);
-    }
-  }
-  return result.vector();
-}
-
-std::unordered_set<Expr*> Fusion::unordered_uses(const Val* val) const {
-  return std::unordered_set<Expr*>(val->uses().begin(), val->uses().end());
-}
-
-Expr* Fusion::definition(const Val* val) const {
-  assertInContainer(val, "Cannot detect the definition of val, ");
-  return val->definition();
-}
-
-// Indicate to kernel to set itself up to generate random numbers
-bool Fusion::isStochastic() {
-  for (auto expr : exprs()) {
-    if (expr->getExprType() == ExprType::RNGOp) {
-      return true;
-    }
-  }
-  return false;
-}
-
-std::vector<Val*> Fusion::getTerminatingOutputs() const {
-  FUSER_PERF_SCOPE("getTerminatingOutputs");
-
-  auto is_reachable_to_output = [](Val* val) {
-    // traverse to consumers of val and see if there is an output
-    std::deque<Val*> consumers;
-    for (auto use : val->uses()) {
-      for (auto consumer : use->outputs()) {
-        consumers.push_back(consumer);
-      }
-    }
-    while (!consumers.empty()) {
-      auto consumer = consumers.back();
-      consumers.pop_back();
-      if (consumer->isFusionOutput()) {
-        return true;
-      }
-      // consumer is not an output; proceed to its consumers
-      for (auto use : consumer->uses()) {
-        for (auto consumer_of_consumer : use->outputs()) {
-          consumers.push_back(consumer_of_consumer);
-        }
-      }
-    }
-    return false;
-  };
-
-  std::vector<Val*> terminating_outputs;
-
-  for (auto out : outputs()) {
-    // If there is another output reachable from this output, it's not
-    // terminating.
-    if (is_reachable_to_output(out)) {
-      continue;
-    }
-    terminating_outputs.push_back(out);
-  }
-
-  return terminating_outputs;
-}
-
-bool Fusion::isAliasCompatible(Val* left, Val* right) {
-  // Nullptr check
-  if (left == nullptr || right == nullptr) {
-    return false;
-  }
-
-  // DataType check
-  if (!left->getDataType().has_value() || !right->getDataType().has_value() ||
-      left->getDataType().value() != right->getDataType().value()) {
-    return false;
-  }
-
-  // ValType check
-  if (!left->getValType().has_value() || !right->getValType().has_value() ||
-      left->getValType().value() != right->getValType().value()) {
-    return false;
-  }
-
-  // Check same number of dimensions if both values are TensorViews
-  if (ir_utils::isTV(left) && ir_utils::isTV(right)) {
-    return left->as<TensorView>()->nDims() == right->as<TensorView>()->nDims();
-  }
-  return false;
-}
-
-void Fusion::aliasOutputToInput(Val* output, Val* input) {
-  // Because we could cast output when input is cast.
-  TORCH_INTERNAL_ASSERT(
-      !output->isFusionOutput(),
-      "Do NOT add aliased output to fusion output outside of `aliasOutputToInput");
-
-  if (!input->isFusionInput()) {
-    auto input_expr = input->definition();
-    // TORCH_INTERNAL_ASSERT(input_def.etype() == ExprType::UnaryOp, "expected
-    // unary op for aliased input");
-    TORCH_INTERNAL_ASSERT(
-        input_expr->isA<UnaryOp>(), "expected unary op for aliased input");
-    auto input_uop = input_expr->as<UnaryOp>();
-    TORCH_INTERNAL_ASSERT(
-        input_uop->getUnaryOpType() == UnaryOpType::Cast,
-        "expected aliased input to be output of cast op");
-    input = input_uop->in();
-  }
-  TORCH_INTERNAL_ASSERT(
-      input->getDataType().has_value() && output->getDataType().has_value(),
-      "requires DataType to be available for aliased output to input");
-
-  if (input->getDataType().value() != output->getDataType().value()) {
-    output = castOp(input->getDataType().value(), output);
-  }
-  // TODO: output should be marked at the end of fusion definition #1488
-  addOutput(output);
-
-  TORCH_INTERNAL_ASSERT(
-      isAliasCompatible(input, output),
-      "The input and output values are not alias-compatible.");
-  io_alias_[output] = input;
-}
-
-Val* Fusion::getOutputAlias(Val* output) {
-  auto search = io_alias_.find(output);
-  if (search != io_alias_.end()) {
-    return search->second;
-  }
-  return nullptr;
-}
-
-std::unordered_set<int> Fusion::getOutputAliasIndices() const {
-  if (io_alias_.empty()) {
-    return {};
-  }
-
-  std::unordered_set<int> alias_indices;
-
-  for (const auto i : c10::irange(outputs_.size())) {
-    if (io_alias_.count(outputs_[i]) != 0) {
-      alias_indices.insert(i);
-    }
-  }
-  return alias_indices;
-}
-
-std::vector<std::pair<int, int>> Fusion::getInputAliasIndices() const {
-  if (io_alias_.empty()) {
-    return {};
-  }
-
-  std::vector<std::pair<int, int>> alias_indices;
-  for (const auto i : c10::irange(outputs_.size())) {
-    if (io_alias_.count(outputs_[i]) != 0) {
-      bool found = false;
-      for (const auto j : c10::irange(inputs_.size())) {
-        if (io_alias_.at(outputs_[i]) == inputs_[j]) {
-          alias_indices.emplace_back(i, j);
-          found = true;
-          break;
-        }
-      }
-      TORCH_INTERNAL_ASSERT(
-          found,
-          "io_alias_ mapping failure, alias output is not present in inputs");
-    }
-  }
-  // can't assert here, we could have segmented fusion where not all alias
-  // outputs are present
-
-  return alias_indices;
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@ -1,288 +0,0 @@
-#pragma once
-
-#include <ATen/core/ivalue.h>
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-
-#include <ir_base_nodes.h>
-#include <ir_container.h>
-#include <iter_visitor.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-//! Usage: FusionGuard and Fusion are required user interfaces for any operation
-//! underlying the code generator. In order to create values, expressions, and
-//! generate code a Fusion instance must be active. It is the responsibility of
-//! the user to create a Fusion instance and register it with the fusion guard.
-//! The simplest example of this is:
-//!
-//!     Fusion fusion;
-//!     FusionGuard fg(&fusion);
-//!
-//! Once a fusion is active all values and operations will be registered with
-//! it.
-//!
-//! FusionGuard and Fusion are critical to the lifetime model of the IR system.
-//! FusionGuard is a convenient way to set what base container instance holds
-//! the defined IR. Statements that are defined are registered through the
-//! FusionGuard with a particular Fusion. FusionGuard provides convenient
-//! methods to access the active fusion so it doesn't need to be passed around
-//! constantly. Any IR node derived classes from Statement must register with
-//! Fusion to avoid memory leaks.
-//!
-//! Fusion is generally thought of as a translated fusion group from the JIT. It
-//! is likely a single kernel, although, we don't have to stick to this in the
-//! future and could in theory generate multiple kernels with an executor to run
-//! them.
-//!
-//! Fusion also allows users to set input/output values that will allow us to
-//! figure out how to hook up runtime data to and from the JIT as well as
-//! provide us mechanisms for dependency analysis and DCE including safety
-//! checks.
-
-class Fusion;
-class TensorView;
-class WelfordResult;
-
-class SegmentCandidateFinder;
-class SegmentedFusion;
-class KernelArgumentHolder;
-
-//! Fusion Guard is our "context manager". It holds the actrive fusion and
-//! allows it to be accessed anywhere through FusionGuard::getCurFusion()
-class TORCH_CUDA_CU_API FusionGuard {
- public:
-  Fusion* prev_fusion;
-
-  //! Set the active fusion so it can be manipulated.
-  explicit FusionGuard(Fusion* fusion);
-
-  ~FusionGuard();
-
-  static Fusion* getCurFusion();
-  static void setCurFusion(Fusion* fusion);
-};
-
-//! Fusion is mutable but unique. Nodes cannot be copied in any way from one
-//! Fusion to another. If anything like that is desired, it would require
-//! duplicating all associated values and exprs. Fusion is considered to be SSA,
-//! though this could also change in the future if there is a good reason to do
-//! so.
-//!
-//! The Fusion owns the whole IR graph (Vals and Exprs)
-//!
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API Fusion : public IrContainer {
-  typedef std::unordered_map<int, std::vector<int64_t>> PermutationMap;
-
- public:
-  Fusion() = default;
-
-  Fusion(const Fusion& other);
-  Fusion(Fusion&& other) noexcept;
-
-  Fusion& operator=(const Fusion& other);
-  Fusion& operator=(Fusion&& other) noexcept;
-
-  ~Fusion() override;
-
-  friend void swap(Fusion& a, Fusion& b) noexcept;
-
-  void clear() noexcept;
-
-  //! Break dependency chains associated with Expr, remove references to expr
-  //! delete expr
-  void removeExpr(Expr* expr) override;
-
-  //! Completely remove val from the fusion, break all dependencies associated
-  //! with it
-  void removeVal(Val* val) override;
-
-  //! Register input as an input of the fusion
-  void addInput(Val* input);
-
-  //! Register output as an output of the fusion
-  void addOutput(Val* output);
-
-  //! Deregister input as an input of the fusion
-  void removeInput(Val* input);
-
-  //! Deregister output as an output of the fusion
-  void removeOutput(Val* output);
-
-  //! Replace output with another value
-  void replaceOutput(Val* output, Val* replacement);
-
-  //! Assert that all leaves found from outputs are registered as an input
-  void validateInputs();
-
-  //! Print this fusion to the console
-  void print();
-
-  //! Print Arith exprs
-  //! \param from_outputs_only Only print exprs reachable from outputs
-  void printMath(bool from_outputs_only = true);
-
-  //! Print transformations used in fusion (can be very verbose)
-  void printTransforms();
-
-  //! Lower the fusion and print a kernel
-  void printKernel(DataType index_type = DataType::Int);
-
-  //! Lower the fusion and evaluate bank conflict info
-  std::unordered_map<std::string, std::pair<int, int>> bankConflictInfo(
-      DataType index_type = DataType::Int);
-
-  //! Return a list of topologically sorted expressions. This only includes
-  //! exprs required to genereate registered outputs.
-  std::vector<Expr*> exprs();
-
-  //! Return a vector of fusion inputs that feed this Val
-  std::vector<Val*> inputsOf(Val* val);
-
-  //! Return all Vals in math expressions that cannot be eliminated.
-  //!
-  //! It is generally equivalent to vals that are used to generate
-  //! outputs, however, when a multi-output expression exists, and only
-  //! some of the outputs are used, the remaining unused outputs are
-  //! also included as they must show up in the final code.
-  std::vector<Val*> usedMathVals();
-
-  //! Returns all vals that are produced by used math expressions and
-  //!  also do not have further consumers.
-  //!
-  //! In the case of an active multi-output expressions, the returned vector
-  //!  will include the expression outputs that did not lead to an fusion
-  //!  output.
-  std::vector<Val*> terminatingMathVals();
-
-  //! Return all Exprs that use val
-  std::unordered_set<Expr*> unordered_uses(const Val* val) const;
-
-  //! Return the Expr that produces val
-  Expr* definition(const Val* val) const;
-
-  //! Indicate to kernel to set itself up to generate random numbers
-  bool isStochastic();
-
-  //! Run fusion segmentation algorithm to create a segmented fusion
-  std::unique_ptr<SegmentedFusion> segment(const KernelArgumentHolder& args);
-
-  const auto& inputs() const {
-    return inputs_;
-  }
-
-  std::vector<Val*> inputsAndCreated();
-
-  const auto& outputs() const {
-    return outputs_;
-  }
-
-  std::vector<Val*> getTerminatingOutputs() const;
-
-  // Aliasing output to input value, this is a WAR to allow inplace update on
-  // input tensor.
-  // Note: this is not always safe and should be used with extra caution.
-  // Currently the only place it's used is in the running stats update for batch
-  // normalization.
-  // TODO: alias should be made aware to segmentation, so we'll always include
-  // the input tensor to the section where output is produced.
-  void aliasOutputToInput(Val* output, Val* input);
-  Val* getOutputAlias(Val* output);
-  std::unordered_set<int> getOutputAliasIndices() const;
-  std::vector<std::pair<int, int>> getInputAliasIndices() const;
-
-  // mark input at index to be permuted by permutation
-  void setPermutationOnInput(int index, std::vector<int64_t> permutation) {
-    permuted_input_map_.insert({index, permutation});
-  }
-
-  // mark output at index to be restored by permutation
-  void setPermutationOnOutput(int index, std::vector<int64_t> permutation) {
-    permuted_output_map_.insert({index, permutation});
-  }
-
-  // return a map of indices to permutation, which indicates all input tensors
-  // that needs to be permuted
-  const PermutationMap& getPermutationInputMap() const {
-    return permuted_input_map_;
-  }
-
-  // return a map of indices to permutation, which indicates all output tensors
-  // that needs to be permuted
-  const PermutationMap& getPermutationOutputMap() const {
-    return permuted_output_map_;
-  }
-
-  bool isTVUseInfoValid() {
-    return all_tv_uses_valid_;
-  }
-
-  bool isUpdatingTVUseInfo() {
-    return is_during_update_uses_;
-  }
-
-  const auto& ioAlias() const {
-    return io_alias_;
-  }
-
- protected:
-  friend SegmentCandidateFinder;
-  friend SegmentedFusion;
-  friend class TranslateApplicableWelford;
-  friend Val;
-
-  static IrCloner copy(const Fusion* from, Fusion* to);
-
-  //! Register the Val with this fusion
-  virtual void registerVal(Val* val) override;
-
-  //! Register expr with this fusion.
-  //! When we register an expression, we want to update the dependency tracking
-  //! of Vals. If this container is a not a Kernel, it will remove previous
-  //! definitions of outputs and register this Expr as the definition. Otherwise
-  //! will update definition if not previously set, but will not remove old
-  //! definitions.
-  virtual void registerExpr(Expr* expr) override;
-
-  //! Clear Expr's from TV uses that are not required to produce outputs from
-  //! inputs. Only other place this is used (other than Fusion) is in
-  //! Val::uses()
-  void resetTvUses();
-
- private:
-  // Determine if the two values are compatible for aliasing
-  // Same DataType, ValType, and number of dimensions
-  bool isAliasCompatible(Val* left, Val* right);
-
- private:
-  // Fusion inputs and outputs
-  std::vector<Val*> inputs_;
-  std::vector<Val*> outputs_;
-
-  // io alias pointing from output to input
-  std::unordered_map<Val*, Val*> io_alias_;
-
-  // See Note [ Permutation support in nvfuser ]
-  // map from indices of input tensor to permutation
-  PermutationMap permuted_input_map_;
-  // map from indices of output tensor to permutation
-  PermutationMap permuted_output_map_;
-
-  // Records if the current use data in the IR nodes are valid
-  //  the states are either all valid or all invalid
-  bool all_tv_uses_valid_ = false;
-  bool is_during_update_uses_ = false;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/fusion_segmenter.cpp
+++ b/third_party/nvfuser/csrc/fusion_segmenter.cpp
--- a/third_party/nvfuser/csrc/fusion_segmenter.h
+++ b/third_party/nvfuser/csrc/fusion_segmenter.h
@ -1,628 +0,0 @@
-#pragma once
-
-#include <fusion.h>
-#include <ir_base_nodes.h>
-#include <kernel_cache.h>
-#include <scheduler/all_schedulers.h>
-#include <scheduler/registry.h>
-#include <utils.h>
-
-#include <deque>
-#include <list>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class SegmentedGroup;
-class SegmentCandidateFinder;
-
-// A directed edge on DAG,
-// Wrapper for values, edges between segmented groups which are made up
-// of Exprs. Multiple edges can exist between segmented groups.
-struct SegmentedEdge {
-  SegmentedEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val)
-      : from(from), to(to), val(val) {}
-
-  SegmentedGroup* from;
-  SegmentedGroup* to;
-  Val* val;
-
-  void print() const;
-};
-
-std::ostream& operator<<(std::ostream& os, const SegmentedEdge* edge);
-
-//! Groups together expressions which create a segmented group
-//! Can be used to produce fusions
-class TORCH_CUDA_CU_API SegmentedGroup {
- public:
-  SegmentedGroup(SegmentedFusion* segmented_fusion)
-      : segmented_fusion_(segmented_fusion) {}
-
-  SegmentedGroup(Expr* expr, SegmentedFusion* segmented_fusion)
-      : segmented_fusion_(segmented_fusion) {
-    exprs_.push_back(expr);
-  }
-
-  //! Checks if this group takes original fusion's input
-  bool isInputGroup() {
-    return !input_vals.empty();
-  };
-
-  //! Checks if this group is used any where in the segmented fusion
-  bool isConnected() const {
-    return !producer_edges.empty() || !consumer_edges.empty() ||
-        !output_vals.empty();
-  }
-
-  //! returns the id assigned by segment pass
-  int groupId() const {
-    return group_id_;
-  }
-
-  //! Returns inputs that this group shares with the original fusion
-  const auto& inputs() const {
-    return input_vals;
-  }
-
-  //! Returns outputs that this group shares with the original fusion
-  const auto& outputs() const {
-    return output_vals;
-  }
-
-  //! Returns the schedule heuristic associated with this group
-  ScheduleHeuristic heuristic() const {
-    return heuristic_;
-  }
-
-  //! Returns the exprs that make up this group
-  const auto& exprs() const {
-    return exprs_;
-  }
-
-  //! Debug print function
-  void print() const;
-
-  //! Returns the segmented fusion that this group is in
-  SegmentedFusion* segmentedFusion() const {
-    return segmented_fusion_;
-  }
-
-  //! Utility to re-collect the operators included in this
-  //!  segmented group after updating the group boundary.
-  void resetExprList();
-
-  //! Try to get a scheduler entry for this group with
-  //!  the given runtime info.
-  //! Returns a new scheduler with the same heuristics
-  //!  for this group if possible.
-  //!  Note that the schedule params can be different.
-  //! Returns a nullopt if this group cannot be scheduled
-  //!  with the same heuristics.
-  c10::optional<std::unique_ptr<SchedulerEntry>> getMaybeSchedulerEntry(
-      SchedulerRuntimeInfo& runtime_info);
-
- public:
-  //! "Ancestor nodes", towards inputs of segmentedDAG
-  std::vector<SegmentedEdge*> producer_edges;
-
-  //! "Descendent nodes", towards outputs of segmentedDAG
-  std::vector<SegmentedEdge*> consumer_edges;
-
-  //! Composite Fusion inputs in this group
-  std::vector<Val*> input_vals;
-
-  //! Composite Fusion outputs in this group
-  std::vector<Val*> output_vals;
-
- private:
-  friend class SegmentCandidateFinder;
-  friend class SegmentedFusion;
-  friend class FusionKernelRuntime;
-  friend class TranslateApplicableWelford;
-
-  //! unique identifier of group in the segmented fusion
-  int group_id_ = -1;
-
-  //! The scheduler to use for compiling this group
-  ScheduleHeuristic heuristic_ = ScheduleHeuristic::None;
-
-  //! Exprs that make up the group
-  std::vector<Expr*> exprs_;
-
-  //! Maximum path distance from an input segmented group required for
-  //! Theorem 4.2
-  int level_ = -1;
-
-  //! traversal marker, has this node already been processed
-  bool visited_ = false;
-
-  //! Did we select another group to merge with
-  SegmentedGroup* merge_with_ = nullptr;
-
-  //! if we selected another group to merge, which edge is to be contracted
-  SegmentedEdge* merge_through_ = nullptr;
-
-  //! Has this node been merged?
-  bool merged_ = false;
-
- private:
-  //! Utility to convert edge vector to value vector
-  std::vector<Val*> edgesToVals(const std::vector<SegmentedEdge*>& se_v);
-
-  //! Reset method to call at begining of each
-  //!  merge node iteration
-  void clearTraversalInfo();
-
-  //! To be called at the very end of segment fusion
-  //!  no more segment merging should be done beyond
-  void finalize();
-
-  //! Return all segmented groups connected with *this
-  std::vector<SegmentedGroup*> getNeighbors();
-
-  //! Utility struct to represent a group connection
-  //!  both the group to connect with and the edge
-  //!  to connect through
-  struct NeighborGroup {
-    NeighborGroup(SegmentedGroup* g, SegmentedEdge* e) : group(g), edge(e) {}
-    SegmentedGroup* group;
-    SegmentedEdge* edge;
-  };
-
-  //! TODO: May want to sort this based on size of connections between this and
-  //! neighbors as well as if the connection is an output of the fusion (has to
-  //! be saved to gmem anyways)
-  std::vector<NeighborGroup> getNeighborGroups();
-
-  //! Look at all neighbors of this and return who this could merge with based
-  //! on level values of this, neighbors, and merged neighbors of neighbors
-  std::vector<NeighborGroup> getMergeCandidates();
-
-  //! Assign schedule heuristic to this group
-  void setHeuristic(ScheduleHeuristic sh) {
-    heuristic_ = sh;
-  }
-
-  //! Assign Id for this group
-  void setID(int id) {
-    TORCH_INTERNAL_ASSERT(group_id_ == -1);
-    group_id_ = id;
-  }
-
-  //! SegmentedFusion this group belongs to
-  SegmentedFusion* segmented_fusion_;
-};
-
-std::ostream& operator<<(std::ostream& os, const SegmentedGroup* group);
-
-//! Auxiliary class for storing heuristics. The managed data is either
-//!  a single scheduler entry for complete fusion,
-//!  or a vector of schedulers, one for each segment, for segmented fusion.
-class TORCH_CUDA_CU_API FusionHeuristics {
-  using SchedulerEntryOwningPtr = std::unique_ptr<SchedulerEntry>;
-
- public:
-  //! Constructor for segmented fusion case. Created with empty list and
-  //!  uses emplaceBack for inserting heuristics in order
-  explicit FusionHeuristics() = default;
-
-  //! Constructor for complete fusion case, generates the scheduler entry
-  //!  for the fusion owning the given expression
-  explicit FusionHeuristics(
-      ScheduleHeuristic schedule_heuristic,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
-    heuristics_.emplace_back(SchedulerEntry::makeEntry(
-        schedule_heuristic, runtime_info.fusion(), runtime_info, data_cache));
-    is_segmented_ = false;
-  }
-
-  FusionHeuristics(const FusionHeuristics&) = delete;
-  FusionHeuristics& operator=(const FusionHeuristics&) = delete;
-
-  //! Place a scheduler entry on the list. Applies to segmented fusion only.
-  void emplaceBack(SchedulerEntryOwningPtr&& pt) {
-    TORCH_INTERNAL_ASSERT(is_segmented_);
-    heuristics_.emplace_back(std::move(pt));
-  }
-
-  //! Returns list of schedulers for a segmneted fusion.
-  const std::vector<SchedulerEntryOwningPtr>& heuristicsList() const {
-    return heuristics_;
-  }
-
-  //! Returns the single scheduler for a complete fusion.
-  SchedulerEntry* singleKernelHeuristics() {
-    TORCH_INTERNAL_ASSERT(!is_segmented_);
-    return heuristics_.begin()->get();
-  }
-
- private:
-  std::vector<SchedulerEntryOwningPtr> heuristics_;
-  bool is_segmented_ = true;
-};
-
-//! Exported Interface for representing segmented fusion graph
-//!   this class owns the segmented groups
-class TORCH_CUDA_CU_API SegmentedFusion {
- public:
-  explicit SegmentedFusion(std::unique_ptr<Fusion> fusion);
-
-  //! Factory function for the un-segmented case, directly
-  //!  constructs a "SegmentedFusion", with the given Fusion
-  //!  as the only group.
-  static std::unique_ptr<SegmentedFusion> fromCompleteFusion(
-      std::unique_ptr<Fusion> fusion,
-      ScheduleHeuristic heuristic);
-
-  //! Is the fusion segmented?
-  bool isSegmented() const {
-    return !groups_.empty();
-  }
-
-  std::vector<SegmentedGroup*>& groups() {
-    return groups_;
-  }
-
-  std::vector<SegmentedEdge*>& edges() {
-    return edges_;
-  }
-
-  const std::vector<SegmentedGroup*>& cgroups() const {
-    return groups_;
-  }
-
-  const std::vector<SegmentedEdge*>& cedges() const {
-    return edges_;
-  }
-
-  //! Returns the original un-segmented fusion
-  Fusion* completeFusion() const {
-    return complete_fusion_.get();
-  }
-
-  const auto& inputs() const {
-    return complete_fusion_->inputs();
-  }
-
-  const auto& outputs() const {
-    return complete_fusion_->outputs();
-  }
-
-  Val* findAlias(Val* val) const {
-    auto alias_it = complete_fusion_->ioAlias().find(val);
-    if (alias_it != complete_fusion_->ioAlias().end()) {
-      return alias_it->second;
-    }
-    return nullptr;
-  }
-
-  //! Make a clone of the group and convert to fusion
-  std::unique_ptr<Fusion> makeFusion(SegmentedGroup* sg);
-
-  //! Make heuristics for all groups in this segmented fusion
-  std::unique_ptr<FusionHeuristics> makeInitialHeuristics(
-      const KernelArgumentHolder& inputs);
-
-  //! Inline Debug print for segmented fusion
-  std::string toString(int verbosity) const;
-
-  //! Debug drawing for graphviz
-  void draw();
-
-  //! Debug print for segmented fusions
-  void print() const;
-
-  //! API for adding groups
-  SegmentedGroup* newGroup();
-
-  //! API shortcut for adding a singleton group
-  SegmentedGroup* newGroup(Expr* expr);
-
-  //! API for adding edges
-  SegmentedEdge* newEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
-
-  HeuristicSummary* getCachedHeuristicDataFor(SegmentedGroup* group);
-
- private:
-  //! Unique name for segmented fusion
-  int segmented_fusion_name_;
-
-  //! States representing segmentation
-  std::vector<SegmentedEdge*> edges_;
-  std::vector<SegmentedGroup*> groups_;
-
-  //! Owning object to explicitly manage groups and edges
-  class Impl {
-   public:
-    explicit Impl(SegmentedFusion* sf) : owning_fusion_(sf) {}
-
-    SegmentedGroup* makeGroup();
-    SegmentedGroup* makeGroup(Expr*);
-    SegmentedEdge* makeEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
-    void cleanUnused();
-
-   private:
-    using GroupPtr = std::unique_ptr<SegmentedGroup>;
-    using EdgePtr = std::unique_ptr<SegmentedEdge>;
-    std::vector<GroupPtr> groups_;
-    std::vector<EdgePtr> edges_;
-    SegmentedFusion* owning_fusion_;
-  };
-  Impl impl_;
-
-  //! A Copy of original full fusion
-  std::unique_ptr<Fusion> complete_fusion_;
-
-  //! A set of intermediate tensors that need to be cast to fp16
-  std::unordered_set<TensorView*> force_fp16_tv_set_;
-
-  DataType force_half_precision_type_;
-
-  //! Static traversal information to be used for fast heuristics lookup
-  std::unordered_map<SegmentedGroup*, std::unique_ptr<HeuristicSummary>>
-      heuristic_summary_cache_;
-
-  // TODO: this class needs cleanup
- protected:
-  friend class SegmentCandidateFinder;
-  //! Make a heuristics entry for a group and parameters
-  std::unique_ptr<SchedulerEntry> makeInitialSchedulerEntry(
-      SegmentedGroup* sg,
-      SchedulerRuntimeInfo& runtime_info);
-
-  //! Cleanup function to be call at the end of fusion
-  //!  segment pass
-  void finalize();
-
-  //! Collect all the intermediate tensors between segmented
-  //!  groups that will cast to fp16
-  void annotateFP16IntermediateTensors();
-
-  //! Keep heuristic checking intermediate data
-  void setCachedHeuristicDataFor(
-      SegmentedGroup* group,
-      std::unique_ptr<HeuristicSummary> data);
-
-  //! Utility to give unique name for each segmented fusion
-  static size_t segmentedFusionName() {
-    static size_t counter = 0;
-    return counter++;
-  }
-};
-
-//! This is a base class for segmenter analysis
-//!  provides the minimal implementation on header so that
-//!  a unique_ptr can use this base class
-//!  actual implementations of analyses are in the .cpp files
-//! TODO: In the next refactor PR, should put segment candidate
-//!  finder in .cpp file completely since API doesn't require these
-//!  details
-class SegmenterAnalysis : public PolymorphicBase {};
-class GroupDependencyAnalysis;
-
-// Manual node merging passes
-class CombineReductions;
-
-//! Options to configure/debug candidate finder
-struct TORCH_CUDA_CU_API SegmentCandidateFinderOptions {
-  bool run_translate_welford = true;
-  bool run_combine_reductions = true;
-  bool run_herrmann_merge = true;
-  bool run_final_merge = true;
-};
-
-//!  SegmentCandidateFinder
-//!    Responsible for going through DAG and proposing things we could try to
-//!    fuse together, calls "canGenerateCode" on these proposed segments to see
-//!    if they are valid and we can generate code for them.
-//!  FusionSegment
-//!    A group of exprs that are segmented together
-//!  FusionSegmentConnections
-//!    Holds vals and what they connect. In other words it's a val that is an
-//!    output of a FusionSegment "from" and an input of FusionSegment "to".
-//!    There's nothing preventing from a val being between segments twice.
-//!    TODO: make sure there's nothing wrong with segmentation on nodes that
-//!    have the same value input twice. i.e. (B = A*A)
-//! Selecting segments to propose is based on the theorem 4.2 in the paper which
-//! makes sure when segment the segmented graph will be a DAG (assumes Fusion is
-//! already a DAG). The segmentation code relies on assumptions of DAG-ness
-//! during segmentation, meaning proposed merging of groups must maintain the
-//! DAG property of the graph.
-//!
-//! Julien Herrmann, Yusuf Özkaya, Bora Uçar, Kamer Kaya, Umit Catalyurek.
-//! Multilevel Algorithms for Acyclic Partitioning of Directed Acyclic Graphs.
-//! SIAM Journal on Scientific Computing, Society for Industrial and Applied
-//! Mathematics, 2019, 41 (4), pp.A2117-A2145. ff10.1137/18M1176865ff.
-//! ffhal02306566f
-class TORCH_CUDA_CU_API SegmentCandidateFinder {
- public:
-  // Perform segmentation on a copy of the given fusion
-  static std::unique_ptr<SegmentedFusion> segment(
-      const Fusion* fusion,
-      const KernelArgumentHolder& inputs,
-      SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
-    auto fusion_copy = std::make_unique<Fusion>(*fusion);
-    if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
-      std::cout << "Segment the fusion (Original Fusion Un-modified): "
-                << std::endl;
-      fusion_copy->printMath();
-    }
-    SegmentCandidateFinder scf(std::move(fusion_copy), inputs, options);
-    return std::move(scf.segmented_fusion_);
-  }
-
-  // Perform segmentation on and take ownership of the given fusion
-  static std::unique_ptr<SegmentedFusion> segment(
-      std::unique_ptr<Fusion> fusion,
-      const KernelArgumentHolder& inputs,
-      SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
-    SegmentCandidateFinder scf(std::move(fusion), inputs, options);
-    if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
-      std::cout << "Segment the fusion (Original Fusion Un-modified): "
-                << std::endl;
-      scf.completeFusion()->printMath();
-    }
-    return std::move(scf.segmented_fusion_);
-  }
-
-  static bool TranslateWelfordInFusion(
-      Fusion* fusion,
-      const KernelArgumentHolder& runtime_inputs);
-
- private:
-  // Perform segmentation on and take ownership of the given fusion
-  SegmentCandidateFinder(
-      std::unique_ptr<Fusion> fusion,
-      const KernelArgumentHolder& inputs,
-      SegmentCandidateFinderOptions options);
-
-  void resetTraversal();
-
-  void resetLevels();
-
-  SegmentedGroup* mergeNodes();
-
-  bool codeGenSupportedMerge(SegmentedGroup* group1, SegmentedGroup* group2);
-
-  void findSegments();
-
-  std::unordered_set<SegmentedEdge*> disconnectGroup(SegmentedGroup* group);
-
-  std::vector<SegmentedGroup*>& groups() {
-    TORCH_INTERNAL_ASSERT(
-        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
-    return segmented_fusion_->groups();
-  }
-
-  std::vector<SegmentedEdge*>& edges() {
-    TORCH_INTERNAL_ASSERT(
-        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
-    return segmented_fusion_->edges();
-  }
-
-  Fusion* completeFusion() {
-    TORCH_INTERNAL_ASSERT(
-        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
-    return segmented_fusion_->completeFusion();
-  }
-
-  SchedulerRuntimeInfo& runtimeInfo() {
-    return runtime_info_;
-  }
-
-  ExpressionEvaluator& expressionEvaluator() {
-    return runtime_info_.expressionEvaluator();
-  }
-
-  //! Additional merging iteration, clean up the rest of
-  //!  the merging opportunities
-  //!  Herrmann et al. is a fast and safe algorithm for finding merge candidates
-  //!  but can become too conservative in our use cases because we place
-  //!  additional qualifiers on valid merges other than having to generate DAGs,
-  //!  i.e. canSchedule. So we need a bruteforce final merging iteration as a
-  //!  clean up pass. Cost isn't expected to be high since the graph at this
-  //!  stage is already quite merged. Example cf. test_gpu.cpp:
-  //!  FusionDAGMerging_CUDA
-  //!
-  //!  This merging algorithm is based on Theorem 4.1 of Herrmann et al.,
-  //!   to check if a producer-consumer pair can be merged into one group,
-  //!   it's enough to check if any other consumer of the producer also
-  //!   produces the consumer.
-  void finalMerge();
-
-  //! Duplicate and add all exprs producing the used
-  //!  scalar values in group
-  void resolveScalarsInGroup(SegmentedGroup* group);
-
-  //! Duplicate and add all exprs from "inputs" in the group, to complete
-  //! inputs. These expressions are simply unary ops of inputs that we want to
-  //! recompute for each segment, instead of computing and producing a segmented
-  //! val. For example if we have:
-  //! tv1 = tv0 * 2;
-  //! tv3 = tv1 + tv2;
-  //! tv4 = tv1 + tv4
-  //! If we segmented on tv1, we would be producing an output for tv1 for 2
-  //! groups that have tv3 or tv4, instead we could easily recompute tv1 from
-  //! tv0.
-  void resolveInputsInGroup(SegmentedGroup* group);
-
-  //! Remove all scalar edges in group
-  //!  (TODO: need structure better so we don't have to do this)
-  void removeScalarEdges();
-
-  //! Utility function to merge a vector of groups in one step,
-  //!  need to check for DAG condition before using this method
-  SegmentedGroup* mergeAllGivenGroups(
-      const std::vector<SegmentedGroup*>& groups);
-
-  //! Utility to remove a group and corresponding edges
-  //!  TODO: remove inline versions of this as much as possible
-  void eraseGroups(std::unordered_set<SegmentedGroup*>& groups_to_erase);
-
-  void finalize();
-
-  //! Return the resulting heuristic corresponding to the merged
-  //!  group built by merging the two groups connected by edge
-  ScheduleHeuristic deriveHeuristic(SegmentedGroup* edge);
-
-  GroupDependencyAnalysis* getGroupDependency();
-
- protected:
-  //! These are the merge node heuristic passes, should
-  //!  eventually should have a dedicated interface
-  //!  instead of keeping adding friends
-  friend class CombineReductions;
-
-  //! options to configure and debug the segment process
-  SegmentCandidateFinderOptions options_;
-
-  std::deque<SegmentedGroup*> to_visit_;
-  std::vector<SegmentedGroup*> next_to_visit_;
-
-  std::unordered_set<SegmentedGroup*> clean_up_groups_;
-  std::unordered_set<SegmentedEdge*> clean_up_edges_;
-
-  std::vector<SegmentedGroup*> to_merge_;
-
-  std::unique_ptr<SegmentedFusion> segmented_fusion_;
-
-  std::unique_ptr<SegmenterAnalysis> group_dependency_;
-
-  SchedulerRuntimeInfo runtime_info_;
-
-  //! Note:
-  //!  Segmenter should eventually rely only on runtime_info_ for
-  //!  safe caching. runtime_inputs_ is only used in translateWelford
-  //!  to initialize expression evaluators on copies of the original
-  //!  fusion, which doesn't use any un-cached info and is safe.
-  //!
-  //!  Directly using runtime_inputs_ in other cases is in general
-  //!   risky.
-  //!
-  //!  To get rid of runtime_inputs_ we need mechanisms
-  //!  to copy expression evaluator values from fusion
-  //!  to a copy, or even better to a copy of a
-  //!  sub-graph of original fusion.
-  //! TODO:
-  //!  implement the expression evaluator transfer and
-  //!  remove runtime_inputs_ in a follow up.
-  const KernelArgumentHolder& runtime_inputs_;
-};
-
-// TODO: Make as member functions on classes instead of global scope
-TORCH_CUDA_CU_API std::string toString(const SegmentedGroup* group);
-TORCH_CUDA_CU_API std::string toString(const SegmentedEdge* edge);
-TORCH_CUDA_CU_API std::string toString(const SegmentedFusion* segmented_fusion);
-TORCH_CUDA_CU_API std::string toString(
-    const SegmentCandidateFinderOptions& segment_options);
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/graph_fuser.cpp
+++ b/third_party/nvfuser/csrc/graph_fuser.cpp
--- a/third_party/nvfuser/csrc/grouped_reduction.cpp
+++ b/third_party/nvfuser/csrc/grouped_reduction.cpp
@ -1,204 +0,0 @@
-#include <ir_builder.h>
-#include <ir_utils.h>
-#include <root_domain_map.h>
-#include <transform_iter.h>
-
-#include <grouped_reduction.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-// Return if ref and other are transformed in the same way.
-bool hasMatchingTransformations(TensorView* ref, TensorView* other) {
-  std::unordered_map<IterDomain*, IterDomain*> ref_2_other;
-  for (const auto i : c10::irange(ref->getRootDomain().size())) {
-    ref_2_other.emplace(
-        ref->getRootDomain().at(i), other->getRootDomain().at(i));
-  }
-
-  auto replay =
-      BestEffortReplay(
-          other->domain()->domain(), ref->domain()->domain(), ref_2_other)
-          .getReplay();
-
-  for (const auto i : c10::irange(ref->nDims())) {
-    auto ref_id = ref->axis(i);
-    auto other_id = other->axis(i);
-    auto it = replay.find(ref_id);
-    if (it == replay.end() || it->second != other_id) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-// Validate grouping of reductions and return a new max producer position
-void validateReductionGrouping(
-    const std::vector<Val*>& inputs,
-    const std::vector<Val*>& outputs) {
-  TORCH_INTERNAL_ASSERT(inputs.size() == outputs.size());
-  TORCH_INTERNAL_ASSERT(!inputs.empty());
-
-  auto fusion = dynamic_cast<Fusion*>(outputs[0]->container());
-  TORCH_INTERNAL_ASSERT(
-      fusion != nullptr, "Grouping of reductions must be done within a Fusion");
-
-  ExactRootDomainMap exact_map(fusion);
-
-  // Pick the first output TV as a reference and compare it with the
-  // rest. Do not allow grouping if any mismatch is detected.
-  auto ref_tv = outputs[0]->as<TensorView>();
-  const auto ref_domain = ref_tv->getRootDomain();
-  const auto num_root_dims = ref_domain.size();
-  const auto num_dims = ref_tv->nDims();
-  const auto ref_ca_pos = ref_tv->getComputeAtPosition();
-  for (const auto i : c10::irange(inputs.size())) {
-    auto output_tv = outputs.at(i)->as<TensorView>();
-    const auto& output_domain = output_tv->getRootDomain();
-    if (ref_tv == output_tv) {
-      continue;
-    }
-    TORCH_INTERNAL_ASSERT(
-        output_domain.size() == num_root_dims,
-        "Invalid grouped reduction due to mismatched number of root dimensions. "
-        "Expected: ",
-        num_root_dims,
-        ". Detected: ",
-        output_domain.size(),
-        ". Invalid output tensor: ",
-        output_tv->toString());
-    TORCH_INTERNAL_ASSERT(
-        output_tv->nDims() == num_dims,
-        "Invalid grouped reduction due to mismatched number of dimensions. "
-        "Expected: ",
-        num_dims,
-        ". Detected: ",
-        output_tv->nDims(),
-        ". Invalid output tensor: ",
-        output_tv->toString());
-    for (const auto i : c10::irange(num_root_dims)) {
-      auto ref_id = ref_domain.at(i);
-      auto output_id = output_domain.at(i);
-      // If an IterDomain is broadcast, require the other
-      // corresponding IterDomains are also broadcast. This may not be
-      // necessary but not completely certain.
-      TORCH_INTERNAL_ASSERT(
-          ref_id->isBroadcast() == output_id->isBroadcast(),
-          "Invalid grouped reduction due to mismatched broadcast root domains. ",
-          "Reference domain: ",
-          ref_id->toString(),
-          ". Mismatched domain: ",
-          output_id->toString(),
-          ". Invalid tensor: ",
-          output_tv->toString());
-      if (ref_id->isBroadcast()) {
-        continue;
-      }
-      TORCH_INTERNAL_ASSERT(
-          ref_id->isReduction() == output_id->isReduction(),
-          "Invalid grouped reduction due to mismatched reduction root domains. ",
-          "Reference domain: ",
-          ref_id->toString(),
-          ". Mismatched domain: ",
-          output_id->toString(),
-          ". Invalid tensor: ",
-          output_tv->toString());
-      TORCH_INTERNAL_ASSERT(
-          exact_map.areMapped(ref_id, output_id) || ref_id->sameAs(output_id),
-          "Invalid grouped reduction due to mismatched root domains. ",
-          "Reference domain: ",
-          ref_id->toString(),
-          ". Mismatched domain: ",
-          output_id->toString(),
-          ". Invalid tensor: ",
-          output_tv->toString());
-    }
-
-    TORCH_INTERNAL_ASSERT(
-        hasMatchingTransformations(ref_tv, output_tv),
-        "Invalid grouped reduction due to mismatched transformations. ",
-        "Reference tensor: ",
-        ref_tv->toString(),
-        ". Mismatched tensor: ",
-        output_tv->toString());
-
-    // Must have the same computeAt position
-    TORCH_INTERNAL_ASSERT(
-        output_tv->getComputeAtPosition() == ref_ca_pos,
-        "Invalid grouped reduction due to mismatched computeAt position. ",
-        "Reference tensor: ",
-        ref_tv->toString(),
-        ". Mismatched tensor: ",
-        output_tv->toString());
-  }
-
-  // Must not have any data dependency from outputs to inputs
-  const auto all_dep_vals = DependencyCheck::getAllValsBetween(
-      {outputs.begin(), outputs.end()}, inputs);
-  if (!all_dep_vals.empty()) {
-    std::stringstream ss;
-    ss << "Invalid dependency:";
-    for (auto val : all_dep_vals) {
-      ss << " " << val->toString();
-    }
-    TORCH_INTERNAL_ASSERT(all_dep_vals.empty(), ss.str());
-  }
-}
-
-} // namespace
-
-void groupReductions(const std::vector<TensorView*>& reduction_outputs) {
-  TORCH_CHECK(!reduction_outputs.empty(), "No tensor is given");
-
-  auto container = reduction_outputs[0]->container();
-
-  const auto num_reductions = reduction_outputs.size();
-
-  std::vector<BinaryOpType> op_types(num_reductions);
-  std::vector<Val*> init_vals(num_reductions);
-  std::vector<Val*> outputs(num_reductions);
-  std::vector<Val*> inputs(num_reductions);
-
-  for (const auto i : c10::irange(num_reductions)) {
-    auto reduction_out = reduction_outputs.at(i);
-    TORCH_CHECK(
-        reduction_out->definition() != nullptr,
-        "Invalid tensor to group: ",
-        reduction_out->toString(),
-        ". Definition not found");
-    auto rop = dynamic_cast<ReductionOp*>(reduction_out->definition());
-    TORCH_CHECK(
-        rop != nullptr,
-        "Invalid tensor to group: ",
-        reduction_out->toString(),
-        ". Not an output of a ReductionOp: ",
-        reduction_out->definition()->toString());
-    // Fused reduction is only enabled during the lowering, so at this
-    // point it should be false.
-    TORCH_INTERNAL_ASSERT(
-        !rop->isAllreduce(), "Invalid ReductionOp: ", rop->toString());
-    op_types.at(i) = rop->getReductionOpType();
-    init_vals.at(i) = rop->init();
-    outputs.at(i) = rop->out();
-    inputs.at(i) = rop->in();
-  }
-
-  validateReductionGrouping(inputs, outputs);
-
-  IrBuilder::create<GroupedReductionOp>(
-      container, op_types, init_vals, outputs, inputs);
-
-  for (auto output : ir_utils::filterByType<TensorView>(outputs)) {
-    output->updateMaxProducerPosition();
-  }
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/grouped_reduction.h
+++ b/third_party/nvfuser/csrc/grouped_reduction.h
@ -1,41 +0,0 @@
-#pragma once
-
-#include <ir_all_nodes.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-//! Horizontally fuse multiple reductions.
-//!
-//! Given a list of tensors produced by ReductionOp, create a new
-//! GroupedReductionOp expression that takes the input tensors of the
-//! original reductions and produces the given tensors, replacing
-//! their defining expressions.
-//!
-//! GroupedReductionOp works just like ReductionOp with a potential
-//! benefit of aggregating synchronizations across individual
-//! reductions. See the reduction::gridReduce2 runtime function for a
-//! two-input version of grid reduction.
-//!
-//! The grouped reductions must follow several constraints, which
-//! include:
-//! - There must not exist any data dependency between individual
-//!   reductions.
-//! - All reduction output tensors must have the same number of
-//!   dimensions, the same transformations and the same axes to
-//!   reduce.
-//!
-//! Note that Welford is not allowed yet, though it should be
-//! technically straightforward to support horizontal fusions of
-//! welford ops. Unclear how common it would be in practice, though.
-//!
-//! \param reduction_outputs Tensors produced by ReductionOp
-TORCH_CUDA_CU_API void groupReductions(
-    const std::vector<TensorView*>& reduction_outputs);
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/index_compute.cpp
+++ b/third_party/nvfuser/csrc/index_compute.cpp
--- a/third_party/nvfuser/csrc/index_compute.h
+++ b/third_party/nvfuser/csrc/index_compute.h
@ -1,447 +0,0 @@
-#pragma once
-
-#include <iter_visitor.h>
-#include <root_domain_map.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-/*
- * Index compute takes in a list of indices typically generated from the
- * surrounding for loop nest. The number of indicies are intended to match the
- * number of dimensions of the incomming TensorView which may have less or more
- * dimensions than its root due to split/merge operations.
- * Split/merge operations are then replayed backwards produce resulting
- * indices (based on input indices) that match the root dimension.
- *
- * For example with GLOBAL tensor:
- * TV[I, K]
- * TV[Io, Ii{4}, K] = TV.split(I, factor=4)
- * ALLOC: NONE
- * INDEX: indexCompute {i, j, k} -> {i * 4 + j, k}
- * FLATTENED_INDEX: {i * 4 + j, k} -> {(i * 4 + j) * K + k}
- * PREDICATE: {i * 4 + j, k} -> i * 4 + j < I
- *
- *
- * For example with SHARED tensor:
- *
- * global_TV[I, K]
- * global_TV[Io, Ii{4}, K] = global_TV.split(I, factor=4)
- * smem_TV.compute_at(global_TV, 1)
- * global_TV.parallelize(1, threadIDx.x)
- *
- * ALLOC: alloc(smem_TV, 4 x K)
- * INDEX: indexCompute(smem_TV, {threadIdx.x, k}) -> {threadIdx.x, k}
- * FLATTENED_INDEX: {threadIdx.x * 4 + j, k} -> {(threadIdx.x * 4 + j) * K + k}
- * PREDICATE: {threadIdx.x * 4 + j, k} -> threadIdx.x * 4 + j < I // Same as if
- * global
- *
- *
- * For example with LOCAL tensor:
- * global_TV[I, K, L]
- * global_TV[Io, Ii{4}, K, L] = global_TV.split(I, factor=4)
- * reg_TV.compute_at(global_TV, 2)
- * global_TV.parallelize(1, threadIDx.x)
- * global_TV{i, j, k, l} -> { i * 4 + j, k, l }
- * global_TV{ i * 4 + j, k, l } -> { (i * 4 + j) * K * L  +  k * L  +  l}
- *
- * ALLOC: alloc(reg_TV, K x L)
- * INDEX: {k, l} -> {k, l}
- * FLATTENED_INDEX: {k, l} -> {k * L + l}
- * PREDICATE: i * 4 + j < I && k < K && l < L ->  // Same as if global
- *
- * These indices can then be flattened later based on strides.
- */
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class ContigIDs;
-class LoopIndexing;
-struct IndexFromIdGraph;
-
-class IndexCompute : public BackwardVisitor {
- protected:
-  using BackwardVisitor::handle;
-
-  void handle(Split*) override;
-  void handle(Merge*) override;
-  void handle(Expr*) override;
-  void handle(Swizzle2D*) override;
-
-  // return extent_map_[id] if exists, else return id->extent()
-  Val* getExtent(IterDomain* id) const;
-
-  //! True if a domain is not used to index
-  bool isZero(IterDomain* id) const;
-  //! True if any dependent of a domain is not used to index
-  bool hasZeroMerged(IterDomain* id) const;
-
-  //! Returns the concrete ID from the compute at EXACT mode map if
-  //! concrete_id_pass == true, otherwise returns id passed in.
-  //! Helps unify the expr handling logic in reference domain and concrete id
-  //! based traversal.
-  IterDomain* maybeGetExactMapConcreteID(IterDomain* id);
-
-  //! (Concrete indexing pass only)
-  //!  Collect permissive index binding from the given expression.
-  //! See also permissive_map_ and LoopIndexing::getBackwardOutOfLineExprList.
-  void collectIndexIntoPermissiveMap(const LoopIndexing& loop_indexing);
-
-  //! (Concrete indexing pass only)
-  //!  Iterate through id_expr's input and pull index vals from permissive
-  //! map, when both of the following are true:
-  //!    1. the output id is missing in index_map_.
-  //!    2. the output id is found in permissive map.
-  void updateIndexMapFromPermissiveMap(const Expr* id_expr);
-
-  // Tensor domain we're mapping back to root
-  const TensorDomain* td_; // NOLINT
-
-  // Map we update as we propagate backward, containing all IDs in the
-  // propagation. Initial indices are mapped with this map at tv->domain()
-  // and are back propagated to tv->getRootDomain(). This index_map_ keeps the
-  // indices at intermediate IterDomain's in that back propagation.
-  std::unordered_map<IterDomain*, Val*> index_map_; // NOLINT
-
-  // Map from IterDomain to their broadcasted extent. If a TV has I0*I1 but its
-  // producer has B0*I1 this map will contain a mapping from the ID{B0*I1} to
-  // the extent I0*I1. Also contains updated extents if we merge in a 0 index.
-  // See zero_merged_in_.
-  std::unordered_map<IterDomain*, Val*> extent_map_; // NOLINT
-
-  // Keeps track of domains that do not contribute to indexing
-  std::unordered_set<IterDomain*> zero_domains_; // NOLINT
-
-  // This set keeps track of IterDomain's that have had a zero index merged into
-  // them. This happens if we do something like tv->axis(0)->split(4) then
-  // tv->computeAt(1, ...) if this tensor is in smem or lmem the backward
-  // indexing would be (0, i) then when we do the backward computation that zero
-  // and i would attempt to be merged together. We handle indices like these
-  // specially.
-  std::unordered_set<IterDomain*> zero_merged_in_;
-
-  // IDs that are a result of contiguous merges
-  std::unordered_set<IterDomain*> contig_ids_;
-
-  // Map from root to indexed domains
-  std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
-
-  // Mentions if we should propagate an index down a particular IterDomain path
-  // if there's an option
-  std::unordered_set<IterDomain*> preferred_paths_;
-
-  // Map from IterDomains to halo-extended extents
-  std::unordered_map<IterDomain*, Val*> halo_extent_map_;
-
-  // Temporary flag which tells IndexCompute to use concrete id's from the exact
-  // map rather than the actual IDs used in the ID expressions.
-  bool concrete_id_pass_ = false;
-
-  // Mode of swizzle that are activated in this index compute
-  //  instance. Will treat swizzles of different mode as no-op.
-  // Currently data mode swizzles are handled same as before in IndexSwizzle
-  //  pass, while loop mode swizzles are handled early on in concrete indexing
-  //  pass. See also [Note on swizzle mode]
-  SwizzleMode swizzle_mode_ = SwizzleMode::NoSwizzle;
-
-  // (Concrete id pass only)
-  // Contains the indexing math that could be resolved with only the
-  //  iterdomains on the right of the consumer_tv's ca axis, i.e. the
-  //  ones that corresponding to the loops that consumer_tv would not
-  //  share with any of its consumers.
-  // These indexing vals should be kept separate from index_map_ and
-  //  should only be used when the indexing traversal follows the
-  //  order defined in LoopIndexingAnalysis::traverseFromDomainVals.
-  std::unordered_map<IterDomain*, Val*> permissive_index_map_;
-
- public:
-  const std::unordered_map<IterDomain*, Val*>& indexMap() const {
-    return index_map_;
-  }
-
-  const std::unordered_map<IterDomain*, Val*>& extentMap() const {
-    return extent_map_;
-  }
-
-  const std::unordered_set<IterDomain*>& zeroDomains() const {
-    return zero_domains_;
-  }
-
-  const std::unordered_set<IterDomain*>& zeroMergedIn() const {
-    return zero_merged_in_;
-  }
-
-  const std::unordered_map<IterDomain*, IterDomain*>& rootToContigID() const {
-    return root_to_indexed_id_;
-  }
-
-  // Propagate back from _td using initial_index_map
-  IndexCompute(
-      const TensorDomain* _td,
-      std::unordered_map<IterDomain*, Val*> initial_index_map,
-      std::unordered_map<IterDomain*, Val*> _extent_map,
-      std::unordered_set<IterDomain*> zero_domains,
-      std::unordered_set<IterDomain*> _zero_merged_in,
-      std::unordered_set<IterDomain*> preferred_paths = {},
-      std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
-
-  IndexCompute(
-      const TensorDomain* _td,
-      std::unordered_map<IterDomain*, Val*> initial_index_map,
-      std::unordered_map<IterDomain*, Val*> _extent_map,
-      std::unordered_set<IterDomain*> zero_domains,
-      std::unordered_set<IterDomain*> _zero_merged_in,
-      const ContigIDs& contig_finder,
-      std::unordered_set<IterDomain*> preferred_paths = {},
-      std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
-
-  // Entry point used for using concrete id based traversal. This traversal is
-  // assumed to start at leaf IDs provided by initial_index_map.
-  IndexCompute(
-      std::unordered_map<IterDomain*, Val*> initial_index_map,
-      std::unordered_set<IterDomain*> zero_domains,
-      std::unordered_set<IterDomain*> preferred_paths,
-      std::unordered_map<IterDomain*, Val*> concrete_halo_extent_map);
-
-  // Updates index_map, extent_map, and zero_merged_in based on id_map and
-  // returns a new IndexCompute ready to be used.
-  IndexCompute updateIndexCompute(
-      const TensorDomain* new_td,
-      const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-      const ContigIDs& contig_finder) const;
-
-  // Interface to run index traversal through loop indexing analysis result to
-  // be used with the entry point for concrete id based traversal.
-  void run(const LoopIndexing& loop_indexing);
-
-  virtual void run();
-};
-
-//! Apply swizzle and update root indices accordingly
-class IndexSwizzle : public IndexCompute {
- public:
-  IndexSwizzle(
-      const TensorView* tv,
-      std::unordered_map<IterDomain*, Val*> initial_index_map,
-      std::unordered_map<IterDomain*, Val*> extent_map,
-      std::unordered_set<IterDomain*> zero_domains,
-      std::unordered_set<IterDomain*> zero_merged_in);
-
-  IndexSwizzle(
-      const TensorView* tv,
-      const TensorDomain* domain,
-      std::unordered_map<IterDomain*, Val*> initial_index_map,
-      std::unordered_map<IterDomain*, Val*> extent_map,
-      std::unordered_set<IterDomain*> zero_domains,
-      std::unordered_set<IterDomain*> zero_merged_in);
-
-  void run() override;
-
- protected:
-  using IndexCompute::handle;
-
-  void handle(Expr* e) override;
-
-  void handle(Swizzle2D* swizzle_2d) override;
-
- private:
-  const TensorView* tv_ = nullptr;
-  SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
-  std::vector<IterDomain*> ids_to_swizzle_;
-  std::unordered_set<IterDomain*> swizzled_ids_;
-};
-
-//! Predicate information of a root or contiguous merged domain
-class RootPredicateInfo {
-  friend class Index;
-
- public:
-  const auto& startPredicate() const {
-    return start_predicate_;
-  }
-
-  auto& startPredicate() {
-    return start_predicate_;
-  }
-
-  const auto& startOffset() const {
-    return start_offset_;
-  }
-
-  const auto& stopPredicate() const {
-    return stop_predicate_;
-  }
-
-  const auto& stopOffset() const {
-    return stop_offset_;
-  }
-
-  const auto& rootIds() const {
-    return root_ids_;
-  }
-
-  //! Return a false RootPredicateInfo, i.e., both start and stop
-  //! predicates are false.
-  static RootPredicateInfo getFalseInfo();
-
- private:
-  // prdicate for lower end
-  Bool* start_predicate_ = nullptr;
-  // prdicate for upper end
-  Bool* stop_predicate_ = nullptr;
-  // Offset of the start predicate
-  Val* start_offset_ = nullptr;
-  // Offset of the stop predicate
-  Val* stop_offset_ = nullptr;
-  // Track which roots have been handled by the generated predicates
-  std::unordered_set<IterDomain*> root_ids_;
-};
-
-// Simple interface for IndexCompute
-// If getComputeAtAxis and more generally TensorView const model is fixed, we
-// can make the below tensorviews const.
-class Index {
- private:
-  // Producer indexing if it's in shared or local memory
-  static std::vector<Val*> getNonGlobalProducerStridedIndices(
-      TensorView* producer,
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  // Consumer indexing if it's in shared or local memory
-  static std::vector<Val*> getNonGlobalConsumerStridedIndices(
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  // Producer if it's in global memory
-  static std::vector<Val*> getGlobalProducerStridedIndices(
-      TensorView* producer,
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  // Consumer indexing if it's in global memory
-  static std::vector<Val*> getGlobalConsumerStridedIndices(
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  // get the strides of a tensor used for the index lowering
-  static std::vector<Val*> getStrides(const TensorView* tv);
-
-  // get the root indices of a tensor used for the index lowering
-  static std::vector<Val*> getRootIndices(
-      const TensorView* tv,
-      const std::vector<kir::ForLoop*>& loops,
-      const IndexFromIdGraph& index_from_id_graph);
-
- public:
-  // Indexing functions
-  // Consumer = Producer
-  // i.e. T0 = T1... -> T0 is the consumer, T1 is the producer
-  // Producer indexing dispatch
-  static kir::TensorIndex* getProducerIndex(
-      TensorView* producer,
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  // Consumer index dispatch
-  static kir::TensorIndex* getConsumerIndex(
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  //! Returns a vector of strided indices mapped onto the (rfactor)
-  //! root domain of a producer tensor. The size of the returned
-  //! vector is guaranteed to be equal to the number of axes of the
-  //! indexing root domain.
-  static std::vector<Val*> getProducerStridedIndices(
-      TensorView* producer,
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  //! Returns a vector of strided indices mapped onto the (rfactor)
-  //! root domain of a consumer tensor. The size of the returned
-  //! vector is guaranteed to be equal to the number of axes of the
-  //! indexing root domain.
-  static std::vector<Val*> getConsumerStridedIndices(
-      const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
-
-  //! Returns the logical index linearized from a multi-dimension address into a
-  //! linear memory address a consumer tensor. The returned index is intended to
-  //! be used for the computation of some tensor factories, such as: arange and
-  //! rand (for Philox pseudo random sequences)
-  static std::vector<Val*> getLinearLogicalIndex(
-      TensorView* consumer_tv,
-      const std::vector<kir::ForLoop*>& loops);
-
-  //! Returns a vector of logical indices mapped onto the (rfactor)
-  //! root domain of a consumer tensor. The returned index is intended
-  //! to be used for the computation of some tensor factories, such as:
-  //! eye
-  static std::vector<Val*> getPerDimLogicalIndex(
-      TensorView* consumer_tv,
-      const std::vector<kir::ForLoop*>& loops);
-
-  //! Take a consumer tensorview and loop nest and generates predicates
-  //! associated with the concrete roots of the loop nest. Returns a list of
-  //! predicates, and a list of concrete roots they're associated with. It
-  //! is assumed that no predicate is required if index[i] is an index
-  //! directly from a for loop. This will not catch all cases if we actually
-  //! have static size information for example:
-  //!
-  //! TV[I].split(4)
-  //! would produce the code:
-  //! for(i : I/4)
-  //!   for(j : 4)
-  //!     if( i * 4 + j < TV.size(0))
-  //!       TV[i * 4 + j]...
-  //!
-  //! However if we had TV.size[0] = 16 at "compile time" then we wouldn't
-  //! need the predicate. This will be caught by canOmitPredicate in the
-  //! predicate lowering
-  //!
-  //! unswitch_or_vec_loop is the for loop to start the unswitch like
-  //! predicate, this is not a bool value as if we have an unswitch loop
-  //! with a vectorized loop inside, we only want to base the "unswitch"
-  //! like predicate on the vectorized loop.
-  static std::vector<RootPredicateInfo> getReferenceRootPredicates(
-      TensorView* consumer_tv,
-      const std::vector<kir::ForLoop*>& loops,
-      kir::ForLoop* unswitch_or_vec_loop,
-      bool padding_predicate);
-};
-
-// Used for local and shared index mapping. Returns a map from loops
-// to loop indices as well as a set of loops that do not contribute to
-// indexing.
-// TODO: could be cleaned up further.
-std::pair<
-    std::unordered_map<kir::ForLoop*, Val*>,
-    std::unordered_set<kir::ForLoop*>>
-indexMapFromTV(
-    const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops,
-    kir::ForLoop* alloc_loop,
-    bool as_consumer,
-    kir::ForLoop* double_buffer_loop = nullptr);
-
-//! Set "pragma unroll" required for loops that indexing of Local
-//! tensors depends on.
-//!
-//! \param tv Indexed tensor
-//! \param alloc_loop Allocation loop of tv
-//! \param loops The current loop structure
-//! \param id_map Producer-to-consumer map in case of indexing as producer
-void ensureStaticIndexing(
-    const TensorView* tv,
-    kir::ForLoop* alloc_loop,
-    const std::vector<kir::ForLoop*>& loops,
-    const std::unordered_map<IterDomain*, IterDomain*>& id_map = {});
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/inlining.cpp
+++ b/third_party/nvfuser/csrc/inlining.cpp
@ -1,306 +0,0 @@
-#include <inlining.h>
-#include <ir_utils.h>
-#include <root_domain_map.h>
-#include <transform_iter.h>
-
-#include <utility>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-MaxPosCalculator::MaxPosCalculator(
-    const std::unordered_set<IterDomain*>& uninlinable_ids)
-    : uninlinable_ids_(uninlinable_ids) {
-  buildUnmappableDims();
-}
-
-void MaxPosCalculator::buildUnmappableDims() {
-  ComputeAtRootDomainMap root_map;
-  root_map.build();
-  auto all_tvs = ir_utils::allTvs(FusionGuard::getCurFusion());
-  for (auto tv : all_tvs) {
-    auto consumers = ir_utils::consumerTvsOf(tv);
-    for (auto consumer : consumers) {
-      // Grab dimensions in producer and consumer that are mappable to eachother
-      // based on the computeAtRootDomainMap. This will tell us which dimensions
-      // can be inlined based on avoiding trying to inline non-trivial
-      // reduction structures.
-      auto mappable_roots =
-          root_map.getMappableDims(tv->domain(), consumer->domain());
-      for (auto tv_root_id : tv->getMaybeRFactorDomain()) {
-        if (mappable_roots.find(tv_root_id) == mappable_roots.end() &&
-            !tv_root_id->isTrivialReduction()) {
-          unmappable_dims_.emplace(tv_root_id);
-        }
-      }
-    }
-  }
-}
-
-bool MaxPosCalculator::isAllowedID(
-    IterDomain* id,
-    TensorView* tv,
-    bool best_effort,
-    bool allow_reduction,
-    bool allow_vectorize,
-    bool allow_unmappable) const {
-  bool allowed = true;
-
-  if (!allow_reduction) {
-    allowed = allowed && !id->isReduction();
-  }
-
-  if (uninlinable_ids_.count(id)) {
-    return false;
-  }
-
-  if (!allow_vectorize) {
-    // Avoid inlining if marked as Vectorize or Group. In the case of
-    // BestEffort and MostInlined modes, avoid Unroll as well.
-    bool is_vectorize = isParallelTypeVectorize(id->getParallelType()) ||
-        id->getParallelType() == ParallelType::Group ||
-        (best_effort && id->getParallelType() == ParallelType::Unroll);
-    allowed = allowed && !is_vectorize;
-  }
-
-  if (!allow_unmappable) {
-    auto root_dom = tv->getMaybeRFactorDomain();
-    std::unordered_set<Val*> root_dom_set(root_dom.begin(), root_dom.end());
-    auto all_vals = DependencyCheck::getAllValsBetween(root_dom_set, {id});
-    bool is_unmappable = false;
-    for (auto val : all_vals) {
-      auto id = val->as<IterDomain>();
-      if (root_dom_set.count(val) > 0 && unmappable_dims_.count(id) > 0) {
-        is_unmappable = true;
-        break;
-      }
-    }
-    allowed = allowed && !is_unmappable;
-  }
-
-  return allowed;
-}
-
-size_t MaxPosCalculator::getMaxPosSelf(
-    TensorView* tv,
-    bool best_effort,
-    bool allow_reduction,
-    bool allow_vectorize,
-    bool allow_unmappable) const {
-  auto dom = tv->domain()->domain();
-  auto iter = std::find_if(dom.begin(), dom.end(), [=](IterDomain* id) {
-    return !isAllowedID(
-        id,
-        tv,
-        best_effort,
-        allow_reduction,
-        allow_vectorize,
-        allow_unmappable);
-  });
-  return std::distance(dom.begin(), iter);
-}
-
-// Return the max position in producer that can be inlined to consumer
-// Cannot inline:
-//   Vectorized dimensions in consumer
-//   Unrolled dimensions in consumer
-size_t MaxPosCalculator::getMaxProducerPosFromConsumer(
-    TensorView* producer,
-    TensorView* consumer,
-    bool best_effort) const {
-  auto pairwise_root_map = PairwiseRootDomainMap(producer, consumer);
-  auto replay_CasP =
-      BestEffortReplay::replayCasP(consumer, producer, -1, pairwise_root_map);
-  auto p2c_replay_map = replay_CasP.getReplay();
-
-  for (size_t producer_pos = 0; producer_pos < producer->nDims();
-       producer_pos++) {
-    // If the producer position is mismatching with the consumer, then we can
-    // not inline into this position, otherwise the max producer position of
-    // the consumer will become invalid and expression sort will fail.
-    if (TransformReplay::getMatchedLeafPosWithoutReplayCasP(
-            consumer, producer, producer_pos + 1) < 0) {
-      return producer_pos;
-    }
-    auto map_it = p2c_replay_map.find(producer->axis(producer_pos));
-    if (map_it != p2c_replay_map.end()) {
-      auto c_id = map_it->second;
-      if (!isAllowedID(c_id, consumer, best_effort, true, false, true)) {
-        return producer_pos;
-      }
-    }
-  }
-  return producer->nDims();
-}
-
-size_t MaxPosCalculator::getMaxPosAll(
-    TensorView* tv,
-    bool best_effort,
-    bool check_siblings) {
-  auto max_pos = getMaxPosSelf(tv, best_effort, false, false, false);
-  for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
-    max_pos = std::min<size_t>(
-        max_pos, getMaxProducerPosFromConsumer(tv, consumer_tv, best_effort));
-  }
-  if (check_siblings) {
-    for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
-      max_pos = std::min<size_t>(
-          max_pos, getMaxPosAll(sibling_tv, best_effort, false));
-    }
-  }
-  return max_pos;
-}
-
-void inlineMost(const std::unordered_set<IterDomain*>& uninlinable_ids) {
-  inlineMost(ir_utils::allTvs(FusionGuard::getCurFusion()), uninlinable_ids);
-}
-
-void inlineMost(
-    const std::vector<TensorView*>& tvs,
-    const std::unordered_set<IterDomain*>& uninlinable_ids) {
-  if (tvs.empty()) {
-    return;
-  }
-  MaxPosCalculator calc(uninlinable_ids);
-  for (auto tv : tvs) {
-    tv->inlineAt(-1, true, &calc);
-  }
-}
-
-void inlineMost(
-    const std::unordered_set<TensorView*>& tvs,
-    const std::unordered_set<IterDomain*>& uninlinable_ids) {
-  if (tvs.empty()) {
-    return;
-  }
-  MaxPosCalculator calc(uninlinable_ids);
-  for (auto tv : tvs) {
-    tv->inlineAt(-1, true, &calc);
-  }
-}
-
-namespace {
-
-// Find the positions of `selected` tensors that is mapped to the given position
-// in the reference tensor.
-class FindMappedPositions : public MaxInfoSpanningTree::Propagator {
-  std::unordered_map<TensorView*, size_t>& output_;
-
- public:
-  FindMappedPositions(
-      std::unordered_map<TensorView*, size_t>& output,
-      TensorView* reference,
-      int64_t reference_pos);
-
-  ~FindMappedPositions() override = default;
-
-  virtual void propagateC2P(TensorView* from, TensorView* to) override;
-  virtual void propagateP2C(TensorView* from, TensorView* to) override;
-  virtual void propagateSibling(TensorView* from, TensorView* to) override;
-};
-
-FindMappedPositions::FindMappedPositions(
-    std::unordered_map<TensorView*, size_t>& output,
-    TensorView* reference,
-    int64_t reference_pos)
-    : output_(output) {
-  if (reference_pos < 0) {
-    reference_pos += int64_t(reference->nDims()) + 1;
-  }
-  TORCH_CHECK(
-      reference_pos >= 0 && reference_pos <= int64_t(reference->nDims()),
-      "Invalid axis received ",
-      reference_pos,
-      " but should be > -",
-      reference->nDims(),
-      " and <= ",
-      reference->nDims(),
-      ".");
-  output_[reference] = reference_pos;
-}
-
-void FindMappedPositions::propagateC2P(TensorView* from, TensorView* to) {
-  int from_pos = output_.at(from);
-  auto to_pos =
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
-  // If there is no matching position found, we compute the highest matched
-  // position as the closest approximation
-  while (to_pos < 0) {
-    from_pos--;
-    to_pos =
-        TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
-  }
-  output_[to] = to_pos;
-}
-
-void FindMappedPositions::propagateP2C(TensorView* from, TensorView* to) {
-  int from_pos = output_.at(from);
-  auto to_pos =
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
-  // If there is no matching position found, we compute the highest matched
-  // position as the closest approximation
-  while (to_pos < 0) {
-    from_pos--;
-    to_pos =
-        TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
-  }
-  output_[to] = to_pos;
-}
-
-void FindMappedPositions::propagateSibling(TensorView* from, TensorView* to) {
-  auto from_pos = output_.at(from);
-  TORCH_CHECK(
-      TransformReplay::fullSelfMatching(to, from),
-      "Transformations in siblings ",
-      from,
-      " and ",
-      to,
-      " does not match with each other.");
-  output_[to] = from_pos;
-}
-
-std::unordered_map<TensorView*, size_t> getPositionsMappedTo(
-    TensorView* reference_tv,
-    int64_t reference_pos) {
-  std::unordered_map<TensorView*, size_t> mapped_positions;
-  MaxRootDomainInfoSpanningTree tree(reference_tv, reference_pos);
-  FindMappedPositions propagator(mapped_positions, reference_tv, reference_pos);
-  tree.traverse(&propagator);
-  return mapped_positions;
-}
-
-} // namespace
-
-void inlineAllAt(
-    TensorView* reference_tv,
-    int64_t reference_pos,
-    bool best_effort,
-    const std::unordered_set<IterDomain*>& uninlinable_ids) {
-  auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
-  MaxPosCalculator calc(uninlinable_ids);
-  for (auto pair : mapped_positions) {
-    pair.first->inlineAt(pair.second, best_effort, &calc);
-  }
-}
-
-void inlineSelectedAt(
-    const std::unordered_set<TensorView*>& selected,
-    TensorView* reference_tv,
-    int64_t reference_pos,
-    bool best_effort,
-    const std::unordered_set<IterDomain*>& uninlinable_ids) {
-  auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
-  MaxPosCalculator calc(uninlinable_ids);
-  for (auto pair : mapped_positions) {
-    if (selected.count(pair.first) > 0) {
-      pair.first->inlineAt(pair.second, best_effort, &calc);
-    }
-  }
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/inlining.h
+++ b/third_party/nvfuser/csrc/inlining.h
@ -1,100 +0,0 @@
-#pragma once
-
-#include <ir_interface_nodes.h>
-#include <maxinfo_propagator.h>
-#include <transform_replay.h>
-
-#include <memory>
-#include <unordered_set>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class MaxPosCalculator {
-  // Root domains in producer that's unmappable to any of its consumers
-  std::unordered_set<IterDomain*> unmappable_dims_;
-
-  // User set IterDomains to not inline, used in schedulers to avoid inlining
-  // trivial reductions
-  std::unordered_set<IterDomain*> uninlinable_ids_;
-
-  // Iterate through all TVs and collect the dimensions of each TV that don't
-  // map to all its consumer TVs.
-  void buildUnmappableDims();
-
-  // Utility function to return if an id of tv is a valid iter domain to inline
-  // within. This is used in getMaxPos{PasC,CasP}. Different variations of the
-  // bool values are used if checking max position of PasC, CasP, or checking
-  // for a max "self" position.
-  bool isAllowedID(
-      IterDomain* id,
-      TensorView* tv,
-      bool best_effort,
-      bool allow_reduction,
-      bool allow_vectorize,
-      bool allow_unmappable) const;
-
- public:
-  // Returns the position at which tv can be inlined within.
-  size_t getMaxPosSelf(
-      TensorView* tv,
-      bool best_effort,
-      bool allow_reduction,
-      bool allow_vectorize,
-      bool allow_unmappable) const;
-
-  // Returns the maximum position producer can be inlined based on consumer
-  // given the set ComputeAtMode
-  size_t getMaxProducerPosFromConsumer(
-      TensorView* producer,
-      TensorView* consumer,
-      bool best_effort) const;
-
-  // Checks producers, consumers, and siblings to see what the maximum position
-  // in tv is that can be shared across both directions.
-  size_t getMaxPosAll(
-      TensorView* tv,
-      bool best_effort = false,
-      bool check_siblings = true);
-
-  MaxPosCalculator(const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-};
-
-// Inline to the right most allowed position for all tensors in the current
-// fusion.
-TORCH_CUDA_CU_API void inlineMost(
-    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-// Inline to the right most allowed position for the selected tensors in the
-// current fusion.
-TORCH_CUDA_CU_API void inlineMost(
-    const std::vector<TensorView*>& tvs,
-    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-// Inline to the right most allowed position for the selected tensors in the
-// current fusion.
-TORCH_CUDA_CU_API void inlineMost(
-    const std::unordered_set<TensorView*>& tvs,
-    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-
-// Inline to the position corresponding to the reference position in the
-// reference tensor for all tensors in the current fusion.
-TORCH_CUDA_CU_API void inlineAllAt(
-    TensorView* reference_tv,
-    int64_t reference_pos,
-    bool best_effort = false,
-    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-
-// Inline to the position corresponding to the reference position in the
-// reference tensor for selected tensors in the current fusion.
-TORCH_CUDA_CU_API void inlineSelectedAt(
-    const std::unordered_set<TensorView*>& selected,
-    TensorView* reference_tv,
-    int64_t reference_pos,
-    bool best_effort = false,
-    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/instrumentation.cpp
+++ b/third_party/nvfuser/csrc/instrumentation.cpp
@ -1,76 +0,0 @@
-#include <instrumentation.h>
-
-#include <c10/macros/Export.h>
-
-#ifdef _WIN32
-#include <c10/util/win32-headers.h>
-#else
-#include <pthread.h>
-#include <unistd.h>
-#endif
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace inst {
-
-Trace::Trace() {
-  const char* trace_filename = getenv("PYTORCH_NVFUSER_TRACE");
-  if (trace_filename != nullptr) {
-    log_file_ = fopen(trace_filename, "w");
-    TORCH_CHECK(log_file_ != nullptr, "Can't open trace file");
-
-    // Disable the file stream buffering, since it may result
-    // in torn writes in multi-threaded tracing
-    setbuf(log_file_, nullptr);
-
-    // Print the trace prologue
-    // (including a dummy TRACE_START event)
-    fprintf(log_file_, "{\n\"traceEvents\": [\n");
-    start_timestamp_ = Clock::now();
-    logEvent('I', "TRACE_START");
-  }
-
-  if (isOptionDisabled(DisableOption::Nvtx)) {
-    record_nvtx_range_ = false;
-  }
-}
-
-Trace::~Trace() {
-  if (log_file_ != nullptr) {
-    // Print trace epilogue
-    logEvent('I', "TRACE_END", ' ');
-    fprintf(log_file_, "],\n\"displayTimeUnit\": \"ms\"\n}\n");
-    fclose(log_file_);
-  }
-}
-
-void Trace::logEvent(char ph, const char* name, char sep) {
-  const std::chrono::duration<double> d = Clock::now() - start_timestamp_;
-  const double elapsed = d.count() * 1e6;
-
-#ifdef _WIN32
-  const unsigned int pid = GetCurrentProcessId();
-  const unsigned int tid = GetCurrentThreadId();
-#else
-  const unsigned int pid = getpid();
-  const unsigned int tid = std::hash<pthread_t>{}(pthread_self());
-#endif // _WIN32
-
-  fprintf(
-      log_file_,
-      "{ \"name\": \"%s\", \"ph\": \"%c\", \"pid\": %u, \"tid\": %u, \"ts\": %.0f }%c\n",
-      name,
-      ph,
-      pid,
-      tid,
-      elapsed,
-      sep);
-}
-
-} // namespace inst
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/instrumentation.h
+++ b/third_party/nvfuser/csrc/instrumentation.h
@ -1,105 +0,0 @@
-#pragma once
-
-#include <utils.h>
-
-#include <nvToolsExt.h>
-
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdio.h>
-#include <chrono>
-#include <cstdio>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace inst {
-
-//! An optional record of selected timestamped operations, events and counters
-//!
-//! This class is not intended to be used directly. Instead, the operations
-//! to be traced are marked (for example using the FUSER_PERF_SCOPE macro)
-//!
-//! In order to enable tracing, the `PYTORCH_NVFUSER_TRACE` environment
-//! variable is set to point to a trace file (ex `test.trace`). The file name
-//! may be a relative or an absolute path.
-//!
-//! The trace uses the Chrome Tracing (Catapult) format, which is a well
-//! documented JSON based format supported by multiple tools:
-//! https://chromium.googlesource.com/catapult/+/HEAD/tracing/README.md
-//!
-//! An easy way to view traces is to type `about://tracing` in Chrome or
-//! Chromium.
-//!
-class TORCH_CUDA_CU_API Trace : public NonCopyable {
- public:
-  using Clock = std::chrono::steady_clock;
-
- public:
-  static Trace* instance() {
-    static Trace trace;
-    return &trace;
-  }
-
-  void beginEvent(const char* name) {
-    if (log_file_ != nullptr) {
-      logEvent('B', name);
-    }
-    if (record_nvtx_range_) {
-      nvtxRangePushA(name);
-    }
-  }
-
-  void endEvent(const char* name) {
-    if (record_nvtx_range_) {
-      nvtxRangePop();
-    }
-    if (log_file_ != nullptr) {
-      logEvent('E', name);
-    }
-  }
-
- private:
-  Trace();
-  ~Trace();
-
-  void logEvent(char ph, const char* name, char sep = ',');
-
- private:
-  FILE* log_file_ = nullptr;
-  Clock::time_point start_timestamp_;
-  bool record_nvtx_range_ = true;
-};
-
-//! \internal Automatic scope for a perf marker
-//!   (normally used through the FUSER_PERF_SCOPE macro)
-class TORCH_CUDA_CU_API TraceScope : public NonCopyable {
- public:
-  explicit TraceScope(const char* event_name) : event_name_(event_name) {
-    Trace::instance()->beginEvent(event_name_);
-  }
-
-  ~TraceScope() {
-    Trace::instance()->endEvent(event_name_);
-  }
-
- private:
-  const char* event_name_ = nullptr;
-};
-
-#define FUSER_MACRO_CONCAT2(a, b) a##b
-#define FUSER_MACRO_CONCAT(a, b) FUSER_MACRO_CONCAT2(a, b)
-#define FUSER_ANONYMOUS(prefix) FUSER_MACRO_CONCAT(prefix, __COUNTER__)
-
-//! Defines a scope we want to measure and record in a perf trace
-//!
-//! \param name The name of the scope, normally a simple string literal
-//!
-#define FUSER_PERF_SCOPE(name) \
-  torch::jit::fuser::cuda::inst::TraceScope FUSER_ANONYMOUS(_perf_scope_)(name)
-
-} // namespace inst
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_all_nodes.h
+++ b/third_party/nvfuser/csrc/ir_all_nodes.h
@ -1,8 +0,0 @@
-#pragma once
-
-#include <ir_base_nodes.h>
-#include <ir_interface_nodes.h>
-#include <ir_internal_nodes.h>
-
-// TODO: remove this once the Kernel IR split is complete
-#include <kernel_ir.h>
--- a/third_party/nvfuser/csrc/ir_base_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_base_nodes.cpp
@ -1,378 +0,0 @@
-#include <dispatch.h>
-#include <expr_evaluator.h>
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_builder.h>
-#include <ir_cloner.h>
-#include <ir_printer.h>
-#include <kernel.h>
-#include <kernel_ir.h>
-#include <kernel_ir_dispatch.h>
-#include <mutator.h>
-
-#include <torch/csrc/jit/ir/ir.h>
-
-#include <c10/util/Exception.h>
-#include <c10/util/irange.h>
-
-#include <iostream>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-Statement::Statement(IrBuilderPasskey passkey) {
-  ir_container_ = passkey.ir_container_;
-}
-
-Statement::Statement(const Statement* src, IrCloner* ir_cloner) {
-  ir_container_ = ir_cloner->container();
-}
-
-void Statement::setName(IrContainerPasskey, StmtNameType name) {
-  name_ = name;
-}
-
-void Statement::setName(IrBuilderPasskey, StmtNameType name) {
-  name_ = name;
-}
-
-Val* Statement::asVal() {
-  TORCH_INTERNAL_ASSERT(isVal(), "Cannot cast to Val as this is not a Val.");
-  return this->as<Val>();
-}
-
-Expr* Statement::asExpr() {
-  TORCH_INTERNAL_ASSERT(isExpr(), "Cannot cast to Expr as this is not a Expr.");
-  return this->as<Expr>();
-}
-
-std::string Statement::toString() const {
-  std::stringstream ss;
-  IrPrinter ir_printer(ss);
-  ir_printer.handle(this);
-  return ss.str();
-}
-
-std::string Statement::toInlineString() const {
-  std::stringstream ss;
-  IrPrinter ir_printer(ss);
-  ir_printer.print_inline(this);
-  return ss.str();
-}
-
-Fusion* Statement::fusion() const {
-  TORCH_INTERNAL_ASSERT(
-      ir_container_->isA<Fusion>(), "Statement does not belong to a fusion.");
-  return ir_container_->as<Fusion>();
-}
-
-kir::Kernel* Statement::kernel() const {
-  TORCH_INTERNAL_ASSERT(
-      ir_container_->isA<kir::Kernel>(),
-      "Statement does not belong to a kernel.");
-  return ir_container_->as<kir::Kernel>();
-}
-
-// When we create a Val we immediately register them with the active fusion.
-Val::Val(IrBuilderPasskey passkey, ValType _vtype, DataType _dtype)
-    : Statement(passkey), vtype_(_vtype), dtype_(_dtype) {}
-
-// NOTE: we don't clone the definition_ and uses_ here
-//  since they may introduce cloning cycles. Instead, we copy
-//  the original pointers and we'll fix them up later part of the
-//  Fusion copy. Neither definition_ nor uses_ are copied through
-//  this constructor now leaving them to be resolved by later stages
-//
-Val::Val(const Val* src, IrCloner* ir_cloner)
-    : Statement(src, ir_cloner), vtype_(src->vtype_), dtype_(src->dtype_) {}
-
-const std::vector<Expr*>& Val::uses() const {
-  if (vtype_ == ValType::TensorView) {
-    if (!fusion()->isTVUseInfoValid() && !fusion()->isUpdatingTVUseInfo()) {
-      fusion()->resetTvUses();
-    }
-  }
-  return uses_;
-}
-
-// Converts the data type of TensorView or Scalar representing index
-// values. The data type of the original input should be
-// DataType::Index, but DataType::Int is also allowed as it is used
-// for index expressions.
-void Val::resolveIndexDtype() {
-  TORCH_INTERNAL_ASSERT(
-      vtype_ == ValType::TensorView || vtype_ == ValType::Scalar,
-      "Resolving index type is currently only supported on tensor view or scalar values. "
-      "Value type: ",
-      vtype_);
-  TORCH_INTERNAL_ASSERT(
-      dtype_ == DataType::Index || dtype_ == DataType::Int,
-      "Can only resolve index type if a Val has an Index or Int DataType. ",
-      "Data type: ",
-      dtype_);
-  TORCH_INTERNAL_ASSERT(
-      container()->isA<kir::Kernel>(),
-      "Index type can only be resolved at compile time.");
-  dtype_ = container()->as<kir::Kernel>()->indexType();
-}
-
-namespace {
-
-// Traverse definition of all values involved in constructing the provided val.
-// Check if all values involved are constant values, meaning the provided
-// val is also a constant value.
-class ConstCheck : private OptOutConstDispatch {
- private:
-  bool is_const_ = true;
-
-  // Returns true if all Val's in the hisotry of provided Val is an Int. Since
-  // our expression evaluator doesn't support any type besides int, it's
-  // important to check it is one.
-  bool is_int_ = true;
-
-  void handle(const Bool* b) final {
-    is_const_ = is_const_ && b->isConst();
-  }
-
-  void handle(const Double* d) final {
-    is_const_ = is_const_ && d->isConst();
-  }
-
-  void handle(const Int* i) final {
-    is_const_ = is_const_ && i->isConst();
-  }
-
-  void handle(const NamedScalar* ns) final {
-    is_const_ = is_const_ && false;
-  }
-
-  void handle(const Expr* expr) final {
-    for (auto inp : expr->inputs()) {
-      handle(inp);
-    }
-  }
-
-  void handle(const Val* val) final {
-    if (!val->isAnInt()) {
-      is_int_ = false;
-    }
-
-    if (val->definition() != nullptr) {
-      handle(val->definition());
-    } else {
-      OptOutConstDispatch::handle(val);
-    }
-  }
-
- public:
-  static bool isConst(const Val* val) {
-    ConstCheck cc;
-    cc.handle(val);
-    return cc.is_const_;
-  }
-
-  static bool isConstInt(const Val* val) {
-    ConstCheck cc;
-    cc.handle(val);
-    return cc.is_const_ && cc.is_int_;
-  }
-};
-
-} // namespace
-
-bool Val::isConstScalar() const {
-  if (!isScalar()) {
-    return false;
-  }
-  return ConstCheck::isConst(this);
-}
-
-bool Val::isConstInt() const {
-  return ConstCheck::isConst(this) && isAnInt();
-}
-
-int64_t Val::evaluateInt() {
-  TORCH_INTERNAL_ASSERT(
-      ConstCheck::isConst(this),
-      "Cannot get Int of not const values through IR nodes, must use runtime ExpressionEvaluator.");
-
-  if (this->as<Int>()->value().has_value()) {
-    return this->as<Int>()->value().value();
-  }
-
-  ExpressionEvaluator ee(fusion());
-  auto evaluated_val = ee.evaluate(this);
-  TORCH_INTERNAL_ASSERT(
-      evaluated_val.has_value(),
-      "Detected a const integer but failed to infer its value.");
-  return evaluated_val->as<int64_t>();
-}
-
-double Val::evaluateDouble() {
-  TORCH_INTERNAL_ASSERT(
-      ConstCheck::isConst(this),
-      "Cannot get Double of not const doubles through IR nodes, must use runtime ExpressionEvaluator.");
-
-  if (this->as<Double>()->value().has_value()) {
-    return this->as<Double>()->value().value();
-  }
-
-  ExpressionEvaluator ee(fusion());
-  auto evaluated_val = ee.evaluate(this);
-  TORCH_INTERNAL_ASSERT(
-      evaluated_val.has_value(),
-      "Detected a const integer but failed to infer its value.");
-  return evaluated_val->as<double>();
-}
-
-c10::optional<int64_t> Val::getInt() const {
-  if (isConstScalar() && isAnInt()) {
-    if (this->getValType() == ValType::Scalar) {
-      if (this->isA<Int>()) {
-        return this->as<Int>()->value();
-      }
-    }
-  }
-  return c10::nullopt;
-}
-
-c10::optional<double> Val::getDouble() const {
-  if (isConstScalar() && isAnInt()) {
-    if (this->getValType() == ValType::Scalar) {
-      if (this->isA<Double>()) {
-        return this->as<Double>()->value();
-      }
-    }
-  }
-  return c10::nullopt;
-}
-
-bool Val::isZeroInt() const {
-  auto int_val = getInt();
-  return int_val.has_value() && int_val.value() == 0;
-}
-
-bool Val::isOneInt() const {
-  auto int_val = getInt();
-  return int_val.has_value() && int_val.value() == 1;
-}
-
-bool Val::isDefinitionType(ExprType expression_type) const {
-  if (definition() != nullptr) {
-    auto def_expr_type = definition()->getExprType();
-    if (def_expr_type.has_value() && def_expr_type.value() == expression_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-c10::optional<DataType> Val::getDataType() const {
-  TORCH_INTERNAL_ASSERT(
-      dtype_ != DataType::Null, "Value does not have a data type.");
-  return dtype_;
-}
-
-bool Val::isProducerOf(const Val* other) const {
-  TORCH_INTERNAL_ASSERT(other != nullptr);
-  TORCH_INTERNAL_ASSERT(container() == other->container());
-
-  if (definition() == nullptr) {
-    return false;
-  }
-  return std::any_of(
-      definition()->inputs().begin(),
-      definition()->inputs().end(),
-      [other](const Val* input) { return input == other; });
-}
-
-bool Val::isConsumerOf(const Val* other) const {
-  return other->isProducerOf(this);
-}
-
-// We don't register with the active fusion in Expr as this needs to be done
-// after inputs and outputs are registered with the Expr
-Expr::Expr(IrBuilderPasskey passkey, ExprType etype)
-    : Statement(passkey), etype_{etype} {}
-
-Expr::Expr(const Expr* src, IrCloner* ir_cloner)
-    : Statement(src, ir_cloner),
-      etype_(src->etype_),
-      inputs_(ir_cloner->clone(src->inputs_)),
-      outputs_(ir_cloner->clone(src->outputs_)) {}
-
-bool Expr::sameAs(const Statement* other) const {
-  if (this == other) {
-    return true;
-  }
-  if (!other->isA<Expr>()) {
-    return false;
-  }
-  const Expr* other_expr = other->as<Expr>();
-  if (getExprType() != other_expr->getExprType()) {
-    return false;
-  }
-  if (inputs().size() != other_expr->inputs().size() ||
-      outputs().size() != other_expr->outputs().size()) {
-    return false;
-  }
-  for (const auto i : c10::irange(inputs().size())) {
-    if (!input(i)->sameAs(other_expr->input(i))) {
-      return false;
-    }
-  }
-  return true;
-}
-
-kir::Predicate* Expr::predicate() const {
-  TORCH_INTERNAL_ASSERT(
-      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
-  return predicate_;
-}
-
-void Expr::setPredicate(kir::Predicate* predicate) {
-  TORCH_INTERNAL_ASSERT(
-      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
-  predicate_ = predicate;
-}
-
-Expr* Expr::withPredicate(kir::Predicate* predicate) {
-  auto result = shallowCopy();
-  result->setPredicate(predicate);
-  return result;
-}
-
-kir::Predicate* Expr::writePredicate() const {
-  TORCH_INTERNAL_ASSERT(
-      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
-  return write_predicate_;
-}
-
-void Expr::setWritePredicate(kir::Predicate* write_predicate) {
-  TORCH_INTERNAL_ASSERT(
-      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
-  write_predicate_ = write_predicate;
-}
-
-Expr* Expr::withWritePredicate(kir::Predicate* predicate) {
-  auto result = shallowCopy();
-  result->setWritePredicate(predicate);
-  return result;
-}
-
-void Expr::copyPredicatesFrom(const Expr* expr) {
-  if (container()->isA<kir::Kernel>()) {
-    predicate_ = expr->predicate_;
-    write_predicate_ = expr->write_predicate_;
-  }
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_base_nodes.h
+++ b/third_party/nvfuser/csrc/ir_base_nodes.h
@ -1,524 +0,0 @@
-#pragma once
-
-#include <c10/core/ScalarType.h>
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Optional.h>
-
-#include <type.h>
-#include <utils.h>
-
-#include <cstdint>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <unordered_map>
-#include <vector>
-
-// TODO: Add more types (int32, int64)
-// TODO: sameAs should have better logic to check against any type and return
-// gracefully
-
-/*
- * This file defines the base IR structure. Any IR node in this system will
- * inherit from one of the following classes: Statement, Expr, Val,
- * IrInputOutput IR is any information that the code generation stack may need
- * for analysis. By analysis we're refering to anything done in response to a
- * user facing call of this stack. This could be careful tracking of user calls,
- * and any transformation including optimizing transformations, user declared
- * transformations, and lowering the IR.
- */
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-using ValueId = int32_t;
-
-using StmtNameType = unsigned int;
-
-constexpr StmtNameType kInvalidStmName =
-    std::numeric_limits<unsigned int>::max();
-
-class Fusion;
-class FusionGuard;
-class Expr;
-class Val;
-class UnaryOp;
-class BinaryOp;
-class RNGOp;
-class IterDomain;
-class IrCloner;
-class IrContainer;
-class IrBuilderPasskey;
-class IrContainerPasskey;
-
-namespace kir {
-class Kernel;
-class Predicate;
-} // namespace kir
-
-// Passkey for container to register names with statements
-class ExprPasskey {
-  friend class Expr;
-
- private:
-  explicit ExprPasskey() {}
-};
-
-TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept;
-
-//! Statement is the highest level node representation. Everything that is
-//! considered "IR" will be derived from this class at some point. Both Values
-//! and Expr's are a Statement. If there will ever be any more fundamental
-//! types, they will also derive from Statement.
-//!
-//! We use Statements to pass around nodes of unknown compile type. Therefore it
-//! is also important for the design to have a dispatch system for a Statment.
-//! Basically beinng able to succienctly traverse down the inhereitance stack of
-//! a Statment at runtime. This is currently implemented in dispatch.h
-class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
-  friend void swap(Fusion&, Fusion&) noexcept;
-  friend void swap(IrContainer& a, IrContainer& b) noexcept;
-
- public:
-  Statement() = delete;
-
-  // Cloning constructor
-  Statement(const Statement* src, IrCloner* ir_cloner);
-
-  // Dispatch functions, definitions in dispatch.cpp
-  template <typename T>
-  static void dispatch(T handler, Statement*);
-
-  template <typename T>
-  static void constDispatch(T handler, const Statement* const);
-
-  template <typename T>
-  static void mutatorDispatch(T mutator, Statement*);
-
-  // Accessor functions to types. Vals always have a DataType, Exprs never do
-  virtual c10::optional<ValType> getValType() const {
-    return c10::nullopt;
-  }
-  virtual c10::optional<DataType> getDataType() const {
-    return c10::nullopt;
-  }
-  virtual c10::optional<ExprType> getExprType() const {
-    return c10::nullopt;
-  }
-
-  // Short cut to figure out if it is a value/expression
-  bool isVal() const {
-    return getValType() != c10::nullopt;
-  }
-  bool isExpr() const {
-    return getExprType() != c10::nullopt;
-  }
-
-  // Make sure this is a Val and return it as a Val*
-  Val* asVal();
-
-  // Make sure this is an Expr and return it as an Expr*
-  Expr* asExpr();
-
-  // Return the fusion this statement belongs to
-  Fusion* fusion() const;
-
-  // Return the kernel this statement belongs to
-  kir::Kernel* kernel() const;
-
-  // Return the container this statement belongs to
-  IrContainer* container() const {
-    return ir_container_;
-  }
-
-  // Return the int that represents its name
-  StmtNameType name() const {
-    return name_;
-  }
-
-  // Set the statements' name. Typically the container will set the name,
-  // however if we're dealing with cloning, IrBuilder will set the name, this
-  // maybe should be from IrCloner, however I didn't want to add another
-  // passkey.
-  void setName(IrContainerPasskey, StmtNameType name);
-  void setName(IrBuilderPasskey, StmtNameType name);
-
-  virtual bool sameType(const Statement* const other) {
-    if (isVal() && other->isVal())
-      return getValType().value() == other->getValType().value();
-    if (isExpr() && other->isExpr())
-      return getExprType().value() == other->getExprType().value();
-    return false;
-  }
-
-  // Return if this statement is the same as another statement
-  // TODO: should this run through dispatch on this and other?
-  virtual bool sameAs(const Statement* other) const {
-    return this == other;
-  }
-
-  std::string toString() const;
-  std::string toInlineString() const;
-
- protected:
-  Statement(IrBuilderPasskey);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  StmtNameType name_ = kInvalidStmName;
-
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  IrContainer* ir_container_ = nullptr;
-};
-
-//! A Val represents a "value." These are objects, like tensors, scalars, and
-//! memory locations, that are inputs and outputs of computations (represented
-//! by Exprs, below)
-//!
-//! Vals are constant and unique and should always be passed
-//! around as a pointer. Val can generally be thought of as representing any
-//! type of data. Some examples: a constant size like convolution filter width a
-//! runtime constant like batch normalizations momentum a "symbolic" tensor like
-//! one passed down from the JIT a memory buffer used in device code
-//!
-//! Adding a Val:
-//! Right now adding a Val is quite involved. Val's can be defined in ir.h or in
-//! their own header file. The following is what is currently needed to add a
-//! new Val:
-//!
-//! 1) Definition inheriting from Val
-//!     - Members must be private or protected
-//!     - Accessor functions for members
-//!     - Must call Val constructor, Val constructor registers with fusion
-//!     - Implementation of bool sameAs(...)
-//!     - Must implement a "cloning" constructor, ex.
-//!        Int::Int(const Int* src, IrCloner* ir_cloner)
-//! 2) dispatch.h/.cpp must be updated to include dispatch of the new Val
-//! 3) Default mutator function should be added to mutator.cpp
-//! 4a) Printing functions should be added to ir_iostream.h/.cpp
-//! 4b) Graphviz generation must be added to ir_graphviz.h/.cpp
-//! 5) An enum value must be added to ValType in type.h
-//! 6) A string entry must be added in val_type_string_map
-//!
-class TORCH_CUDA_CU_API Val : public Statement {
- public:
-  explicit Val(
-      IrBuilderPasskey,
-      ValType _vtype,
-      DataType _dtype = DataType::Null);
-
-  Val(const Val* src, IrCloner* ir_cloner);
-
-  // Dispatch functions, definitions in dispatch.cpp
-  template <typename T>
-  static void dispatch(T handler, Val*);
-
-  template <typename T>
-  static void constDispatch(T handler, const Val* const);
-
-  template <typename T>
-  static void mutatorDispatch(T mutator, Val*);
-
-  c10::optional<ValType> getValType() const override {
-    return vtype_;
-  }
-
-  ValType vtype() const {
-    return vtype_;
-  }
-
-  DataType dtype() const {
-    return dtype_;
-  }
-
-  // Throws if no DataType is found. Vals must have a DataType
-  c10::optional<DataType> getDataType() const override;
-
-  bool isScalar() const {
-    return vtype_ == ValType::Scalar || vtype_ == ValType::NamedScalar;
-  }
-
-  // Returns if all dependencies are constant scalars
-  bool isConstScalar() const;
-
-  // Returns if all dependencies are constant integers
-  bool isConstInt() const;
-
-  bool isAnInt() const {
-    return isScalar() && dtype_ == DataType::Int;
-  }
-
-  bool isADouble() const {
-    return isScalar() && dtype_ == DataType::Double;
-  }
-
-  // If this Val is an integer with a direct constant value associated with it,
-  // will return the value of that constant integer. If this integer has
-  // defining expressions it will return a c10::nullopt. Those values should be
-  // infered using evaluateInt.
-  c10::optional<int64_t> getInt() const;
-
-  // If this Val is a double with a direct constant value associated with it,
-  // will return the value of that constant double. If this double has
-  // defining expressions it will return a c10::nullopt. Those values should be
-  // infered using evaluateDouble.
-  c10::optional<double> getDouble() const;
-
-  // If this Val is a constant integer, and its history is comprised only of
-  // constant values, will return the value of that constant integer. Cannot
-  // make constant as expression evaluator takes non-constant Vals.
-  int64_t evaluateInt();
-
-  // If this Val is a constant double, and its history is comprised only of
-  // constant values, will return the value of that constant double. Cannot
-  // make constant as expression evaluator takes non-constant Vals.
-  double evaluateDouble();
-
-  // Returns if no dependencies and is a constant scalar.
-  virtual bool isConst() const {
-    return false;
-  }
-
-  bool isZeroInt() const;
-  bool isOneInt() const;
-
-  // Returns the Expr that this value is an output of, returns nullptr if none
-  // was found
-  Expr* definition() const {
-    if (is_fusion_input_) {
-      return nullptr;
-    }
-    return definition_;
-  }
-
-  // Determine if value definition matches given expression type
-  bool isDefinitionType(ExprType expression_type) const;
-
-  const std::vector<Expr*>& uses() const;
-
-  bool isFusionInput() const {
-    return is_fusion_input_;
-  }
-
-  bool isFusionOutput() const {
-    return is_fusion_output_;
-  }
-
-  //! Returns true when other is a producer of this
-  bool isProducerOf(const Val* other) const;
-
-  //! Returns true when other is a consumer of this
-  bool isConsumerOf(const Val* other) const;
-
-  bool sameType(const Statement* other) override {
-    return Statement::sameType(other) &&
-        getDataType() == other->as<Val>()->getDataType();
-  }
-
-  // TODO: Make this more sophisticated. A value being the same as another value
-  // should be evaluated based on the DAG that created it, and that DAGs leaf
-  // nodes
-  bool sameAs(const Statement* other) const override {
-    return this == other;
-  }
-
-  void setEvaluatorIndex(int to) {
-    TORCH_INTERNAL_ASSERT(evaluator_index_ == -1);
-    evaluator_index_ = to;
-  }
-
-  int evaluatorIndex() const {
-    return evaluator_index_;
-  }
-
-  // Following is managed by Fusion (or kirIrBuilder) and can change.
-  // TODO: Protect with a passkey.
-  void setDefinition(Expr* expr) {
-    definition_ = expr;
-  }
-
-  void resolveIndexDtype();
-
- protected:
-  friend Fusion;
-
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  const ValType vtype_;
-
-  // TODO: Add fusion passkey for this
-  void setIsFusionInput(bool is_fusion_input) {
-    is_fusion_input_ = is_fusion_input;
-  }
-
-  // TODO: Add fusion passkey for this
-  void setIsFusionOutput(bool is_fusion_output) {
-    is_fusion_output_ = is_fusion_output;
-  }
-
-  // TODO: Add fusion or container passkey for this
-  void setUses(const std::vector<Expr*>& uses) {
-    uses_ = uses;
-  }
-
- private:
-  // There's only one instance where dtype can change, and that's through
-  // resolving the index data type from nvfuser to either Int or Int32 for
-  // welford operations.
-  DataType dtype_;
-
-  // Following is managed by Fusion and can change.
-  bool is_fusion_input_ = false;
-  bool is_fusion_output_ = false;
-
-  Expr* definition_ = nullptr;
-  std::vector<Expr*> uses_;
-
-  // Expr evaluator idx;
-  int evaluator_index_ = -1;
-};
-
-//!  A Expr represents a "computation." These are functions that takes inputs
-//!  and produce outputs, inputs and outputs all being Vals. There are
-//!  specializations of BinaryOp which takes 2 inputs and produces 1 output, and
-//!  UnaryOp which takes 1 input and produces 1 output. Exprs are unique and
-//!  immutable. Conceptually, Exprs could always be manipulated using unique
-//!  pointers, and we could add this later. However, for now Exprs can be
-//!  replaced in a fusion, but they cannot be modified in place.
-//!
-//!  The IR is static single assignment (SSA). Values can only be defined as an
-//!  output of an Expr once. If they are re-defined the original definition is
-//!  deleted from the program, as opposed to an ordered redefinition of the
-//!  value in the program.
-//!
-//!  Note: Registering an Expr with a Fusion is actually 2 parts, one part is
-//!  done in the Expr constructor, so that should be called on anything that
-//!  inherits Expr. The issue with having registration in Expr's constructor, is
-//!  that the constructor of an Expr will set ouputs and inputs. This
-//!  information is important for registration with Fuser, so it can track the
-//!  dependency chain.
-//!
-//!  Adding an Expr:
-//!  Right now adding an Expr is quite involved. Expr's can be defined in ir.h
-//!  or in their own header file. The following is what is currently needed for
-//!  Expr definitions:
-//!
-//! 1) Definition inheriting from Expr.
-//!      - Members must be private or protected
-//!      - Accessor functions for members
-//!      - Constructors need to register with the Fusion after inputs/outputs
-//!         are defined
-//!      - Implementation of bool sameAs(...)
-//!  2) dispatch.h/.cpp must be updated to include dispatch of the new Val
-//!  3) Default mutator function should be added to mutator.h/.cpp
-//!  4) Printing functions should be added to ir_iostream.h/.cpp
-//!  5) Lower case convenience functions should be added to arith.h/.cpp (If
-//!     user facing)
-//!  6) An enum value must be added to ExprType in type.h
-//!  7) A string entry must be added in expr_type_string_map
-//!  8) Entry added to ir_graphviz .cpp/.h
-//!
-class TORCH_CUDA_CU_API Expr : public Statement {
- public:
-  explicit Expr(IrBuilderPasskey, ExprType type);
-
-  Expr(const Expr* src, IrCloner* ir_cloner);
-
-  // Creates a new instance of the expression with all its field copied.
-  // Note that unlike IrCloner, this function only do a shallow copy
-  virtual Expr* shallowCopy() const = 0;
-
-  c10::optional<ExprType> getExprType() const override {
-    return etype_;
-  }
-
-  ExprType etype() const {
-    return etype_;
-  }
-
-  bool sameAs(const Statement* other) const override;
-
-  // Input/output accessors
-  const auto& inputs() const {
-    return inputs_;
-  }
-
-  const auto& outputs() const {
-    return outputs_;
-  }
-
-  auto input(size_t index) const {
-    return inputs_[index];
-  }
-
-  auto output(size_t index) const {
-    return outputs_[index];
-  }
-
-  // Dispatch functions, definitions in dispatch.cpp
-  template <typename T>
-  static void dispatch(T handler, Expr*);
-
-  template <typename T>
-  static void constDispatch(T handler, const Expr* const);
-
-  template <typename T>
-  static void mutatorDispatch(T mutator, Expr*);
-
-  // TODO: Protect based on being in kernel container
-  kir::Predicate* predicate() const;
-
-  // Creates a shallow copy the expression with the given predicate attached.
-  // TODO: Protect based on being in kernel container
-  Expr* withPredicate(kir::Predicate* predicate);
-
-  // TODO: Protect based on being in kernel container
-  kir::Predicate* writePredicate() const;
-
-  // Creates a shallow copy the expression with the given write-predicate
-  // attached.
-  // TODO: Protect based on being in kernel container
-  Expr* withWritePredicate(kir::Predicate* write_predicate);
-
- protected:
-  // TODO: Protect based on being in kernel container
-  void setPredicate(kir::Predicate* predicate);
-
-  // TODO: Protect based on being in kernel container
-  void setWritePredicate(kir::Predicate* write_predicate);
-
-  void copyPredicatesFrom(const Expr* expr);
-
-  // TODO: Add Fusion passkey
-  void addInput(Val* input) {
-    TORCH_INTERNAL_ASSERT(input != nullptr);
-    inputs_.push_back(input);
-  }
-
-  // TODO: Add Fusion passkey
-  void addOutput(Val* output) {
-    TORCH_INTERNAL_ASSERT(output != nullptr);
-    outputs_.push_back(output);
-  }
-
-  ExprPasskey exprPasskey() {
-    return ExprPasskey();
-  }
-
- private:
-  ExprType etype_ = ExprType::Invalid;
-  std::vector<Val*> inputs_;
-  std::vector<Val*> outputs_;
-
-  kir::Predicate* predicate_ = nullptr;
-
-  // Only used for reduction-related expressions
-  kir::Predicate* write_predicate_ = nullptr;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_builder.cpp
+++ b/third_party/nvfuser/csrc/ir_builder.cpp
@ -1,471 +0,0 @@
-#include <fusion.h>
-#include <ir_builder.h>
-#include <ir_cloner.h>
-#include <kernel.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-//! Clone an IR node, forwarding the arguments to the IrCloner constructor.
-template <class T>
-T* IrBuilder::clone(const T* src, IrCloner* ir_cloner) {
-  TORCH_INTERNAL_ASSERT(
-      ir_cloner != nullptr,
-      "Cannot use create when a cloner object is set. Use clone.");
-
-  TORCH_INTERNAL_ASSERT(
-      ir_cloner->container() != nullptr,
-      "Cloner doesn't have a valid container to store cloned object.");
-
-  T* dest = new T(src, ir_cloner);
-  const Statement* src_stmt = dynamic_cast<const Statement*>(src);
-  Statement* dest_stmt = dynamic_cast<Statement*>(dest);
-
-  auto dest_container = ir_cloner->container();
-  auto src_container = src_stmt->container();
-
-  dest_container->registerStmt(IrBuilderPasskey(dest_container), dest_stmt);
-
-  if (src_container != dest_container) {
-    dest_stmt->setName(IrBuilderPasskey(dest_container), src_stmt->name());
-  }
-
-  ir_cloner->registerClone(src_stmt, dest_stmt);
-
-  return dest;
-}
-
-#define IR_BUILDER_INSTANTIATE(T) \
-  template T* IrBuilder::clone(const T* src, IrCloner* ir_cloner);
-
-// Vals
-IR_BUILDER_INSTANTIATE(IterDomain)
-IR_BUILDER_INSTANTIATE(TensorDomain)
-IR_BUILDER_INSTANTIATE(TensorView)
-IR_BUILDER_INSTANTIATE(Bool)
-IR_BUILDER_INSTANTIATE(Double)
-IR_BUILDER_INSTANTIATE(Int)
-IR_BUILDER_INSTANTIATE(ComplexDouble)
-IR_BUILDER_INSTANTIATE(NamedScalar)
-
-// Exprs
-IR_BUILDER_INSTANTIATE(Split)
-IR_BUILDER_INSTANTIATE(Merge)
-IR_BUILDER_INSTANTIATE(Swizzle2D)
-IR_BUILDER_INSTANTIATE(TransposeOp)
-IR_BUILDER_INSTANTIATE(ExpandOp)
-IR_BUILDER_INSTANTIATE(ShiftOp)
-IR_BUILDER_INSTANTIATE(GatherOp)
-IR_BUILDER_INSTANTIATE(ViewAsScalar)
-IR_BUILDER_INSTANTIATE(ViewOp)
-IR_BUILDER_INSTANTIATE(FullOp)
-IR_BUILDER_INSTANTIATE(ARangeOp)
-IR_BUILDER_INSTANTIATE(EyeOp)
-IR_BUILDER_INSTANTIATE(UnaryOp)
-IR_BUILDER_INSTANTIATE(BinaryOp)
-IR_BUILDER_INSTANTIATE(TernaryOp)
-IR_BUILDER_INSTANTIATE(RNGOp)
-IR_BUILDER_INSTANTIATE(ReductionOp)
-IR_BUILDER_INSTANTIATE(GroupedReductionOp)
-IR_BUILDER_INSTANTIATE(WelfordOp)
-IR_BUILDER_INSTANTIATE(LoadStoreOp)
-IR_BUILDER_INSTANTIATE(MmaOp)
-IR_BUILDER_INSTANTIATE(BroadcastOp)
-
-Val* IrBuilder::newResult(DataType dtype) {
-  switch (dtype) {
-    case DataType::Bool:
-      return IrBuilder::create<Bool>(c10::nullopt);
-    case DataType::Double:
-      return IrBuilder::create<Double>(c10::nullopt);
-    case DataType::Int:
-      return IrBuilder::create<Int>(c10::nullopt);
-    default:
-      TORCH_CHECK(false, "Unexpected data type");
-  }
-}
-
-Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  TORCH_CHECK(
-      lhs != nullptr && rhs != nullptr,
-      "Either lhs or rhs is a nullptr in newArithmeticExpr.");
-  TORCH_CHECK(
-      lhs->dtype() == rhs->dtype(),
-      "Incompatible operand types: ",
-      lhs->dtype(),
-      " and ",
-      rhs->dtype());
-  auto result = newResult(lhs->dtype());
-  IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  return result;
-}
-
-Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  TORCH_CHECK(
-      lhs != nullptr && rhs != nullptr,
-      "Either lhs or rhs is a nullptr in newLogicExpr.");
-  auto result = IrBuilder::create<Bool>(c10::nullopt);
-  IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  return result;
-}
-
-Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) {
-  TORCH_CHECK(
-      pred != nullptr && lhs != nullptr && rhs != nullptr,
-      "Either pred, lhs, or rhs is a nullptr in whereExpr.");
-  TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types");
-  auto result = newResult(lhs->dtype());
-  IrBuilder::create<TernaryOp>(TernaryOpType::Where, result, pred, lhs, rhs);
-  return result;
-}
-
-Val* IrBuilder::negExpr(Val* val) {
-  TORCH_CHECK(val != nullptr, "val is a nullptr in negExpr.");
-  auto result = newResult(val->dtype());
-  IrBuilder::create<UnaryOp>(UnaryOpType::Neg, result, val);
-  return result;
-}
-
-Val* IrBuilder::notExpr(Val* val) {
-  TORCH_CHECK(val != nullptr, "val is a nullptr in notExpr.");
-  auto result = newResult(val->dtype());
-  IrBuilder::create<UnaryOp>(UnaryOpType::Not, result, val);
-  return result;
-}
-
-Val* IrBuilder::setExpr(Val* val) {
-  TORCH_CHECK(val != nullptr, "val is a nullptr in setExpr.");
-  auto result = newResult(val->dtype());
-  IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
-  return result;
-}
-
-Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) {
-  TORCH_CHECK(val != nullptr, "val is a nullptr in setExprNamedScalar.");
-  auto result = IrBuilder::create<NamedScalar>(name, val->dtype());
-  IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
-  return result;
-}
-
-Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) {
-  TORCH_CHECK(val != nullptr, "val is a nullptr in addressExprNamedScalar.");
-  auto result = IrBuilder::create<NamedScalar>(name, DataType::Int);
-  IrBuilder::create<UnaryOp>(UnaryOpType::Address, result, val);
-  return result;
-}
-
-Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::And, lhs, rhs);
-}
-
-Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
-}
-
-Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::GT, lhs, rhs);
-}
-
-Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
-}
-
-Val* IrBuilder::leExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LE, lhs, rhs);
-}
-
-Val* IrBuilder::geExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::GE, lhs, rhs);
-}
-
-Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
-}
-
-Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
-}
-
-Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
-}
-
-Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
-}
-
-Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
-}
-
-Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
-}
-
-Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Max, lhs, rhs);
-}
-
-Val* IrBuilder::minExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Min, lhs, rhs);
-}
-
-Val* IrBuilder::swizzle2DIntExpr(
-    Val* in_x,
-    Val* in_y,
-    Val* extent_x,
-    Val* extent_y,
-    Swizzle2DType swizzle_type) {
-  auto result = create<kir::IntPair>();
-
-  create<kir::Swizzle2DInt>(
-      result, in_x, in_y, extent_x, extent_y, swizzle_type);
-  return result;
-}
-
-Val* IrBuilder::pairSelectExpr(Val* in, kir::PairSelect::Selection sel) {
-  auto int_pair = dynamic_cast<kir::IntPair*>(in);
-  TORCH_INTERNAL_ASSERT(int_pair != nullptr);
-  auto result = create<Int>();
-  create<kir::PairSelect>(result, int_pair, sel);
-  return result;
-}
-
-Val* SimplifyingIrBuilder::negExpr(Val* val) {
-  if (auto int_val = dynamic_cast<Int*>(val)) {
-    if (int_val->isConst()) {
-      return IrBuilder::create<Int>(-int_val->value().value());
-    }
-  }
-  return IrBuilder::negExpr(val);
-}
-
-Val* SimplifyingIrBuilder::notExpr(Val* val) {
-  if (auto bool_val = dynamic_cast<Bool*>(val)) {
-    if (bool_val->isConst()) {
-      if (bool_val->value().value()) {
-        return FusionGuard::getCurFusion()->falseVal();
-      } else {
-        return FusionGuard::getCurFusion()->trueVal();
-      }
-    }
-  }
-  return IrBuilder::notExpr(val);
-}
-
-Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) {
-  if (rhs == 0) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return IrBuilder::IrBuilder::create<Int>(rhs);
-  } else if (lhs->isConst()) {
-    return IrBuilder::IrBuilder::create<Int>(lhs->value().value() + rhs);
-  } else if (rhs > 0) {
-    return IrBuilder::addExpr(lhs, IrBuilder::IrBuilder::create<Int>(rhs));
-  } else {
-    return IrBuilder::subExpr(lhs, IrBuilder::IrBuilder::create<Int>(-rhs));
-  }
-}
-
-Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) {
-  if (rhs == nullptr) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return rhs;
-  } else if (lhs->isConst()) {
-    return addExpr(rhs, lhs->value().value());
-  } else if (rhs->isConst()) {
-    return addExpr(lhs, rhs->value().value());
-  } else {
-    return IrBuilder::addExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) {
-  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
-  if (lhs == nullptr || lhs->isZeroInt()) {
-    return rhs;
-  } else if (rhs == nullptr || rhs->isZeroInt()) {
-    return lhs;
-  }
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  auto rhs_int = dynamic_cast<Int*>(rhs);
-  if (lhs_int != nullptr && rhs_int != nullptr) {
-    return addExpr(lhs_int, rhs_int);
-  } else {
-    return IrBuilder::addExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::addExpr(Val* lhs, Int::ScalarType rhs) {
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  if (lhs_int != nullptr) {
-    return addExpr(lhs_int, rhs);
-  } else {
-    return addExpr(lhs, IrBuilder::create<Int>(rhs));
-  }
-}
-
-Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) {
-  return addExpr(lhs, negExpr(rhs));
-}
-
-Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int::ScalarType rhs) {
-  if (rhs == 0) {
-    return lhs->container()->zeroVal();
-  } else if (rhs == 1) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return IrBuilder::create<Int>(rhs);
-  } else if (lhs->isConst()) {
-    return IrBuilder::create<Int>(lhs->value().value() * rhs);
-  } else {
-    return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
-  }
-}
-
-Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Int::ScalarType rhs) {
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  if (lhs_int != nullptr) {
-    return mulExpr(lhs_int, rhs);
-  } else {
-    return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
-  }
-}
-
-Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int* rhs) {
-  if (rhs == nullptr) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return rhs;
-  } else if (lhs->isConst()) {
-    return mulExpr(rhs, lhs->value().value());
-  } else if (rhs->isConst()) {
-    return mulExpr(lhs, rhs->value().value());
-  } else {
-    return IrBuilder::mulExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Val* rhs) {
-  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
-  if (lhs == nullptr || lhs->isOneInt()) {
-    return rhs;
-  } else if (rhs == nullptr || rhs->isOneInt()) {
-    return lhs;
-  } else if (lhs->isZeroInt() || rhs->isZeroInt()) {
-    return lhs->container()->zeroVal();
-  }
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  auto rhs_int = dynamic_cast<Int*>(rhs);
-  if (lhs_int != nullptr && rhs_int != nullptr) {
-    return mulExpr(lhs_int, rhs_int);
-  } else {
-    return IrBuilder::mulExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) {
-  TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr));
-
-  if (lhs == nullptr) {
-    return rhs;
-  } else if (rhs == nullptr) {
-    return lhs;
-  }
-
-  bool lhs_definitely_true = false;
-  bool lhs_definitely_false = false;
-  auto lhs_bool = dynamic_cast<Bool*>(lhs);
-  if (lhs_bool && lhs_bool->isConst()) {
-    lhs_definitely_true = lhs_bool->value().value();
-    lhs_definitely_false = !lhs_bool->value().value();
-  }
-  auto rhs_bool = dynamic_cast<Bool*>(rhs);
-  bool rhs_definitely_true = false;
-  bool rhs_definitely_false = false;
-  if (rhs_bool && rhs_bool->isConst()) {
-    rhs_definitely_true = rhs_bool->value().value();
-    rhs_definitely_false = !rhs_bool->value().value();
-  }
-
-  if (lhs_definitely_true && rhs_definitely_true) {
-    return FusionGuard::getCurFusion()->trueVal();
-  } else if (lhs_definitely_false || rhs_definitely_false) {
-    return FusionGuard::getCurFusion()->falseVal();
-  } else if (lhs_definitely_true) {
-    return rhs;
-  } else if (rhs_definitely_true) {
-    return lhs;
-  }
-
-  return IrBuilder::andExpr(lhs, rhs);
-}
-
-namespace {
-
-template <typename IrBuilderFunc, typename IntFunc>
-Val* minOrMaxExpr(
-    Int* lhs,
-    Int* rhs,
-    IrBuilderFunc ir_builder_func,
-    IntFunc int_func) {
-  if (rhs == nullptr) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return rhs;
-  } else if (lhs->isConst() && rhs->isConst()) {
-    return IrBuilder::create<Int>(
-        int_func(lhs->value().value(), rhs->value().value()));
-  } else {
-    return ir_builder_func(lhs, rhs);
-  }
-}
-
-template <typename IrBuilderFunc, typename IntFunc>
-Val* minOrMaxExpr(
-    Val* lhs,
-    Val* rhs,
-    IrBuilderFunc ir_builder_func,
-    IntFunc int_func) {
-  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
-  if (lhs == nullptr) {
-    return rhs;
-  } else if (rhs == nullptr || lhs == rhs) {
-    return lhs;
-  }
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  auto rhs_int = dynamic_cast<Int*>(rhs);
-  if (lhs_int != nullptr && rhs_int != nullptr) {
-    return minOrMaxExpr(lhs_int, rhs_int, ir_builder_func, int_func);
-  } else {
-    return ir_builder_func(lhs, rhs);
-  }
-}
-
-} // namespace
-
-Val* SimplifyingIrBuilder::maxExpr(Val* lhs, Val* rhs) {
-  return minOrMaxExpr(
-      lhs,
-      rhs,
-      [](Val* lhs, Val* rhs) { return IrBuilder::maxExpr(lhs, rhs); },
-      [](int64_t lhs, int64_t rhs) { return std::max(lhs, rhs); });
-}
-
-Val* SimplifyingIrBuilder::minExpr(Val* lhs, Val* rhs) {
-  return minOrMaxExpr(
-      lhs,
-      rhs,
-      [](Val* lhs, Val* rhs) { return IrBuilder::minExpr(lhs, rhs); },
-      [](int64_t lhs, int64_t rhs) { return std::min(lhs, rhs); });
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_builder.h
+++ b/third_party/nvfuser/csrc/ir_builder.h
@ -1,140 +0,0 @@
-#pragma once
-
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_container.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace kir {
-class Kernel;
-}
-
-class IrCloner;
-
-// Passkey for builder to register properties with statements, and to call
-// functions in IrContainer
-class TORCH_CUDA_CU_API IrBuilderPasskey {
-  friend class IrBuilder;
-
- public:
-  // TODO: Collapse ir_container and Kernel once Kernel inherits from
-  // IrContainer
-  IrContainer* const ir_container_ = nullptr;
-
- private:
-  explicit IrBuilderPasskey(IrContainer* ir_container);
-};
-
-//! IR builder interface
-class TORCH_CUDA_CU_API IrBuilder {
- public:
-  //! Allocate a new IR node, forwarding the arguments to the appropriate
-  //! constructor and registering with the container
-  template <class T, class... Args>
-  static T* create(Args&&... args) {
-    auto container = FusionGuard::getCurFusion();
-    // return create<T>(container, std::forward<Args>(args)...);
-    TORCH_INTERNAL_ASSERT(
-        container != nullptr, "Need an active container to build IR.");
-    T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
-
-    container->registerStmt(IrBuilderPasskey(container), node);
-
-    return node;
-  }
-
-  //! Allocate a new IR node, forwarding the arguments to the appropriate
-  //! constructor and registering with the container
-  template <class T, class... Args>
-  static T* create(IrContainer* container, Args&&... args) {
-    TORCH_INTERNAL_ASSERT(
-        container != nullptr, "Need an active container to build IR.");
-    T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
-
-    container->registerStmt(IrBuilderPasskey(container), node);
-
-    return node;
-  }
-
-  //! Clone an IR node, forwarding the arguments to the IrCloner constructor.
-  //! Register clones with IrCloner's target container.
-  template <class T>
-  static T* clone(const T* src, IrCloner* ir_cloner);
-
-  // Unary operations
-  static Val* negExpr(Val* val);
-  static Val* notExpr(Val* val);
-  static Val* setExpr(Val* val);
-  static Val* setExprNamedScalar(const std::string& name, Val* val);
-  static Val* addressExprNamedScalar(const std::string& name, Val* val);
-
-  // Binary operations
-  static Val* andExpr(Val* lhs, Val* rhs);
-  static Val* eqExpr(Val* lhs, Val* rhs);
-  static Val* gtExpr(Val* lhs, Val* rhs);
-  static Val* ltExpr(Val* lhs, Val* rhs);
-  static Val* leExpr(Val* lhs, Val* rhs);
-  static Val* geExpr(Val* lhs, Val* rhs);
-  static Val* addExpr(Val* lhs, Val* rhs);
-  static Val* subExpr(Val* lhs, Val* rhs);
-  static Val* mulExpr(Val* lhs, Val* rhs);
-  static Val* divExpr(Val* lhs, Val* rhs);
-  static Val* ceilDivExpr(Val* lhs, Val* rhs);
-  static Val* modExpr(Val* lhs, Val* rhs);
-  static Val* maxExpr(Val* lhs, Val* rhs);
-  static Val* minExpr(Val* lhs, Val* rhs);
-
-  // Ternary operations
-  static Val* whereExpr(Val* pred, Val* lhs, Val* rhs);
-
-  // Swizzle operations
-  static Val* swizzle2DIntExpr(
-      Val* x,
-      Val* y,
-      Val* extent_x,
-      Val* extent_y,
-      Swizzle2DType swizzle_type);
-  static Val* pairSelectExpr(Val* in, kir::PairSelect::Selection sel);
-
- private:
-  static Val* newResult(DataType dtype);
-  static Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
-  static Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
-};
-
-//! A wrapper builder with static expression simplification
-//!
-//! Example:
-//! - addExpr(new Int(1), new Int(2)) -> Int(3)
-//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo")
-//!
-//! Designed to be used to simplify predicate and index expressions in
-//! generated code. Also, the shift validation may fail without
-//! this simplification.
-class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder {
- public:
-  static Val* negExpr(Val* val);
-  static Val* notExpr(Val* val);
-
-  static Val* addExpr(Int* lhs, Int::ScalarType rhs);
-  static Val* addExpr(Val* lhs, Int::ScalarType rhs);
-  static Val* addExpr(Int* lhs, Int* rhs);
-  static Val* addExpr(Val* lhs, Val* rhs);
-  static Val* subExpr(Val* lhs, Val* rhs);
-  static Val* mulExpr(Int* lhs, Int::ScalarType rhs);
-  static Val* mulExpr(Val* lhs, Int::ScalarType rhs);
-  static Val* mulExpr(Int* lhs, Int* rhs);
-  static Val* mulExpr(Val* lhs, Val* rhs);
-  static Val* andExpr(Val* lhs, Val* rhs);
-  static Val* maxExpr(Val* lhs, Val* rhs);
-  static Val* minExpr(Val* lhs, Val* rhs);
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_cloner.cpp
+++ b/third_party/nvfuser/csrc/ir_cloner.cpp
@ -1,242 +0,0 @@
-#include <ir_cloner.h>
-
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_builder.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-IrCloner::IrCloner(IrContainer* container) : ir_container_(container) {}
-
-Statement* IrCloner::clone(const Statement* statement) {
-  if (statement == nullptr) {
-    return nullptr;
-  }
-
-  // Have we already cloned this node?
-  const auto it = clones_map_.find(statement);
-  if (it != clones_map_.end()) {
-    return it->second;
-  } else {
-    // Clone the new node, saving/restoring this->clone_
-    // since the cloning can be reentrant
-    auto saved_clone = clone_;
-    handle(statement);
-    auto new_node = clone_;
-    clone_ = saved_clone;
-
-    // The base cloning constructor (Statement) should have
-    // registered the new node. Failure to do so indicates
-    // that something went horribly wrong.
-    TORCH_INTERNAL_ASSERT(new_node != nullptr);
-    TORCH_INTERNAL_ASSERT(clones_map_[statement] == new_node);
-
-    return new_node;
-  }
-}
-
-void IrCloner::registerClone(const Statement* src, Statement* clone) {
-  TORCH_CHECK(src != nullptr);
-  TORCH_CHECK(clone != nullptr);
-  TORCH_CHECK(clones_map_.insert({src, clone}).second);
-}
-
-void IrCloner::handle(const Statement* s) {
-  OptInConstDispatch::handle(s);
-}
-
-void IrCloner::handle(const Val* v) {
-  OptInConstDispatch::handle(v);
-}
-
-void IrCloner::handle(const Expr* e) {
-  OptInConstDispatch::handle(e);
-}
-
-void IrCloner::handle(const TensorDomain* td) {
-  clone_ = IrBuilder::clone(td, this);
-}
-
-void IrCloner::handle(const IterDomain* id) {
-  clone_ = IrBuilder::clone(id, this);
-}
-
-void IrCloner::handle(const Bool* b) {
-  clone_ = IrBuilder::clone(b, this);
-}
-
-void IrCloner::handle(const Double* d) {
-  clone_ = IrBuilder::clone(d, this);
-}
-
-void IrCloner::handle(const Int* i) {
-  clone_ = IrBuilder::clone(i, this);
-}
-
-void IrCloner::handle(const ComplexDouble* c) {
-  clone_ = IrBuilder::clone(c, this);
-}
-
-void IrCloner::handle(const NamedScalar* named_scalar) {
-  clone_ = IrBuilder::clone(named_scalar, this);
-}
-
-void IrCloner::handle(const TensorView* tv) {
-  clone_ = IrBuilder::clone(tv, this);
-}
-
-void IrCloner::handle(const FullOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ARangeOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const EyeOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const UnaryOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const BinaryOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const TernaryOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const RNGOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const BroadcastOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ReductionOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const GroupedReductionOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const WelfordOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const LoadStoreOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const MmaOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const TransposeOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ExpandOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ShiftOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const GatherOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ViewAsScalar* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const ViewOp* op) {
-  clone_ = IrBuilder::clone(op, this);
-}
-
-void IrCloner::handle(const Split* split) {
-  clone_ = IrBuilder::clone(split, this);
-}
-
-void IrCloner::handle(const Merge* merge) {
-  clone_ = IrBuilder::clone(merge, this);
-}
-
-void IrCloner::handle(const Swizzle2D* swizzle) {
-  clone_ = IrBuilder::clone(swizzle, this);
-}
-
-TensorView* RecomputeTv::recompute(TensorView* tv) {
-  FusionGuard fg(tv->fusion());
-
-  // Disallow recomputation of inputs or outputs. User would have to be aware of
-  // these changes and informed they happened somehow.
-  TORCH_INTERNAL_ASSERT(
-      !tv->isFusionInput(),
-      "Cannot recompute buffers that are inputs of the fusion.");
-
-  // Grab all the expressions used to generate the TensorView
-  auto exprs = StmtSort::getExprs(tv->fusion(), {tv}, false);
-
-  // Run the replicator
-  RecomputeTv replicator(tv->fusion(), exprs);
-
-  // Make const version of pointer for lookup
-  const auto const_tv = tv;
-  // Find the recomputed tensor from the cloner
-  auto clone_it = replicator.clones_map_.find(const_tv);
-  TORCH_INTERNAL_ASSERT(clone_it != replicator.clones_map_.end());
-  auto cloned_val = clone_it->second;
-  TORCH_INTERNAL_ASSERT(
-      cloned_val->isA<TensorView>(),
-      "Cloned value is somehow not a tensor view.");
-
-  // Return the cloned value
-  return cloned_val->as<TensorView>();
-}
-
-RecomputeTv::RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs)
-    : IrCloner(fusion), fusion_(fusion) {
-  // Add inputs to the clones map to prevent cloning them.
-  for (const auto inp : fusion->inputs()) {
-    clones_map_[inp] = inp;
-  }
-  // Adds all scalar values to clones map to prevent cloning them
-  for (const auto val : fusion->vals()) {
-    if (val->getValType().value() == ValType::Scalar ||
-        val->getValType().value() == ValType::NamedScalar) {
-      clones_map_[val] = val;
-    }
-  }
-  // Clone the expressions
-  for (auto expr : exprs) {
-    IrCloner::handle(expr);
-  }
-}
-
-void RecomputeTv::handle(const TensorDomain* td) {
-  // Make sure to recompute the history of the iteration domains, explicitly go
-  // through the expressions and send them to IrCloner.
-  auto exprs =
-      StmtSort::getExprs(fusion_, {td->domain().begin(), td->domain().end()});
-
-  for (auto expr : exprs) {
-    IrCloner::handle(expr);
-  }
-  IrCloner::handle(td);
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_cloner.h
+++ b/third_party/nvfuser/csrc/ir_cloner.h
@ -1,132 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <dispatch.h>
-#include <ir_builder.h>
-
-#include <unordered_map>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class IrContainer;
-
-//! Clones nodes from an exiting Fusion
-//!
-//! \warning IrCloner machinery is a specialized helper for implementing
-//!   Fusion copy operations and the and limited scope of RecomputeTv below.
-//!   It is not intended for any other uses.
-//!
-class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
-  friend class Statement;
-  friend class IrBuilder;
-
- public:
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  explicit IrCloner(IrContainer* container);
-
-  Statement* clone(const Statement* statement);
-
-  template <class T>
-  T* clone(const T* node) {
-    return node ? clone(node->template as<Statement>())->template as<T>()
-                : nullptr;
-  }
-
-  template <class T>
-  std::vector<T*> clone(const std::vector<T*>& container) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    std::vector<T*> copy;
-    copy.reserve(container.size());
-    for (auto p : container) {
-      copy.push_back(clone(p));
-    }
-    return copy;
-  }
-
-  IrContainer* container() const {
-    return ir_container_;
-  }
-
- protected:
-  void registerClone(const Statement* src, Statement* clone);
-
-  void handle(const Statement*) override;
-  void handle(const Val*) override;
-  void handle(const Expr*) override;
-
-  void handle(const TensorDomain*) override;
-  void handle(const TensorView*) override;
-  void handle(const IterDomain*) override;
-
-  void handle(const Bool*) override;
-  void handle(const Double*) override;
-  void handle(const Int*) override;
-  void handle(const ComplexDouble*) override;
-  void handle(const NamedScalar*) override;
-
-  void handle(const FullOp*) override;
-  void handle(const ARangeOp*) override;
-  void handle(const EyeOp*) override;
-  void handle(const UnaryOp*) override;
-  void handle(const BinaryOp*) override;
-  void handle(const TernaryOp*) override;
-  void handle(const RNGOp*) override;
-  void handle(const BroadcastOp*) override;
-  void handle(const ReductionOp*) override;
-  void handle(const GroupedReductionOp*) override;
-  void handle(const WelfordOp*) override;
-  void handle(const LoadStoreOp*) override;
-  void handle(const MmaOp*) override;
-  void handle(const TransposeOp*) override;
-  void handle(const ExpandOp*) override;
-  void handle(const ShiftOp*) override;
-  void handle(const GatherOp*) override;
-  void handle(const ViewAsScalar*) override;
-  void handle(const ViewOp*) override;
-
-  void handle(const Split*) override;
-  void handle(const Merge*) override;
-  void handle(const Swizzle2D*) override;
-
- protected:
-  // We keep track of the original -> clone map so we don't
-  // duplicate clones of the same object if referenced multiple times
-  std::unordered_map<const Statement*, Statement*> clones_map_;
-
- private:
-  // The destination Fusion container
-  IrContainer* ir_container_ = nullptr;
-
-  // The dispatch interface doesn't allow returning values from
-  // individual `handle()` methods, so they are storing the
-  // result here
-  Statement* clone_ = nullptr;
-
-  // Builder to make all the new nodes
-  IrBuilder builder_;
-};
-
-// Replicates all expressions used to generate the provided TensorView. Does not
-// replicate inputs. Does not replicate scalar values. In other words the value
-// provided will be recomputed from the inputs of the fusion.
-class RecomputeTv : private IrCloner {
- public:
-  // Replicates expressions and values in provided expressions.
-  static TensorView* recompute(TensorView* tv);
-
- private:
-  RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs);
-
-  void handle(const TensorDomain*) final;
-
-  Fusion* fusion_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_container.cpp
+++ b/third_party/nvfuser/csrc/ir_container.cpp
@ -1,279 +0,0 @@
-#include <instrumentation.h>
-#include <ir_builder.h>
-#include <ir_cloner.h>
-#include <ir_container.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-void swap(IrContainer& a, IrContainer& b) noexcept {
-  FUSER_PERF_SCOPE("Fusion swap");
-
-  using std::swap;
-
-  // Swap the content
-  swap(a.vals_up_, b.vals_up_);
-  swap(a.vals_, b.vals_);
-
-  swap(a.exprs_up_, b.exprs_up_);
-  swap(a.exprs_, b.exprs_);
-
-  swap(a.raw_ptrs_, b.raw_ptrs_);
-
-  swap(a.val_type_name_map_, b.val_type_name_map_);
-  swap(a.expr_name_counter_, b.expr_name_counter_);
-
-  // Fixup the Statement::fusion_ links for a
-  for (auto val : a.vals_) {
-    val->ir_container_ = &a;
-  }
-  for (auto expr : a.exprs_) {
-    expr->ir_container_ = &a;
-  }
-
-  // Fixup the Statement::fusion_ links for b
-  for (auto val : b.vals_) {
-    val->ir_container_ = &a;
-  }
-  for (auto expr : b.exprs_) {
-    expr->ir_container_ = &a;
-  }
-}
-
-IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) {
-  to->clear();
-  IrCloner ir_cloner(to);
-
-  for (auto val : from->vals_) {
-    to->vals_.insert(ir_cloner.clone(val));
-  }
-
-  for (auto expr : from->exprs_) {
-    to->exprs_.insert(ir_cloner.clone(expr));
-  }
-
-  to->val_type_name_map_ = from->val_type_name_map_;
-  to->expr_name_counter_ = from->expr_name_counter_;
-
-  return ir_cloner;
-}
-
-IrContainer::IrContainer() = default;
-
-IrContainer::IrContainer(const IrContainer& other) {
-  FUSER_PERF_SCOPE("IrContainer copy");
-  IrContainer::copy(&other, this);
-}
-
-IrContainer::IrContainer(IrContainer&& other) noexcept {
-  FUSER_PERF_SCOPE("IrContainer move");
-  swap(*this, other);
-}
-
-IrContainer& IrContainer::operator=(const IrContainer& other) {
-  FUSER_PERF_SCOPE("IrContainer copy assign");
-  IrContainer copy(other);
-  clear();
-  swap(*this, copy);
-  return *this;
-}
-
-IrContainer& IrContainer::operator=(IrContainer&& other) noexcept {
-  FUSER_PERF_SCOPE("IrContainer move assign");
-  clear();
-  swap(*this, other);
-  return *this;
-}
-
-IrContainer::~IrContainer() {
-  clear();
-}
-
-//! Register the Statement with this container
-void IrContainer::registerStmt(IrBuilderPasskey, Statement* stmt) {
-  if (stmt->isVal()) {
-    registerVal(stmt->asVal());
-  } else {
-    registerExpr(stmt->asExpr());
-  }
-}
-
-//! Register the Val with this container
-void IrContainer::registerVal(IrBuilderPasskey, Val* val) {
-  registerVal(val);
-}
-
-//! Register expr with this container.
-void IrContainer::registerExpr(IrBuilderPasskey, Expr* expr) {
-  registerExpr(expr);
-}
-
-void IrContainer::registerExpr(ExprPasskey, Expr* expr) {
-  registerExpr(expr);
-}
-
-void IrContainer::removeExpr(Expr* expr) {
-  TORCH_INTERNAL_ASSERT(
-      exprs_.find(expr) != exprs_.end(),
-      "Wanted to remove an expression but it doesn't exist in this container.");
-  auto expr_in_deque = std::find_if(
-      exprs_up_.begin(),
-      exprs_up_.end(),
-      [expr](std::unique_ptr<Expr>& expr_up) { return expr_up.get() == expr; });
-
-  TORCH_INTERNAL_ASSERT(
-      expr_in_deque != exprs_up_.end(),
-      "Wanted to remove an expression but its unique ptr is missing.");
-
-  exprs_.erase(expr);
-  exprs_up_.erase(expr_in_deque);
-  raw_ptrs_.erase((void*)expr);
-}
-
-//! Completely remove val from the fusion, break all dependencies associated
-//! with it
-void IrContainer::removeVal(Val* val) {
-  // Don't remove shortcuts
-  if (val == true_val_.get() || val == false_val_.get() ||
-      val == one_val_.get() || val == zero_val_.get() ||
-      val == magic_zero_val_.get()) {
-    return;
-  }
-
-  TORCH_INTERNAL_ASSERT(
-      vals_.find(val) != vals_.end(),
-      "Wanted to remove a value but it doesn't exist in this container.");
-  auto val_in_deque = std::find_if(
-      vals_up_.begin(), vals_up_.end(), [val](std::unique_ptr<Val>& val_up) {
-        return val_up.get() == val;
-      });
-
-  TORCH_INTERNAL_ASSERT(
-      val_in_deque != vals_up_.end(),
-      "Wanted to remove a value but its unique ptr is missing.");
-
-  vals_.erase(val);
-  vals_up_.erase(val_in_deque);
-  raw_ptrs_.erase((void*)val);
-}
-
-//! Register the Val with this container
-void IrContainer::registerVal(Val* val) {
-  if (inContainer(val)) {
-    return;
-  }
-
-  vals_up_.emplace_back(std::unique_ptr<Val>(val));
-  vals_.emplace(vals_up_.back().get());
-  val->setName(IrContainerPasskey(), getValName(vals_up_.back()->vtype()));
-  raw_ptrs_.emplace((void*)vals_up_.back().get());
-}
-
-//! Register expr with this container.
-void IrContainer::registerExpr(Expr* expr) {
-  if (inContainer(expr)) {
-    return;
-  }
-  exprs_up_.emplace_back(std::unique_ptr<Expr>(expr));
-  exprs_.emplace(exprs_up_.back().get());
-  expr->setName(IrContainerPasskey(), getExprName());
-  raw_ptrs_.emplace((void*)exprs_up_.back().get());
-}
-
-void IrContainer::clear() noexcept {
-  FUSER_PERF_SCOPE("IrContainer clear");
-  vals_.clear();
-  vals_up_.clear();
-  exprs_.clear();
-  exprs_up_.clear();
-  raw_ptrs_.clear();
-
-  val_type_name_map_.clear();
-  expr_name_counter_ = 0;
-}
-
-bool IrContainer::inContainer(const Statement* stmt) const {
-  const void* const_void = (const void*)(stmt);
-  void* nonconst_void = const_cast<void*>(const_void); // NOLINT
-  if (raw_ptrs_.find(nonconst_void) == raw_ptrs_.end()) {
-    return false;
-  }
-
-  TORCH_INTERNAL_ASSERT(
-      stmt->container() == this,
-      "Container claims to own stmt, but stmt disagrees.");
-
-  Statement* nonconst_stmt = const_cast<Statement*>(stmt); // NOLINT
-  if (stmt->isExpr()) {
-    TORCH_INTERNAL_ASSERT(
-        exprs_.find(nonconst_stmt->as<Expr>()) != exprs_.end(),
-        "Somehow container claims to and not to own an Expr.");
-  }
-  if (stmt->isVal()) {
-    TORCH_INTERNAL_ASSERT(
-        vals_.find(nonconst_stmt->as<Val>()) != vals_.end(),
-        "Somehow container claims to and not to own an Val.");
-  }
-
-  return true;
-}
-
-// Shortcuts for frequently used vals
-Int* IrContainer::zeroVal() {
-  if (!zero_val_) {
-    auto zero_val = IrBuilder::create<Int>(this, 0);
-    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == zero_val);
-    zero_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
-    vals_up_.pop_back();
-  }
-  return zero_val_.get();
-}
-
-Int* IrContainer::oneVal() {
-  if (!one_val_) {
-    auto one_val = IrBuilder::create<Int>(this, 1);
-    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == one_val);
-    one_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
-    vals_up_.pop_back();
-  }
-  return one_val_.get();
-}
-
-Bool* IrContainer::falseVal() {
-  if (!false_val_) {
-    auto false_val = IrBuilder::create<Bool>(this, false);
-    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == false_val);
-    false_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
-    vals_up_.pop_back();
-  }
-  return false_val_.get();
-}
-
-Bool* IrContainer::trueVal() {
-  if (!true_val_) {
-    auto true_val = IrBuilder::create<Bool>(this, true);
-    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == true_val);
-    true_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
-    vals_up_.pop_back();
-  }
-  return true_val_.get();
-}
-
-NamedScalar* IrContainer::magicZeroVal() {
-  if (!magic_zero_val_) {
-    auto magic_zero =
-        IrBuilder::create<NamedScalar>(kMagicZeroName, DataType::Int);
-    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == magic_zero);
-    magic_zero_val_ = std::unique_ptr<NamedScalar>(
-        vals_up_.back().release()->as<NamedScalar>());
-    vals_up_.pop_back();
-  }
-  return magic_zero_val_.get();
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_container.h
+++ b/third_party/nvfuser/csrc/ir_container.h
@ -1,174 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <ir_base_nodes.h>
-#include <utils.h>
-
-#include <deque>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class IrBuilderPasskey;
-class ExprPasskey;
-class OptOutMutator;
-
-class Int;
-class Bool;
-class NamedScalar;
-
-// Passkey for container to register names with statements
-class IrContainerPasskey {
-  friend class IrContainer;
-
- private:
-  explicit IrContainerPasskey() {}
-};
-
-class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase {
- public:
-  IrContainer();
-
-  IrContainer(const IrContainer& other);
-  IrContainer(IrContainer&& other) noexcept;
-
-  IrContainer& operator=(const IrContainer& other);
-  IrContainer& operator=(IrContainer&& other) noexcept;
-
-  virtual ~IrContainer();
-
-  bool inContainer(const Statement* stmt) const;
-
-  void assertInContainer(const Statement* stmt, const std::string& msg) const {
-    TORCH_CHECK(
-        inContainer(stmt), msg, " it was not found in the active container.");
-  }
-
-  //! Return in insertion order
-  const std::deque<Val*> deterministic_vals() const noexcept {
-    std::deque<Val*> vals_deque;
-    std::transform(
-        vals_up_.begin(),
-        vals_up_.end(),
-        std::back_inserter(vals_deque),
-        [](const std::unique_ptr<Val>& val_up) { return val_up.get(); });
-    return vals_deque;
-  }
-
-  //! Register the Statement with this container
-  virtual void registerStmt(IrBuilderPasskey, Statement* stmt);
-
-  //! Register the Val with this container
-  virtual void registerVal(IrBuilderPasskey, Val* val);
-
-  //! Register expr with this container.
-  virtual void registerExpr(IrBuilderPasskey, Expr* expr);
-
-  //! Allow expr's to register themselves with a container, this is only used
-  //! for broadcastOp so it can register itself in its constructor so root maps
-  //! can be built.
-  virtual void registerExpr(ExprPasskey, Expr* expr);
-
-  //! Return the set of Exprs registered with this fusion. Warning: This will
-  //! return exprs outside inputs/outputs, so can be unsafe for use with
-  //! segmented fusions.
-  const std::unordered_set<Expr*>& unordered_exprs() const noexcept {
-    return exprs_;
-  }
-
-  //! Return the set of Vals registered with this fusion
-  const std::unordered_set<Val*>& vals() const noexcept {
-    return vals_;
-  }
-
-  // Shortcuts for frequently used vals
-  Int* zeroVal();
-  Int* oneVal();
-  Bool* falseVal();
-  Bool* trueVal();
-  NamedScalar* magicZeroVal();
-
- protected:
-  static IrCloner copy(const IrContainer* from, IrContainer* to);
-
-  friend void swap(IrContainer& a, IrContainer& b) noexcept;
-
-  // Let mutator remove Exprs.
-  friend OptOutMutator;
-
-  virtual void removeExpr(Expr* expr);
-
-  //! Completely remove val from the fusion, break all dependencies associated
-  //! with it
-  virtual void removeVal(Val* val);
-
-  //! Register the Val with this container
-  virtual void registerVal(Val* val);
-
-  //! Register expr with this container.
-  virtual void registerExpr(Expr* expr);
-
-  StmtNameType getValName(ValType vtype) {
-    if (val_type_name_map_.find(vtype) == val_type_name_map_.end()) {
-      val_type_name_map_[vtype] = 0;
-    }
-    return val_type_name_map_[vtype]++;
-  }
-
-  StmtNameType getExprName() {
-    return expr_name_counter_++;
-  }
-
-  void clear() noexcept;
-
-  // Deque of unique pointer is the memory owning data structure
-  std::deque<std::unique_ptr<Val>> vals_up_;
-
-  // A convenient set to return when we just need an unordered set to do
-  // something like check if a Val is in this container
-  std::unordered_set<Val*> vals_;
-
-  // Deque of unique pointer is the memory owning data structure
-  std::deque<std::unique_ptr<Expr>> exprs_up_;
-
-  // A convenient set to return when we just need an unordered set to do
-  // something like check if an Expr is in this container
-  std::unordered_set<Expr*> exprs_;
-
-  // Used to implement a generic "inContainer" that can be passed an invalid
-  // pointer. Specifically a pointer to a Statement owned by another container
-  // that has been freed. We can't check normally with the unordered_sets we
-  // already have because it would require a const_cast from a constant
-  // expr/val, or a dynamic cast from a Statement.
-  std::unordered_set<void*> raw_ptrs_;
-
-  // Values names counters
-  std::unordered_map<ValType, StmtNameType, TypeHash> val_type_name_map_;
-
-  // Expression names counter
-  StmtNameType expr_name_counter_ = 0;
-
-  // Manually store some persistent, frequently used nodes. It's very
-  // challenging to do this anything but manually as detecting when a container
-  // may or may not have one of these vals is tricky. Specifically because if
-  // the container doesn't own it, it's hard to understand from the outside if
-  // the node may have been removed then re-registered. It could also be tricky
-  // to know when we're using a different container as in FusionCopy_test
-  // demonstrates deleting then creating containers can result in the same
-  // pointer for the container.
-  std::unique_ptr<Bool> true_val_;
-  std::unique_ptr<Bool> false_val_;
-  std::unique_ptr<Int> one_val_;
-  std::unique_ptr<Int> zero_val_;
-  std::unique_ptr<NamedScalar> magic_zero_val_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_graphviz.cpp
+++ b/third_party/nvfuser/csrc/ir_graphviz.cpp
@ -1,519 +0,0 @@
-#include <ir_graphviz.h>
-
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_builder.h>
-#include <type.h>
-
-#include <fstream>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-
-// Private helper, generating node labels for IrGraphGenerator
-class IrNodeLabel : private OptInConstDispatch {
-  using DetailLevel = IrGraphGenerator::DetailLevel;
-
- public:
-  static std::string gen(
-      const Statement* node,
-      DetailLevel detail_level = DetailLevel::Basic) {
-    IrNodeLabel generator(detail_level);
-    generator.OptInConstDispatch::handle(node);
-    return generator.label_.str();
-  }
-
- private:
-  explicit IrNodeLabel(DetailLevel detail_level)
-      : detail_level_(detail_level) {}
-
-  ~IrNodeLabel() override = default;
-
-  void handle(const Bool* b) override {
-    if (b->isSymbolic()) {
-      label_ << "b" << b->name();
-    } else {
-      if (detail_level_ >= DetailLevel::Explicit) {
-        label_ << "b" << b->name() << "=";
-      }
-      label_ << *b->value();
-    }
-  }
-
-  void handle(const Double* d) override {
-    if (d->isSymbolic()) {
-      label_ << "d" << d->name();
-    } else {
-      if (detail_level_ >= DetailLevel::Explicit) {
-        label_ << "d" << d->name() << "=";
-      }
-      label_ << *d->value();
-    }
-  }
-
-  void handle(const Int* i) override {
-    if (i->isSymbolic()) {
-      label_ << "i" << i->name();
-    } else {
-      if (detail_level_ >= DetailLevel::Explicit) {
-        label_ << "i" << i->name() << "=";
-      }
-      label_ << *i->value();
-    }
-  }
-
-  void handle(const NamedScalar* ns) override {
-    label_ << ns->name();
-  }
-
-  void handle(const IterDomain* id) override {
-    label_ << id->getIterType();
-    label_ << id->getParallelType();
-
-    label_ << "(";
-    if (!id->start()->isZeroInt()) {
-      label_ << IrNodeLabel::gen(id->start()) << " : ";
-    }
-    label_ << IrNodeLabel::gen(id->extent());
-    label_ << ")";
-  }
-
-  void handle(const Split* split) override {
-    label_ << "Split(inner=" << (split->innerSplit() ? "true" : "false")
-           << ", factor=" << IrNodeLabel::gen(split->factor()) << ")";
-  }
-
-  void handle(const Merge* merge) override {
-    label_ << "Merge";
-  }
-
- private:
-  std::stringstream label_;
-  const DetailLevel detail_level_;
-};
-
-// Small color palette from the X11 theme
-static const char* getColorFromIndex(size_t index) {
-  const size_t number_of_colors = 10;
-  index = index % number_of_colors;
-  switch (index) {
-    case 0: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "azure";
-    case 1: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "pink";
-    case 2: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "green";
-    case 3: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "grey";
-    case 4: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "yellow";
-    case 5: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "lavender";
-    case 6: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "cyan";
-    case 7: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "white";
-    case 8: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "magenta";
-    case 9: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
-      return "red";
-    default:
-      break;
-  }
-  return "";
-}
-
-} // anonymous namespace
-
-void IrGraphGenerator::print(
-    const Fusion* fusion,
-    const char* filename,
-    DetailLevel detail_level,
-    ExprColorMap* expr_color_map) {
-  std::ofstream dot_file(filename);
-  TORCH_CHECK(dot_file.good(), "Failed to open the IR graph file");
-  dot_file << toGraphviz(fusion, detail_level, expr_color_map);
-}
-
-std::string IrGraphGenerator::toGraphviz(
-    const Fusion* fusion,
-    DetailLevel detail_level,
-    ExprColorMap* expr_color_map) {
-  IrGraphGenerator ir_graph(fusion, detail_level, expr_color_map);
-  return ir_graph.generate();
-}
-
-IrGraphGenerator::IrGraphGenerator(
-    const Fusion* fusion,
-    DetailLevel detail_level,
-    ExprColorMap* expr_color_map)
-    : detail_level_(detail_level),
-      fusion_(fusion),
-      expr_color_map_(expr_color_map) {
-  // setup inputs & outputs
-  // (indexes used to quickly check if a value is fusion input or output)
-  for (const auto* input : fusion->inputs()) {
-    TORCH_CHECK(inputs_.count(input) == 0);
-    inputs_.insert(input);
-  }
-  for (const auto* output : fusion->outputs()) {
-    TORCH_CHECK(outputs_.count(output) == 0);
-    outputs_.insert(output);
-  }
-}
-
-std::string IrGraphGenerator::getid(const Statement* stm) {
-  const auto it = id_map_.find(stm);
-  if (it == id_map_.end()) {
-    // First reference, generate a new id
-    std::stringstream new_id;
-    new_id << "stm_" << next_id_++;
-    id_map_.insert({stm, new_id.str()});
-    return new_id.str();
-  } else {
-    return it->second;
-  }
-}
-
-void IrGraphGenerator::addArc(
-    const Statement* src,
-    const Statement* dst,
-    const std::string& style) {
-  // We automatically visit (handle) the arc's source and destination
-  handle(src);
-  handle(dst);
-
-  // generate and queue the arc definition
-  std::stringstream arc_def;
-  arc_def << getid(src) << " -> " << getid(dst) << " " << style;
-  arcs_.push_back(arc_def.str());
-}
-
-void IrGraphGenerator::printExpr(const Expr* expr, const std::string& label) {
-  graph_def_ << "    " << getid(expr) << " "
-             << "[label=\"" << label << "\", shape=oval, color=blue, "
-             << "style=filled, fillcolor=";
-  if (expr_color_map_ != nullptr && expr_color_map_->count(expr)) {
-    graph_def_ << getColorFromIndex(expr_color_map_->at(expr));
-  } else {
-    graph_def_ << "azure";
-  }
-  graph_def_ << "];\n";
-}
-
-void IrGraphGenerator::printValue(const Val* val, const std::string& label) {
-  graph_def_ << "    " << getid(val) << " [label=\"" << label
-             << "\", shape=rect, color=green, fontsize=10];\n";
-}
-
-std::string IrGraphGenerator::generate() {
-  // IrGraphGenerator instances are not reusable
-  TORCH_CHECK(graph_def_.str().empty());
-  TORCH_CHECK(visited_.empty());
-
-  // record detail level
-  graph_def_ << "// detail level: ";
-  switch (detail_level_) {
-    case DetailLevel::ComputeOnly:
-      graph_def_ << "compute only\n";
-      break;
-    case DetailLevel::Basic:
-      graph_def_ << "minimal\n";
-      break;
-    case DetailLevel::Explicit:
-      graph_def_ << "explicit\n";
-      break;
-    case DetailLevel::Verbose:
-      graph_def_ << "verbose\n";
-      break;
-    default:
-      TORCH_CHECK(!"Unexpected detail level");
-  }
-
-  graph_def_ << "digraph fusion_ir {\n"
-             << "  node [shape=circle, color=gray];\n"
-             << "  edge [color=black];\n";
-
-  // Compute graph
-  generateComputeGraph();
-
-  // Schedule graph
-  if (detail_level_ > DetailLevel::ComputeOnly) {
-    generateScheduleGraph();
-  }
-
-  // All expressions & values
-  // (These are otherwise unreacheable (dead) nodes)
-  if (detail_level_ >= DetailLevel::Verbose) {
-    for (const auto* expr : fusion_->unordered_exprs()) {
-      handle(expr);
-    }
-    for (const auto* val : fusion_->vals()) {
-      handle(val);
-    }
-  }
-
-  // Finally, print all arc definitions
-  for (const auto& arc : arcs_) {
-    graph_def_ << "  " << arc << ";\n";
-  }
-
-  graph_def_ << "}\n";
-
-  // Make sure that all referenced nodes have been visited
-  for (const auto& kv : id_map_) {
-    TORCH_CHECK(visited(kv.first));
-  }
-
-  return graph_def_.str();
-}
-
-void IrGraphGenerator::generateComputeGraph() {
-  graph_def_ << "  subgraph cluster_compute {\n"
-             << "    label=\"compute\";\n"
-             << "    style=dashed;\n";
-
-  // Inputs
-  for (const auto* input : fusion_->inputs()) {
-    handle(input);
-  }
-
-  // Outputs
-  for (const auto* output : fusion_->outputs()) {
-    handle(output);
-  }
-
-  graph_def_ << "  }\n";
-}
-
-void IrGraphGenerator::generateScheduleGraph() {
-  graph_def_ << "  subgraph cluster_schedule {\n"
-             << "    label=\"schedule\";\n"
-             << "    style=dashed;\n";
-
-  // Connect TensorView with their TensorDomain
-  // (this will trigger the traversal of the schedule graph)
-
-  for (auto tv : tensor_views_) {
-    addArc(tv->domain(), tv, "[style=dashed, arrowhead=none]");
-    if (detail_level_ >= DetailLevel::Explicit) {
-      // Maybe not the best way to handle the root domain, but should be okay
-      addArc(
-          tv,
-          IrBuilder::create<TensorDomain>(tv->getRootDomain()),
-          "[style=dashed, color=green, arrowhead=none]");
-
-      if (tv->domain()->hasRFactor())
-        addArc(
-            tv,
-            IrBuilder::create<TensorDomain>(tv->domain()->getRFactorDomain()),
-            "[style=dashed, color=green, arrowhead=none]");
-    }
-  }
-
-  graph_def_ << "  }\n";
-}
-
-void IrGraphGenerator::handle(const Statement* s) {
-  OptInConstDispatch::handle(s);
-}
-
-void IrGraphGenerator::handle(const Val* v) {
-  if (!visited(v)) {
-    visited_.insert(v);
-    if (const auto* def = v->definition()) {
-      handle(def);
-    }
-    OptInConstDispatch::handle(v);
-  }
-}
-
-void IrGraphGenerator::handle(const Expr* e) {
-  if (!visited(e)) {
-    visited_.insert(e);
-    OptInConstDispatch::handle(e);
-  }
-}
-
-void IrGraphGenerator::handle(const TensorDomain* td) {
-  graph_def_ << "    " << getid(td) << " [label=\"TensorDomain\", "
-             << "shape=note, color=gray, "
-             << "style=filled, fillcolor=gray90, fontsize=10];\n";
-  for (auto iter_domain : td->domain()) {
-    addArc(iter_domain, td, "[color=gray]");
-  }
-}
-
-void IrGraphGenerator::handle(const IterDomain* id) {
-  graph_def_ << "    " << getid(id) << " [label=\"" << IrNodeLabel::gen(id)
-             << "\", shape=cds, color=gray, fontsize=10];\n";
-
-  if (!id->start()->isZeroInt()) {
-    addArc(id->start(), id, "[color=gray]");
-  }
-
-  addArc(id->extent(), id, "[color=gray]");
-}
-
-void IrGraphGenerator::handle(const Bool* b) {
-  printValue(b, IrNodeLabel::gen(b, detail_level_));
-}
-
-void IrGraphGenerator::handle(const Double* d) {
-  printValue(d, IrNodeLabel::gen(d, detail_level_));
-}
-
-void IrGraphGenerator::handle(const Int* i) {
-  printValue(i, IrNodeLabel::gen(i, detail_level_));
-}
-
-void IrGraphGenerator::handle(const ComplexDouble* i) {
-  printValue(i, IrNodeLabel::gen(i, detail_level_));
-}
-
-void IrGraphGenerator::handle(const NamedScalar* i) {
-  printValue(i, IrNodeLabel::gen(i, detail_level_));
-}
-
-void IrGraphGenerator::handle(const TensorView* tv) {
-  std::stringstream label;
-  label << "{T" << tv->name() << "|";
-  label << "{";
-  bool first_axis = true;
-  for (auto iter_domain : tv->domain()->domain()) {
-    if (first_axis) {
-      first_axis = false;
-    } else {
-      label << "|";
-    }
-    label << IrNodeLabel::gen(iter_domain);
-  }
-  label << "}}";
-
-  const bool is_input = inputs_.find(tv) != inputs_.end();
-  const bool is_output = outputs_.find(tv) != outputs_.end();
-
-  const char* style = is_input ? "style=filled, fillcolor=palegreen"
-      : is_output              ? "style=filled, fillcolor=lightblue"
-                               : "style=filled, fillcolor=beige";
-
-  graph_def_ << "    " << getid(tv) << " [label=\"" << label.str()
-             << "\", shape=Mrecord, color=brown, " << style << "];\n";
-
-  tensor_views_.push_back(tv);
-}
-
-void IrGraphGenerator::handle(const FullOp* fop) {
-  // node
-  printExpr(fop, "full");
-
-  // inputs & outputs
-  addArc(fop->getFillValue(), fop);
-  addArc(fop, fop->output(0));
-}
-
-void IrGraphGenerator::handle(const ARangeOp* aop) {
-  // node
-  printExpr(aop, "arange");
-
-  // inputs & outputs
-  addArc(aop->start(), aop);
-  addArc(aop->end(), aop);
-  addArc(aop->step(), aop);
-  addArc(aop, aop->output(0));
-}
-
-void IrGraphGenerator::handle(const EyeOp* eop) {
-  // node
-  printExpr(eop, "eye");
-
-  // inputs & outputs
-  addArc(eop, eop->output(0));
-}
-
-void IrGraphGenerator::handle(const UnaryOp* uop) {
-  // node
-  std::stringstream label;
-  label << uop->getUnaryOpType();
-  printExpr(uop, label.str());
-
-  // inputs & outputs
-  addArc(uop->in(), uop);
-  addArc(uop, uop->out());
-}
-
-void IrGraphGenerator::handle(const BinaryOp* bop) {
-  // node
-  std::stringstream label;
-  label << bop->getBinaryOpType();
-  printExpr(bop, label.str());
-
-  // inputs & outputs
-  addArc(bop->lhs(), bop);
-  addArc(bop->rhs(), bop, "[color=blue]");
-  addArc(bop, bop->out());
-}
-
-void IrGraphGenerator::handle(const TernaryOp* op) {
-  // node
-  std::stringstream label;
-  label << op->getTernaryOpType();
-  printExpr(op, label.str());
-
-  // inputs & outputs
-  addArc(op->in1(), op);
-  addArc(op->in2(), op, "[color=blue]");
-  addArc(op->in3(), op, "[color=brown]");
-  addArc(op, op->out());
-}
-
-void IrGraphGenerator::handle(const RNGOp* op) {
-  // node
-  std::stringstream label;
-  label << op->getRNGOpType();
-  printExpr(op, label.str());
-
-  // inputs & outputs
-  addArc(op, op->output(0));
-}
-
-void IrGraphGenerator::handle(const BroadcastOp* op) {
-  printExpr(op, "Broadcast");
-  addArc(op->in(), op);
-  addArc(op, op->out());
-}
-
-void IrGraphGenerator::handle(const ReductionOp* op) {
-  // node
-  std::stringstream label;
-  label << "Reduction(" << op->getReductionOpType() << ")";
-  printExpr(op, label.str());
-
-  // inputs & outputs
-  addArc(op->in(), op);
-  addArc(op->init(), op, "[color=blue]");
-  addArc(op, op->out());
-}
-
-void IrGraphGenerator::handle(const Split* split) {
-  printExpr(split, IrNodeLabel::gen(split));
-  addArc(split->in(), split);
-  addArc(split, split->outer());
-  addArc(split, split->inner());
-}
-
-void IrGraphGenerator::handle(const Merge* merge) {
-  printExpr(merge, IrNodeLabel::gen(merge));
-  addArc(merge->outer(), merge);
-  addArc(merge->inner(), merge);
-  addArc(merge, merge->out());
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_graphviz.h
+++ b/third_party/nvfuser/csrc/ir_graphviz.h
@ -1,130 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-#include <dispatch.h>
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// Generates a DOT (https://www.graphviz.org) graph
-// representation of a fuser IR
-//
-// Usage:
-// 1) Add calls to IrGraphGenerator::print(), for example:
-//  `IrGraphGenerator::print(&fusion, "ir.dot")`
-//
-// 2) Call IrGraphGenerator::print() from a debugger. Using gdb for example:
-//  `call IrGraphGenerator::print(&fusion, "ir.dot",
-//      IrGraphGenerator::DetailLevel::Explicit)`
-//
-// Notes:
-//  - When called from the debugger, the detail_level must be
-//    explicitly passed in (most debuggers don't support default arguments)
-//
-//  - The output dot file path can't include shell specific notations,
-//    for example you can't use "~/temp/ir.dot" ("/home/user/temp/ir.dot"
-//    must be used instead)
-//
-class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
- public:
-  enum class DetailLevel {
-    ComputeOnly, // Only dataflow (compute) nodes
-    Basic, // Compute + schedule, with minimal details (default)
-    Explicit, // Additional details (ex. symbolic names for scalar constants)
-    Verbose, // Includes all values and dead definitions
-  };
-
-  using ExprColorMap = std::unordered_map<const Expr*, size_t>;
-
- public:
-  static void print(
-      const Fusion* fusion,
-      const char* filename,
-      DetailLevel detail_level = DetailLevel::Basic,
-      ExprColorMap* expr_color_map = nullptr);
-
-  static std::string toGraphviz(
-      const Fusion* fusion,
-      DetailLevel detail_level,
-      ExprColorMap* expr_color_map = nullptr);
-
- private:
-  IrGraphGenerator(
-      const Fusion* fusion,
-      DetailLevel detail_level,
-      ExprColorMap* expr_color_map = nullptr);
-  ~IrGraphGenerator() override = default;
-
-  std::string generate();
-
-  void generateComputeGraph();
-  void generateScheduleGraph();
-
-  void handle(const Statement*) override;
-  void handle(const Val*) override;
-  void handle(const Expr*) override;
-
-  void handle(const TensorDomain*) override;
-  void handle(const TensorView*) override;
-  void handle(const IterDomain*) override;
-
-  void handle(const Bool*) override;
-  void handle(const Double*) override;
-  void handle(const Int*) override;
-  void handle(const ComplexDouble*) override;
-  void handle(const NamedScalar*) override;
-
-  void handle(const FullOp*) override;
-  void handle(const ARangeOp*) override;
-  void handle(const EyeOp*) override;
-  void handle(const UnaryOp*) override;
-  void handle(const BinaryOp*) override;
-  void handle(const TernaryOp*) override;
-  void handle(const RNGOp*) override;
-  void handle(const BroadcastOp*) override;
-  void handle(const ReductionOp*) override;
-
-  void handle(const Split*) override;
-  void handle(const Merge*) override;
-
-  // lookup the graph id, creating one if not found
-  std::string getid(const Statement* stm);
-
-  bool visited(const Statement* s) const {
-    return visited_.find(s) != visited_.end();
-  }
-
-  void addArc(
-      const Statement* src,
-      const Statement* dst,
-      const std::string& style = "");
-
-  void printExpr(const Expr* expr, const std::string& label);
-  void printValue(const Val* val, const std::string& label);
-
- private:
-  const DetailLevel detail_level_;
-  const Fusion* const fusion_;
-  std::stringstream graph_def_;
-  std::unordered_map<const Statement*, std::string> id_map_;
-  std::unordered_set<const Statement*> visited_;
-  std::unordered_set<const Val*> inputs_;
-  std::unordered_set<const Val*> outputs_;
-  std::vector<const TensorView*> tensor_views_;
-  std::vector<std::string> arcs_;
-  int next_id_ = 1;
-  ExprColorMap* expr_color_map_ = nullptr;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_interface_nodes.h
+++ b/third_party/nvfuser/csrc/ir_interface_nodes.h
@ -1,600 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <fusion.h>
-#include <ir_base_nodes.h>
-#include <ir_internal_nodes.h>
-#include <mma_type.h>
-
-#include <torch/csrc/jit/ir/ir.h>
-
-//! Nodes in here are intended to be "user facing" users in this sense being
-//! those that want to be able to generate CUDA code.
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class WelfordResult;
-class ViewTransform;
-
-class IrCloner;
-class IrBuilderPasskey;
-
-//! A Bool value
-//!
-//! This value can be a symbolic value (defined after the kernel
-//! is compiled) or a constant value (inlined into the kernel definition).
-//!
-class TORCH_CUDA_CU_API Bool : public Val {
- public:
-  Bool(IrBuilderPasskey passkey);
-
-  explicit Bool(IrBuilderPasskey passkey, bool value);
-
-  explicit Bool(IrBuilderPasskey passkey, c10::optional<bool> value);
-
-  Bool(const Bool* src, IrCloner* ir_cloner);
-
-  bool isSymbolic() const {
-    return !(maybe_value_.has_value());
-  }
-  bool isConst() const final {
-    return maybe_value_.has_value();
-  }
-  c10::optional<bool> value() const {
-    return maybe_value_;
-  }
-
-  bool sameAs(const Statement* other) const override;
-
- private:
-  const c10::optional<bool> maybe_value_;
-};
-
-//! A Float64 value. This value can be a symbolic value (defined after the
-//! kernel is compiled) or a constant value (inlined into the kernel
-//! definition).
-class TORCH_CUDA_CU_API Double : public Val {
- public:
-  using ScalarType = double;
-
-  Double(IrBuilderPasskey passkey);
-
-  explicit Double(IrBuilderPasskey passkey, ScalarType value);
-
-  explicit Double(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
-
-  Double(const Double* src, IrCloner* ir_cloner);
-
-  bool isSymbolic() const {
-    return !(maybe_value_.has_value());
-  }
-  bool isConst() const final {
-    return maybe_value_.has_value();
-  }
-  c10::optional<ScalarType> value() const {
-    return maybe_value_;
-  }
-
-  bool sameAs(const Statement* other) const override;
-
- private:
-  const c10::optional<ScalarType> maybe_value_;
-};
-
-//! An Int64 value. If used for indexing it's set as size_t. Otherwise it's an
-//! inlined literal in the kernel.
-class TORCH_CUDA_CU_API Int : public Val {
- public:
-  using ScalarType = int64_t;
-
-  Int(IrBuilderPasskey passkey);
-
-  explicit Int(IrBuilderPasskey passkey, ScalarType value);
-
-  explicit Int(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
-
-  Int(const Int* src, IrCloner* ir_cloner);
-
-  bool isSymbolic() const {
-    return !(maybe_value_.has_value());
-  }
-  bool isConst() const final {
-    return maybe_value_.has_value();
-  }
-  c10::optional<ScalarType> value() const {
-    return maybe_value_;
-  }
-
-  bool sameAs(const Statement* other) const override;
-
- private:
-  const c10::optional<ScalarType> maybe_value_;
-};
-
-//! An c10::complex<double> value. This value can be a symbolic value (defined
-//! after the kernel is compiled) or a constant value (inlined into the kernel
-//! definition).
-class TORCH_CUDA_CU_API ComplexDouble : public Val {
- public:
-  using ScalarType = c10::complex<double>;
-
-  ComplexDouble(IrBuilderPasskey passkey);
-
-  explicit ComplexDouble(IrBuilderPasskey passkey, ScalarType value);
-
-  explicit ComplexDouble(
-      IrBuilderPasskey passkey,
-      c10::optional<ScalarType> value);
-
-  ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner);
-
-  bool isSymbolic() const {
-    return !(maybe_value_.has_value());
-  }
-  bool isConst() const final {
-    return maybe_value_.has_value();
-  }
-  c10::optional<ScalarType> value() const {
-    return maybe_value_;
-  }
-
-  bool sameAs(const Statement* other) const override;
-
- private:
-  const c10::optional<ScalarType> maybe_value_;
-};
-
-//! Mode during propagation of computeAt, standard will throw an error if
-//! computeAt position provided can't be satisfied, best effort will lower the
-//! computeAt position as needed during traversal, most inlined will increase
-//! the compute at position to maximum possible through traversal.
-enum class ComputeAtMode { Standard, BestEffort, MostInlined };
-
-class TransformPropagator;
-struct MostInlinedTransformPropagator;
-class TransformIter;
-class TransformReplay;
-class OptOutMutator;
-class TensorDomain;
-
-class MaxPosCalculator;
-
-namespace ir_utils {
-class TVDomainGuard;
-}
-
-//! TensorView is our primitive Tensor Type used in code generation. It can be
-//! thought of as representing physical memory, however, its dimensionality is
-//! modifed as split/merge/computeAt functions are called. The history of
-//! these transformations are kept and used for generating actual code
-//! referncing physical memory. Generally when users are thinking of code
-//! generation in reference to a Tensor, this is the class they should be
-//! interacting with.
-//!
-//! The reason we need both TensorView and TensorDomain is that we need to have
-//! a record of both what is being computed and how it is being computed. For
-//! example we may have the operation:
-//!
-//!   TV3[I, J, K] = TV2[I, J, K] + TV1[I, J, K]
-//!
-//! The mathematical operations here are on the tensor views TV1, TV2, and
-//! TV3. This operation is a pointwise operation. To compute this pointwise
-//! operation we iterate over the 3D TensorDomain [I, J, K], where K is the
-//! fastest changing dimension.
-//!
-//! \todo Need to work on the const model for TensorView, making all functions
-//! that should be const, const. Gave this a try but expanded really quickly.
-//! getComputeAtAxis not being const because it can return a TV that some expect
-//! to be non-const is the biggest headache.
-//!
-class TORCH_CUDA_CU_API TensorView : public Val {
- public:
-  TensorView(
-      IrBuilderPasskey passkey,
-      TensorDomain* domain,
-      DataType dtype,
-      MemoryType mtype = MemoryType::Local);
-
-  explicit TensorView(
-      IrBuilderPasskey passkey,
-      const std::shared_ptr<c10::TensorType>& tensor_type);
-
-  explicit TensorView(
-      IrBuilderPasskey passkey,
-      const std::shared_ptr<Value>& jit_value);
-
-  TensorView(const TensorView* src, IrCloner* ir_cloner);
-
-  TensorDomain* domain() const {
-    return domain_;
-  }
-
-  //! This is for a TensorView with an rFactor domain that is an input to a
-  //! fusion segment. We convert the rfactor domain into a new root domain.
-  //! Any dynamic-sized rfactor iterDomains are given a new symbolic extent.
-  //! Concrete integer extents are kept. Output TensorViews of any subsequent
-  //! expressions that use this TensorView are also updated.
-  void convertRfactorToRootDomain();
-
-  void setContiguity(const std::vector<bool>& contig) {
-    domain()->setContiguity(contig);
-  }
-
-  void setContiguity(bool contig) {
-    setContiguity(std::vector<bool>(domain()->contiguity().size(), contig));
-  }
-
-  bool hasReduction() const;
-  bool hasBlockReduction() const;
-  bool hasGridReduction() const;
-  bool hasBroadcast() const;
-  bool hasRFactor() const;
-
-  //! This is the previous hasReduction logic,
-  //! kept here exclusively for lower loop pass will
-  //! deprecate when Fusion IR pass can convert
-  //! trivial reductions
-  bool hasAnyReduction() const;
-
-  //! Returns true if this tensor is zero dimensional,
-  //!  i.e. a wrapped scalar or an empty placeholder.
-  bool isZeroDim() const {
-    return nDims() == 0;
-  }
-
-  //! Returns true if this tensor does not contain
-  //!  any value.
-  bool isEmptyTensor() const;
-
-  c10::optional<unsigned int> getReductionAxis() const;
-
-  const std::vector<IterDomain*>& getRootDomain() const;
-
-  const std::vector<IterDomain*>& getRFactorDomain() const;
-
-  // If rfactor domain exists in domain() return it, otherwise return root
-  // domain.
-  const std::vector<IterDomain*>& getMaybeRFactorDomain() const;
-
-  IterDomain* axis(int pos) const;
-
-  // Does it share outer axes with other tensors?
-  bool hasComputeAt() const {
-    return compute_at_pos_ > 0;
-  }
-
-  bool hasMaxProducerPosition() const {
-    return max_producer_pos_ > 0;
-  }
-
-  size_t nDims() const;
-
-  // sets cpu_scalar_ value, which is special handling for CPU based zero-dim
-  // tensors (i.e. CPU Tensors that only have one value). This is only used if
-  // on an input value, otherwise ignored. This is important as special handling
-  // because these "scalars" should be type promoted as a tensor, but we want to
-  // avoid explicit copying of the data, so we want to pass the data value as a
-  // standard kernel argument value.
-  void setCpuScalar(bool is_cpu_scalar);
-
-  // returns cpu_scalar_ value, which is special handling for CPU based zero-dim
-  // tensors (i.e. CPU Tensors that only have one value). This is only used if
-  // on an input value, otherwise ignored. This is important as special handling
-  // because these "scalars" should be type promoted as a tensor, but we want to
-  // avoid explicit copying of the data, so we want to pass the data value as a
-  // standard kernel argument value.
-  bool isCpuScalar() const {
-    return cpu_scalar_;
-  }
-
-  // Returns the position that this tensor is produced at relative to its axes.
-  unsigned int getComputeAtPosition() const {
-    return compute_at_pos_;
-  }
-
-  // Returns the maximum position of producers are being computed at relative to
-  // this tensor. This position dictates the clear expectations of producers.
-  unsigned int getMaxProducerPosition() const {
-    return max_producer_pos_;
-  }
-
-  //! This is used when we disconnect a tensorview from a reduction
-  //!  operation and connect it to a non-reduction operator. We need
-  //!  to remove the reduction ids on the tv in this case.
-  //! Currently only used in translate welford, and this function may
-  //!  be refactored or extended if any more use cases appear.
-  void clearReductionIterDomains();
-
-  //! Compute this TensorView relative to a consumer position, -1 will
-  //! compute tensors inline with each other, 0 doesn't share
-  //! any loop nests between the tensors. It's an error when the given
-  //! position is not legally viable. Alternatively, when the mode
-  //! parameter is ComputeAtMode::BestEffort, the position is lowered
-  //! one by one until a valid position is found. When
-  //! ComputeAtMode::MostInlined is given, the position parameter is
-  //! ignored, and the deepest possible position is searched.
-  TensorView* computeAt(
-      TensorView* consumer,
-      int position,
-      ComputeAtMode mode = ComputeAtMode::Standard);
-
-  //! Compute this tensor to consumer, at local position, -1 will compute
-  //! tensors inline with eachother, 0 doesn't share any loop nests between the
-  //! tensors. The mode parameter can be used in the same manner as computeAt.
-  TensorView* computeWith(
-      TensorView* consumer,
-      int position,
-      ComputeAtMode mode = ComputeAtMode::Standard);
-
-  // Split "axis" into 2 axes
-  //! inner_split dictates if the factor section of the split should be inside
-  //! the
-  //! remainer or outside.
-  //! e.g. split(0, 4, inner_split = true) will result in:
-  //! tv[id{extent}] -> tv[id{ceilDiv(extent, factor)}, id{factor}]
-  //! e.g. split(0, 4, inner_split = false) will result in:
-  //! tv[id{extent}] -> tv[id{factor}, id{ceilDiv(extent, factor)}]
-  //!
-  //! When trim_out_of_bounds is true, only the inner domain defined by the
-  //! start and stop positions is split.
-  TensorView* split(
-      int axis,
-      unsigned int factor,
-      bool inner_split = true,
-      bool trim_out_of_bounds = false);
-
-  // Split "axis" into 2 axes where the inner axes is size of "factor"
-  // and outer axis is size axis.size() / factor. Factor can be a symbolic
-  // value instead of constant. This requires setting the symbolic value as an
-  // input, or using a parallel dim from NamedScalar::getParallelDim
-  TensorView* split(
-      int axis,
-      Val* factor,
-      bool inner_split = true,
-      bool trim_out_of_bounds = false);
-
-  // Merge axis_o and axis_i into 1 IterDomain
-  TensorView* merge(int axis_o, int axis_i);
-
-  // Merge axis and axis+1 into 1 IterDomain
-  TensorView* merge(int axis) {
-    return merge(axis, axis + 1);
-  }
-
-  // Reorder axes according to old2new[old_pos] = new_pos
-  TensorView* reorder(const std::unordered_map<int, int>& old2new);
-
-  //! Swizzle indices to improve memory access efficiency.
-  //!
-  //! Swizzle::Transpose is a pattern commonly used to avoid bank
-  //! conflicts in shared memory. It takes two axes and shifts the
-  //! second axis by the first axis as ((axis1 + axis2) % extent). The
-  //! memory type must be Shared.
-  //!
-  //! \input type Swizzle pattern such as transpose.
-  //! \input axes Axes to swizzle
-  TensorView* swizzle(SwizzleType type, const std::vector<int>& axes);
-
-  //! Swizzle the rectangular tile defined by the iterdomains corresponding
-  //!  to the 2 given indices.
-  TensorView* swizzle(
-      Swizzle2DType swizzle_type,
-      int x,
-      int y,
-      SwizzleMode swizzle_mode = SwizzleMode::Data);
-
-  // WARNING: rFactor does not return this TensorView, ir returns a new
-  //  tensorview consumed by this!
-  //
-  // Take reduction axes out of this domain, and create a new
-  // domain. New domain will be used to create this domain.
-  //
-  // For example:
-  //  TV1[I0, R1, R2, I3] = TV0[I0, I1, I2, I3]
-  //
-  // After:
-  //  TV1->rfactor({1}), TV1 is transformed to -> TV1[I0, R2, I3]
-  //
-  // The TensorView returned is: TV2[I0, R1, I2, I3]
-  //
-  // The reduction will now beset as:
-  //  TV2[I0, R1, I2, I3] = TV0[I0, I1, I2, I3]
-  //  TV1[I0, R2, I3] = TV2[I0, R1, I2, I3]
-  //
-  TensorView* rFactor(const std::vector<int>& axes);
-
-  //! Multi-output version of rFactor, semantically similar with
-  //! the reduction version except that the rfactor is done
-  //! for all outputs in a consistent way
-  std::vector<TensorView*> rFactor(
-      const std::vector<int>& axes,
-      const std::vector<TensorView*>& tvs);
-
-  //! Create a TensorView before the original tensor. A common use case is to
-  //! write results into shared memory or registers before moving to global
-  //! memory. Analogous to TVM Cache_Write
-  //!
-  //! @param cache_op: memory operator to use for the inserted op between
-  //!   the the data tensor and the cache tensor
-  TensorView* cacheBefore(
-      c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
-
-  //! Create a TensorView after the original tensor. A common use case is to
-  //! read tensor into shared memory or registers. Analogous to TVM Cache_Read
-  //!
-  //! @param cache_op: memory operator to use for the inserted op between
-  //!   the the data tensor and the cache tensor
-  TensorView* cacheAfter(
-      c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
-
-  // For a fusion output with other uses, we want to avoid writing to global
-  // memory and then reading the output again. We write to global memory
-  // separately after an operation. We replace this fusion output with the
-  // direct write TensorView.
-  TensorView* cacheFork();
-
-  MemoryType getMemoryType() const {
-    return memory_type_;
-  }
-
-  void setMemoryType(MemoryType mt);
-
-  SwizzleType swizzleType() const {
-    return swizzle_type_;
-  }
-
-  const std::vector<IterDomain*>& axesToSwizzle() const {
-    return axes_to_swizzle_;
-  }
-
-  // Apply double buffering transformation
-  void doubleBuffer();
-
-  // Apply circular buffering transformation
-  void circularBuffer(unsigned int number_of_stage);
-
-  // Returns true if this tensor is double buffered.
-  bool isDoubleBuffered() const {
-    return is_double_buffered_;
-  }
-
-  // Returns true if this tensor is circular buffered.
-  bool isCircularBuffered() const {
-    return is_circular_buffered_;
-  }
-
-  // Returns the depth of circular buffering if applicable.
-  unsigned int circularBufferDepth() const {
-    TORCH_INTERNAL_ASSERT(
-        is_circular_buffered_, toString(), "not circular buffered");
-    return circular_buffer_stage_;
-  }
-
-  //! Transforms the innermost iterdomains according to the given mma swizzle,
-  //!  this should be used on the tvs that are either inputs/outputs of an
-  //!  MmaOp, or any tv's that are involved in prolog/epilog fusions and need to
-  //!  have a matching thread swizzle with the mma operand/result.
-  //! More detail on usage see [WarpMmaSwizzler] in scheduler/mma_utils.h .
-  void applyMmaSwizzle(MmaOptions options);
-
-  //! Returns if this tensor view has swizzle operator on its tensor domain.
-  //!  This is the temporary flag for indicating that the new swizzle
-  //!  implementation is used and will be removed in follow ups.
-  bool hasSwizzleOp() const {
-    return has_swizzle_op_;
-  }
-
-  friend TORCH_CUDA_CU_API TransformPropagator;
-  friend TORCH_CUDA_CU_API MostInlinedTransformPropagator;
-  friend TORCH_CUDA_CU_API TransformReplay;
-  friend TORCH_CUDA_CU_API OptOutMutator;
-  friend class InlineBatchingGuard;
-  friend class ir_utils::TVDomainGuard;
-
-  // Inline the computation of this tensor into its consumer at the given
-  // position. If this tensor is already inlined in a higher position, then this
-  // call is a no-op. If the right most dimensions before `pos` are
-  // broadcasting, then will not inline into these broadcastings. If
-  // best_effort, then will inline into the highest allowed position that is <=
-  // `pos`.
-  void inlineAt(
-      int64_t pos,
-      bool best_effort = false,
-      MaxPosCalculator* calc = nullptr);
-
-  // Update the max producer position of the current tensor. This is required
-  // when we modify producer-consumer relationship of a scheduled tensor, for
-  // example, grouping multiple reductions.
-  void updateMaxProducerPosition();
-
- protected:
-  void setDomain(TensorDomain* td) {
-    domain_ = td;
-  }
-
- private:
-  int normalizeAxisPos(int pos) const {
-    if (pos < 0) {
-      pos += nDims();
-    }
-    return pos;
-  }
-
-  //! A helper function to maintain the consistency of schedules of
-  //! multiple outputs wheen doing rfactor on multi-output reduction ops.
-  TensorView* multiOutputRfactorHelper(
-      TensorView* tv,
-      const std::vector<int>& axes);
-
- private:
-  TensorDomain* domain_ = nullptr;
-  unsigned int compute_at_pos_ = 0;
-  unsigned int max_producer_pos_ = 0;
-  MemoryType memory_type_ = MemoryType::Local;
-  SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
-  std::vector<IterDomain*> axes_to_swizzle_;
-  bool is_double_buffered_ = false;
-
-  //! Indicates if the tensor is circular buffered.
-  bool is_circular_buffered_ = false;
-
-  //! Indicates the circular buffering stage depth if applicable.
-  unsigned int circular_buffer_stage_ = 0;
-
-  // special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only
-  // have one value). This is only used if on an input value, otherwise ignored.
-  // This is important as special handling because these "scalars" should be
-  // type promoted as a tensor, but we want to avoid explicit copying of the
-  // data, so we want to pass the data value as a standard kernel argument
-  // value.
-  bool cpu_scalar_ = false;
-
-  //! Indicates if this tensor view has swizzle operator on its tensor domain.
-  //!  This is the temporary flag for indicating that the new swizzle
-  //!  implementation is used and will be removed in follow ups.
-  bool has_swizzle_op_ = false;
-};
-
-//! A simple TensorView builder
-//!
-//! Example usage:
-//!
-//!   auto tv = TensorViewBuilder()
-//!       .ndims(ndims)
-//!       .dtype(dtype)
-//!       .contiguity(contiguity)
-//!       .build();
-//!
-class TORCH_CUDA_CU_API TensorViewBuilder {
- public:
-  //! Set the number of dimensions of the tensor (default 0, meaning scalar)
-  TensorViewBuilder& ndims(size_t ndims);
-
-  //! Set the data type of the tensor (default DataType::Float)
-  TensorViewBuilder& dtype(DataType dtype);
-
-  //! Set the contiguity information (default non-contiguous)
-  TensorViewBuilder& contiguity(std::vector<bool> contiguity);
-
-  //! Set the shape (default 0 dimensional, ie. scalar)
-  TensorViewBuilder& shape(std::vector<Val*> shape);
-  TensorViewBuilder& shape(const std::vector<int64_t>& shape);
-
-  //! Creates a new TensorView with the specified options
-  TensorView* build() const;
-
- private:
-  size_t ndims_ = 0;
-  DataType dtype_ = DataType::Float;
-  std::vector<bool> contiguity_;
-  std::vector<Val*> shape_;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_internal_nodes.h
+++ b/third_party/nvfuser/csrc/ir_internal_nodes.h
--- a/third_party/nvfuser/csrc/ir_iostream.cpp
+++ b/third_party/nvfuser/csrc/ir_iostream.cpp
--- a/third_party/nvfuser/csrc/ir_iostream.h
+++ b/third_party/nvfuser/csrc/ir_iostream.h
@ -1,162 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <dispatch.h>
-
-#include <c10/util/irange.h>
-
-#include <iostream>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class Fusion;
-namespace kir {
-class Kernel;
-class Scope;
-} // namespace kir
-
-//! Define pretty printing functions for IR nodes
-//!
-//! This class is intended for debug printing, so it attempts
-//! to handle invalid states as well.
-//!
-class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
-  static constexpr char const* kTab = "  ";
-
- public:
-  explicit IrPrinter(std::ostream& os) : os_(os) {}
-
-  // Indent the generated code
-  std::ostream& indent() {
-    for (const auto i : c10::irange(indent_size_)) {
-      (void)i; // Suppress unused variable warning
-      os_ << "  ";
-    }
-    return os_;
-  }
-
-  void resetIndent() {
-    indent_size_ = 0;
-  }
-
-  bool printInline() const {
-    return print_inline_;
-  }
-
-  using OptInConstDispatch::handle;
-
-  virtual void handle(Fusion* f);
-
-  // handle calls some non const fusion ops,
-  // eventhough fusion should remain unchanged.
-  // Need to look into this.
-  virtual void handle(const Fusion* f) {
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-    handle(const_cast<Fusion*>(f));
-  }
-
-  virtual void handle(Fusion& f) {
-    handle(&f);
-  }
-
-  virtual void handle(const kir::Kernel* kernel);
-  virtual void handle(kir::Kernel& kernel);
-
-  void handleScope(const kir::Scope& scope);
-
-  void handle(const Statement* s) final;
-  void handle(const Val* v) final;
-  void handle(const Expr* e) final;
-
-  void handle(const IterDomain*) final;
-  void handle(const TensorDomain*) final;
-  void handle(const TensorView*) final;
-
-  void handle(const Bool*) final;
-  void handle(const Double*) final;
-  void handle(const Int*) final;
-  void handle(const ComplexDouble*) final;
-  void handle(const NamedScalar*) final;
-
-  void handle(const FullOp*) final;
-  void handle(const ARangeOp*) final;
-  void handle(const EyeOp*) final;
-  void handle(const UnaryOp*) final;
-  void handle(const BinaryOp*) final;
-  void handle(const TernaryOp*) final;
-  void handle(const RNGOp*) final;
-  void handle(const ReductionOp*) final;
-  void handle(const GroupedReductionOp*) final;
-  void handle(const WelfordOp*) final;
-  void handle(const GroupedWelfordOp*) final;
-  void handle(const LoadStoreOp*) final;
-  void handle(const MmaOp*) final;
-  void handle(const BroadcastOp*) final;
-  void handle(const TransposeOp*) final;
-  void handle(const ExpandOp*) final;
-  void handle(const ShiftOp*) final;
-  void handle(const GatherOp*) final;
-  void handle(const ViewAsScalar*) final;
-  void handle(const ViewOp*) final;
-
-  void handle(const kir::Predicate*) final;
-  void handle(const kir::TensorIndex*) final;
-  void handle(const kir::IntPair*) final;
-
-  void handle(const kir::GridBroadcast*) final;
-  void handle(const kir::GridReduction*) final;
-  void handle(const kir::GroupedGridReduction*) final;
-  void handle(const kir::GridWelford*) final;
-  void handle(const kir::GroupedGridWelford*) final;
-  void handle(const kir::ForLoop*) final;
-  void handle(const kir::IfThenElse*) final;
-  void handle(const kir::Allocate*) final;
-  void handle(const kir::BlockSync*) final;
-  void handle(const kir::GridSync*) final;
-  void handle(const kir::CpAsyncWait*) final;
-  void handle(const kir::CpAsyncCommit*) final;
-  void handle(const kir::InitMagicZero*) final;
-  void handle(const kir::UpdateMagicZero*) final;
-  void handle(const kir::AllocateFusedReduction*) final;
-  void handle(const kir::Swizzle2DInt*) final;
-  void handle(const kir::PairSelect*) final;
-
-  // IR math printer overrides these to prevent them from printing, keep
-  // override
-  void handle(const Split*) override;
-  void handle(const Merge*) override;
-  void handle(const Swizzle2D*) override;
-
-  void print_inline(const Statement* stmt) {
-    bool prev = print_inline_;
-    print_inline_ = true;
-    handle(stmt);
-    print_inline_ = prev;
-  }
-
- protected:
-  std::ostream& os() {
-    return os_;
-  }
-
- private:
-  std::ostream& os_;
-  bool print_inline_ = false;
-  int indent_size_ = 0;
-};
-
-TORCH_CUDA_CU_API std::ostream& operator<<(
-    std::ostream& os,
-    const Statement* stmt);
-
-TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion* f);
-TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion& f);
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_nodes.cpp
--- a/third_party/nvfuser/csrc/ir_printer.h
+++ b/third_party/nvfuser/csrc/ir_printer.h
@ -1,59 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <ir_iostream.h>
-#include <iter_visitor.h>
-
-#include <iostream>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-//! Prints computation Fusion IR nodes
-//!
-//! IrMathPrinter and IrTransformPrinter allow the splitting up of fusion print
-//! functions. IrMathPrinter as its name implies focuses solely on what tensor
-//! computations are taking place. Resulting TensorView math will reflect the
-//! series of split/merge/computeAts that have taken place, however these
-//! nodes will not be displayed in what is printed. IrTransformPrinter does not
-//! print any mathematical functions and only lists the series of
-//! split/merge calls that were made. Both of these printing methods are
-//! quite verbose on purpose as to show accurately what is represented in the IR
-//! of a fusion.
-//
-//! \sa IrTransformPrinter
-//!
-class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
- public:
-  IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
-
-  void handle(const Split* const) override {}
-  void handle(const Merge* const) override {}
-  void handle(const Swizzle2D* const) override {}
-
-  void handle(Fusion* f) override {
-    IrPrinter::handle(f);
-  }
-};
-
-//! Prints transformation (schedule) Fusion IR nodes
-//!
-//! \sa IrMathPrinter
-//!
-class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter {
- public:
-  IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
-
-  void handle(Fusion* f) override;
-
- private:
-  void printTransforms(TensorView* tv);
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/ir_utils.cpp
+++ b/third_party/nvfuser/csrc/ir_utils.cpp
--- a/third_party/nvfuser/csrc/ir_utils.h
+++ b/third_party/nvfuser/csrc/ir_utils.h
@ -1,341 +0,0 @@
-#pragma once
-
-#include <ir_all_nodes.h>
-#include <type.h>
-
-#include <iterator>
-#include <unordered_map>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace ir_utils {
-
-// Replace values in fusion using ValReplacementMutator
-void replaceValue(
-    Fusion*,
-    const std::unordered_map<Val*, Val*>& replacement_map);
-
-template <typename FilterType, typename Iterator>
-class FilterIterator {
- public:
-  using iterator_category = std::forward_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using value_type = FilterType*;
-  using pointer = value_type*;
-  using reference = value_type&;
-
-  FilterIterator(Iterator begin, Iterator end) : current_(begin), end_(end) {
-    advance();
-  }
-
-  FilterType* operator*() const {
-    return (*current_)->template as<FilterType>();
-  }
-
-  FilterType* operator->() const {
-    return (*this);
-  }
-
-  FilterIterator& operator++() {
-    ++current_;
-    advance();
-    return *this;
-  }
-
-  FilterIterator operator++(int) {
-    const auto before_increment = *this;
-    ++current_;
-    advance();
-    return before_increment;
-  }
-
-  bool operator==(const FilterIterator& other) const {
-    TORCH_INTERNAL_ASSERT(
-        end_ == other.end_,
-        "Comparing two FilteredViews that originate from different containers");
-    return current_ == other.current_;
-  }
-
-  bool operator!=(const FilterIterator& other) const {
-    return !(*this == other);
-  }
-
- private:
-  void advance() {
-    current_ = std::find_if(current_, end_, [](const auto& val) {
-      return dynamic_cast<const FilterType*>(val) != nullptr;
-    });
-  }
-
- private:
-  Iterator current_;
-  Iterator end_;
-};
-
-// An iterable view to a given container of Val pointers. Only returns
-// Vals of a given Val type.
-// NOTE: Add a non-const iterator if needed.
-template <typename FilterType, typename InputIt>
-class FilteredView {
- public:
-  using value_type = FilterType*;
-  using const_iterator = FilterIterator<FilterType, InputIt>;
-
-  FilteredView(InputIt first, InputIt last) : input_it_(first), last_(last) {}
-
-  const_iterator cbegin() const {
-    return const_iterator(input_it_, last_);
-  }
-
-  const_iterator begin() const {
-    return cbegin();
-  }
-
-  const_iterator cend() const {
-    return const_iterator(last_, last_);
-  }
-
-  const_iterator end() const {
-    return cend();
-  }
-
-  bool empty() const {
-    return begin() == end();
-  }
-
-  std::vector<value_type> vector() const {
-    return std::vector<value_type>(begin(), end());
-  }
-
- private:
-  const InputIt input_it_;
-  const InputIt last_;
-};
-
-template <typename FilterType, typename InputIt>
-auto filterByType(InputIt first, InputIt last) {
-  return FilteredView<FilterType, InputIt>(first, last);
-}
-
-template <typename FilterType, typename ContainerType>
-auto filterByType(const ContainerType&& inputs) = delete;
-
-template <typename FilterType, typename ContainerType>
-auto filterByType(const ContainerType& inputs) {
-  return filterByType<FilterType>(inputs.cbegin(), inputs.cend());
-}
-
-//! Returns a list of new-to-old mappings.
-//!
-//! This funcion canonicalizes the dimensions and validates that multiple old
-//! dimension are mapped to the same new dimension.
-std::vector<int64_t> normalizeNew2Old(
-    const std::vector<int64_t>& new2old_in,
-    size_t ndims);
-
-//! Returns a list of new-to-old mappings.
-//!
-//! The input map does not need to be complete. Missing axes are
-//! assumed not to be affected.
-//!
-//! This is used to preprocess broadcast and transpose arguments.
-//!
-//! Example: (N := ndims)
-//!   {{0, 1}} -> [1, 0, ...., N-1]
-//!   Transposes the first two axes with no other change.
-//!
-//!   {{0, -1}} -> [N-1, ...., 0]
-//!   Swaps the first and last axes.
-std::vector<int> normalizeOld2New(
-    const std::unordered_map<int, int>& old2new_in,
-    size_t ndims);
-
-// Replace all uses of reference with substitute in expr. Return the Expr.
-// Warning: Invalidates provided Expr.
-// Warning: Removes connection of reference through provided Expr.
-// Warning: Creates new Expr connecting substitue.
-// Reference is found through direct pointer comparison.
-Expr* replaceValInExpr(Expr* expr, Val* reference, Val* substitute);
-
-//! Replace Vals in an index Val as specified by replacement_map while
-//! cloning the given index Val. The index val is assumed to represent
-//! a tensor index consisting of Ints  and arithmetic expressions.
-//!
-//! This is similar to replaceValInExpr but is different as Vals are
-//! cloned such that no other exprs using the same leaf Vals are not
-//! modified. TODO: Consider cleaning up the multiple replacement
-//! routines.
-Val* replaceValInIndexVal(
-    Val* index,
-    const std::unordered_map<Val*, Val*>& replacement_map);
-
-// Makes rfactor generic with reduction ops and Welford
-TORCH_CUDA_CU_API TensorView* rfactorHelper(
-    TensorView* red_tv,
-    const std::vector<int>& axes);
-
-// Return immediate producers of val, this function can be used on any Val and
-// will return producers through Exprs.
-//
-// Warning: returned val's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses val->definition() or val->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val);
-
-// Return immediate consumers of val, this function can be used on any Val and
-// will return consumers through Exprs.
-//
-// Warning: returned val's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses val->definition() or val->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val);
-
-// Return immediate siblings of val, this function can be used on any Val and
-// will return siblings through Exprs.
-//
-// Warning: returned val's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses val->definition() or val->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<Val*> siblingValsOf(Val* val);
-
-// Return immediate producers of vals, this function can be used on any vals and
-// will return producers through Exprs.
-//
-// Warning: returned val's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses val->definition() or val->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
-    const std::vector<Val*>& vals);
-
-// Return immediate consumers of vals, this function can be used on any vals and
-// will return consumers through Exprs.
-//
-// Warning: returned val's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses val->definition() or val->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(
-    const std::vector<Val*>& vals);
-
-// Return immediate producers of tv, this function will return all immediate
-// producers of tv through Exprs.
-//
-// Warning: returned tv's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses tv->definition() or tv->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(TensorView* tv);
-
-// Return immediate consumers of tv, this function will return all immediate
-// consumers of tv through Exprs.
-//
-// Warning: returned tv's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses tv->definition() or tv->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(TensorView* tv);
-
-// Return immediate siblings of tv, this function will return all immediate
-// siblings of tv through Exprs.
-//
-// Warning: returned tv's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses tv->definition() or tv->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<TensorView*> siblingTvsOf(TensorView* tv);
-
-// Return immediate producers of tvs, this function will return all immediate
-// producers of tvs through Exprs.
-//
-// Warning: returned tv's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses tv->definition() or tv->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(
-    const std::vector<TensorView*>& tvs);
-
-// Return immediate consumers of tvs, this function will return all immediate
-// consumers of tvs through Exprs.
-//
-// Warning: returned tv's are not guaranteed to be between fusion inputs and
-// outputs. This function simply uses tv->definition() or tv->uses() which is
-// limited to not go through fusion inputs/outputs, but if on a path that isn't
-// strictly between fusion inputs/outputs, it could effectively return dead
-// code.
-TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(
-    const std::vector<TensorView*>& tvs);
-
-// Returns producers of tv that are inputs of fusion
-TORCH_CUDA_CU_API std::vector<TensorView*> inputTvsOf(TensorView* tv);
-
-// Returns consumers of tv that are outputs of fusion
-TORCH_CUDA_CU_API std::vector<TensorView*> outputTvsOf(TensorView* tv);
-
-// Returns producers of tvs that are inputs of fusion
-TORCH_CUDA_CU_API std::vector<TensorView*> inputTvsOf(
-    std::vector<TensorView*> tvs);
-
-// Returns consumers of tvs that are outputs of fusion
-TORCH_CUDA_CU_API std::vector<TensorView*> outputTvsOf(
-    std::vector<TensorView*> tvs);
-
-// returns all tensor views in fusion that are used between outputs and inputs.
-TORCH_CUDA_CU_API std::vector<TensorView*> allTvs(Fusion* fusion);
-
-// returns all tensor views in fusion that are used between outputs and inputs
-// except the specified set.
-TORCH_CUDA_CU_API std::vector<TensorView*> allTvsExcept(
-    Fusion* fusion,
-    const std::unordered_set<TensorView*>& except);
-
-TORCH_CUDA_CU_API std::vector<Expr*> getReductionOps(
-    Fusion* fusion,
-    bool ignore_trivial = true);
-
-// Returns the initialization value of tv or nullptr if not initialized.
-TORCH_CUDA_CU_API Val* getReductionInitValOf(TensorView* tv);
-
-// Returns if Expr is a reduction op
-TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
-
-// Returns if Expr is a reduction op with TensorView or TensorIndex
-TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
-
-// Returns all non-trivial view operations. We shouldn't have trivial view
-// operations but this function is to simply make sure if we ever do we don't
-// pull them in.
-TORCH_CUDA_CU_API std::vector<ViewOp*> getViewOps(Fusion*);
-
-template <typename T>
-std::string toString(const T& nodes) {
-  std::stringstream ss;
-  for (const Statement* stmt : nodes) {
-    if (ss.tellp() != 0) {
-      ss << ", ";
-    }
-    ss << stmt->toString();
-  }
-  return ss.str();
-}
-
-} // namespace ir_utils
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/iter_visitor.cpp
+++ b/third_party/nvfuser/csrc/iter_visitor.cpp
@ -1,869 +0,0 @@
-#include <iter_visitor.h>
-
-#include <fusion.h>
-#include <ir_all_nodes.h>
-#include <ir_iostream.h>
-#include <ir_utils.h>
-#include <type.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-/* ITER VISITOR */
-
-namespace {
-
-// Remove any stmt in stmts that is in visited
-void remove_visited(
-    std::vector<Statement*>& stmts,
-    const std::unordered_set<Statement*>& visited) {
-  std::deque<std::vector<Statement*>::iterator> to_erase;
-  for (auto it = stmts.begin(); it != stmts.end(); it++) {
-    if (visited.find(*it) != visited.end()) {
-      to_erase.push_back(it);
-    }
-  }
-
-  while (!to_erase.empty()) {
-    stmts.erase(to_erase.back());
-    to_erase.pop_back();
-  }
-}
-
-class MemberStatements : public OptOutDispatch {
- public:
-  // Return all members of the stmt if it's a Val. For expressions it returns
-  // nothing.
-  static std::vector<Statement*> next(Statement* stmt) {
-    MemberStatements find_next(stmt);
-    return find_next.next_stmts_;
-  }
-
- private:
-  MemberStatements() = default;
-
-  MemberStatements(Statement* stmt) {
-    handle(stmt);
-  }
-
-  using OptOutDispatch::handle;
-
-  void handle(Val* val) final {
-    FusionGuard::getCurFusion()->assertInContainer(
-        val,
-        "IterVisitor.cpp::MemberStatements::handle(Val*) Cannot traverse val, ");
-    OptOutDispatch::handle(val);
-  }
-
-  void handle(IterDomain* stmt) final {
-    next_stmts_.push_back(stmt->start());
-    next_stmts_.push_back(stmt->extent());
-    next_stmts_.push_back(stmt->stopOffset());
-  }
-
-  void handle(TensorDomain* stmt) final {
-    next_stmts_.insert(
-        next_stmts_.end(), stmt->domain().begin(), stmt->domain().end());
-  }
-
-  void handle(TensorView* tv) final {
-    next_stmts_.push_back(tv->domain());
-  }
-
-  std::vector<Statement*> next_stmts_;
-};
-
-} // namespace
-
-std::vector<Statement*> IterVisitor::next(Statement* stmt) {
-  if (stmt->isVal()) {
-    return next(stmt->as<Val>());
-  } else {
-    return next(stmt->as<Expr>());
-  }
-}
-
-std::vector<Statement*> IterVisitor::next(Val* v) {
-  FusionGuard::getCurFusion()->assertInContainer(v, "Cannot traverse val, ");
-  if (v->definition() != nullptr) {
-    return {v->definition()};
-  }
-  return {};
-}
-
-std::vector<Statement*> IterVisitor::next(Expr* expr) {
-  FusionGuard::getCurFusion()->assertInContainer(
-      expr, "Cannot traverse expr, ");
-  std::vector<Statement*> next_stmts{
-      expr->inputs().begin(), expr->inputs().end()};
-  return next_stmts;
-}
-
-// This handle functions is called on every Statement* in topological order,
-// starting from outputs to inputs.
-void IterVisitor::handle(Statement* s) {
-  OptOutDispatch::handle(s);
-}
-
-// This handle functions is called on every Expr* in topological order,
-// starting from outputs to inputs.
-void IterVisitor::handle(Expr* e) {
-  OptOutDispatch::handle(e);
-}
-
-// This handle functions is called on every Val* in topological order,
-// starting from outputs to inputs.
-void IterVisitor::handle(Val* v) {
-  OptOutDispatch::handle(v);
-}
-
-// Implementation details:
-// We start with an entry in stmt_stack that is the outputs we want to
-// process. We cannot process these outputs untill all Stmts in their history
-// have been processed, as those Stmts contain all dependencies to produce
-// these values. What we will do is traverse towards inputs until we hit a
-// leaf node. Once we hit a leaf node that node will be visited, then we will
-// take them off the stack. Once a stack entry is empty, know everything
-// needed to be visited to visit stmt_stack.back().back(). We then visit that
-// node, make it as visisted and remove it from the stack.
-//
-// To prevent traversing all paths through a DAG (unless we want to) we have a
-// function to remove visited nodes from being re-added to the stack
-// (remove_visited).
-void IterVisitor::traverseBetween(
-    Fusion* fusion,
-    const std::unordered_set<Val*>& from,
-    const std::vector<Val*>& to,
-    bool traverse_all_paths,
-    bool traverse_into_members) {
-  FusionGuard fg(fusion);
-
-  std::unordered_set<Statement*> visited;
-
-  stmt_stack.clear();
-  stmt_stack.emplace_back(to.rbegin(), to.rend());
-
-  bool all_inputs_visited = false;
-
-  while (!stmt_stack.empty()) {
-    auto& current_inputs = stmt_stack.back();
-
-    // If current_inputs is empty, pop a level of the stmt_stack, mark the level
-    // we pop to as having all inputs processed, the layer we processed were all
-    // added inputs required for that Stmt.
-    if (current_inputs.empty()) {
-      stmt_stack.pop_back();
-      all_inputs_visited = true;
-      continue;
-    }
-
-    // Get the very last entry in the stack to process
-    const auto& stmt = current_inputs.back();
-
-    // If we just poped a stmt_stack level, we can finally visit it!
-    if (all_inputs_visited) {
-      // stmt may have be already visited.
-      if (traverse_all_paths || visited.find(stmt) == visited.end()) {
-        // Mark visited
-        visited.insert(stmt);
-
-        // Actually visit stmt
-        handle(stmt);
-      }
-
-      // Remove last value just visited
-      current_inputs.pop_back();
-
-      // Mark that we need to visit a new Stmt's.
-      all_inputs_visited = false;
-    } else {
-      // We're not ready to process this node, so add all its inputs to be
-      // checked Visit input nodes.
-      std::vector<Statement*> next_stmts;
-
-      if ((stmt->isVal() && from.find(stmt->asVal()) == from.end()) ||
-          stmt->isExpr()) {
-        next_stmts = next(stmt);
-      }
-
-      if (traverse_into_members) {
-        auto members = MemberStatements::next(stmt);
-        next_stmts.insert(next_stmts.end(), members.begin(), members.end());
-      }
-
-      // We may want to retraverse nodes, in that case revisit everything!
-      if (!traverse_all_paths) {
-        // If we don't want to retraverse, remove nodes we already visisted.
-        remove_visited(next_stmts, visited);
-      }
-      if (next_stmts.empty()) {
-        // If there's nothing to visit because it was all already visited, mark
-        // to process
-        all_inputs_visited = true;
-      } else {
-        // Add all these new stmts to visit to the stack.
-        stmt_stack.emplace_back(next_stmts.rbegin(), next_stmts.rend());
-        // We have new things to visit,
-        all_inputs_visited = false;
-      }
-    }
-  }
-}
-
-void IterVisitor::traverseTo(
-    Fusion* fusion,
-    const std::vector<Val*>& to,
-    bool traverse_all_paths,
-    bool traverse_into_members) {
-  traverseBetween(fusion, {}, to, traverse_all_paths, traverse_into_members);
-}
-
-void IterVisitor::traverseHelper(Fusion* fusion, bool traverse_all_paths) {
-  FusionGuard fg(fusion);
-
-  auto term_val_outs = fusion->getTerminatingOutputs();
-  if (!term_val_outs.empty()) {
-    traverseTo(fusion, term_val_outs, traverse_all_paths);
-  }
-}
-
-void IterVisitor::traverse(Fusion* fusion) {
-  traverseHelper(fusion, false);
-}
-
-void IterVisitor::traverseAllPaths(Fusion* fusion) {
-  traverseHelper(fusion, true);
-}
-
-namespace {
-
-// TODO: Also have InputsOf should pick one and remove the other.
-class Inputs : public IterVisitor {
- private:
-  //! Optional list of input vals. While traversing to inputs if a value in the
-  //! all_inputs list is found, that value will be added to the inputs_ and
-  //! traversal will not go into its definition. Otherwise traversal follows
-  //! definition paths until hitting a definition that is a nullptr (i.e. a
-  //! terminating input).
-  const std::vector<Val*>& all_inputs_;
-  std::vector<Val*> inputs_;
-
-  Inputs(const std::vector<Val*>& all_inputs) : all_inputs_(all_inputs) {}
-
-  std::vector<Statement*> next(Val* v) override {
-    if (std::find(inputs_.begin(), inputs_.end(), v) != inputs_.end()) {
-      return {};
-    }
-    return IterVisitor::next(v);
-  }
-
-  void handle(Val* val) override {
-    // If there's no definition to val, or val is created inside the fusion, or
-    // val is within the provided inputs
-    if (val->definition() == nullptr || val->definition()->inputs().empty() ||
-        std::find(all_inputs_.begin(), all_inputs_.end(), val) !=
-            all_inputs_.end()) {
-      // if not already placed in the inputs
-      if (std::find(inputs_.begin(), inputs_.end(), val) == inputs_.end()) {
-        inputs_.push_back(val);
-      }
-    }
-  }
-
- public:
-  static std::vector<Val*> getInputs(
-      const std::vector<Val*>& of,
-      const std::vector<Val*>& all_inputs) {
-    if (of.empty()) {
-      return {};
-    }
-    Inputs inps(all_inputs);
-    inps.traverseTo(of[0]->fusion(), of);
-    return inps.inputs_;
-  }
-};
-
-} // namespace
-
-std::vector<Val*> IterVisitor::getInputsTo(
-    const std::vector<Val*>& vals,
-    const std::vector<Val*>& inputs) {
-  return Inputs::getInputs(vals, inputs);
-}
-
-namespace {
-
-class AllVals : public IterVisitor {
- private:
-  std::unordered_set<Val*> vals;
-
-  void handle(Val* val) final {
-    vals.emplace(val);
-  }
-
- public:
-  // Return all values in history of all values in from
-  static std::unordered_set<Val*> get(
-      Fusion* fusion,
-      const std::vector<Val*>& from) {
-    AllVals av;
-    av.traverseTo(fusion, from, false);
-    return av.vals;
-  }
-};
-
-} // namespace
-
-/* BACKWARDS VISITOR */
-
-std::vector<Statement*> BackwardVisitor::next(Statement* stmt) {
-  if (stmt->isVal()) {
-    return next(stmt->as<Val>());
-  } else if (stmt->isExpr()) {
-    return next(stmt->as<Expr>());
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "BackwardVisitor could not detect type in next_dispatch.");
-  }
-}
-
-std::vector<Statement*> BackwardVisitor::next(Expr* expr) {
-  return std::vector<Statement*>(
-      expr->outputs().begin(), expr->outputs().end());
-}
-
-std::vector<Statement*> BackwardVisitor::next(Val* val) {
-  // Going to sort based on relative topological position
-  std::map<size_t, Statement*> exprs;
-
-  for (auto expr : FusionGuard::getCurFusion()->unordered_uses(val)) {
-    // Make sure it's an expr we can traverse
-    if (traversal_exprs_.find(expr) != traversal_exprs_.end()) {
-      exprs[traversal_exprs_[expr]] = expr;
-    }
-  }
-
-  std::vector<Statement*> next_stmts(exprs.size());
-  std::transform(
-      exprs.begin(),
-      exprs.end(),
-      next_stmts.begin(),
-      [](std::pair<size_t, Statement*> pair) { return pair.second; });
-
-  return next_stmts;
-}
-
-void BackwardVisitor::handle(Statement* stmt) {
-  OptOutDispatch::handle(stmt);
-}
-
-void BackwardVisitor::handle(Expr* expr) {
-  OptOutDispatch::handle(expr);
-}
-
-void BackwardVisitor::handle(Val* val) {
-  OptOutDispatch::handle(val);
-}
-
-void BackwardVisitor::traverseTo(
-    Fusion* fusion,
-    const std::vector<Val*>& from,
-    bool traverseAllPaths) {
-  FusionGuard fg(fusion);
-
-  // Reset members
-  stmt_stack_.clear();
-  traversal_exprs_.clear();
-
-  if (from.empty()) {
-    return;
-  }
-
-  auto vals = AllVals::get(fusion, from);
-  auto exprs = StmtSort::getExprs(fusion, from);
-
-  {
-    size_t pos = 0;
-    for (auto expr : exprs)
-      traversal_exprs_[expr] = pos++;
-  }
-
-  // All stmts we've called handle on
-  std::unordered_set<Statement*> visited_stmts_;
-
-  if (must_cover_all_expr_outputs_) {
-    for (auto traversal_pair : traversal_exprs_) {
-      for (auto out : traversal_pair.first->outputs()) {
-        TORCH_INTERNAL_ASSERT(
-            vals.find(out) != vals.end(),
-            "Invalid backward traversal found. Some output paths were not provided:",
-            out);
-      }
-    }
-  }
-
-  auto inputs = InputsOf::getInputsTo(from);
-  stmt_stack_.emplace_back(inputs.begin(), inputs.end());
-
-  // The rest is basically copy-pasted from IterVitor:
-  while (!stmt_stack_.empty()) {
-    auto next_stmts = next(stmt_stack_.back().back());
-
-    // Remove statements we already visited if we're not traversing all paths
-    if (!traverseAllPaths) {
-      remove_visited(next_stmts, visited_stmts_);
-    }
-
-    // Traverse down until we get to a leaf
-    while (!next_stmts.empty()) {
-      stmt_stack_.emplace_back(next_stmts.rbegin(), next_stmts.rend());
-      next_stmts = next(stmt_stack_.back().back());
-      // Remove statements we already visited if we're not traversing all paths
-      if (!traverseAllPaths) {
-        remove_visited(next_stmts, visited_stmts_);
-      }
-    }
-
-    // Traverse back up
-    // Mark visited
-    visited_stmts_.emplace(stmt_stack_.back().back());
-    // Handle
-    handle(stmt_stack_.back().back());
-    // Remove
-    stmt_stack_.back().pop_back();
-
-    while (!stmt_stack_.empty() && stmt_stack_.back().empty()) {
-      stmt_stack_.pop_back();
-      if (!stmt_stack_.empty()) {
-        // Mark visited
-        visited_stmts_.emplace(stmt_stack_.back().back());
-        // Handle
-        handle(stmt_stack_.back().back());
-        // Remove
-        stmt_stack_.back().pop_back();
-      }
-    }
-  }
-}
-
-/* DEPENDENCY CHECKING */
-
-namespace {
-
-// Looks for and returns all values in between dependencies and vals, including
-// them.
-struct Dependencies : public IterVisitor {
- private:
-  //! A given set of dependency Vals
-  const std::unordered_set<Val*> dependencies_;
-  //! Vals that are found between dependencies_ and of. Topologically
-  //! ordered.
-  std::vector<Val*> vals_;
-  //! Exprs that are found between dependencies_ and of. Topologically
-  //! ordered.
-  std::vector<Expr*> exprs_;
-  //! A set version of vals_
-  std::unordered_set<Val*> dependent_vals_;
-  //! A set version of exprs_
-  std::unordered_set<Expr*> dependent_exprs_;
-
- private:
-  std::vector<Statement*> next(Val* v) override {
-    if (dependencies_.find(v) != dependencies_.end()) {
-      return std::vector<Statement*>();
-    }
-    return IterVisitor::next(v);
-  }
-
-  void handle(Val* val) override {
-    // val is included if:
-    // 1. it is one of the dependencies, or
-    // 2. its defining expression is included in the dependent expr set
-    if (dependencies_.find(val) != dependencies_.end()) {
-      TORCH_INTERNAL_ASSERT(
-          dependent_vals_.find(val) == dependent_vals_.end(),
-          "Trying to add already added val: ",
-          val);
-      vals_.push_back(val);
-      dependent_vals_.insert(val);
-    } else {
-      auto def = val->definition();
-      if (def != nullptr &&
-          dependent_exprs_.find(def) != dependent_exprs_.end()) {
-        TORCH_INTERNAL_ASSERT(
-            dependent_vals_.find(val) == dependent_vals_.end(),
-            "Trying to add already added val: ",
-            val);
-        vals_.push_back(val);
-        dependent_vals_.insert(val);
-      }
-    }
-  }
-
-  void handle(Expr* expr) override {
-    // Track which expr is depedent on the dependencies_ exprs.
-    if (std::any_of(
-            expr->inputs().begin(), expr->inputs().end(), [&](Val* input_val) {
-              return dependent_vals_.find(input_val) != dependent_vals_.end();
-            })) {
-      if (!dependent_exprs_.count(expr)) {
-        exprs_.push_back(expr);
-        dependent_exprs_.insert(expr);
-      }
-    }
-  }
-
-  Dependencies(
-      std::unordered_set<Val*> _dependencies,
-      const std::vector<Val*>& of)
-      : dependencies_(std::move(_dependencies)) {
-    traverseTo(of[0]->fusion(), of, false);
-  };
-
- public:
-  static std::vector<Val*> getAllVals(
-      const std::unordered_set<Val*>& dependencies,
-      const std::vector<Val*>& of) {
-    if (of.empty()) {
-      return {};
-    }
-
-    Dependencies deps(dependencies, of);
-    return deps.vals_;
-  }
-
-  static std::vector<Expr*> getAllExprs(
-      const std::unordered_set<Val*>& dependencies,
-      const std::vector<Val*>& of) {
-    if (of.empty()) {
-      return {};
-    }
-
-    Dependencies deps(dependencies, of);
-    return deps.exprs_;
-  }
-};
-
-// Looks for and returns all output values with dependencies on `of`.
-struct FindOutputs : public IterVisitor {
-  const std::unordered_set<Val*>& of_;
-  std::unordered_set<Val*> outs_;
-
-  void handle(Val* val) override {
-    if (of_.find(val) != of_.end()) {
-      Statement* out_stmt = stmt_stack.front().back();
-      TORCH_INTERNAL_ASSERT(out_stmt->isVal());
-      auto out_val = out_stmt->as<Val>();
-      if (of_.find(out_val) == of_.end()) {
-        outs_.emplace(out_val);
-      }
-    }
-  }
-
-  // TODO: Simply traverse through uses from of. Would be a lot faster than
-  // tracing all paths like this.
-  FindOutputs(const std::unordered_set<Val*>& _of) : of_(_of) {
-    auto fusion = (*of_.begin())->fusion();
-    traverseTo(fusion, fusion->outputs(), true);
-  };
-
-  static std::unordered_set<Val*> getAllOutputsOf(
-      const std::unordered_set<Val*>& of) {
-    if (of.empty()) {
-      return std::unordered_set<Val*>();
-    }
-
-    FindOutputs finder(of);
-    return finder.outs_;
-  }
-};
-
-// Looks for and returns all values that depends on `of`.
-class DependentVals : public IterVisitor {
- private:
-  // Which nodes to find dependencies of
-  const std::unordered_set<Val*>& of_;
-
-  // Dependencies we have so far
-  std::unordered_set<Val*> outs_;
-
-  // Boundary where we want to stop searching beyond
-  // TODO: Based on the todo below, shouldn't we stop just at the definition of?
-  // If we really wanted to make this traverse left, wouldn't we first check
-  // which outputs are outputs dependent on of?
-  std::unordered_set<Val*> boundary_;
-
-  std::vector<Statement*> next(Val* v) override {
-    if (boundary_.find(v) != boundary_.end())
-      return std::vector<Statement*>();
-    return IterVisitor::next(v);
-  }
-
-  void handle(Val* val) override {
-    if (val->isFusionInput() || val->definition() == nullptr ||
-        of_.count(val) || outs_.count(val)) {
-      return;
-    }
-
-    for (auto v : val->definition()->inputs()) {
-      if (of_.count(v) || outs_.count(v)) {
-        outs_.emplace(val);
-        return;
-      }
-    }
-  }
-
-  // optimization to limit search path
-  // TODO: Is this valid? Couldn't something like:
-  // out0 = of + val0
-  // out1 = out0 + val1
-  // out2 = TernaryOp(out1, val0, of)
-  // Hide the dep of out1 on of?
-  void createBoundary() {
-    for (auto v_of : of_) {
-      for (auto v_expr : v_of->uses()) {
-        for (auto v_in : v_expr->inputs()) {
-          boundary_.emplace(v_in);
-        }
-      }
-    }
-  }
-
-  DependentVals(const std::unordered_set<Val*>& _of) : of_(_of) {
-    createBoundary();
-    auto fusion = (*of_.begin())->fusion();
-    traverseTo(fusion, fusion->outputs(), false);
-  };
-
- public:
-  static std::unordered_set<Val*> getAllDependentVals(
-      const std::unordered_set<Val*>& of) {
-    if (of.empty()) {
-      return std::unordered_set<Val*>();
-    }
-    DependentVals dependencies(of);
-    return dependencies.outs_;
-  }
-};
-
-class DependencyChains : public IterVisitor {
- public:
-  std::deque<std::deque<Val*>> dep_chains;
-  bool is_dependency = false;
-  std::unordered_set<Val*> dependencies_;
-
-  void handle(Val* val) override {
-    if (dependencies_.find(val) != dependencies_.end()) {
-      is_dependency = true;
-      std::deque<Val*> deps;
-      for (auto stack : stmt_stack) {
-        if (stack.back()->isVal()) {
-          deps.push_back(stack.back()->as<Val>());
-        }
-      }
-      // Order as dependency -> of
-      dep_chains.emplace_back(deps.rbegin(), deps.rend());
-    }
-  }
-
-  DependencyChains(Val* _dependency, Val* _of, bool all_chains_ = false)
-      : dependencies_({_dependency}) {
-    traverseTo(_of->fusion(), {_of}, all_chains_);
-  }
-
-  DependencyChains(Val* _dependency, bool all_chains_ = false)
-      : dependencies_({_dependency}) {
-    if (all_chains_) {
-      traverseAllPaths(_dependency->fusion());
-    } else {
-      traverse(_dependency->fusion());
-    }
-  }
-
-  DependencyChains(
-      std::unordered_set<Val*> _dependencies,
-      bool all_chains_ = false)
-      : dependencies_(std::move(_dependencies)) {
-    if (dependencies_.empty()) {
-      return;
-    }
-
-    if (all_chains_) {
-      traverseAllPaths((*dependencies_.begin())->fusion());
-    } else {
-      traverse((*dependencies_.begin())->fusion());
-    }
-  }
-
-  static std::deque<Val*> getDependencyChain(Val* dependency, Val* of) {
-    DependencyChains dp(dependency, of, false);
-    if (dp.dep_chains.empty()) {
-      return std::deque<Val*>();
-    }
-    return dp.dep_chains[0];
-  }
-
-  // I don't think this is actually hooked up, but leaving for now.
-  static std::deque<std::deque<Val*>> getDependencyChains(
-      Val* dependency,
-      Val* of) {
-    DependencyChains dp(dependency, of, true);
-    if (dp.dep_chains.empty()) {
-      return std::deque<std::deque<Val*>>();
-    }
-    return dp.dep_chains;
-  }
-
-  static std::deque<std::deque<Val*>> getAllUseChains(Val* dependency) {
-    DependencyChains dp(dependency, true);
-    if (dp.dep_chains.empty()) {
-      return std::deque<std::deque<Val*>>();
-    }
-    return dp.dep_chains;
-  }
-
-  static std::deque<std::deque<Val*>> getAllUseChains(
-      const std::unordered_set<Val*>& dependencies) {
-    DependencyChains dp(dependencies, true);
-    if (dp.dep_chains.empty()) {
-      return std::deque<std::deque<Val*>>();
-    }
-    return dp.dep_chains;
-  }
-};
-
-} // namespace
-
-bool DependencyCheck::isDependencyOf(Val* dependency, Val* of) {
-  return !DependencyChains::getDependencyChain(dependency, of).empty();
-}
-
-std::deque<Val*> DependencyCheck::getSingleDependencyChain(
-    Val* dependency,
-    Val* of) {
-  return DependencyChains::getDependencyChain(dependency, of);
-}
-
-std::deque<std::deque<Val*>> DependencyCheck::getAllDependencyChains(
-    Val* dependency,
-    Val* of) {
-  return DependencyChains::getDependencyChains(dependency, of);
-}
-
-std::deque<std::deque<Val*>> DependencyCheck::getAllUseChains(Val* producer) {
-  return DependencyChains::getAllUseChains(producer);
-}
-
-std::vector<Val*> DependencyCheck::getAllValsBetween(
-    const std::unordered_set<Val*>& dependencies,
-    const std::vector<Val*>& of) {
-  return Dependencies::getAllVals(dependencies, of);
-}
-
-std::vector<Expr*> DependencyCheck::getAllExprsBetween(
-    const std::unordered_set<Val*>& dependencies,
-    const std::vector<Val*>& of) {
-  return Dependencies::getAllExprs(dependencies, of);
-}
-
-std::unordered_set<Val*> DependencyCheck::getAllOutputsOf(
-    const std::unordered_set<Val*>& of) {
-  if (of.empty()) {
-    return std::unordered_set<Val*>();
-  }
-  FusionGuard fg((*of.begin())->fusion());
-  return FindOutputs::getAllOutputsOf(of);
-}
-
-std::unordered_set<Val*> DependencyCheck::getAllDependentVals(
-    const std::unordered_set<Val*>& of) {
-  if (of.empty()) {
-    return std::unordered_set<Val*>();
-  }
-  FusionGuard fg((*of.begin())->fusion());
-  return DependentVals::getAllDependentVals(of);
-}
-
-void StmtSort::handle(Statement* stmt) {
-  stmts.push_back(stmt);
-}
-
-std::vector<Expr*> StmtSort::getExprs(Fusion* fusion, bool traverse_members) {
-  auto terminating_outputs = fusion->getTerminatingOutputs();
-  return StmtSort::getExprs(fusion, terminating_outputs, traverse_members);
-}
-
-std::vector<Expr*> StmtSort::getExprs(
-    Fusion* fusion,
-    const std::vector<Val*>& to,
-    bool traverse_members) {
-  auto stmts = StmtSort::getStmts(fusion, to, traverse_members);
-  auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
-  std::vector<Expr*> exprs(filter.begin(), filter.end());
-  return exprs;
-}
-
-std::vector<Expr*> StmtSort::getExprsBetween(
-    Fusion* fusion,
-    const std::vector<Val*>& from,
-    const std::vector<Val*>& to,
-    bool traverse_members) {
-  auto stmts = StmtSort::getStmtsBetween(fusion, from, to, traverse_members);
-  auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
-  std::vector<Expr*> exprs(filter.begin(), filter.end());
-  return exprs;
-}
-
-std::vector<Statement*> StmtSort::getStmts(
-    Fusion* fusion,
-    bool traverse_members) {
-  auto terminating_outputs = fusion->getTerminatingOutputs();
-  return StmtSort::getStmts(fusion, terminating_outputs, traverse_members);
-}
-
-std::vector<Statement*> StmtSort::getStmts(
-    Fusion* fusion,
-    const std::vector<Val*>& to,
-    bool traverse_members) {
-  StmtSort es;
-  es.traverseTo(fusion, to, false, traverse_members);
-  return es.stmts;
-}
-
-std::vector<Statement*> StmtSort::getStmtsBetween(
-    Fusion* fusion,
-    const std::vector<Val*>& from,
-    const std::vector<Val*>& to,
-    bool traverse_members) {
-  StmtSort es;
-  es.traverseBetween(
-      fusion, {from.begin(), from.end()}, to, false, traverse_members);
-  return es.stmts;
-}
-
-void InputsOf::handle(Val* v) {
-  if (v->definition() == nullptr || v->definition()->inputs().empty()) {
-    if (grabbed_inputs.emplace(v).second) {
-      ordered_inputs.push_back(v);
-    }
-  }
-}
-
-std::vector<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
-  return outputs(fusion, {output_});
-}
-
-std::vector<Val*> InputsOf::outputs(
-    Fusion* fusion,
-    const std::vector<Val*>& outputs_) {
-  InputsOf io;
-  io.traverseTo(fusion, outputs_, false);
-  return io.ordered_inputs;
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/iter_visitor.h
+++ b/third_party/nvfuser/csrc/iter_visitor.h
@ -1,349 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <dispatch.h>
-#include <type.h>
-
-#include <deque>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class Fusion;
-class Statement;
-class Expr;
-class Val;
-
-/*
- * IterVisitor starts from leaf nodes, fusion outputs, or the provided values.
- * It walks the DAG bacwkards from the starting nodes, to roots. Each node in
- * the dag will be called with handle(Statement*) in topolgical order inputs of
- * the fusion to outputs of the fusion.
- *
- * TODO: We may want a BFS version of this code to extract ILP, not implemented
- * yet.
- *
- * TODO: We may want to have ordering of outputs to inputs. I'm not sure why we
- * would want this, but seems like it would be a reasonable request.
- */
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
- public:
-  ~IterVisitor() override = default;
-
-  IterVisitor() = default;
-
-  IterVisitor(const IterVisitor& other) = default;
-  IterVisitor& operator=(const IterVisitor& other) = default;
-
-  IterVisitor(IterVisitor&& other) = default;
-  IterVisitor& operator=(IterVisitor&& other) = default;
-
- protected:
-  // Functions return nodes in reverse order to be added to the to_visit queue
-  // These functions will start at outputs and propagate up through the DAG
-  // to inputs based on depth first traversal. Next could be called on a node
-  // multiple times.
-  virtual std::vector<Statement*> next(Statement* stmt);
-
-  virtual std::vector<Statement*> next(Val* v);
-
-  virtual std::vector<Statement*> next(Expr* expr);
-
-  // This handle functions is called on every Statement* in topological order,
-  // starting from outputs to inputs.
-  void handle(Statement* s) override;
-
-  // This handle functions is called on every Expr* in topological order,
-  // starting from outputs to inputs.
-  void handle(Expr* e) override;
-
-  // This handle functions is called on every Val* in topological order,
-  // starting from outputs to inputs.
-  void handle(Val* v) override;
-
-  // The entire stack during traversal. stmt_stack.back().back() is the node
-  // that is being called in handle(). stmt_stack.back() contains siblings (not
-  // guarenteed to be all siblings throughout traversal). stmt_stack.front()
-  // contains the outputs we started with (not guarenteed to be all outputs
-  // throughout traversal).
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::vector<std::vector<Statement*>> stmt_stack;
-
-  void traverseHelper(Fusion* fusion, bool traverse_all_paths = false);
-
- public:
-  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
-  //! from inputs towards outputs.
-  //! \param traverseAllPaths = false only call handle on each Statement* once
-  //!    traverseAllPaths = true traverses all paths between expressions/values.
-  //!    Calls handle on a Statement* for every path from inputs to "to".
-  //! \param traverseIntoMembers = When hitting nodes like TensorView,
-  //! TensorDomain, or IterDomain where there are members of the nodes that are
-  //! Val's a value of "true" will also traverse into those member Val's, a
-  //! value of "false" will not traverse into the members.
-  void traverseTo(
-      Fusion* fusion,
-      const std::vector<Val*>& to,
-      bool traverse_all_paths = false,
-      bool traverse_into_members = false);
-
-  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
-  //! from inputs towards outputs.
-  //! \param traverseAllPaths = false only call handle on each Statement* once
-  //!    traverseAllPaths = true traverses all paths between expressions/values.
-  //!    Calls handle on a Statement* for every path from inputs to "to".
-  //! \param traverseIntoMembers = When hitting nodes like TensorView,
-  //! TensorDomain, or IterDomain where there are members of the nodes that are
-  //! Val's a value of "true" will also traverse into those member Val's, a
-  //! value of "false" will not traverse into the members.
-  //! \param from: Specified values to start traversing. If a "from" Val is not
-  //! on path from inputs to "to" node it will not be visited. If there's a path
-  //! from inputs to "to" that doesn't go through "from" that input and the path
-  //! from it will also be traversed.
-  void traverseBetween(
-      Fusion* fusion,
-      const std::unordered_set<Val*>& from,
-      const std::vector<Val*>& to,
-      bool traverse_all_paths = false,
-      bool traverse_into_members = false);
-
-  // Iterates from terminating outputs registered with the fusion. Terminating
-  // means value is not used to generate any other value used in producing
-  // registered outputs.
-  void traverse(Fusion* fusion);
-
-  // Same as traverse put it traverses every edge, meaning it will traverse
-  // values more than once.
-  void traverseAllPaths(Fusion* fusion);
-
-  //! Get inputs to vals. Possible input vals can be optionally
-  //! given. If not, vals with no producers are returned.
-  //
-  // TODO: This doesn't seem to fit with IterVisitor. Should probably be moved
-  // out of the class.
-  static std::vector<Val*> getInputsTo(
-      const std::vector<Val*>& vals,
-      const std::vector<Val*>& inputs = {});
-};
-
-/*
- * Backward visitor IterVisitor calls handle in reverse order from outputs
- * to inputs It would be really nice to unify this with IterVisitor, however,
- * the challenge there is that we specify traversal from outputs towards inputs
- * because it implicitly provides DCE. However, if users are not careful, they
- * could miss necessary outputs to do a backward traversal.
- *
- * BackwardVisitor checks that all outputs of an Expr is visited before visiting
- * the Expr. If we don't provide nodes to start from on all backward paths of
- * those outputs we will never visit the Expr.
- *
- * The first step of BackwardVisitor is to make sure we've specified enough
- * outputs to guarentee that we will traverse all outputs of all exprs during
- * the backward traversal. In the case where we don't require visiting all
- * outputs of some exprs, example being the `N` output of welford ops.
- * `must_cover_all_expr_outputs` is added to disable the check, and in
- * this case the visitor pass need be aware
- *  1. Exprs with any output that has a use chain that ends with a final
- * consumer in the `from` list `will be` visited.
- *  2. Vals that doesn't have a use chain that ends with a final
- * consumer in the `from` list `will not be` visited, even though its
- * definition expr might be visited. An example is if the `N` output
- * of an welford op is unused, but other outputs are, the welford op
- * will be visited but the `N` output will not.
- *
- */
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API BackwardVisitor : public OptOutDispatch {
- protected:
-  virtual ~BackwardVisitor() override = default;
-
-  BackwardVisitor(bool must_cover_all_expr_outputs = true)
-      : must_cover_all_expr_outputs_(must_cover_all_expr_outputs) {}
-
-  BackwardVisitor(const BackwardVisitor& other) = default;
-  BackwardVisitor& operator=(const BackwardVisitor& other) = default;
-
-  BackwardVisitor(BackwardVisitor&& other) = default;
-  BackwardVisitor& operator=(BackwardVisitor&& other) = default;
-
-  // Functions return nodes in reverse order to be added to the to_visit queue
-  // These functions will start at outputs and propagate up through the DAG
-  // to inputs based on depth first traversal. Next could be called on a node
-  // multiple times.
-  virtual std::vector<Statement*> next(Statement* stmt);
-
-  virtual std::vector<Statement*> next(Expr* expr);
-
-  virtual std::vector<Statement*> next(Val* val);
-
-  // This handle functions is called on every Statement* in topological order,
-  // starting from outputs to inputs.
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  virtual void handle(Statement* stmt) override;
-
-  // This handle functions is called on every Expr* in topological order,
-  // starting from outputs to inputs.
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  virtual void handle(Expr* expr) override;
-
-  // This handle functions is called on every Val* in topological order,
-  // starting from outputs to inputs.
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  virtual void handle(Val* val) override;
-
-  // All exprs that need to be visited in this traversal. Labeled in topological
-  // order (size_t).
-  std::unordered_map<Expr*, size_t> traversal_exprs_;
-
-  // The entire stack during traversal. stmt_stack.back().back() is the node
-  // that is being called in handle(). stmt_stack.back() contains siblings (not
-  // guarenteed to be all siblings throughout traversal). stmt_stack.front()
-  // contains the inputs we started with (not guarenteed to be all outputs
-  // throughout traversal).
-  std::deque<std::deque<Statement*>> stmt_stack_;
-
-  // Starts at nodes provided in from, traverses from these nodes to inputs.
-  // Calls handle on all Statement*s in topological sorted order.
-  // traverseAllPaths = false only call handle on each Statement* once
-  // traverseAllPaths = true traverses all paths from nodes in from to inputs.
-  //   Handle on a Statement* for every path from "from" nodes, to inputs.
-  void traverseTo(
-      Fusion* fusion,
-      const std::vector<Val*>& from,
-      bool traverseAllPaths = false);
-
-  bool must_cover_all_expr_outputs_ = true;
-};
-
-class TORCH_CUDA_CU_API DependencyCheck {
- public:
-  // Returns if "dependency" is a dependency of "of".
-  static bool isDependencyOf(Val* dependency, Val* of);
-
-  // Finds a Val* path from "of" to "dependency". Returns that path.
-  // deque.back() is "of", deque[0] is dependency if a chain exists.
-  static std::deque<Val*> getSingleDependencyChain(Val* dependency, Val* of);
-
-  // Finds all Val* paths from "of" to "dependency". Returns those paths.
-  // deque[i].back() is "of", and deque[i][0] is "dependency". Returns an
-  // empty deque if no dependency found.
-  static std::deque<std::deque<Val*>> getAllDependencyChains(
-      Val* dependency,
-      Val* of);
-
-  // Finds all Val* paths from all leaf nodes to "dependency". Returns those
-  // paths. deque[i].back() are leaf nodes, and deque[i][0] is "dependency".
-  // Returns an empty deque if there are no uses of dependency found.
-  static std::deque<std::deque<Val*>> getAllUseChains(Val* dependency);
-
-  // Grab all values that exist between and including provided
-  // vals. Returned values are topologicaly ordered, and unique.
-  static std::vector<Val*> getAllValsBetween(
-      const std::unordered_set<Val*>& dependencies,
-      const std::vector<Val*>& of);
-
-  // Returns all dependent exprs that exist between
-  //  the provided vals
-  static std::vector<Expr*> getAllExprsBetween(
-      const std::unordered_set<Val*>& dependencies,
-      const std::vector<Val*>& of);
-
-  // Return registered outputs of the fusion that are a dependency of any val of
-  static std::unordered_set<Val*> getAllOutputsOf(
-      const std::unordered_set<Val*>& of);
-
-  // Return all Vals that depend on the given Vals
-  static std::unordered_set<Val*> getAllDependentVals(
-      const std::unordered_set<Val*>& of);
-};
-
-// Expr sort will take a fusion and return a topologically sorted list of
-// expressions.
-class StmtSort : public IterVisitor {
- protected:
-  StmtSort() = default;
-
-  std::vector<Statement*> stmts;
-
-  void handle(Statement* stmt) override;
-
- public:
-  // If traverse_members it will also extract all member nodes in the sorted
-  // statement list in the fusion. i.e. all IterDomains, extents, and associated
-  // expressions of them
-  static std::vector<Statement*> getStmts(
-      Fusion* fusion,
-      bool traverse_members = false);
-
-  // Returns ordered Statements required to produce from, including from.
-  static std::vector<Statement*> getStmts(
-      Fusion* fusion,
-      const std::vector<Val*>& to,
-      bool traverse_members = false);
-
-  // Returns ordered Statements required to produce from, including from.
-  // Stops traversal once hiting any Statements in to. Includes Statements in
-  // to.
-  //
-  // Warning: this doesn't necessarily prevent statements before `to` from being
-  // returned. e.g.
-  // i1 = i0
-  // i2 = i1
-  // i3 = i2
-  // i4 = i3 + i1
-  // getExprs(fusion, {i4}, {i3})
-  // will return the definition and values {i0, i1, i4}
-  // i3 is dependent on i1, but since i4 also is then the traversal will go down
-  // the i4->i1->i0 path, even though the i4->i3-//>i2->i1 path is blocked.
-  //
-  // If traverse_members it will also extract all member nodes in the sorted
-  // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
-  static std::vector<Statement*> getStmtsBetween(
-      Fusion* fusion,
-      const std::vector<Val*>& from,
-      const std::vector<Val*>& to,
-      bool traverse_members = false);
-
-  // Same as getStmts version but filters to only return the Expr*s
-  static std::vector<Expr*> getExprs(
-      Fusion* fusion,
-      bool traverse_members = false);
-
-  // Same as getStmts version but filters to only return the Expr*s
-  static std::vector<Expr*> getExprs(
-      Fusion* fusion,
-      const std::vector<Val*>& to,
-      bool traverse_members = false);
-
-  // Same as getStmts version but filters to only return the Expr*s
-  static std::vector<Expr*> getExprsBetween(
-      Fusion* fusion,
-      const std::vector<Val*>& from,
-      const std::vector<Val*>& to,
-      bool traverse_members = false);
-};
-
-class InputsOf : public IterVisitor {
- private:
-  std::unordered_set<Val*> grabbed_inputs;
-  std::vector<Val*> ordered_inputs;
-
-  void handle(Val* v) final;
-
- public:
-  static std::vector<Val*> output(Fusion* fusion, Val* output_);
-  static std::vector<Val*> outputs(
-      Fusion* fusion,
-      const std::vector<Val*>& outputs_);
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/kernel.cpp
+++ b/third_party/nvfuser/csrc/kernel.cpp
@ -1,428 +0,0 @@
-#include <instrumentation.h>
-#include <ir_iostream.h>
-#include <kernel.h>
-#include <kernel_expr_evaluator.h>
-#include <kernel_ir_dispatch.h>
-#include <lower2device.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <unordered_set>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container)
-    : ir_container_(ir_container) {}
-
-namespace kir {
-
-namespace {
-
-//! Scan all primary expressions in the Kernel IR and build
-//! lists of specialized nodes and other interesting information
-class KernelIrScanner : private IrVisitor {
- public:
-  explicit KernelIrScanner(const Kernel* kernel) {
-    IrVisitor::handle(kernel->topLevelExprs());
-    const auto gpu_lower = GpuLower::current();
-    for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) {
-      auto extent = split->in()->extent();
-      auto factor = split->factor();
-      summary_.splits_to_validate.emplace_back(extent, factor);
-    }
-  }
-
-  const auto& summary() const {
-    return summary_;
-  }
-
- private:
-  using IrVisitor::handle;
-  void handle(Expr* expr) final {
-    IrVisitor::handle(expr);
-    for (auto inp : expr->inputs()) {
-      handle(inp);
-    }
-    for (auto out : expr->outputs()) {
-      handle(out);
-    }
-  }
-  void handle(BlockSync* sync) final {
-    // TODO: Move to a dedicated validation pass
-    // which is not on the common execution/compilation path
-    if (sync->isWarHazardSync()) {
-      ++summary_.war_hazard_syncs_count;
-    }
-  }
-
-  void handle(GridSync* sync) final {
-    summary_.has_cooperative_grid_reduction = true;
-  }
-
-  void handle(Allocate* allocate) final {
-    switch (allocate->memoryType()) {
-      case MemoryType::Global:
-        summary_.global_allocations.push_back(allocate);
-        break;
-      case MemoryType::Shared:
-        summary_.dynamic_smem_allocations.push_back(allocate);
-        break;
-      case MemoryType::Local:
-        if (!ExpressionEvaluator::isConst(allocate->size())) {
-          summary_.has_dynamic_local_memory_allocations = true;
-          summary_.dynamic_lmem_allocations.emplace_back(allocate);
-        }
-        break;
-    }
-  }
-
-  void handle(RNGOp* rng_op) final {
-    summary_.max_rng_offsets =
-        std::max<int>(summary_.max_rng_offsets, rng_op->getRNGOffset());
-  }
-
-  void handle(TensorIndex* tensor_index) final {
-    const auto tv = tensor_index->view();
-    const auto domain = tv->domain();
-    // Do we have any reductions?
-    summary_.has_block_reductions =
-        summary_.has_block_reductions || domain->hasBlockReduction();
-
-    // Update the largest smem data type
-    if (domain->hasBlockReduction() || domain->hasGridReduction() ||
-        tv->getMemoryType() == MemoryType::Shared) {
-      const auto data_type = tv->dtype();
-      const size_t type_size = dataTypeSize(data_type);
-      if (type_size > max_smem_type_size_) {
-        max_smem_type_size_ = type_size;
-        summary_.largest_smem_data_type = data_type;
-      }
-    }
-  }
-
-  void handle(WelfordOp* welford_op) final {
-    summary_.has_welford = true;
-    TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA<TensorIndex>());
-    auto out_dom = welford_op->outAvg()->as<TensorIndex>()->view()->domain();
-    summary_.has_block_welford =
-        summary_.has_block_welford || out_dom->hasBlockReduction();
-  }
-
-  void handle(GridWelford* grid_welford) final {
-    summary_.has_welford = true;
-    summary_.has_grid_welford = true;
-    summary_.has_grid_reductions = true;
-    if (grid_welford->welford_op()->isAllreduce()) {
-      summary_.has_cooperative_grid_reduction = true;
-    }
-  }
-
-  void handle(GridReduction* grid_reduction) final {
-    summary_.has_grid_reductions = true;
-    if (grid_reduction->isAllreduce()) {
-      summary_.has_cooperative_grid_reduction = true;
-    }
-  }
-
-  void handle(GroupedGridReduction* grid_reduction) final {
-    summary_.has_grid_reductions = true;
-    if (grid_reduction->isAllreduce()) {
-      summary_.has_cooperative_grid_reduction = true;
-    }
-  }
-
-  void handle(GroupedGridWelford* grid_welford) final {
-    summary_.has_welford = true;
-    summary_.has_grid_welford = true;
-    summary_.has_grid_reductions = true;
-    if (grid_welford->isAllreduce()) {
-      summary_.has_cooperative_grid_reduction = true;
-    }
-  }
-
-  void handle(GridBroadcast* grid_broadcast) final {
-    summary_.has_cooperative_grid_reduction = true;
-    handle(grid_broadcast->broadcast_op());
-  }
-
-  void handle(BroadcastOp* bop) final {
-    const ParallelTypeBitmap parallel_types =
-        GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
-            bop->out()->as<TensorIndex>()->view());
-    summary_.broadcast_parallel_types.emplace(bop, parallel_types);
-    // Do we have block broadcasts?
-    summary_.has_block_broadcasts =
-        summary_.has_block_broadcasts || parallel_types.hasTID();
-    // Do we have grid broadcasts?
-    summary_.has_grid_broadcasts =
-        summary_.has_grid_broadcasts || parallel_types.hasBID();
-  }
-
- private:
-  size_t max_smem_type_size_ = 0;
-  KernelSummary summary_;
-};
-
-//! Make sure tensors have valid allocations even when parallelized
-//! loops potentially have larger iteration counts than the number of
-//! threads.
-//!
-//! When an IterDomain of a tensor is parallelized, the IterDomain
-//! may not contribute to the allocation of the tensor. For example,
-//! it is assumed that an allocation of a local-memory tensor does not
-//! need to be accounted for an parallelied IterDomain. This is true
-//! when it is guaranteed that each thread only needs to execute the
-//! loop body once. However, if not, the allocation is invalid as it
-//! only has a space for one value per thread.
-//!
-//! ValidateAllocation checks all tensor allocations and sees if any
-//! tensor may have a parallelized loop whose iteration count may
-//! be larger than the number of threads. If so, an error is thrown if
-//! the tensor is not allocated on thread-shared memories. Note that
-//! when allocated on a shared memory (i.e., MemoryType::Shared or
-//! MemoryType::Global for tensors parallelized with threadIdx, or
-//! MemoryType::Global for tensors parallelized with blockIdx), it is
-//! assumed that allocation is properly extended for the iteration
-//! count.
-class ValidateAllocation : private OptOutConstDispatch {
- public:
-  static void validate(const Kernel* kernel) {
-    ValidateAllocation validate_allocation(kernel);
-  }
-
- private:
-  explicit ValidateAllocation(const Kernel* kernel) {
-    live_allocations_.emplace_back(std::vector<const Allocate*>());
-    for (const auto& expr : kernel->topLevelExprs()) {
-      OptOutConstDispatch::handle(expr);
-    }
-    live_allocations_.pop_back();
-    TORCH_INTERNAL_ASSERT(live_allocations_.empty());
-  }
-
-  void handle(const Allocate* allocate) final {
-    TORCH_INTERNAL_ASSERT(!live_allocations_.empty());
-    live_allocations_.back().push_back(allocate);
-  }
-
-  // for_loop is parallelized and its stop value is not guaranteed to
-  // be <= the number of threads, which breaks an assumption made
-  // during in the allocation lowering if it's thread-parallel and not
-  // allocated on shared or global memories, or if it's block-parallel
-  // ando not allocated on global memory.
-  void validate(const ForLoop* for_loop) {
-    const auto loop_id = for_loop->iter_domain();
-    for (const auto& allocations : live_allocations_) {
-      for (const auto& allocate : allocations) {
-        const auto tv = dynamic_cast<TensorView*>(allocate->buffer());
-        if (tv == nullptr) {
-          continue;
-        }
-        for (const auto& axis : tv->domain()->domain()) {
-          if (!GpuLower::current()->caMap()->areMapped(
-                  loop_id, axis, IdMappingMode::LOOP)) {
-            continue;
-          }
-          if (isParallelTypeThreadDim(loop_id->getParallelType())) {
-            TORCH_INTERNAL_ASSERT(
-                tv->getMemoryType() == MemoryType::Shared ||
-                    tv->getMemoryType() == MemoryType::Global,
-                "Tensor t",
-                tv->name(),
-                " must be allocated on SMEM or GMEM.");
-          } else if (isParallelTypeBlockDim(loop_id->getParallelType())) {
-            TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global);
-          }
-        }
-      }
-    }
-  }
-
-  void handle(const ForLoop* for_loop) final {
-    if (for_loop->stop() != for_loop->iter_domain()->extent() &&
-        isParallelTypeThread(for_loop->iter_domain()->getParallelType())) {
-      validate(for_loop);
-    }
-
-    live_allocations_.emplace_back(std::vector<const Allocate*>());
-    for (const auto& expr : for_loop->body().exprs()) {
-      OptOutConstDispatch::handle(expr);
-    }
-    live_allocations_.pop_back();
-  }
-
-  void handle(const IfThenElse* ite) final {
-    for (const auto& expr : ite->thenBody().exprs()) {
-      OptOutConstDispatch::handle(expr);
-    }
-    for (const auto& expr : ite->elseBody().exprs()) {
-      OptOutConstDispatch::handle(expr);
-    }
-  }
-
- private:
-  std::vector<std::vector<const Allocate*>> live_allocations_;
-};
-
-} // namespace
-
-// TODO(kir): Kernel IR validation
-void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
-  TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
-  top_level_exprs_ = std::move(top_level_exprs);
-  warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
-  profile_ = GpuLower::current()->profile();
-  ValidateAllocation::validate(this);
-  analyze();
-  // Make sure this is after analyze as it sets summary_
-  summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses();
-  summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo();
-  summary_.sync_map = GpuLower::current()->syncMap();
-  summary_.parallel_dimension_map_ =
-      GpuLower::current()->parallelDimensionMap();
-}
-
-void Kernel::analyze() {
-  FUSER_PERF_SCOPE("Kernel::analyze");
-
-  const KernelIrScanner ir_scanner(this);
-  summary_ = ir_scanner.summary();
-}
-
-void Kernel::print() const {
-  IrPrinter ir_printer(std::cout);
-  ir_printer.handle(this);
-}
-
-//! Register the Val with this fusion
-void Kernel::registerVal(Val* val) {
-  if (inContainer(val)) {
-    return;
-  }
-  if (val->kernel()) {
-    TORCH_CHECK(
-        val->kernel() == this,
-        val->toString(),
-        " was not found in the active kernel.");
-  }
-
-  Fusion::registerVal(val);
-}
-
-//! Register expr with this fusion.
-//! When we register an expression, we want to update the dependency tracking
-//! of Vals. We add expr to our general expr_set_,
-void Kernel::registerExpr(Expr* expr) {
-  if (inContainer(expr)) {
-    return;
-  }
-
-  if (expr->kernel()) {
-    TORCH_CHECK(
-        expr->kernel() == this,
-        expr->toString(),
-        " was not found in the active kernel.");
-  }
-
-  for (Val* input : expr->inputs()) {
-    TORCH_INTERNAL_ASSERT(
-        inContainer(input),
-        "Input\n",
-        input->toString(),
-        " to expr,\n",
-        expr->toString(),
-        ",\n is invalid because it is not in the same kernel.");
-  }
-
-  for (Val* output : expr->outputs()) {
-    TORCH_INTERNAL_ASSERT(
-        inContainer(output),
-        "Output\n",
-        output->toString(),
-        " to expr,\n",
-        expr->toString(),
-        ",\n is invalid because it is not in the same kernel.");
-  }
-
-  // Register expr is explicitly non-SSA when coming from a kernel. This is
-  // detected inside Fusion::registerExpr
-  Fusion::registerExpr(expr);
-}
-
-std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
-  return kernel_->top_level_exprs_;
-}
-
-void KernelPerformanceProfile::registerExpr(const Expr* expr) {
-  if (expr_entry_map_.find(expr) != expr_entry_map_.end()) {
-    return;
-  }
-
-  auto slot = getNewIndex();
-  expr_entry_map_.emplace(expr, slot);
-}
-
-int KernelPerformanceProfile::getNewIndex() {
-  return num_profile_entries_++;
-}
-
-bool KernelPerformanceProfile::isProfiled(const Expr* expr) const {
-  return expr_entry_map_.find(expr) != expr_entry_map_.end();
-}
-
-c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const {
-  auto it = expr_entry_map_.find(expr);
-  if (it == expr_entry_map_.end()) {
-    return c10::optional<int>();
-  } else {
-    return it->second;
-  }
-}
-
-std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer(
-    const Expr* expr) const {
-  TORCH_INTERNAL_ASSERT(
-      isProfiled(expr), "Not a profiled expression: ", expr->toString());
-
-  int cycle_index = getIndex(expr).value() * 2;
-  int count_index = cycle_index + 1;
-
-  return {cycle_index, count_index};
-}
-
-std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const {
-  std::stringstream ss;
-  ss << "Kernel performance profile:\n";
-  if (!buffer.defined()) {
-    ss << "No profile found\n";
-    return ss.str();
-  }
-
-  double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate;
-
-  ss << std::setprecision(3) << std::fixed;
-
-  for (const auto& kv : expr_entry_map_) {
-    auto expr = kv.first;
-    auto index = kv.second;
-    auto out_tv = ir_utils::getTvOutput(expr);
-    double cycles = static_cast<double>(buffer[index][0].item<int64_t>());
-    auto count = buffer[index][1].item<int64_t>();
-    auto cycles_per_call = count == 0 ? 0.0 : cycles / count;
-    auto us_per_call = cycles_per_call / kilo_freq * 1000.0;
-    ss << expr->getExprType().value() << ", T" << out_tv->name() << ", "
-       << us_per_call << " us, " << count << "\n";
-  }
-
-  return ss.str();
-}
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/third_party/nvfuser/csrc/kernel.h
+++ b/third_party/nvfuser/csrc/kernel.h
@ -1,257 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <fusion.h>
-#include <ir_base_nodes.h>
-#include <ir_builder.h>
-#include <lower_sync_information.h>
-#include <lower_warp_reduce.h>
-#include <parallel_dimension_map.h>
-#include <utils.h>
-#include <vectorization_info.h>
-
-#include <memory>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-//! Summary of interesting facts about the kernel
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct KernelSummary {
-  //! Count of WAR (write-after-read) hazard barriers
-  int war_hazard_syncs_count = 0;
-
-  //! List of global buffers
-  std::vector<const kir::Allocate*> global_allocations;
-
-  //! List of dynamic shared memory buffers
-  std::vector<const kir::Allocate*> dynamic_smem_allocations;
-
-  //! List of static shared memory buffers
-  std::vector<const kir::Allocate*> static_smem_allocations;
-
-  //! Indicate the need to generate random numbers
-  int max_rng_offsets = -1;
-
-  //! Do we have any block reductions?
-  bool has_block_reductions = false;
-
-  //! Number of static grid reductions
-  bool has_grid_reductions = false;
-
-  //! Do we have any grid reduction in a loop, or grid reductions dependent on
-  //! grid reductions
-  bool has_cooperative_grid_reduction = false;
-
-  //! Do we have any block broadcasts?
-  bool has_block_broadcasts = false;
-
-  //! Do we have any grid broadcasts?
-  bool has_grid_broadcasts = false;
-
-  //! Do we have any welford op?
-  bool has_welford = false;
-
-  //! Do we have any welford op?
-  bool has_block_welford = false;
-
-  //! Do we have any welford op?
-  bool has_grid_welford = false;
-
-  //! Largest shared memory buffer base type
-  DataType largest_smem_data_type = DataType::Null;
-
-  //! Do we have allocations of dynamic local memory?
-  bool has_dynamic_local_memory_allocations = false;
-
-  //! List of dynamic local memory buffers.
-  //! Only used for debugging.
-  std::vector<const kir::Allocate*> dynamic_lmem_allocations;
-
-  //! ceilDiv extents that must be divisible
-  std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
-
-  //! Effective ParallelTypes of broadcast ops
-  std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
-      broadcast_parallel_types;
-
-  //! Track which tensor views are inputs or outputs of a vectorized operation
-  //! and their maximum vectorized access size
-  std::unordered_map<TensorView*, int> vectorized_accesses;
-
-  // Sync map is needed to figure out if global memory buffers need to be marked
-  // as volatile because they're used for communication.
-  SyncMap sync_map;
-
-  // Parallel dimension map needed to set the correct properties of grid buffers
-  // (is a dim inactive)
-  ParallelDimensionMap parallel_dimension_map_;
-
-  //! Track information on vectorized set operations for runtime validation
-  std::vector<VectorizedSetInfo> vectorized_set_info;
-};
-
-class TORCH_CUDA_CU_API KernelPerformanceProfile {
- public:
-  //! Register an expression to profile
-  void registerExpr(const Expr* expr);
-
-  //! Query if an expression is profiled
-  bool isProfiled(const Expr* expr) const;
-
-  //! Get the number of profiled expressions
-  int getNumberOfProfileEntries() const {
-    return num_profile_entries_;
-  }
-
-  //! Set the backing buffer of profile.
-  void setBuffer(TensorView* buffer) {
-    buffer_ = buffer;
-  }
-
-  //! Get the backing buffer
-  TensorView* getBuffer() const {
-    return buffer_;
-  }
-
-  //! Get the indices of the profile of an expression in the backing buffer
-  std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
-
-  std::string toString(const at::Tensor& buffer) const;
-
- private:
-  //! Get the new profile index
-  int getNewIndex();
-
-  //! Get the profile index
-  c10::optional<int> getIndex(const Expr* expr) const;
-
- private:
-  int num_profile_entries_ = 0;
-
-  //! Backing buffer of Nx2 integer tensor, where N is the number of profiled
-  //! regions. Each region has two integer values, one representing
-  //! the cycles spent, and another the count.
-  TensorView* buffer_ = nullptr;
-
-  //! Map profiled expressions to profile entry offsets
-  std::unordered_map<const Expr*, int> expr_entry_map_;
-
-  // TODO: Allow profiling of ForLoops
-  //! Map profiled ForLoop to profile entry offsets
-  // std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
-};
-
-class KernelInternalProxy;
-
-//! Container for a lowered Kernel IR
-//!
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API Kernel final : public Fusion {
-  friend KernelInternalProxy;
-
- public:
-  // Kernel starts by grabbing all the nodes from the provided fusion.
-  // Kernel is not SSA, if a definition is not set, we should update it, but
-  // not remove previous definition if it is set. This is primarily because when
-  // we do something like generate an initialization statement for a reduction
-  // TV, we may want to continue to do fusion like analysis on the original
-  // expression.
-  // TODO: Assert index type is int or int32
-  Kernel(Fusion* fusion, DataType index_type = DataType::Int)
-      : Fusion(*fusion), index_type_(index_type) {}
-
-  Kernel() = delete;
-
-  // No move or copy semantics
-  Kernel(const Kernel&) = delete;
-  Kernel& operator=(const Kernel&) = delete;
-
-  //! Finalize a kernel definition
-  //!
-  //! At this point we have a complete kernel definition and we can
-  //! run analysis passes to build a KernelSummary.
-  void finalize(std::vector<Expr*> top_level_exprs);
-
-  const std::vector<Expr*>& topLevelExprs() const {
-    return top_level_exprs_;
-  }
-
-  const KernelSummary& summary() const {
-    return summary_;
-  }
-
-  DataType indexType() const {
-    return index_type_;
-  }
-
-  //! Checks if parallel type is padded
-  bool isParallelTypePadded(ParallelType ptype) const {
-    return ptype == ParallelType::TIDx &&
-        warp_padded_parallel_info_.is_tidx_padded;
-  }
-
-  const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
-    return warp_padded_parallel_info_;
-  }
-
-  const KernelPerformanceProfile& profile() const {
-    return profile_;
-  }
-
-  //! Debug dump of the Kernel IR
-  void print() const;
-
- protected:
-  //! Register the Val with this fusion
-  void registerVal(Val* val) override;
-
-  //! Register expr with this fusion.
-  //! When we register an expression, we want to update the dependency tracking
-  //! of Vals. We add expr to our general expr_set_,
-  void registerExpr(Expr* expr) override;
-
- private:
-  // Analyze the kernel IR and caches the summary of interesting data
-  void analyze();
-
-  // Top level statements
-  std::vector<Expr*> top_level_exprs_;
-
-  // Summary of interesting kernel data
-  KernelSummary summary_;
-
-  // Is this kernel being compiled with int32 or int64 indexing. This
-  // information is required to resolve DataType::Index
-  DataType index_type_ = DataType::Int;
-
-  WarpPaddedParallelInfo warp_padded_parallel_info_;
-
-  KernelPerformanceProfile profile_;
-};
-
-//! A special debugging proxy for Kernel.
-//!
-//! Should not be used for other than testing and debugging.
-class TORCH_CUDA_CU_API KernelInternalProxy {
- public:
-  KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
-
-  std::vector<Expr*>& topLevelExprs();
-
- private:
-  Kernel* kernel_ = nullptr;
-};
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
--- a/Show More
+++ b/Show More