Remove cpp/tensorexpr benchmarks (#116868)

Summary: These refer to a deprecated backend of torchscript which is no longer built in releases, and require llvm to be built.

Test Plan:
```
python setup.py develop
```

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/116868
Approved by: https://github.com/hl475, https://github.com/chenyang78, https://github.com/eellison, https://github.com/mikekgfb
This commit is contained in:
Bert Maher
2024-01-05 21:23:30 +00:00
committed by PyTorch MergeBot
parent 99ef47098d
commit 521dbbfaff
19 changed files with 0 additions and 3204 deletions

View File

@ -187,7 +187,6 @@ cmake_dependent_option(
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF) option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF) option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF)
option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF) option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
option(BUILD_TENSOREXPR_BENCHMARK "Build C++ binaries for tensorexpr benchmarks (need gbenchmark)" OFF)
option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF) option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF) option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
option(BUILD_JNI "Build JNI bindings" OFF) option(BUILD_JNI "Build JNI bindings" OFF)

View File

@ -1,2 +0,0 @@
add_executable(convolution_bench convolution.cpp)
target_link_libraries(convolution_bench PRIVATE torch_library benchmark)

View File

@ -1,313 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <benchmark/benchmark.h>
#include <c10/core/InferenceMode.h>
#include <sstream>
struct ConvParams {
std::vector<int64_t> input;
std::vector<int64_t> weight;
std::vector<int64_t> bias;
std::vector<int64_t> stride;
std::vector<int64_t> padding;
std::vector<int64_t> dilation;
int64_t groups;
};
struct xs {
explicit xs(const std::vector<int64_t>& v_) : v(v_) {}
const std::vector<int64_t>& v;
};
std::ostream& operator<<(std::ostream& os, const xs& x) {
bool first = true;
for (auto const& xx : x.v) {
if (!first) {
os << "x";
}
first = false;
os << xx;
}
return os;
}
std::ostream& operator<<(std::ostream& os, const ConvParams& params) {
os << "I" << xs(params.input) << "_W" << xs(params.weight) << "_B"
<< xs(params.bias) << "_S" << xs(params.stride) << "_P"
<< xs(params.padding) << "_D" << xs(params.dilation) << "_G"
<< params.groups;
return os;
}
std::vector<ConvParams> MobileNetV3Params = {
{{1, 3, 224, 224}, {16, 3, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 16, 112, 112}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 16, 112, 112}, {16, 1, 3, 3}, {16}, {2, 2}, {1, 1}, {1, 1}, 16},
{{1, 16, 56, 56}, {16, 16, 1, 1}, {16}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 16, 56, 56}, {72, 16, 1, 1}, {72}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 72, 56, 56}, {72, 1, 3, 3}, {72}, {2, 2}, {1, 1}, {1, 1}, 72},
{{1, 72, 28, 28}, {24, 72, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 24, 28, 28}, {88, 24, 1, 1}, {88}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 88, 28, 28}, {88, 1, 3, 3}, {88}, {1, 1}, {1, 1}, {1, 1}, 88},
{{1, 88, 28, 28}, {24, 88, 1, 1}, {24}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 24, 28, 28}, {96, 24, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 96, 28, 28}, {96, 1, 5, 5}, {96}, {2, 2}, {2, 2}, {1, 1}, 96},
{{1, 96, 14, 14}, {40, 96, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
{{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 40, 14, 14}, {240, 40, 1, 1}, {240}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 240, 14, 14}, {240, 1, 5, 5}, {240}, {1, 1}, {2, 2}, {1, 1}, 240},
{{1, 240, 14, 14}, {40, 240, 1, 1}, {40}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 40, 14, 14}, {120, 40, 1, 1}, {120}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 120, 14, 14}, {120, 1, 5, 5}, {120}, {1, 1}, {2, 2}, {1, 1}, 120},
{{1, 120, 14, 14}, {48, 120, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 48, 14, 14}, {144, 48, 1, 1}, {144}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 144, 14, 14}, {144, 1, 5, 5}, {144}, {1, 1}, {2, 2}, {1, 1}, 144},
{{1, 144, 14, 14}, {48, 144, 1, 1}, {48}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 48, 14, 14}, {288, 48, 1, 1}, {288}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 288, 14, 14}, {288, 1, 5, 5}, {288}, {2, 2}, {2, 2}, {1, 1}, 288},
{{1, 288, 7, 7}, {96, 288, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
{{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 576, 7, 7}, {576, 1, 5, 5}, {576}, {1, 1}, {2, 2}, {1, 1}, 576},
{{1, 576, 7, 7}, {96, 576, 1, 1}, {96}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 96, 7, 7}, {576, 96, 1, 1}, {576}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 576, 1, 1}, {1280, 576, 1, 1}, {1280}, {1, 1}, {0, 0}, {1, 1}, 1},
};
std::vector<ConvParams> ResNet18Params = {
{{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {128, 64, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {128, 64, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {256, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {256, 128, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {512, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
};
std::vector<ConvParams> ResNet50Params = {
{{1, 3, 224, 224}, {64, 3, 7, 7}, {}, {2, 2}, {3, 3}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 56, 56}, {64, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 64, 56, 56}, {64, 64, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 64, 56, 56}, {256, 64, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 56, 56}, {128, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 128, 56, 56}, {128, 128, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 56, 56}, {512, 256, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 28, 28}, {128, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 128, 28, 28}, {128, 128, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 128, 28, 28}, {512, 128, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 28, 28}, {256, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 28, 28}, {256, 256, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 28, 28}, {1024, 512, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {256, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 256, 14, 14}, {256, 256, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 256, 14, 14}, {1024, 256, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {512, 1024, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 14, 14}, {512, 512, 3, 3}, {}, {2, 2}, {1, 1}, {1, 1}, 1},
{{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 1024, 14, 14}, {2048, 1024, 1, 1}, {}, {2, 2}, {0, 0}, {1, 1}, 1},
{{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 2048, 7, 7}, {512, 2048, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
{{1, 512, 7, 7}, {512, 512, 3, 3}, {}, {1, 1}, {1, 1}, {1, 1}, 1},
{{1, 512, 7, 7}, {2048, 512, 1, 1}, {}, {1, 1}, {0, 0}, {1, 1}, 1},
};
struct EnableMklDnn {
explicit EnableMklDnn(bool enable)
: prev_(at::globalContext().userEnabledMkldnn()) {
at::globalContext().setUserEnabledMkldnn(enable);
}
~EnableMklDnn() {
at::globalContext().setUserEnabledMkldnn(prev_);
}
bool prev_;
};
template <bool WithMklDnn>
static void BM_conv2d_native(
benchmark::State& state,
const ConvParams& params) {
EnableMklDnn mkl(WithMklDnn);
auto input = at::randn(params.input);
auto weight = at::randn(params.weight);
auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
auto output = at::conv2d(
input,
weight,
bias,
params.stride,
params.padding,
params.dilation,
params.groups);
for (auto _ : state) {
output = at::conv2d(
input,
weight,
bias,
params.stride,
params.padding,
params.dilation,
params.groups);
}
state.counters["GFLOPS/s"] = benchmark::Counter(
2.0f * output.numel() * weight.numel() / weight.size(0) *
state.iterations(),
benchmark::Counter::kIsRate);
state.counters["GB/s"] = benchmark::Counter(
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
benchmark::Counter::kIsRate);
}
enum MklDnnReorder {
None,
WeightOnly,
WeightAndInput,
};
template <MklDnnReorder Reorder>
static void BM_conv2d_mkldnn(
benchmark::State& state,
const ConvParams& params) {
auto input = at::randn(params.input);
auto weight = at::randn(params.weight);
auto bias = params.bias.size() > 0 ? at::randn(params.bias) : at::Tensor{};
if (Reorder == WeightAndInput) {
auto it_input = at::native::itensor_from_mkldnn(input.to_mkldnn());
auto r = ideep::tensor(
params.input, ideep::data_type::f32, ideep::format_tag::aBcd16b);
it_input.reorder_to(r);
input = at::native::new_with_itensor_mkldnn(
std::move(r), at::kFloat, at::Device(at::kCPU));
}
if (Reorder == WeightOnly || Reorder == WeightAndInput) {
weight = at::mkldnn_reorder_conv2d_weight(
weight.to_mkldnn(),
params.padding,
params.stride,
params.dilation,
params.groups);
bias = params.bias.size() > 0 ? bias.to_mkldnn() : bias;
}
auto output = at::mkldnn_convolution(
input,
weight,
bias,
params.padding,
params.stride,
params.dilation,
params.groups);
for (auto _ : state) {
output = at::mkldnn_convolution(
input,
weight,
bias,
params.padding,
params.stride,
params.dilation,
params.groups);
}
state.counters["GFLOPS/s"] = benchmark::Counter(
2.0f * output.numel() * weight.numel() / weight.size(0) *
state.iterations(),
benchmark::Counter::kIsRate);
state.counters["GB/s"] = benchmark::Counter(
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
state.iterations() * (input.nbytes() + weight.nbytes() + output.nbytes()),
benchmark::Counter::kIsRate);
}
std::string name(
const char* base,
const char* suffix,
const ConvParams& params) {
std::ostringstream os;
os << base << "_" << suffix << "_" << params;
return os.str();
}
void registerOne(const char* base, const ConvParams& params) {
benchmark::RegisterBenchmark(
name(base, "native", params).data(), BM_conv2d_native<true>, params);
benchmark::RegisterBenchmark(
name(base, "native_nomkl", params).data(),
BM_conv2d_native<false>,
params);
benchmark::RegisterBenchmark(
name(base, "mkldnn_none", params).data(), BM_conv2d_mkldnn<None>, params);
benchmark::RegisterBenchmark(
name(base, "mkldnn_weight", params).data(),
BM_conv2d_mkldnn<WeightOnly>,
params);
benchmark::RegisterBenchmark(
name(base, "mkldnn_input", params).data(),
BM_conv2d_mkldnn<WeightAndInput>,
params);
}
int main(int argc, char** argv) {
c10::InferenceMode guard;
#define BENCH(x) \
for (auto const& params : x##Params) { \
registerOne(#x, params); \
}
BENCH(MobileNetV3);
BENCH(ResNet18);
BENCH(ResNet50);
#undef BENCH
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
}

View File

@ -1,24 +0,0 @@
find_package(AVX)
add_executable(
tensorexpr_bench
bench_approx.cpp
bench_batchnorm.cpp
bench_concat.cpp
bench_compile.cpp
bench_signed_log1p.cpp
bench_fuser_overhead.cpp
bench_gemm.cpp
bench_kernels.cpp
bench_parallel.cpp
bench_prefix_sum.cpp
bench_reduce.cpp
main.cpp)
if(C_AVX2_FOUND)
message(STATUS "AVX2 compiler support found")
target_compile_options(tensorexpr_bench PUBLIC -mavx2)
target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
endif()
target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)

View File

@ -1,433 +0,0 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include "caffe2/operators/logit_op.h"
#include "caffe2/operators/tanh_op.h"
using namespace torch::jit;
using namespace torch::jit::tensorexpr;
void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
auto loops = ln->getLoopStmtsFor(target);
ForPtr inner, tail;
ln->splitWithTail(loops[0], width, &inner, &tail);
ln->vectorize(inner);
}
void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
ForPtr inner, tail;
ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
ForPtr outer = loops[0];
ln->vectorize(inner);
ln->splitWithTail(outer, 8, &inner, &tail);
StmtPtr unrolled;
LoopNest::fullUnroll(inner, &unrolled);
}
static void relu_nnc(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
auto clamp = 0;
torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
auto A_elem = [&]() {
auto elem = A.load(i);
auto min = FloatImm::make(clamp);
return CompareSelect::make(elem, min, min, elem, kLT);
}();
return A_elem;
});
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::randn({state.range(0)});
at::Tensor B_t = torch::randn(state.range(0));
auto B_ref = at::relu(A_t);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(B_t, B_ref));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["log/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void log_nnc_sleef(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
torch::jit::tensorexpr::Tensor B =
Compute("B", {N}, [&](const VarHandle& i) { return log(A.load(i)); });
LoopNest ln({B});
ln.prepareForCodegen();
vectorize(&ln, B, 8);
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::log(A_t);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(B_t, B_ref));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["log/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void log_nnc_fast(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
torch::jit::tensorexpr::Tensor B = Compute(
"B", {N}, [&](const VarHandle& i) { return fast_log(A.load(i)); });
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::log(A_t);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(B_t, B_ref));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["log/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void log_nnc_vml(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
torch::jit::tensorexpr::Tensor B =
Compute("B", {N}, [&](const VarHandle& i) { return log_vml(A.load(i)); });
LoopNest ln({B});
vectorize(&ln, B, 8);
ln.prepareForCodegen();
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::log(A_t);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(B_t, B_ref));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["log/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void log_aten(benchmark::State& state) {
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
for (auto _ : state) {
at::log_out(B_t, A_t);
}
state.counters["log/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void logit_nnc_sleef(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
auto clamp = 1e-6f;
tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
auto A_elem = [&]() {
auto elem = A.load(i);
auto min = FloatImm::make(clamp);
auto max = FloatImm::make(1.0f - clamp);
elem = CompareSelect::make(elem, min, min, elem, kLT);
return CompareSelect::make(elem, max, max, elem, kGT);
}();
return log(A_elem / (FloatImm::make(1.0f) - A_elem));
});
LoopNest ln({B});
ln.prepareForCodegen();
optimizePointwise(&ln, B);
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::logit(A_t, clamp);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["logit/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void logit_nnc_fast(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
auto clamp = 1e-6f;
tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
auto A_elem = [&]() {
auto elem = A.load(i);
auto min = FloatImm::make(clamp);
auto max = FloatImm::make(1.0f - clamp);
elem = CompareSelect::make(elem, min, min, elem, kLT);
return CompareSelect::make(elem, max, max, elem, kGT);
}();
return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem));
});
LoopNest ln({B});
ln.prepareForCodegen();
optimizePointwise(&ln, B);
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::logit(A_t, clamp);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["logit/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void logit_nnc_vml(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
auto clamp = 1e-6f;
tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
auto A_elem = [&]() {
auto elem = A.load(i);
auto min = FloatImm::make(clamp);
auto max = FloatImm::make(1.0f - clamp);
elem = CompareSelect::make(elem, min, min, elem, kLT);
return CompareSelect::make(elem, max, max, elem, kGT);
}();
return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
});
LoopNest ln({B});
ln.prepareForCodegen();
vectorize(&ln, B, 16);
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::logit(A_t, clamp);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["logit/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void logit_aten(benchmark::State& state) {
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto clamp = 1e-6f;
for (auto _ : state) {
at::native::logit_out(A_t, clamp, B_t);
}
state.counters["logit/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
template <typename T>
void logit_caffe2_impl(int size, const T* X, T* Y, float eps_ = 1e-6f) {
using namespace caffe2;
ConstEigenVectorMap<T> X_vec(X, size);
EigenVectorMap<T> Y_vec(Y, size);
Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
Y_vec = Y_vec.array().max(eps_);
Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
}
static void logit_caffe2(benchmark::State& state) {
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
at::Tensor B_ref = torch::randn({state.range(0)});
auto N = state.range(0);
auto X = A_t.data_ptr<float>();
auto Y = B_t.data_ptr<float>();
auto clamp = 1e-6f;
at::native::logit_out(A_t, clamp, B_ref);
logit_caffe2_impl(N, X, Y, clamp);
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
for (auto _ : state) {
logit_caffe2_impl(N, X, Y, clamp);
}
state.counters["logit/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void tanh_nnc_fast(benchmark::State& state) {
auto N = VarHandle("N", kInt);
BufHandle A("A", {N}, kFloat);
torch::jit::tensorexpr::Tensor B = Compute(
"B", {N}, [&](const VarHandle& i) { return fast_tanh(A.load(i)); });
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
args.emplace_back(A);
args.emplace_back(N);
LLVMCodeGen cg(s, args);
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
auto B_ref = at::tanh(A_t);
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
TORCH_CHECK(at::allclose(B_t, B_ref, 1e-3f, 1e-6f));
for (auto _ : state) {
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
}
state.counters["tanh/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void tanh_aten(benchmark::State& state) {
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
for (auto _ : state) {
at::tanh_out(A_t, B_t);
}
state.counters["tanh/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
}
static void tanh_caffe2(benchmark::State& state) {
#ifdef FBCODE_CAFFE2
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
at::Tensor B_t = torch::randn({state.range(0)});
at::Tensor B_ref = torch::randn({state.range(0)});
auto N = state.range(0);
auto X = A_t.data_ptr<float>();
auto Y = B_t.data_ptr<float>();
caffe2::CPUContext c;
auto tanh = caffe2::TanhFunctor<caffe2::CPUContext>();
at::tanh_out(A_t, B_ref);
tanh(N, X, Y, &c);
TORCH_CHECK(at::native::allclose(B_t, B_ref, 1e-3f, 1e-6f));
for (auto _ : state) {
tanh(N, X, Y, &c);
}
state.counters["tanh/s"] = benchmark::Counter(
uint64_t(state.range(0) * state.iterations()),
benchmark::Counter::kIsRate);
#endif
}
BENCHMARK(relu_nnc)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
{2 << 14});
BENCHMARK(log_nnc_sleef)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(log_nnc_fast)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(log_nnc_vml)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(log_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
{2 << 14});
BENCHMARK(logit_nnc_sleef)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(logit_nnc_fast)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(logit_nnc_vml)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(logit_aten)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(logit_caffe2)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(tanh_nnc_fast)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});
BENCHMARK(tanh_aten)->Args({2 << 5})->Args({2 << 8})->Args({2 << 12})->Args(
{2 << 14});
BENCHMARK(tanh_caffe2)
->Args({2 << 5})
->Args({2 << 8})
->Args({2 << 12})
->Args({2 << 14});

View File

@ -1,216 +0,0 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
using namespace torch::jit::tensorexpr;
namespace {
class BatchNorm : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
N_ = state.range(0);
C_ = state.range(1);
H_ = state.range(2);
W_ = state.range(3);
input_ = torch::ones({N_, C_, H_, W_});
weight_ = torch::ones({C_});
bias_ = torch::ones({C_});
mean_ = torch::ones({C_}) * 0.5f;
var_ = torch::ones({C_}) * 0.1f;
ref_ = at::batch_norm(
input_,
weight_,
bias_,
mean_,
var_,
training_,
momentum_,
eps_,
cudnn_enabled_);
output_ = at::empty_like(ref_);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(ref_, output_));
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * (input_.nbytes() + ref_.nbytes()),
benchmark::Counter::kIsRate);
}
int N_;
int C_;
int H_;
int W_;
at::Tensor input_;
at::Tensor weight_;
at::Tensor bias_;
at::Tensor mean_;
at::Tensor var_;
at::Tensor output_;
at::Tensor ref_;
bool training_{false};
float momentum_{0.1};
float eps_{1.0e-5f};
bool cudnn_enabled_{false};
};
} // namespace
BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
for (auto _ : state) {
output_ = at::batch_norm(
input_,
weight_,
bias_,
mean_,
var_,
training_,
momentum_,
eps_,
cudnn_enabled_);
}
}
BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
BufHandle input("input", {N_, C_, H_, W_}, kFloat);
BufHandle weight("weight", {C_}, kFloat);
BufHandle bias("bias", {C_}, kFloat);
BufHandle mean("mean", {C_}, kFloat);
BufHandle var("var", {C_}, kFloat);
VarHandle eps("eps", kFloat);
using axis = const VarHandle&;
Tensor output =
Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
// Compute affine terms.
auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
auto weight_v = weight.load(c);
auto bias_v = bias.load(c);
auto alpha = inv_var * weight_v;
auto beta = bias_v - mean.load(c) * alpha;
return input.load(n, c, h, w) * alpha + beta;
});
LoopNest nest({output});
auto loops = nest.getLoopStmtsFor(output);
LoopNest::flatten({loops[2], loops[3]});
loops = nest.getLoopStmtsFor(output);
LoopNest::flatten({loops[0], loops[1]});
loops = nest.getLoopStmtsFor(output);
loops[0]->set_parallel();
nest.prepareForCodegen();
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
std::vector<CodeGen::CallArg> args;
for (auto _ : state) {
args.clear();
output_ = at::empty_like(input_);
for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
args.push_back(t.data_ptr<float>());
}
args.push_back(eps_);
cg.call(args);
}
}
BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
for (auto _ : state) {
output_ = at::batch_norm(
input_,
weight_,
bias_,
mean_,
var_,
training_,
momentum_,
eps_,
cudnn_enabled_);
output_.relu_();
}
}
BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
BufHandle input("input", {N_, C_, H_, W_}, kFloat);
BufHandle weight("weight", {C_}, kFloat);
BufHandle bias("bias", {C_}, kFloat);
BufHandle mean("mean", {C_}, kFloat);
BufHandle var("var", {C_}, kFloat);
VarHandle eps("eps", kFloat);
using axis = const VarHandle&;
Tensor output =
Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
// Compute affine terms.
auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
auto weight_v = weight.load(c);
auto bias_v = bias.load(c);
auto alpha = inv_var * weight_v;
auto beta = bias_v - mean.load(c) * alpha;
auto bn = input.load(n, c, h, w) * alpha + beta;
return CompareSelect::make(bn, 0.f, 0.f, bn, kLT);
});
LoopNest nest({output});
nest.prepareForCodegen();
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
std::vector<CodeGen::CallArg> args;
for (auto _ : state) {
args.clear();
output_ = at::empty_like(input_);
for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
args.push_back(t.data_ptr<float>());
}
args.push_back(eps_);
cg.call(args);
}
}
BENCHMARK_REGISTER_F(BatchNorm, ATen)
->Args({1, 64, 112, 112})
->Args({1, 256, 14, 14})
->Args({1, 128, 28, 28})
->Args({1, 64, 56, 56})
->Args({1, 512, 7, 7})
->Args({5, 64, 112, 112})
->Args({5, 256, 14, 14})
->Args({5, 128, 28, 28})
->Args({5, 64, 56, 56})
->Args({5, 512, 7, 7});
BENCHMARK_REGISTER_F(BatchNorm, NNC)
->Args({1, 64, 112, 112})
->Args({1, 256, 14, 14})
->Args({1, 128, 28, 28})
->Args({1, 64, 56, 56})
->Args({1, 512, 7, 7})
->Args({5, 64, 112, 112})
->Args({5, 256, 14, 14})
->Args({5, 128, 28, 28})
->Args({5, 64, 56, 56})
->Args({5, 512, 7, 7});
BENCHMARK_REGISTER_F(BatchNorm, ATenRelu)
->Args({1, 64, 112, 112})
->Args({1, 256, 14, 14})
->Args({1, 128, 28, 28})
->Args({1, 64, 56, 56})
->Args({1, 512, 7, 7})
->Args({5, 64, 112, 112})
->Args({5, 256, 14, 14})
->Args({5, 128, 28, 28})
->Args({5, 64, 56, 56})
->Args({5, 512, 7, 7});
BENCHMARK_REGISTER_F(BatchNorm, NNCRelu)
->Args({1, 64, 112, 112})
->Args({1, 256, 14, 14})
->Args({1, 128, 28, 28})
->Args({1, 64, 56, 56})
->Args({1, 512, 7, 7})
->Args({5, 64, 112, 112})
->Args({5, 256, 14, 14})
->Args({5, 128, 28, 28})
->Args({5, 64, 56, 56})
->Args({5, 512, 7, 7});

View File

@ -1,71 +0,0 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#ifdef TORCH_ENABLE_LLVM
namespace te = torch::jit::tensorexpr;
static void BM_CompileSwish(benchmark::State& state) {
for (auto _ : state) {
constexpr int N = 512;
te::VarHandle n("n", te::kInt);
te::BufHandle A("A", {N}, te::kFloat);
te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
return te::Max::make(A.load(i), 0.f, false);
});
te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
return te::Min::make(relu.load(i), 6.f, false);
});
te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) {
return min6.load(i) + 3.f;
});
te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
return A.load(i) * plus3.load(i);
});
te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
return times.load(i) * 1.f / 6.f;
});
te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
for (auto tensor : {relu, min6, plus3, times}) {
nest.computeInline(tensor.buf());
}
nest.prepareForCodegen();
te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
te::LLVMCodeGen cg(s, {A, sixth, n});
}
}
static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
constexpr int N = 512;
te::VarHandle n("n", te::kInt);
te::BufHandle A("A", {N}, te::kFloat);
te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
return te::Max::make(A.load(i), 0.f, false);
});
te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
return te::Min::make(relu.load(i), 6.f, false);
});
te::Tensor plus3 = te::Compute(
"plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; });
te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
return A.load(i) * plus3.load(i);
});
te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
return times.load(i) * 1.f / 6.f;
});
te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
for (auto tensor : {relu, min6, plus3, times}) {
nest.computeInline(tensor.buf());
}
nest.prepareForCodegen();
te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
for (auto _ : state) {
te::LLVMCodeGen cg(s, {A, sixth, n});
}
}
BENCHMARK(BM_CompileSwish);
BENCHMARK(BM_CompileSwishLLVMOnly);
#endif // TORCH_ENABLE_LLVM

View File

@ -1,293 +0,0 @@
#include <benchmark/benchmark.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
using namespace torch::jit::tensorexpr;
namespace {
class ConcatBench : public benchmark::Fixture {
public:
void init(const std::vector<std::vector<int>> input_sizes, int concat_dim) {
input_sizes_ = std::move(input_sizes);
concat_dim_ = concat_dim;
inputs_.resize(input_sizes_.size());
for (const auto i : c10::irange(input_sizes_.size())) {
inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
}
output_size_.resize(input_sizes_.front().size());
for (const auto i : c10::irange(output_size_.size())) {
if (i == static_cast<size_t>(concat_dim_)) {
output_size_[i] = 0;
for (const auto j : c10::irange(input_sizes_.size())) {
output_size_[i] += input_sizes_[j][i];
}
} else {
output_size_[i] = input_sizes_.front()[i];
}
}
ref_ = at::cat(inputs_, concat_dim_);
output_ = at::empty_like(ref_);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(ref_, output_));
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * output_.nbytes(),
benchmark::Counter::kIsRate);
}
void runATen(benchmark::State& state) {
for (auto _ : state) {
output_ = at::cat(inputs_, concat_dim_);
}
}
void runNNC(benchmark::State& state) {
size_t num_inputs = inputs_.size();
size_t num_dims = 2;
std::vector<BufHandle> inputs;
for (size_t i = 0; i < num_inputs; ++i) {
inputs.emplace_back(BufHandle(
"input" + std::to_string(i),
{input_sizes_[i][0], input_sizes_[i][1]},
kFloat));
}
Tensor output = Compute(
"aten_cat",
{output_size_[0], output_size_[1]},
[&](const VarHandle& m, const VarHandle& n) {
int d = 0;
std::vector<int> cumulative_concat_dim_sizes(num_inputs);
for (const auto i : c10::irange(num_inputs)) {
cumulative_concat_dim_sizes[i] = d;
d += input_sizes_[i][concat_dim_];
}
auto load =
inputs.back().load(m, n - cumulative_concat_dim_sizes.back());
for (size_t i = num_inputs - 1; i > 0; --i) {
load = ifThenElse(
CompareSelect::make(
n, IntImm::make(cumulative_concat_dim_sizes[i]), kLT),
inputs[i - 1].load(m, n - cumulative_concat_dim_sizes[i - 1]),
load);
}
return load;
});
LoopNest nest({output});
nest.prepareForCodegen();
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
buf_args.push_back(output);
LLVMCodeGen cg(s, buf_args);
std::vector<CodeGen::CallArg> call_args;
for (auto _ : state) {
output_ = at::empty_like(ref_);
call_args.clear();
for (const auto& inp : inputs_) {
call_args.push_back(inp.data_ptr<float>());
}
call_args.push_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
void runNNCLoop(benchmark::State& state) {
size_t num_inputs = inputs_.size();
size_t num_dims = 2;
TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
auto output_buf = alloc<Buf>(
alloc<Var>("aten_cat", kHandle),
std::vector<ExprPtr>(
{alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
kFloat);
std::vector<BufHandle> inputs;
std::vector<StmtPtr> for_stmts(num_inputs);
int cumulative_input_sizes = 0;
for (size_t i = 0; i < num_inputs; ++i) {
inputs.emplace_back(BufHandle(
"input" + std::to_string(i),
{input_sizes_[i][0], input_sizes_[i][1]},
kFloat));
std::vector<VarPtr> for_vars(num_inputs);
for (const auto d : c10::irange(num_dims)) {
for_vars[d] =
alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
}
auto store = alloc<Store>(
output_buf,
std::vector<ExprPtr>(
{for_vars[0],
alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
alloc<Load>(
inputs[i].node(),
std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
auto for_st = alloc<For>(
for_vars[0],
alloc<IntImm>(0),
alloc<IntImm>(input_sizes_[i][0]),
alloc<For>(
for_vars[1],
alloc<IntImm>(0),
alloc<IntImm>(input_sizes_[i][1]),
store));
for_stmts[i] = for_st;
cumulative_input_sizes += input_sizes_[i][1];
}
auto output = Tensor(output_buf, alloc<Block>(for_stmts));
LoopNest nest({output});
nest.prepareForCodegen();
nest.vectorizeInnerLoops();
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
buf_args.push_back(output);
LLVMCodeGen cg(s, buf_args);
std::vector<CodeGen::CallArg> call_args;
for (auto _ : state) {
output_ = at::empty_like(ref_);
call_args.clear();
for (const auto& inp : inputs_) {
call_args.push_back(inp.data_ptr<float>());
}
call_args.push_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
std::vector<std::vector<int>> input_sizes_;
int concat_dim_;
std::vector<at::Tensor> inputs_;
std::vector<int> output_size_;
at::Tensor output_;
at::Tensor ref_;
};
class Concat2D2Input : public ConcatBench {
public:
void SetUp(const benchmark::State& state) override {
init(
{{state.range(0), state.range(1)}, {state.range(2), state.range(3)}},
state.range(4));
}
};
} // namespace
BENCHMARK_DEFINE_F(Concat2D2Input, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(Concat2D2Input, NNC)(benchmark::State& state) {
runNNC(state);
}
BENCHMARK_DEFINE_F(Concat2D2Input, NNCLoop)(benchmark::State& state) {
runNNCLoop(state);
}
BENCHMARK_REGISTER_F(Concat2D2Input, ATen)
->Args({1, 160, 1, 14, 1})
->Args({1, 580, 1, 174, 1})
->Args({20, 160, 20, 14, 1})
->Args({20, 580, 20, 174, 1})
->Args({8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D2Input, NNC)
->Args({1, 160, 1, 14, 1})
->Args({1, 580, 1, 174, 1})
->Args({20, 160, 20, 14, 1})
->Args({20, 580, 20, 174, 1})
->Args({8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D2Input, NNCLoop)
->Args({1, 160, 1, 14, 1})
->Args({1, 580, 1, 174, 1})
->Args({20, 160, 20, 14, 1})
->Args({20, 580, 20, 174, 1})
->Args({8, 512, 8, 512, 1});
namespace {
class Concat2D3Input : public ConcatBench {
public:
void SetUp(const benchmark::State& state) override {
init(
{{state.range(0), state.range(1)},
{state.range(2), state.range(3)},
{state.range(4), state.range(5)}},
state.range(6));
}
};
} // namespace
BENCHMARK_DEFINE_F(Concat2D3Input, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(Concat2D3Input, NNC)(benchmark::State& state) {
runNNC(state);
}
BENCHMARK_DEFINE_F(Concat2D3Input, NNCLoop)(benchmark::State& state) {
runNNCLoop(state);
}
BENCHMARK_REGISTER_F(Concat2D3Input, ATen)->Args({8, 512, 8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D3Input, NNC)->Args({8, 512, 8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D3Input, NNCLoop)
->Args({8, 512, 8, 512, 8, 512, 1});
namespace {
class Concat2D7Input : public ConcatBench {
public:
void SetUp(const benchmark::State& state) override {
init(
{{state.range(0), state.range(1)},
{state.range(2), state.range(3)},
{state.range(4), state.range(5)},
{state.range(6), state.range(7)},
{state.range(8), state.range(9)},
{state.range(10), state.range(11)},
{state.range(12), state.range(13)}},
state.range(14));
}
};
} // namespace
BENCHMARK_DEFINE_F(Concat2D7Input, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(Concat2D7Input, NNC)(benchmark::State& state) {
runNNC(state);
}
BENCHMARK_DEFINE_F(Concat2D7Input, NNCLoop)(benchmark::State& state) {
runNNCLoop(state);
}
BENCHMARK_REGISTER_F(Concat2D7Input, ATen)
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D7Input, NNC)
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
BENCHMARK_REGISTER_F(Concat2D7Input, NNCLoop)
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});

View File

@ -1,59 +0,0 @@
#include <benchmark/benchmark.h>
#include <c10/core/InferenceMode.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/codegen/fuser/interface.h>
#include <torch/torch.h>
using namespace torch::jit;
static const std::string two_adds = R"JIT(
def two_adds(self, x: Tensor, y: Tensor, z: Tensor) -> Tensor:
return x + y + z
)JIT";
static void FusedOverhead(benchmark::State& state) {
c10::InferenceMode mode;
overrideCanFuseOnCPU(true);
Module m("m");
m.define(two_adds);
auto x = torch::ones({1});
auto y = torch::ones({1});
auto z = torch::ones({1});
// Warmup.
for (const auto i : c10::irange(8)) {
(void)i; // Suppress unused variable warning
m.run_method("two_adds", x, y, z);
}
for (auto _ : state) {
m.run_method("two_adds", x, y, z);
}
}
static void UnfusedOverhead(benchmark::State& state) {
c10::InferenceMode guard;
overrideCanFuseOnCPU(false);
Module m("m");
m.define(two_adds);
auto x = torch::ones({1});
auto y = torch::ones({1});
auto z = torch::ones({1});
// Warmup.
for (const auto i : c10::irange(8)) {
(void)i; // Suppress unused variable warning
m.run_method("two_adds", x, y, z);
}
for (auto _ : state) {
m.run_method("two_adds", x, y, z);
}
}
BENCHMARK(FusedOverhead);
BENCHMARK(UnfusedOverhead);

View File

@ -1,313 +0,0 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
namespace te = torch::jit::tensorexpr;
namespace {
class Gemm : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
M = state.range(0);
N = state.range(1);
K = state.range(2);
A = torch::randn({M, K});
B = torch::randn({K, N});
C = torch::mm(A, B);
}
void TearDown(benchmark::State& state) override {
state.counters["GFLOPS"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * M * N * K,
benchmark::Counter::kIsRate);
}
int M;
int N;
int K;
at::Tensor A;
at::Tensor B;
at::Tensor C;
};
} // namespace
BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) {
for (auto _ : state) {
torch::mm_out(C, A, B);
}
}
BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
te::BufHandle AP("A", {M, K}, te::kFloat);
te::BufHandle BP("B", {K, N}, te::kFloat);
te::Tensor CT = te::Reduce(
"gemm",
{M, N},
te::Sum(),
[&](const te::ExprHandle& m,
const te::ExprHandle& n,
const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
{K});
te::LoopNest loop({CT});
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
for (auto _ : state) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
}
}
BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
te::BufHandle AP("A", {M, K}, te::kFloat);
te::BufHandle BP("B", {K, N}, te::kFloat);
te::Tensor CT = te::Reduce(
"gemm",
{M, N},
te::Sum(),
[&](const te::ExprHandle& m,
const te::ExprHandle& n,
const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
{K});
te::LoopNest loop({CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr m = loops[0];
loop.splitWithMask(m, 32);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr n = loops[2];
loop.splitWithMask(n, 32);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[1];
te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr ni = loops[3];
te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[2];
te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
for (auto _ : state) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
}
}
BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
te::BufHandle AP("A", {M, K}, te::kFloat);
te::BufHandle BP("B", {K, N}, te::kFloat);
te::Tensor CT = te::Reduce(
"gemm",
{M, N},
te::Sum(),
[&](const te::ExprHandle& m,
const te::ExprHandle& n,
const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
{K});
te::LoopNest loop({CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[1];
te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr ni = loops[3];
te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[2];
te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
for (auto _ : state) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
}
}
BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
te::BufHandle AP("A", {M, K}, te::kFloat);
te::BufHandle BP("B", {K, N}, te::kFloat);
te::Tensor CT = te::Reduce(
"gemm",
{M, N},
te::Sum(),
[&](const te::ExprHandle& m,
const te::ExprHandle& n,
const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
{K});
te::LoopNest loop({CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[1];
te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr ni = loops[3];
te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[2];
te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[3];
te::ForPtr ni = loops[4];
te::StmtPtr unrolled;
loop.vectorize(ni);
loop.fullUnroll(mi, &unrolled);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
for (auto _ : state) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
}
}
BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
te::BufHandle AP("A", {M, K}, te::kFloat);
te::BufHandle BP("B", {K, N}, te::kFloat);
te::Tensor CT = te::Reduce(
"gemm",
{M, N},
te::Sum(),
[&](const te::ExprHandle& m,
const te::ExprHandle& n,
const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
{K});
te::LoopNest loop({CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[1];
te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr ni = loops[3];
te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
te::ForPtr mi = loops[2];
te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
for (auto _ : state) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>()});
}
}
BENCHMARK_REGISTER_F(Gemm, Torch)->Args({128, 128, 128});
BENCHMARK_REGISTER_F(Gemm, TensorExprNoopt)->Args({128, 128, 128});
BENCHMARK_REGISTER_F(Gemm, TensorExprTile32x32)->Args({128, 128, 128});
BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16)->Args({128, 128, 128});
BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16VecUnroll)->Args({128, 128, 128});
BENCHMARK_REGISTER_F(Gemm, TensorExprTile4x16Cache)->Args({128, 128, 128});

View File

@ -1,101 +0,0 @@
#include <benchmark/benchmark.h>
#include <ATen/code_template.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/ir/irparser.h>
#include <torch/csrc/jit/tensorexpr/kernel.h>
using namespace torch::jit;
using namespace torch::jit::tensorexpr;
static const std::string kernel_static_shapes_template = R"IR(
graph(%0 : Float(${dim}, strides=[1], device=cpu),
%1 : Float(${dim}, strides=[1], device=cpu)):
%2 : Float(${dim}, strides=[1]) = aten::mul(%0, %1)
%4 : Float(${dim}, strides=[1]) = aten::mul(%0, %2)
return (%4))IR";
static const std::string kernel_symbolic_shapes = R"IR(
graph(%0 : Float(SS(-2), strides=[1], device=cpu),
%1 : Float(SS(-2), strides=[1], device=cpu),
%SS_2 : int):
%2 : Float(SS(-2), strides=[1]) = aten::mul(%0, %1)
%4 : Float(SS(-2), strides=[1]) = aten::mul(%0, %2)
return (%4))IR";
class KernelBench : public benchmark::Fixture {
public:
void Eager(benchmark::State& state) {
auto dim = state.range(0);
auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
for (auto _ : state) {
auto o = at::mul(a, at::mul(a, b));
}
}
void GraphWithStaticShapes(benchmark::State& state) {
auto dim = state.range(0);
auto graph = std::make_shared<Graph>();
at::jit::TemplateEnv env;
env.d("dim", dim);
const auto kernel_static_shapes =
format(kernel_static_shapes_template, env);
parseIR(kernel_static_shapes, &*graph);
TensorExprKernel k(graph);
auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
std::vector<at::Tensor> inputs = {a, b};
for (auto _ : state) {
std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
k.run(stack);
}
}
void GraphWithSymbolicShapes(benchmark::State& state) {
auto dim = state.range(0);
auto graph = std::make_shared<Graph>();
parseIR(kernel_symbolic_shapes, &*graph);
std::vector<torch::jit::StrideInput> input_desc = {
torch::jit::StrideInput::TENSOR_CONT};
std::unordered_map<
const torch::jit::Value*,
std::vector<torch::jit::StrideInput>>
symbolic_strides;
symbolic_strides[graph->inputs().at(0)] = input_desc;
symbolic_strides[graph->inputs().at(1)] = input_desc;
symbolic_strides[graph->outputs().at(0)] = input_desc;
std::vector<int64_t> symbolic_shape_inputs = {-2};
TensorExprKernel k(
graph, {}, symbolic_shape_inputs, false, symbolic_strides);
auto a = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto b = at::rand({dim}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
std::vector<at::Tensor> inputs = {a, b};
for (auto _ : state) {
std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
stack.push_back(dim);
k.run(stack);
}
}
};
BENCHMARK_DEFINE_F(KernelBench, Eager)(benchmark::State& state) {
Eager(state);
}
BENCHMARK_DEFINE_F(KernelBench, StaticShapes)(benchmark::State& state) {
GraphWithStaticShapes(state);
}
BENCHMARK_DEFINE_F(KernelBench, SymbolicShapes)(benchmark::State& state) {
GraphWithSymbolicShapes(state);
}
BENCHMARK_REGISTER_F(KernelBench, Eager)->Range(32, 2048);
BENCHMARK_REGISTER_F(KernelBench, StaticShapes)->Range(32, 2048);
BENCHMARK_REGISTER_F(KernelBench, SymbolicShapes)->Range(32, 2048);

View File

@ -1,113 +0,0 @@
import timeit
import torch
import torch.nn.functional as F
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._debug_set_fusion_group_inlining(False)
torch.set_num_threads(1)
def hardswish(x):
return x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0
unary_ops = [
hardswish,
torch._C._nn.hardswish,
torch.sigmoid,
torch.reciprocal,
torch.neg,
torch.relu,
torch.isnan,
torch.log,
torch.log10,
torch.log1p,
torch.log2,
torch.exp,
torch.expm1,
torch.erf,
torch.erfc,
torch.cos,
torch.sin,
torch.tan,
torch.acos,
torch.asin,
torch.cosh,
torch.sinh,
torch.atan,
torch.tanh,
torch.sqrt,
torch.rsqrt,
torch.abs,
torch.ceil,
torch.floor,
torch.round,
torch.trunc,
torch.lgamma,
]
print(f"{'op':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
for op in unary_ops:
x = torch.rand((1024, 1024))
traced = torch.jit.trace(op, (x))
# Warmup.
warmup_iters = 8
for _ in range(warmup_iters):
op(x)
traced(x)
# Validate result.
torch.testing.assert_close(op(x), traced(x))
# Benchmark.
bench_iters = 100
teager = timeit.timeit(stmt="op(x)", globals=globals(), number=bench_iters)
tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
def test_batch_norm():
op = F.batch_norm
print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
batch_norm_shapes = [
[1, 64, 112, 112],
[1, 256, 14, 14],
[1, 128, 28, 28],
[1, 64, 56, 56],
[1, 512, 7, 7],
[5, 64, 112, 112],
[5, 256, 14, 14],
[5, 128, 28, 28],
[5, 64, 56, 56],
[5, 512, 7, 7],
]
for n, c, h, w in batch_norm_shapes:
x = torch.rand((n, c, h, w))
y = torch.rand(c)
z = torch.rand(c)
traced = torch.jit.trace(op, (x, y, z))
# Warmup.
warmup_iters = 8
for _ in range(warmup_iters):
op(x, y, z)
traced(x, y, z)
# Validate result.
torch.testing.assert_close(op(x, y, z), traced(x, y, z))
# Benchmark.
bench_iters = 100
teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters)
tjit = timeit.timeit(
stmt="traced(x, y, z)", globals=locals(), number=bench_iters
)
print(
f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}"
)
test_batch_norm()

View File

@ -1,71 +0,0 @@
#include <benchmark/benchmark.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include <immintrin.h>
namespace torch {
namespace jit {
namespace tensorexpr {
class ParallelAdd : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
at::set_num_threads(4);
torch::manual_seed(0x12345678);
M = state.range(0);
A = torch::randn({M});
B = torch::randn({M});
C = torch::zeros({M});
}
void TearDown(benchmark::State& state) override {
state.counters["tasks"] = benchmark::Counter(
uint64_t(state.iterations()) * M, benchmark::Counter::kIsRate);
}
int M;
at::Tensor A;
at::Tensor B;
at::Tensor C;
};
BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
BufHandle a_buf("a", {M}, kFloat);
BufHandle b_buf("b", {M}, kFloat);
Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
return a_buf.load(m) + b_buf.load(m);
});
LoopNest loop_nest({c_tensor});
auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
ForPtr m = loops[0];
m->set_parallel();
loop_nest.prepareForCodegen();
StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
float* a_ptr = A.data_ptr<float>();
float* b_ptr = B.data_ptr<float>();
float* c_ptr = C.data_ptr<float>();
std::vector<void*> args({c_ptr, a_ptr, b_ptr});
cg.value<int>(args);
for (const auto i : c10::irange(M)) {
float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
TORCH_CHECK(diff < 1e-5);
}
for (auto _ : state) {
cg.value<int>(args);
}
}
BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
} // namespace tensorexpr
} // namespace jit
} // namespace torch

View File

@ -1,395 +0,0 @@
#include <benchmark/benchmark.h>
#include "ATen/Functions.h"
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/operators/operators.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include <immintrin.h>
using namespace torch::jit::tensorexpr;
namespace {
#ifdef __AVX2__
#define _mm256_slli_si1(x) \
_mm256_blend_epi32( \
_mm256_permutevar8x32_ps(x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)), \
_mm256_setzero_si256(), \
1)
#define _mm256_slli_si2(x) \
_mm256_blend_epi32( \
_mm256_permutevar8x32_ps(x, _mm256_set_epi32(5, 4, 3, 2, 1, 0, 7, 6)), \
_mm256_setzero_si256(), \
3)
#define _mm256_slli_si4(x) \
_mm256_blend_epi32( \
_mm256_permutevar8x32_ps(x, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)), \
_mm256_setzero_si256(), \
15)
__m256 PrefixSum(__m256 x) {
x = _mm256_add_ps(x, _mm256_slli_si1(x));
x = _mm256_add_ps(x, _mm256_slli_si2(x));
x = _mm256_add_ps(x, _mm256_slli_si4(x));
return x; // local prefix sums
}
__m256i PrefixSumInt(__m256i x) {
x = _mm256_add_epi32(x, _mm256_slli_si1(x));
x = _mm256_add_epi32(x, _mm256_slli_si2(x));
x = _mm256_add_epi32(x, _mm256_slli_si4(x));
return x; // local prefix sums
}
// Util function to log the given value. Not used during benchmarking.
template <class T>
inline void Log(const __m256i& value) {
const size_t n = sizeof(__m256i) / sizeof(T);
T buffer[n];
_mm256_storeu_si256((__m256i*)buffer, value);
for (int i = 0; i < n; i++)
std::cout << buffer[n - i - 1] << " ";
std::cout << std::endl;
}
#endif
#ifdef __AVX512F__
#define _mm512_slli_si512(x, k) \
_mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k)
__m512 PrefixSum(__m512 x) {
x = _mm512_add_ps(x, _mm512_slli_si512(x, 1));
x = _mm512_add_ps(x, _mm512_slli_si512(x, 2));
x = _mm512_add_ps(x, _mm512_slli_si512(x, 4));
x = _mm512_add_ps(x, _mm512_slli_si512(x, 8));
return x; // local prefix sums
}
__m512i PrefixSumInt(__m512i x) {
x = _mm512_add_epi32(x, _mm512_slli_si512(x, 1));
x = _mm512_add_epi32(x, _mm512_slli_si512(x, 2));
x = _mm512_add_epi32(x, _mm512_slli_si512(x, 4));
x = _mm512_add_epi32(x, _mm512_slli_si512(x, 8));
return x; // local prefix sums
}
template <int index>
float _mm512_extract_f32(__m512 target) {
return _mm512_cvtss_f32(_mm512_alignr_epi32(target, target, index));
}
// extract the last i32 from target
int _mm512_extract_epi32(__m512i target) {
__m256i x = _mm512_extracti32x8_epi32(target, 1);
return _mm256_extract_epi32(x, 7);
}
void PrefixSum(float* output_data, float* input_data, size_t input_size) {
float carry = 0.0f;
for (int i = 0; i < input_size / 16; i++) {
__m512 x = _mm512_loadu_ps(input_data + i * 16);
x = PrefixSum(x);
x = _mm512_add_ps(x, _mm512_set1_ps(carry));
carry = _mm512_extract_f32<15>(x);
_mm512_storeu_ps((__m512*)(output_data + i * 16), x);
}
}
void PrefixSum(int* output_data, int* input_data, size_t input_size) {
int carry = 0;
for (int i = 0; i < input_size / 16; i++) {
__m512i x = _mm512_loadu_epi32(input_data + i * 16);
x = PrefixSumInt(x);
x = _mm512_add_epi32(x, _mm512_set1_epi32(carry));
carry = _mm512_extract_epi32(x);
_mm512_storeu_epi32((__m512i*)(output_data + i * 16), x);
}
}
#endif
// PrefixSum: the same as inclusive scan
class PrefixSumBench : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
input_size_ = state.range(0);
input_ = torch::rand(input_size_);
ref_ = prefixSum(input_);
// no type promotion. Default is int->long.
input_int_ = torch::randint(1000, {input_size_}, at::kInt);
ref_int_ = at::cumsum(input_int_, 0, at::kInt);
}
void TearDown(benchmark::State& state) override {
if (output_.numel() > 0) {
if (output_.numel() == ref_.numel()) {
TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
}
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * output_.nbytes(),
benchmark::Counter::kIsRate);
} else {
if (output_int_.numel() == ref_int_.numel()) {
TORCH_CHECK(ref_int_.equal(output_int_));
}
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * output_int_.nbytes(),
benchmark::Counter::kIsRate);
}
}
at::Tensor prefixSum(const at::Tensor& inp) {
return at::cumsum(inp, 0);
}
void runATen(benchmark::State& state) {
output_ = prefixSum(input_);
for (auto _ : state) {
at::cumsum_out(output_, input_, 0);
}
}
void runLocal(benchmark::State& state) {
output_ = at::empty_like(ref_);
for (auto _ : state) {
auto input_data = input_.data_ptr<float>();
auto output_data = output_.data_ptr<float>();
float sum = 0.0f;
for (int i = 0; i < input_size_; ++i) {
sum = sum + input_data[i];
output_data[i] = sum;
}
}
}
// no type promotion
void runLocalInt(benchmark::State& state) {
output_int_ = at::empty_like(input_int_);
for (auto _ : state) {
auto input_data = input_int_.data_ptr<int>();
auto output_data = output_int_.data_ptr<int>();
int sum = 0;
for (int i = 0; i < input_size_; ++i) {
sum = sum + input_data[i];
output_data[i] = sum;
}
}
}
void runNNC(benchmark::State& state) {
BufHandle input("input", {input_size_}, kFloat);
BufHandle output("output", {input_size_}, kFloat);
BufHandle s("s", {1}, kFloat);
VarHandle i("i", kInt);
auto allocS = Allocate::make(s);
auto initS = Store::make(s, {0}, 0.0f);
auto accumS = Store::make(
s, {0}, Add::make(Load::make(s, {0}), Load::make(input, {i})));
auto store = Store::make(output, {i}, Load::make(s, {0}));
auto forI = For::make(i, 0, input_size_, Block::make({accumS, store}));
auto freeS = Free::make(s);
auto par = Block::make({allocS, initS, forI, freeS});
LoopNest nest(par, {output.node()});
std::vector<CodeGen::BufferArg> buf_args;
buf_args.emplace_back(input);
buf_args.emplace_back(output);
LLVMCodeGen cg(nest.root_stmt(), buf_args);
std::vector<CodeGen::CallArg> call_args;
output_ = at::empty_like(ref_);
for (auto _ : state) {
call_args.clear();
call_args.emplace_back(input_.data_ptr<float>());
call_args.emplace_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
#ifdef __AVX2__
void runLocalAVX2(benchmark::State& state) {
output_ = at::empty_like(ref_);
for (auto _ : state) {
float* input_data = input_.data_ptr<float>();
float* output_data = output_.data_ptr<float>();
float carry = 0.0f;
for (int i = 0; i < input_size_ / 8; i++) {
__m256 x = _mm256_loadu_ps(input_data + i * 8);
x = PrefixSum(x);
x = _mm256_add_ps(x, _mm256_set1_ps(carry));
(reinterpret_cast<__m256*>(output_data))[i] = x;
carry = _mm256_cvtss_f32(_mm256_permutevar8x32_ps(
x, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)));
}
}
}
void runLocalIntAVX2(benchmark::State& state) {
output_int_ = at::empty_like(input_int_);
for (auto _ : state) {
auto input_data = input_int_.data_ptr<int>();
auto output_data = output_int_.data_ptr<int>();
int carry = 0;
for (size_t i = 0; i < input_size_ / 8; i++) {
__m256i x = _mm256_loadu_si256((__m256i*)(input_data + i * 8));
x = PrefixSumInt(x);
x = _mm256_add_epi32(x, _mm256_set1_epi32(carry));
_mm256_storeu_si256((__m256i*)(output_data + i * 8), x);
carry = _mm256_extract_epi32(x, 7);
}
}
}
#endif
#ifdef __AVX512F__
void runLocalAVX512(benchmark::State& state) {
output_ = at::empty_like(ref_);
for (auto _ : state) {
auto input_data = input_.data_ptr<float>();
auto output_data = output_.data_ptr<float>();
PrefixSum(output_data, input_data, input_size_);
}
}
void runLocalIntAVX512(benchmark::State& state) {
output_int_ = at::empty_like(input_int_);
for (auto _ : state) {
auto input_data = input_int_.data_ptr<int>();
auto output_data = output_int_.data_ptr<int>();
PrefixSum(output_data, input_data, input_size_);
}
}
void runExclusiveScanAVX512(benchmark::State& state) {
output_ = at::empty({input_size_ + 1}, at::kFloat);
for (auto _ : state) {
auto input_data = input_.data_ptr<float>();
auto output_data = output_.data_ptr<float>();
output_data[0] = 0.0f;
PrefixSum(output_data + 1, input_data, input_size_);
}
}
void runExclusiveScanIntAVX512(benchmark::State& state) {
output_int_ = at::empty({input_size_ + 1}, at::kInt);
for (auto _ : state) {
auto input_data = input_int_.data_ptr<int>();
auto output_data = output_int_.data_ptr<int>();
output_data[0] = 0;
PrefixSum(output_data + 1, input_data, input_size_);
}
}
#endif
private:
int input_size_;
at::Tensor input_;
at::Tensor output_;
at::Tensor ref_;
at::Tensor input_int_;
at::Tensor output_int_;
at::Tensor ref_int_; // no type promotion
};
} // namespace
BENCHMARK_DEFINE_F(PrefixSumBench, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, Local)(benchmark::State& state) {
runLocal(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, LocalInt)(benchmark::State& state) {
runLocalInt(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, NNC)(benchmark::State& state) {
runNNC(state);
}
#ifdef __AVX2__
BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX2)(benchmark::State& state) {
runLocalAVX2(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX2)(benchmark::State& state) {
runLocalIntAVX2(state);
}
#endif
#ifdef __AVX512F__
BENCHMARK_DEFINE_F(PrefixSumBench, LocalAVX512)(benchmark::State& state) {
runLocalAVX512(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, LocalIntAVX512)(benchmark::State& state) {
runLocalIntAVX512(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanAVX512)
(benchmark::State& state) {
runExclusiveScanAVX512(state);
}
BENCHMARK_DEFINE_F(PrefixSumBench, ExclusiveScanIntAVX512)
(benchmark::State& state) {
runExclusiveScanIntAVX512(state);
}
#endif
//---------- float benchmarks ----------//
BENCHMARK_REGISTER_F(PrefixSumBench, ATen)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
BENCHMARK_REGISTER_F(PrefixSumBench, NNC)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
BENCHMARK_REGISTER_F(PrefixSumBench, Local)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#ifdef __AVX2__
BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX2)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#endif
#ifdef __AVX512F__
BENCHMARK_REGISTER_F(PrefixSumBench, LocalAVX512)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanAVX512)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#endif
//---------- int benchmarks ----------//
BENCHMARK_REGISTER_F(PrefixSumBench, LocalInt)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#ifdef __AVX2__
BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX2)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#endif
#ifdef __AVX512F__
BENCHMARK_REGISTER_F(PrefixSumBench, LocalIntAVX512)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
BENCHMARK_REGISTER_F(PrefixSumBench, ExclusiveScanIntAVX512)
->RangeMultiplier(4)
->Ranges({{1 << 6, 1 << 20}});
#endif

View File

@ -1,621 +0,0 @@
#include <benchmark/benchmark.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/operators/operators.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include <immintrin.h>
namespace te = torch::jit::tensorexpr;
namespace {
class Reduce1D : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
at::set_num_threads(1);
torch::manual_seed(0x12345678);
M = state.range(0);
A = torch::randn({M});
B = torch::zeros({});
ref = torch::sum(A, {0});
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-7));
state.counters["BYTES"] = benchmark::Counter(
uint64_t(state.iterations()) * M * sizeof(float),
benchmark::Counter::kIsRate);
}
int M;
at::Tensor A;
at::Tensor B;
at::Tensor ref;
};
} // namespace
BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) {
for (auto _ : state) {
B = torch::sum(A, {0});
}
}
BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24});
#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B))
template <typename Func>
void ValidateFunc(
Func func,
const std::string& func_name,
at::Tensor& A,
at::Tensor& B) {
func(A, B);
float* pB = B.data_ptr<float>();
at::Tensor B2 = torch::sum(A, {0});
float* pB2 = B2.data_ptr<float>();
int size = A.numel();
float size_sqrt = std::sqrt(size);
float natural_noise = size_sqrt * 1e-7;
if (!torch::allclose(B, B2, natural_noise)) {
std::ostringstream oss;
oss << func_name << " failed check: " << std::endl;
oss << "value: " << B << std::endl;
;
oss << "reference: " << B2 << std::endl;
oss << "threshold: " << natural_noise << std::endl;
throw std::runtime_error(oss.str());
}
}
static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
float* pA = A.data_ptr<float>();
float* pB = B.data_ptr<float>();
int size = A.numel();
TORCH_CHECK(B.numel() == 1);
*pB = 0.;
for (const auto i : c10::irange(size)) {
*pB += pA[i];
}
}
BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) {
VALIDATE(reduce1d_naive, A, B);
for (auto _ : state) {
reduce1d_naive(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24});
static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
float* pA = A.data_ptr<float>();
float* pB = B.data_ptr<float>();
int size = A.numel();
constexpr int kChunkSize = 16;
TORCH_CHECK(B.numel() == 1);
TORCH_CHECK(size % kChunkSize == 0);
*pB = 0.;
float temp[kChunkSize];
for (const auto j : c10::irange(kChunkSize)) {
temp[j] = 0;
}
int chunk_count = size / kChunkSize;
for (const auto i : c10::irange(chunk_count)) {
for (const auto j : c10::irange(kChunkSize)) {
temp[j] += pA[i * kChunkSize + j];
}
}
for (const auto j : c10::irange(kChunkSize)) {
*pB += temp[j];
}
}
BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) {
VALIDATE(reduce1d_native_rfactor, A, B);
for (auto _ : state) {
reduce1d_native_rfactor(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24});
#ifdef USE_AVX2
// x = ( x7, x6, x5, x4, x3, x2, x1, x0 )
inline float sum_f32x8(__m256 x) {
// hiQuad = ( x7, x6, x5, x4 )
const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
// loQuad = ( x3, x2, x1, x0 )
const __m128 loQuad = _mm256_castps256_ps128(x);
// sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
// loDual = ( -, -, x1 + x5, x0 + x4 )
const __m128 loDual = sumQuad;
// hiDual = ( -, -, x3 + x7, x2 + x6 )
const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
// sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
const __m128 sumDual = _mm_add_ps(loDual, hiDual);
// lo = ( -, -, -, x0 + x2 + x4 + x6 )
const __m128 lo = sumDual;
// hi = ( -, -, -, x1 + x3 + x5 + x7 )
const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
// sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
const __m128 sum = _mm_add_ss(lo, hi);
return _mm_cvtss_f32(sum);
}
static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
float* pA = A.data_ptr<float>();
float* pB = B.data_ptr<float>();
int size = A.numel();
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
TORCH_CHECK(B.numel() == 1);
TORCH_CHECK(size % kChunkSize == 0);
*pB = 0.;
__m256 temp;
temp = _mm256_setzero_ps();
int tile_count = size / kChunkSize;
for (const auto i : c10::irange(tile_count)) {
__m256 data = _mm256_load_ps(pA + i * kChunkSize);
temp = _mm256_add_ps(temp, data);
}
float result = sum_f32x8(temp);
*pB = result;
}
BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) {
VALIDATE(reduce1d_native_vector, A, B);
for (auto _ : state) {
reduce1d_native_vector(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24});
static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
static constexpr int kTileSize = 4;
float* pA = A.data_ptr<float>();
float* pB = B.data_ptr<float>();
int size = A.numel();
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1");
TORCH_CHECK(
size % kChunkSize == 0,
"Invalid size: ",
size,
" % ",
kChunkSize,
" ! = 0");
__m256 t[kTileSize];
for (const auto j : c10::irange(kTileSize)) {
t[j] = _mm256_setzero_ps();
}
int tile_count = size / kChunkSize / kTileSize;
for (int i = 0; i < tile_count; i++) {
#pragma unroll
for (int j = 0; j < kTileSize; j++) {
float* p = pA + (i * kTileSize + j) * kChunkSize;
__m256 data = _mm256_loadu_ps(p);
t[j] = _mm256_add_ps(t[j], data);
}
}
float result = sum_f32x8(t[0]);
for (const auto j : c10::irange(1, kTileSize)) {
result += sum_f32x8(t[j]);
}
*pB = result;
}
BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) {
VALIDATE(reduce1d_native_tiled, A, B);
for (auto _ : state) {
reduce1d_native_tiled(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
#endif // USE_AVX2
BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
int M = A.numel();
te::BufHandle AP("A", {M}, te::kFloat);
te::Tensor BT = te::Reduce(
"reduce_full",
{1},
te::Sum(),
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
return AP.load(m);
},
{M});
te::LoopNest loop({BT});
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
auto func = [&](at::Tensor& A, at::Tensor& B) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
};
ValidateFunc(func, "reduce1d_te_naive", A, B);
for (auto _ : state) {
func(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
int M = A.numel();
te::BufHandle AP("A", {M}, te::kFloat);
te::Tensor BT = te::Reduce(
"reduce_full",
{1},
te::Sum(),
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
return AP.load(m);
},
{M});
te::LoopNest loop({BT});
const int kChunkSize = 8;
{
auto const& loops = loop.getLoopStmtsFor(BT);
te::ForPtr m = loops[1];
loop.splitWithTail(m, kChunkSize);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
auto func = [&](at::Tensor& A, at::Tensor& B) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
};
ValidateFunc(func, "reduce1d_te_naive", A, B);
for (auto _ : state) {
func(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
int M = A.numel();
te::BufHandle AP("A", {M}, te::kFloat);
te::Tensor BT = te::Reduce(
"reduce_full",
{1},
te::Sum(),
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
return AP.load(m);
},
{M});
te::LoopNest loop({BT});
const int kChunkSize = 8;
{
auto const& loops = loop.getLoopStmtsFor(BT);
te::ForPtr m = loops[1];
loop.splitWithMask(m, kChunkSize);
}
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
auto func = [&](at::Tensor& A, at::Tensor& B) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
};
ValidateFunc(func, "reduce1d_te_naive", A, B);
for (auto _ : state) {
func(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
int M = A.numel();
const int kChunkSize = 8;
TORCH_CHECK(M % kChunkSize == 0);
te::BufHandle AP("A", {M}, te::kFloat);
te::Tensor BT = te::Reduce(
"reduce_full",
{},
te::Sum(),
[&](const te::ExprHandle& m) { return AP.load(m); },
{M});
te::LoopNest loop({BT});
te::BufPtr rfac_buf;
auto loops = loop.getLoopStmtsFor(BT);
TORCH_CHECK(loops.size() == 1);
te::ForPtr mi;
loop.splitWithMask(loops.at(0), kChunkSize, &mi);
te::ForPtr mo = loops.at(0);
loop.reorderAxis(mo, mi);
loops = loop.getLoopStmtsFor(BT);
auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
loop.reorderAxis(loops.at(0), loops.at(1));
loops = loop.getAllInnermostLoopsWritingToBuf(rfac_buf);
TORCH_CHECK(loops.size() == 2);
loop.vectorize(loops.at(1));
loop.prepareForCodegen();
te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
auto func = [&](at::Tensor& A, at::Tensor& B) {
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
};
ValidateFunc(func, "reduce1d_te_naive", A, B);
for (auto _ : state) {
func(A, B);
}
}
BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
const int M = A.numel();
const int kChunkSize = 8;
te::BufHandle a("A", {M}, te::kFloat);
te::Tensor b = te::computeSum(
{a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU);
te::LoopNest nest({b});
auto loops = nest.getLoopStmtsFor(b);
te::ForPtr mi, mo;
te::BufPtr rf;
nest.splitWithMask(loops[0], kChunkSize, &mi);
loops = nest.reorder({loops[0], mi}, {1, 0});
nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf);
nest.reorderAxis(loops[0], loops[1]);
for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
nest.vectorize(loop);
}
nest.prepareForCodegen();
nest.simplify();
te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
for (auto _ : state) {
cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
}
}
BENCHMARK_REGISTER_F(Reduce1D, Op)->Args({1 << 24});
class Reduce2DCol : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
at::set_num_threads(1);
torch::manual_seed(0x12345678);
M = state.range(0);
N = state.range(1);
A = torch::randn({M, N});
ref = torch::sum(A, {0});
B = torch::zeros_like(ref);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-5));
state.counters["BYTES"] = benchmark::Counter(
uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
benchmark::Counter::kIsRate);
}
int M;
int N;
at::Tensor A;
at::Tensor B;
at::Tensor ref;
};
BENCHMARK_DEFINE_F(Reduce2DCol, Torch)(benchmark::State& state) {
for (auto _ : state) {
B = torch::sum(A, {0});
}
}
BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
->Args({1 << 3, 1 << 21})
->Args({1 << 6, 1 << 18})
->Args({1 << 12, 1 << 12});
BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
constexpr int kCacheSize = 1 << 12;
te::BufHandle a("A", {M, N}, te::kFloat);
te::Tensor b = te::computeSum(
{a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU);
te::LoopNest nest({b});
auto sch = state.range(2);
if (sch == 0) {
} else if (sch == 1) {
auto loops = nest.getLoopStmtsFor(b);
nest.reorderAxis(loops[0], loops[1]);
} else if (sch == 2) {
auto loops = nest.getLoopStmtsFor(b);
nest.splitWithTail(loops[0], kCacheSize);
loops = nest.getLoopStmtsFor(b);
nest.reorderAxis(loops[1], loops[2]);
} else if (sch == 3) {
auto loops = nest.getLoopStmtsFor(b);
nest.splitWithTail(loops[1], 8);
loops = nest.getLoopStmtsFor(b);
nest.reorderAxis(loops[0], loops[1]);
}
nest.prepareForCodegen();
nest.simplify();
te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
for (auto _ : state) {
cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
}
}
BENCHMARK_REGISTER_F(Reduce2DCol, OpSchedule)
->Apply( // CustomArgs);
[](benchmark::internal::Benchmark* b) {
for (auto sch : {0, 1, 2, 3}) {
for (auto rows : {3, 6, 12}) {
auto cols = 24 - rows;
b->Args({1 << rows, 1 << cols, sch});
}
}
});
class Reduce2DRow : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
at::set_num_threads(1);
torch::manual_seed(0x12345678);
M = state.range(0);
N = state.range(1);
A = torch::randn({M, N});
ref = torch::sum(A, {1});
B = torch::zeros_like(ref);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(B, ref, std::sqrt(A.numel()) * 1e-4));
state.counters["BYTES"] = benchmark::Counter(
uint64_t(state.iterations()) * (A.nbytes() + B.nbytes()),
benchmark::Counter::kIsRate);
}
int M;
int N;
at::Tensor A;
at::Tensor B;
at::Tensor ref;
};
BENCHMARK_DEFINE_F(Reduce2DRow, Torch)(benchmark::State& state) {
for (auto _ : state) {
B = torch::sum(A, {1});
}
}
BENCHMARK_REGISTER_F(Reduce2DRow, Torch)
->Args({1 << 3, 1 << 21})
->Args({1 << 6, 1 << 18})
->Args({1 << 12, 1 << 12})
->Args({1 << 18, 1 << 6});
BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) {
auto a = A.data_ptr<float>();
auto b = B.data_ptr<float>();
constexpr int Mb = 4;
constexpr int Nb = 4;
auto fn = [&] {
for (int m_outer = 0; m_outer < M; m_outer += Mb) {
float bregs[Mb][Nb] = {0.0f};
for (int n_outer = 0; n_outer < N; n_outer += Nb) {
for (const auto m_inner : c10::irange(Mb)) {
for (const auto n_inner : c10::irange(Nb)) {
bregs[m_inner][n_inner] +=
a[(m_outer + m_inner) * N + n_outer + n_inner];
}
}
}
for (const auto m_inner : c10::irange(Mb)) {
b[m_outer + m_inner] = 0.f;
for (const auto n_inner : c10::irange(Nb)) {
b[m_outer + m_inner] += bregs[m_inner][n_inner];
}
}
}
};
for (auto _ : state) {
fn();
}
}
BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6});
BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
constexpr int kChunkSize = 8;
te::BufHandle a("A", {M, N}, te::kFloat);
te::Tensor b = te::computeSum(
{a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU);
te::LoopNest nest({b});
auto sch = state.range(2);
if (sch == 1) {
auto loops = nest.getLoopStmtsFor(b);
te::ForPtr mi, mo;
te::BufPtr rf;
nest.splitWithMask(loops[1], kChunkSize, &mi);
loops = nest.reorder({loops[1], mi}, {1, 0});
TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
nest.reorderAxis(loops[0], loops[1]);
for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
nest.vectorize(loop);
}
} else if (sch == 2) {
auto loops = nest.getLoopStmtsFor(b);
nest.splitWithMask(loops[1], 8);
nest.splitWithMask(loops[0], 4);
loops = nest.getLoopStmtsFor(b);
nest.reorderAxis(loops[1], loops[2]);
} else if (sch == 3) {
auto loops = nest.getLoopStmtsFor(b);
te::ForPtr mi, mo;
te::BufPtr rf;
nest.splitWithMask(loops[1], kChunkSize, &mi);
loops = nest.reorder({loops[1], mi}, {1, 0});
TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
nest.reorderAxis(loops[0], loops[1]);
te::LoopNest::compressBuffer(rf, nest.root_stmt());
for (auto const& loop : nest.getAllInnermostLoopsWritingToBuf(rf)) {
nest.vectorize(loop);
}
}
nest.prepareForCodegen();
nest.simplify();
te::LLVMCodeGen cg(nest.root_stmt(), {a, b});
for (auto _ : state) {
cg.call({A.data_ptr<float>(), B.data_ptr<float>()});
}
}
BENCHMARK_REGISTER_F(Reduce2DRow, OpSchedule)
->Apply( // CustomArgs);
[](benchmark::internal::Benchmark* b) {
for (auto sch : {0, 1, 2, 3}) {
for (auto rows : {3, 6, 12, 18}) {
auto cols = 24 - rows;
b->Args({1 << rows, 1 << cols, sch});
}
}
});

View File

@ -1,166 +0,0 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/operators/operators.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
using namespace torch::jit::tensorexpr;
namespace {
class SignedLog1pBench : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
input_size_ = {state.range(0), state.range(1)};
input_size_int_ = {state.range(0), state.range(1)};
input_ = torch::rand(input_size_);
ref_ = signedLog1p(input_);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3));
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * output_.nbytes(),
benchmark::Counter::kIsRate);
}
at::Tensor signedLog1p(const at::Tensor& inp) {
auto sign = at::sign(inp);
auto log1p = at::log1p(at::abs(inp));
return sign * log1p;
}
void runATen(benchmark::State& state) {
for (auto _ : state) {
output_ = signedLog1p(input_);
}
}
void runNNC(benchmark::State& state) {
BufHandle input_ph(
"input", {input_size_int_[0], input_size_int_[1]}, kFloat);
Tensor abs_result = Compute(
"aten_abs",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return abs(input_ph.load(m, n));
});
Tensor log1p_result = Compute(
"aten_log1p",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return log1p(abs_result.load(m, n));
});
Tensor sign_result =
computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
Tensor output = Compute(
"aten_mul",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return sign_result.load(m, n) * log1p_result.load(m, n);
});
LoopNest nest({output}, {abs_result, log1p_result, sign_result, output});
GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
nest.inlineIntermediateBufs(true);
nest.prepareForCodegen();
nest.simplify();
nest.vectorizeInnerLoops();
nest.simplify();
GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
// StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args;
buf_args.emplace_back(input_ph);
buf_args.emplace_back(output);
LLVMCodeGen cg(nest.root_stmt(), buf_args);
std::vector<CodeGen::CallArg> call_args;
for (auto _ : state) {
output_ = at::empty_like(ref_);
call_args.clear();
call_args.emplace_back(input_.data_ptr<float>());
call_args.emplace_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
void runNNCLogVml(benchmark::State& state) {
BufHandle input_ph(
"input", {input_size_int_[0], input_size_int_[1]}, kFloat);
Tensor abs_result = Compute(
"aten_abs",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return abs(input_ph.load(m, n));
});
Tensor log_vml_result = Compute(
"aten_log1p",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return log_vml(abs_result.load(m, n) + ExprHandle(1));
});
Tensor sign_result =
computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
Tensor output = Compute(
"aten_mul",
{input_size_int_[0], input_size_int_[1]},
[&](const VarHandle& m, const VarHandle& n) {
return sign_result.load(m, n) * log_vml_result.load(m, n);
});
LoopNest nest({output}, {abs_result, log_vml_result, sign_result, output});
GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
nest.inlineIntermediateBufs(true);
nest.prepareForCodegen();
nest.simplify();
nest.vectorizeInnerLoops();
nest.simplify();
GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
// StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args;
buf_args.emplace_back(input_ph);
buf_args.emplace_back(output);
LLVMCodeGen cg(nest.root_stmt(), buf_args);
std::vector<CodeGen::CallArg> call_args;
for (auto _ : state) {
output_ = at::empty_like(ref_);
call_args.clear();
call_args.emplace_back(input_.data_ptr<float>());
call_args.emplace_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
private:
std::vector<long> input_size_;
std::vector<int> input_size_int_;
at::Tensor input_;
at::Tensor output_;
at::Tensor ref_;
};
} // namespace
BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
runNNC(state);
}
BENCHMARK_DEFINE_F(SignedLog1pBench, NNCLogVml)(benchmark::State& state) {
runNNCLogVml(state);
}
BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});
BENCHMARK_REGISTER_F(SignedLog1pBench, NNCLogVml)->Args({10, 1467});

View File

@ -1,3 +0,0 @@
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@ -1660,14 +1660,6 @@ if(BUILD_STATIC_RUNTIME_BENCHMARK)
target_link_libraries(static_runtime_test torch_library gtest_main) target_link_libraries(static_runtime_test torch_library gtest_main)
endif() endif()
if(BUILD_TENSOREXPR_BENCHMARK)
add_subdirectory(${TORCH_ROOT}/benchmarks/cpp/tensorexpr ${CMAKE_BINARY_DIR}/tensorexpr_bench)
endif()
if(BUILD_CPP_BENCHMARKS)
add_subdirectory(${TORCH_ROOT}/benchmarks/cpp ${PROJECT_BINARY_DIR}/bin)
endif()
if(BUILD_MOBILE_BENCHMARK) if(BUILD_MOBILE_BENCHMARK)
foreach(benchmark_src ${ATen_MOBILE_BENCHMARK_SRCS}) foreach(benchmark_src ${ATen_MOBILE_BENCHMARK_SRCS})
get_filename_component(benchmark_name ${benchmark_src} NAME_WE) get_filename_component(benchmark_name ${benchmark_src} NAME_WE)

View File

@ -26,7 +26,6 @@ function(caffe2_print_configuration_summary)
message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}") message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}")
message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}")
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}") message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
message(STATUS " BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}") message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}") message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
if(${CAFFE2_LINK_LOCAL_PROTOBUF}) if(${CAFFE2_LINK_LOCAL_PROTOBUF})