mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-20 18:43:49 +08:00
541 lines
21 KiB
C++
541 lines
21 KiB
C++
/*******************************************************************************
|
|
* Copyright 2023-2025 Intel Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*******************************************************************************/
|
|
|
|
#include <cstring>
|
|
#include <random>
|
|
#include <sstream>
|
|
#include <unordered_map>
|
|
|
|
#include "dnnl_debug.hpp"
|
|
#include "utils/dnnl_query.hpp"
|
|
#include "utils/fill.hpp"
|
|
#include "utils/numeric.hpp"
|
|
#include "utils/parallel.hpp"
|
|
|
|
fill_cfg_t::fill_cfg_t(dnnl_data_type_t dt, float range_min_val,
|
|
float range_max_val, bool only_integer, attr_t::post_ops_t::kind_t alg,
|
|
const std::string &name)
|
|
: dt_(dt)
|
|
, range_min_val_(MAX2(lowest_dt(dt_), range_min_val))
|
|
, range_max_val_(MIN2(max_dt(dt_), range_max_val))
|
|
, only_integer_(is_integral_dt(dt_) || only_integer)
|
|
, name_(name) {
|
|
if (alg == attr_t::post_ops_t::kind_t::SUB) {
|
|
// Apply range inversion if `alg` is `sub`. This helps to keep output
|
|
// data positive if it was intended to be positive. In rest cases act
|
|
// like for binary `add` algorithm. If `attr` is unavailable in the
|
|
// code, use `attr_t::post_ops_t::kind_t::ADD` as a defulat value.
|
|
float sub_range_min_val_ = -range_min_val_;
|
|
float sub_range_max_val_ = -range_max_val_;
|
|
range_min_val_ = MIN2(sub_range_min_val_, sub_range_max_val_);
|
|
range_max_val_ = MAX2(sub_range_min_val_, sub_range_max_val_);
|
|
} else if (alg == attr_t::post_ops_t::kind_t::MUL) {
|
|
// Reduce the range for multiplication to decrease a computational
|
|
// error magnitute which can lead to rounding to a different output
|
|
// value for low-precision data types.
|
|
// TODO: replace with using specific values instead.
|
|
range_min_val_ /= 8.f;
|
|
range_max_val_ /= 8.f;
|
|
}
|
|
}
|
|
|
|
fill_cfg_t::fill_cfg_t(
|
|
const std::vector<float> &user_set, const std::string &name)
|
|
: dt_(dnnl_data_type_undef)
|
|
, range_min_val_(-FLT_MAX)
|
|
, range_max_val_(FLT_MAX)
|
|
, predefined_set_(user_set)
|
|
, only_integer_(false)
|
|
, name_(name) {
|
|
assert(!predefined_set_.empty());
|
|
}
|
|
|
|
std::string fill_cfg_t::print_verbose() const {
|
|
std::stringstream ss;
|
|
|
|
ss << "[FILL_CFG]";
|
|
if (!name_.empty()) ss << " name:\'" << name_ << "\';";
|
|
|
|
// Predefined set is mutually excluded with a range setting.
|
|
if (!predefined_set_.empty()) {
|
|
ss << " set:[";
|
|
for (const auto &e : predefined_set_) {
|
|
ss << e << ";";
|
|
}
|
|
ss << "]";
|
|
} else {
|
|
ss << " dt:" << dt_;
|
|
ss << " range:[" << range_min_val_ << ";" << range_max_val_ << "]";
|
|
if (only_integer_) ss << " only_integer:true";
|
|
}
|
|
|
|
return ss.str();
|
|
}
|
|
|
|
const fill_cfg_t &get_default_fill_cfg() {
|
|
static const fill_cfg_t fill_cfg;
|
|
return fill_cfg;
|
|
}
|
|
|
|
const fill_cfg_t &get_perf_fill_cfg(dnnl_data_type_t dt) {
|
|
assert(has_bench_mode_bit(mode_bit_t::perf));
|
|
|
|
#define CASE(dt, low_end, high_end) \
|
|
case dt: { \
|
|
static const fill_cfg_t fill_cfg(dt, MAX2((low_end), lowest_dt(dt)), \
|
|
MIN2((high_end), max_dt(dt)), /* only_int = */ false, \
|
|
attr_t::post_ops_t::kind_t::ADD, "perf_mode_fill"); \
|
|
return fill_cfg; \
|
|
}
|
|
|
|
switch (dt) {
|
|
CASE(dnnl_f4_e2m1, -2.f, 2.f);
|
|
CASE(dnnl_f4_e3m0, -2.f, 2.f);
|
|
CASE(dnnl_e8m0, -2.f, 2.f);
|
|
CASE(dnnl_f8_e5m2, -2.f, 2.f);
|
|
CASE(dnnl_f8_e4m3, -2.f, 2.f);
|
|
CASE(dnnl_bf16, -32.f, 32.f);
|
|
CASE(dnnl_f16, -8.f, 8.f);
|
|
CASE(dnnl_f32, -1024.f, 1024.f);
|
|
CASE(dnnl_f64, -1024.f, 1024.f);
|
|
CASE(dnnl_s32, -1024.f, 1024.f);
|
|
CASE(dnnl_s8, -32, 32);
|
|
CASE(dnnl_u8, 0, 64);
|
|
CASE(dnnl_s4, -8, 7);
|
|
CASE(dnnl_u4, 0, 15);
|
|
default: {
|
|
assert(!"bad data_type");
|
|
SAFE_V(FAIL);
|
|
static const fill_cfg_t dummy;
|
|
return dummy;
|
|
}
|
|
}
|
|
#undef CASE
|
|
}
|
|
|
|
int fill_scales(
|
|
const attr_t &attr, int arg, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
|
|
const auto &e = attr.scales.get(arg);
|
|
return fill_scales(e, mem_dt, mem_fp);
|
|
}
|
|
|
|
int fill_scales(const attr_t::arg_scales_t::entry_t &e, dnn_mem_t &mem_dt,
|
|
dnn_mem_t &mem_fp) {
|
|
const auto nelems = mem_fp.nelems();
|
|
if (nelems == 0) return OK;
|
|
|
|
if (mem_dt) { assert(mem_dt.nelems() == mem_fp.nelems()); }
|
|
|
|
if (e.policy == policy_t::COMMON || e.policy == policy_t::HOST_SCALAR) {
|
|
assert(nelems == 1);
|
|
mem_fp.set_f32_elem(0, e.scale);
|
|
if (mem_dt) mem_dt.set_elem(0, e.scale);
|
|
} else {
|
|
/* Do fixed partitioning to have same filling for any number of threads */
|
|
static constexpr int64_t chunk_size = 64;
|
|
const int64_t n_chunks = div_up(nelems, chunk_size);
|
|
benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
|
|
int64_t idx_start = idx_chunk * chunk_size;
|
|
int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
|
|
// Note: we use a different seed for each chunk to avoid
|
|
// repeating patterns. We could use discard(idx_start) too but
|
|
// it has a complexity in O(idx_start). We also add 1 to avoid
|
|
// seeding with 0.
|
|
std::minstd_rand int_seed(idx_start + 1);
|
|
int_seed.discard(1);
|
|
|
|
std::uniform_int_distribution<> gen(-2, 2);
|
|
|
|
for (int64_t idx = idx_start; idx < idx_end; ++idx) {
|
|
int pow2 = gen(int_seed);
|
|
int pow2_shift = 1 << std::abs(pow2);
|
|
const float gen_val
|
|
= pow2 < 0 ? (1.f / pow2_shift) : pow2_shift;
|
|
const float val = gen_val;
|
|
mem_fp.set_f32_elem(idx, val);
|
|
if (mem_dt) mem_dt.set_elem(idx, val);
|
|
}
|
|
});
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|
|
int fill_zero_points(
|
|
const attr_t &attr, int arg, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
|
|
const auto nelems = mem_fp.nelems();
|
|
if (nelems == 0) return OK;
|
|
|
|
assert(mem_dt.nelems() == mem_fp.nelems());
|
|
|
|
const auto &e = attr.zero_points.get(arg);
|
|
if (e.policy == policy_t::COMMON || e.policy == policy_t::HOST_SCALAR) {
|
|
assert(nelems == 1);
|
|
mem_fp.set_f32_elem(0, e.value);
|
|
if (mem_dt) mem_dt.set_elem(0, e.value);
|
|
} else {
|
|
/* Do fixed partitioning to have same filling for any number of threads */
|
|
static constexpr int64_t chunk_size = 64;
|
|
const int64_t n_chunks = div_up(nelems, chunk_size);
|
|
const int min_val = MAX2(-2, static_cast<int>(lowest_dt(mem_dt.dt())));
|
|
benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
|
|
int64_t idx_start = idx_chunk * chunk_size;
|
|
int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
|
|
// Note: we use a different seed for each chunk to avoid
|
|
// repeating patterns. We could use discard(idx_start) too but
|
|
// it has a complexity in O(idx_start). We also add 1 to avoid
|
|
// seeding with 0.
|
|
std::minstd_rand int_seed(idx_start + 1);
|
|
int_seed.discard(1);
|
|
|
|
std::uniform_int_distribution<> gen(min_val, 2);
|
|
|
|
for (int64_t idx = idx_start; idx < idx_end; ++idx) {
|
|
const float zp_val = gen(int_seed);
|
|
mem_fp.set_f32_elem(idx, zp_val);
|
|
if (mem_dt) mem_dt.set_elem(idx, zp_val);
|
|
}
|
|
});
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|
|
int fill_random_real_dense(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
|
|
const fill_cfg_t &fill_cfg) {
|
|
auto nelems = mem_ref.nelems();
|
|
if (nelems == 0) return OK;
|
|
|
|
BENCHDNN_PRINT(6, "%s\n", fill_cfg.print_verbose().c_str());
|
|
|
|
// This function doesn't handle the predefined set yet.
|
|
assert(fill_cfg.predefined_set_.empty());
|
|
|
|
// The `nelems()` function returns a product of dims/pdims regardless of
|
|
// whether the tensor is dense or sparse (this is by design). Because of
|
|
// that we need to adjust the `nelems` value for the sparse tensor as the
|
|
// number of elements to fill is equal to `nnz`.
|
|
if (mem_ref.format_kind() == dnnl_format_kind_sparse)
|
|
nelems = query_md_nnz(mem_ref.md_);
|
|
|
|
// Note: fill_cfg_t drives value distribution, but the final rounding is
|
|
// in compliance with the memory object the values are inserted. Depending
|
|
// on a case, it may or may not benefit to force same data type for filling
|
|
// and final memory object data type.
|
|
const dnnl_data_type_t round_dt = mem ? mem.dt() : mem_ref.dt();
|
|
|
|
/* Do fixed partitioning to have same filling for any number of threads */
|
|
static constexpr int64_t chunk_size = 64;
|
|
const int64_t n_chunks = div_up(nelems, chunk_size);
|
|
|
|
benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
|
|
int64_t idx_start = idx_chunk * chunk_size;
|
|
int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
|
|
// Note: we use a different seed for each chunk to avoid
|
|
// repeating patterns. We could use discard(idx_start) too but
|
|
// it has a complexity in O(idx_start). We also add 1 to avoid
|
|
// seeding with 0.
|
|
std::minstd_rand int_seed(nelems + idx_start + 1);
|
|
int_seed.discard(1);
|
|
|
|
std::uniform_real_distribution<> gen_real(
|
|
fill_cfg.range_min_val_, fill_cfg.range_max_val_);
|
|
std::uniform_int_distribution<> gen_int(
|
|
fill_cfg.range_min_val_, fill_cfg.range_max_val_);
|
|
|
|
const auto get_val = [&]() {
|
|
return fill_cfg.only_integer_
|
|
? static_cast<float>(gen_int(int_seed))
|
|
: gen_real(int_seed);
|
|
};
|
|
|
|
if (mem_ref.dt() == dnnl_f32) {
|
|
for (int64_t idx = idx_start; idx < idx_end; ++idx) {
|
|
float val = get_val();
|
|
mem_ref.set_f32_elem(
|
|
idx, round_to_nearest_representable(round_dt, val));
|
|
}
|
|
} else {
|
|
// There are some rare scenarios when mem_ref is not f32.
|
|
for (int64_t idx = idx_start; idx < idx_end; ++idx) {
|
|
float val = get_val();
|
|
mem_ref.set_elem(
|
|
idx, round_to_nearest_representable(round_dt, val));
|
|
}
|
|
}
|
|
});
|
|
|
|
// Note: `only_integer_` option is tricky. While it allows to avoid
|
|
// cancellation effect triggering, it doesn't allow to validate loads
|
|
// properly due to the values used. To ensure the library works properly,
|
|
// basically, add fractional part of 0.5f for all floating types regardless
|
|
// the setting, and for all integral types use values not available in other
|
|
// data types to trigger potential overflow.
|
|
if (fill_cfg.only_integer_) {
|
|
const auto adjust_val = [&](float orig_val) {
|
|
if (!is_integral_dt(round_dt)) {
|
|
// Catch faulty integer loads instead fp type.
|
|
return orig_val + 0.5f >= fill_cfg.range_max_val_
|
|
? orig_val - 0.5f
|
|
: orig_val + 0.5f;
|
|
} else if (round_dt == dnnl_s8) {
|
|
; // Using s8 val of -128 leads to a binary_mul alg magnifying
|
|
// the diff (at least for eltwise) when it used to fit.
|
|
// Need a general solution for the problem.
|
|
} else if (round_dt == dnnl_u8) {
|
|
return 128.f; // catch faulty s8 loads instead of u8.
|
|
} else if (round_dt == dnnl_s32) {
|
|
return 256.f; // catch faulty int8 loads instead of s32.
|
|
} else if (round_dt == dnnl_u4) {
|
|
return 15.f;
|
|
} else if (round_dt == dnnl_s4) {
|
|
return -8.f;
|
|
} else {
|
|
assert(!"unexpected data type");
|
|
}
|
|
return orig_val;
|
|
};
|
|
|
|
// There are some rare scenarios when mem_ref is not f32. Since it's a
|
|
// single element per tensor, can call a regular interface.
|
|
const float elem_first_val = adjust_val(mem_ref.get_elem(0));
|
|
mem_ref.set_elem(
|
|
0, round_to_nearest_representable(round_dt, elem_first_val));
|
|
}
|
|
|
|
if (mem) {
|
|
// TODO: move `res` inside reorder.
|
|
auto status = mem.reorder(mem_ref);
|
|
if (status != OK) {
|
|
if (res) res->state = FAILED;
|
|
return status;
|
|
}
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|
|
// Since a sparsity pattern affects performance, it's crucial to keep the
|
|
// pattern intact and only randomize tensor values. Thus, the function relies on
|
|
// an assumption that every sparse format contains three handles, where the
|
|
// second and the third are responsible for a sparsity pattern, and are
|
|
// **already filled**.
|
|
int fill_random_real_sparse(const_dnnl_memory_t dnnl_memory, dnn_mem_t &mem,
|
|
dnn_mem_t &mem_ref, res_t *res, const fill_cfg_t &fill_cfg) {
|
|
auto orig_cc_mem_md = query_md(dnnl_memory);
|
|
const int nhandles = query_md_num_handles(orig_cc_mem_md);
|
|
assert(nhandles == 3);
|
|
// Copy-exact the content of metadata buffers. Let data handle go further.
|
|
for (int idx = 1; idx < nhandles; idx++) {
|
|
void *dst_ptr = mem_ref.get_mapped_pointer<void>(idx);
|
|
void *src_ptr = nullptr;
|
|
dnnl_memory_get_data_handle_v2(dnnl_memory, &src_ptr, idx);
|
|
|
|
const size_t size = dnnl_memory_desc_get_size_v2(orig_cc_mem_md, idx);
|
|
std::memcpy(dst_ptr, src_ptr, size);
|
|
}
|
|
|
|
return fill_random_real_dense(mem, mem_ref, res, fill_cfg);
|
|
}
|
|
|
|
int fill_random_real(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
|
|
const fill_cfg_t &fill_cfg, const_dnnl_memory_t dnnl_memory) {
|
|
if (mem_ref.format_kind() == dnnl_format_kind_sparse) {
|
|
assert(dnnl_memory != nullptr);
|
|
return fill_random_real_sparse(
|
|
dnnl_memory, mem, mem_ref, res, fill_cfg);
|
|
}
|
|
return fill_random_real_dense(mem, mem_ref, res, fill_cfg);
|
|
}
|
|
|
|
int fill_random_real(dnn_mem_t &mem_ref, const fill_cfg_t &fill_cfg,
|
|
const_dnnl_memory_t dnnl_memory) {
|
|
dnn_mem_t dummy;
|
|
return fill_random_real(dummy, mem_ref, nullptr, fill_cfg, dnnl_memory);
|
|
}
|
|
|
|
std::string execarg2str(int exec_arg) {
|
|
using arg_map_t = std::unordered_map<int, std::string>;
|
|
#define ARG(id) \
|
|
{ id, #id }
|
|
static const arg_map_t ids = {
|
|
ARG(DNNL_ARG_UNDEF),
|
|
ARG(DNNL_ARG_SRC_0),
|
|
ARG(DNNL_ARG_SRC_1),
|
|
ARG(DNNL_ARG_SRC_2),
|
|
ARG(DNNL_ARG_SRC_3),
|
|
ARG(DNNL_ARG_DST_0),
|
|
ARG(DNNL_ARG_DST_1),
|
|
ARG(DNNL_ARG_DST_2),
|
|
ARG(DNNL_ARG_WEIGHTS_0),
|
|
ARG(DNNL_ARG_WEIGHTS_1),
|
|
ARG(DNNL_ARG_WEIGHTS_2),
|
|
ARG(DNNL_ARG_WEIGHTS_3),
|
|
ARG(DNNL_ARG_BIAS),
|
|
ARG(DNNL_ARG_REDUCE),
|
|
ARG(DNNL_ARG_MEAN),
|
|
ARG(DNNL_ARG_VARIANCE),
|
|
ARG(DNNL_ARG_SCALE),
|
|
ARG(DNNL_ARG_SHIFT),
|
|
ARG(DNNL_ARG_WORKSPACE),
|
|
ARG(DNNL_ARG_SCRATCHPAD),
|
|
ARG(DNNL_ARG_DIFF_SRC_0),
|
|
ARG(DNNL_ARG_DIFF_SRC_1),
|
|
ARG(DNNL_ARG_DIFF_SRC_2),
|
|
ARG(DNNL_ARG_DIFF_SRC_3),
|
|
ARG(DNNL_ARG_DIFF_DST_0),
|
|
ARG(DNNL_ARG_DIFF_DST_1),
|
|
ARG(DNNL_ARG_DIFF_DST_2),
|
|
ARG(DNNL_ARG_DIFF_WEIGHTS_0),
|
|
ARG(DNNL_ARG_DIFF_WEIGHTS_1),
|
|
ARG(DNNL_ARG_DIFF_WEIGHTS_2),
|
|
ARG(DNNL_ARG_DIFF_WEIGHTS_3),
|
|
ARG(DNNL_ARG_DIFF_BIAS),
|
|
ARG(DNNL_ARG_DIFF_SCALE),
|
|
ARG(DNNL_ARG_DIFF_SHIFT),
|
|
ARG(DNNL_ARG_ATTR_ROUNDING_SEED),
|
|
ARG(DNNL_ARG_ATTR_DROPOUT_MASK),
|
|
ARG(DNNL_ARG_ATTR_DROPOUT_PROBABILITY),
|
|
ARG(DNNL_ARG_ATTR_DROPOUT_SEED),
|
|
};
|
|
static const arg_map_t flags = {
|
|
ARG(DNNL_ARG_ATTR_PRECOMPUTED_REDUCTIONS),
|
|
ARG(DNNL_ARG_ATTR_SCALES),
|
|
ARG(DNNL_ARG_ATTR_ZERO_POINTS),
|
|
ARG(DNNL_ARG_ATTR_POST_OP_DW),
|
|
};
|
|
static const arg_map_t post_ops = {
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(0)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(1)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(2)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(3)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(4)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(5)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(6)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(7)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(8)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(9)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(10)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(11)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(12)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(13)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(14)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(15)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(16)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(17)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(18)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(19)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(20)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(21)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(22)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(23)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(24)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(25)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(26)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(27)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(28)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(29)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(30)),
|
|
ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(31)),
|
|
};
|
|
#undef ARG
|
|
#define SPACE " | "
|
|
auto search = [](const arg_map_t &map, int i) {
|
|
auto it = map.find(i);
|
|
if (it != map.end()) return SPACE + it->second;
|
|
if (!i) return std::string();
|
|
std::ostringstream oss;
|
|
oss << std::hex << i;
|
|
return SPACE "0x" + oss.str();
|
|
};
|
|
std::string retn = search(
|
|
post_ops, exec_arg & ~(DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE - 1));
|
|
exec_arg &= DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE - 1;
|
|
for (auto &flag : flags) {
|
|
assert(((flag.first - 1) & flag.first) == 0);
|
|
if (exec_arg & flag.first) {
|
|
retn += SPACE + flag.second;
|
|
exec_arg &= ~flag.first;
|
|
}
|
|
}
|
|
if (exec_arg >= DNNL_ARG_MULTIPLE_DST) {
|
|
retn += SPACE "DNNL_ARG_MULTIPLE_DST+"
|
|
+ std::to_string(exec_arg - DNNL_ARG_MULTIPLE_DST);
|
|
} else if (exec_arg >= DNNL_ARG_MULTIPLE_SRC) {
|
|
retn += SPACE "DNNL_ARG_MULTIPLE_SRC+"
|
|
+ std::to_string(exec_arg - DNNL_ARG_MULTIPLE_SRC);
|
|
} else {
|
|
retn += search(ids, exec_arg); // including DNNL_ARG_UNDEF
|
|
}
|
|
return retn.erase(0, sizeof(SPACE) - 1);
|
|
#undef SPACE
|
|
}
|
|
|
|
std::string buffer_prefix;
|
|
|
|
bool fill_from_file(int exec_arg, dnn_mem_t &mem, dnn_mem_t &ref_mem) {
|
|
static const char format[] = "File %s %s; buffer not imported.\n";
|
|
auto prefix = buffer_prefix;
|
|
if (prefix.empty()) return false;
|
|
|
|
// `mem` is not supposed to be filled.
|
|
if (mem.nelems() == 0) return false;
|
|
|
|
prefix += "." + std::to_string(exec_arg) + ".bin";
|
|
|
|
FILE *file = nullptr;
|
|
#ifdef _WIN32
|
|
if (fopen_s(&file, prefix.c_str(), "rb")) file = nullptr;
|
|
#else
|
|
file = fopen(prefix.c_str(), "rb");
|
|
#endif
|
|
prefix = "'" + prefix + "' [" + execarg2str(exec_arg) + "]";
|
|
if (!file) {
|
|
BENCHDNN_PRINT(2, format, prefix.c_str(), "not found");
|
|
return false;
|
|
}
|
|
fseek(file, 0, SEEK_END);
|
|
size_t total = 0, size = ftell(file);
|
|
if (mem.size() != size) {
|
|
fclose(file);
|
|
BENCHDNN_PRINT(0, format, prefix.c_str(),
|
|
"differs in size from the buffer's memory descriptor");
|
|
BENCHDNN_PRINT(0, "Actual size: %zu\nNeeded size: %zu (%zu x %s)\n",
|
|
size, mem.size(), mem.size() / mem.sizeof_dt(),
|
|
dt2str(mem.dt()));
|
|
SAFE_V(FAIL);
|
|
return false;
|
|
}
|
|
fseek(file, 0, SEEK_SET);
|
|
for (size_t read = ~0; read && (total < size); total += read)
|
|
read = fread(
|
|
static_cast<uint8_t *>(mem) + total, 1, size - total, file);
|
|
fclose(file);
|
|
if (total != size) {
|
|
BENCHDNN_PRINT(0, format, prefix.c_str(), "cannot be read correctly");
|
|
SAFE_V(FAIL);
|
|
return false;
|
|
}
|
|
if (ref_mem && (ref_mem.reorder(mem) != OK)) {
|
|
BENCHDNN_PRINT(0, format, prefix.c_str(), "cannot be reordered");
|
|
SAFE_V(FAIL);
|
|
return false;
|
|
}
|
|
BENCHDNN_PRINT(2, "File %s successfully processed; buffer imported.\n",
|
|
prefix.c_str());
|
|
return true;
|
|
}
|