oneDNN/tests/benchdnn/utils/fill.cpp

/*******************************************************************************
* Copyright 2023-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include <cstring>
#include <random>
#include <sstream>
#include <unordered_map>

#include "dnnl_debug.hpp"
#include "utils/dnnl_query.hpp"
#include "utils/fill.hpp"
#include "utils/numeric.hpp"
#include "utils/parallel.hpp"

fill_cfg_t::fill_cfg_t(dnnl_data_type_t dt, float range_min_val,
        float range_max_val, bool only_integer, attr_t::post_ops_t::kind_t alg,
        const std::string &name)
    : dt_(dt)
    , range_min_val_(MAX2(lowest_dt(dt_), range_min_val))
    , range_max_val_(MIN2(max_dt(dt_), range_max_val))
    , only_integer_(is_integral_dt(dt_) || only_integer)
    , name_(name) {
    if (alg == attr_t::post_ops_t::kind_t::SUB) {
        // Apply range inversion if `alg` is `sub`. This helps to keep output
        // data positive if it was intended to be positive. In rest cases act
        // like for binary `add` algorithm. If `attr` is unavailable in the
        // code, use `attr_t::post_ops_t::kind_t::ADD` as a defulat value.
        float sub_range_min_val_ = -range_min_val_;
        float sub_range_max_val_ = -range_max_val_;
        range_min_val_ = MIN2(sub_range_min_val_, sub_range_max_val_);
        range_max_val_ = MAX2(sub_range_min_val_, sub_range_max_val_);
    } else if (alg == attr_t::post_ops_t::kind_t::MUL) {
        // Reduce the range for multiplication to decrease a computational
        // error magnitute which can lead to rounding to a different output
        // value for low-precision data types.
        // TODO: replace with using specific values instead.
        range_min_val_ /= 8.f;
        range_max_val_ /= 8.f;
    }
}

fill_cfg_t::fill_cfg_t(
        const std::vector<float> &user_set, const std::string &name)
    : dt_(dnnl_data_type_undef)
    , range_min_val_(-FLT_MAX)
    , range_max_val_(FLT_MAX)
    , predefined_set_(user_set)
    , only_integer_(false)
    , name_(name) {
    assert(!predefined_set_.empty());
}

std::string fill_cfg_t::print_verbose() const {
    std::stringstream ss;

    ss << "[FILL_CFG]";
    if (!name_.empty()) ss << " name:\'" << name_ << "\';";

    // Predefined set is mutually excluded with a range setting.
    if (!predefined_set_.empty()) {
        ss << " set:[";
        for (const auto &e : predefined_set_) {
            ss << e << ";";
        }
        ss << "]";
    } else {
        ss << " dt:" << dt_;
        ss << " range:[" << range_min_val_ << ";" << range_max_val_ << "]";
        if (only_integer_) ss << " only_integer:true";
    }

    return ss.str();
}

const fill_cfg_t &get_default_fill_cfg() {
    static const fill_cfg_t fill_cfg;
    return fill_cfg;
}

const fill_cfg_t &get_perf_fill_cfg(dnnl_data_type_t dt) {
    assert(has_bench_mode_bit(mode_bit_t::perf));

#define CASE(dt, low_end, high_end) \
    case dt: { \
        static const fill_cfg_t fill_cfg(dt, MAX2((low_end), lowest_dt(dt)), \
                MIN2((high_end), max_dt(dt)), /* only_int = */ false, \
                attr_t::post_ops_t::kind_t::ADD, "perf_mode_fill"); \
        return fill_cfg; \
    }

    switch (dt) {
        CASE(dnnl_f4_e2m1, -2.f, 2.f);
        CASE(dnnl_f4_e3m0, -2.f, 2.f);
        CASE(dnnl_e8m0, -2.f, 2.f);
        CASE(dnnl_f8_e5m2, -2.f, 2.f);
        CASE(dnnl_f8_e4m3, -2.f, 2.f);
        CASE(dnnl_bf16, -32.f, 32.f);
        CASE(dnnl_f16, -8.f, 8.f);
        CASE(dnnl_f32, -1024.f, 1024.f);
        CASE(dnnl_f64, -1024.f, 1024.f);
        CASE(dnnl_s32, -1024.f, 1024.f);
        CASE(dnnl_s8, -32, 32);
        CASE(dnnl_u8, 0, 64);
        CASE(dnnl_s4, -8, 7);
        CASE(dnnl_u4, 0, 15);
        default: {
            assert(!"bad data_type");
            SAFE_V(FAIL);
            static const fill_cfg_t dummy;
            return dummy;
        }
    }
#undef CASE
}

int fill_scales(
        const attr_t &attr, int arg, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
    const auto &e = attr.scales.get(arg);
    return fill_scales(e, mem_dt, mem_fp);
}

int fill_scales(const attr_t::arg_scales_t::entry_t &e, dnn_mem_t &mem_dt,
        dnn_mem_t &mem_fp) {
    const auto nelems = mem_fp.nelems();
    if (nelems == 0) return OK;

    if (mem_dt) { assert(mem_dt.nelems() == mem_fp.nelems()); }

    if (e.policy == policy_t::COMMON || e.policy == policy_t::HOST_SCALAR) {
        assert(nelems == 1);
        mem_fp.set_f32_elem(0, e.scale);
        if (mem_dt) mem_dt.set_elem(0, e.scale);
    } else {
        /* Do fixed partitioning to have same filling for any number of threads */
        static constexpr int64_t chunk_size = 64;
        const int64_t n_chunks = div_up(nelems, chunk_size);
        benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
            int64_t idx_start = idx_chunk * chunk_size;
            int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
            // Note: we use a different seed for each chunk to avoid
            // repeating patterns. We could use discard(idx_start) too but
            // it has a complexity in O(idx_start). We also add 1 to avoid
            // seeding with 0.
            std::minstd_rand int_seed(idx_start + 1);
            int_seed.discard(1);

            std::uniform_int_distribution<> gen(-2, 2);

            for (int64_t idx = idx_start; idx < idx_end; ++idx) {
                int pow2 = gen(int_seed);
                int pow2_shift = 1 << std::abs(pow2);
                const float gen_val
                        = pow2 < 0 ? (1.f / pow2_shift) : pow2_shift;
                const float val = gen_val;
                mem_fp.set_f32_elem(idx, val);
                if (mem_dt) mem_dt.set_elem(idx, val);
            }
        });
    }

    return OK;
}

int fill_zero_points(
        const attr_t &attr, int arg, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
    const auto nelems = mem_fp.nelems();
    if (nelems == 0) return OK;

    assert(mem_dt.nelems() == mem_fp.nelems());

    const auto &e = attr.zero_points.get(arg);
    if (e.policy == policy_t::COMMON || e.policy == policy_t::HOST_SCALAR) {
        assert(nelems == 1);
        mem_fp.set_f32_elem(0, e.value);
        if (mem_dt) mem_dt.set_elem(0, e.value);
    } else {
        /* Do fixed partitioning to have same filling for any number of threads */
        static constexpr int64_t chunk_size = 64;
        const int64_t n_chunks = div_up(nelems, chunk_size);
        const int min_val = MAX2(-2, static_cast<int>(lowest_dt(mem_dt.dt())));
        benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
            int64_t idx_start = idx_chunk * chunk_size;
            int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
            // Note: we use a different seed for each chunk to avoid
            // repeating patterns. We could use discard(idx_start) too but
            // it has a complexity in O(idx_start). We also add 1 to avoid
            // seeding with 0.
            std::minstd_rand int_seed(idx_start + 1);
            int_seed.discard(1);

            std::uniform_int_distribution<> gen(min_val, 2);

            for (int64_t idx = idx_start; idx < idx_end; ++idx) {
                const float zp_val = gen(int_seed);
                mem_fp.set_f32_elem(idx, zp_val);
                if (mem_dt) mem_dt.set_elem(idx, zp_val);
            }
        });
    }

    return OK;
}

int fill_random_real_dense(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
        const fill_cfg_t &fill_cfg) {
    auto nelems = mem_ref.nelems();
    if (nelems == 0) return OK;

    BENCHDNN_PRINT(6, "%s\n", fill_cfg.print_verbose().c_str());

    // This function doesn't handle the predefined set yet.
    assert(fill_cfg.predefined_set_.empty());

    // The `nelems()` function returns a product of dims/pdims regardless of
    // whether the tensor is dense or sparse (this is by design). Because of
    // that we need to adjust the `nelems` value for the sparse tensor as the
    // number of elements to fill is equal to `nnz`.
    if (mem_ref.format_kind() == dnnl_format_kind_sparse)
        nelems = query_md_nnz(mem_ref.md_);

    // Note: fill_cfg_t drives value distribution, but the final rounding is
    // in compliance with the memory object the values are inserted. Depending
    // on a case, it may or may not benefit to force same data type for filling
    // and final memory object data type.
    const dnnl_data_type_t round_dt = mem ? mem.dt() : mem_ref.dt();

    /* Do fixed partitioning to have same filling for any number of threads */
    static constexpr int64_t chunk_size = 64;
    const int64_t n_chunks = div_up(nelems, chunk_size);

    benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
        int64_t idx_start = idx_chunk * chunk_size;
        int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
        // Note: we use a different seed for each chunk to avoid
        // repeating patterns. We could use discard(idx_start) too but
        // it has a complexity in O(idx_start). We also add 1 to avoid
        // seeding with 0.
        std::minstd_rand int_seed(nelems + idx_start + 1);
        int_seed.discard(1);

        std::uniform_real_distribution<> gen_real(
                fill_cfg.range_min_val_, fill_cfg.range_max_val_);
        std::uniform_int_distribution<> gen_int(
                fill_cfg.range_min_val_, fill_cfg.range_max_val_);

        const auto get_val = [&]() {
            return fill_cfg.only_integer_
                    ? static_cast<float>(gen_int(int_seed))
                    : gen_real(int_seed);
        };

        if (mem_ref.dt() == dnnl_f32) {
            for (int64_t idx = idx_start; idx < idx_end; ++idx) {
                float val = get_val();
                mem_ref.set_f32_elem(
                        idx, round_to_nearest_representable(round_dt, val));
            }
        } else {
            // There are some rare scenarios when mem_ref is not f32.
            for (int64_t idx = idx_start; idx < idx_end; ++idx) {
                float val = get_val();
                mem_ref.set_elem(
                        idx, round_to_nearest_representable(round_dt, val));
            }
        }
    });

    // Note: `only_integer_` option is tricky. While it allows to avoid
    // cancellation effect triggering, it doesn't allow to validate loads
    // properly due to the values used. To ensure the library works properly,
    // basically, add fractional part of 0.5f for all floating types regardless
    // the setting, and for all integral types use values not available in other
    // data types to trigger potential overflow.
    if (fill_cfg.only_integer_) {
        const auto adjust_val = [&](float orig_val) {
            if (!is_integral_dt(round_dt)) {
                // Catch faulty integer loads instead fp type.
                return orig_val + 0.5f >= fill_cfg.range_max_val_
                        ? orig_val - 0.5f
                        : orig_val + 0.5f;
            } else if (round_dt == dnnl_s8) {
                ; // Using s8 val of -128 leads to a binary_mul alg magnifying
                        // the diff (at least for eltwise) when it used to fit.
                        // Need a general solution for the problem.
            } else if (round_dt == dnnl_u8) {
                return 128.f; // catch faulty s8 loads instead of u8.
            } else if (round_dt == dnnl_s32) {
                return 256.f; // catch faulty int8 loads instead of s32.
            } else if (round_dt == dnnl_u4) {
                return 15.f;
            } else if (round_dt == dnnl_s4) {
                return -8.f;
            } else {
                assert(!"unexpected data type");
            }
            return orig_val;
        };

        // There are some rare scenarios when mem_ref is not f32. Since it's a
        // single element per tensor, can call a regular interface.
        const float elem_first_val = adjust_val(mem_ref.get_elem(0));
        mem_ref.set_elem(
                0, round_to_nearest_representable(round_dt, elem_first_val));
    }

    if (mem) {
        // TODO: move `res` inside reorder.
        auto status = mem.reorder(mem_ref);
        if (status != OK) {
            if (res) res->state = FAILED;
            return status;
        }
    }

    return OK;
}

// Since a sparsity pattern affects performance, it's crucial to keep the
// pattern intact and only randomize tensor values. Thus, the function relies on
// an assumption that every sparse format contains three handles, where the
// second and the third are responsible for a sparsity pattern, and are
// **already filled**.
int fill_random_real_sparse(const_dnnl_memory_t dnnl_memory, dnn_mem_t &mem,
        dnn_mem_t &mem_ref, res_t *res, const fill_cfg_t &fill_cfg) {
    auto orig_cc_mem_md = query_md(dnnl_memory);
    const int nhandles = query_md_num_handles(orig_cc_mem_md);
    assert(nhandles == 3);
    // Copy-exact the content of metadata buffers. Let data handle go further.
    for (int idx = 1; idx < nhandles; idx++) {
        void *dst_ptr = mem_ref.get_mapped_pointer<void>(idx);
        void *src_ptr = nullptr;
        dnnl_memory_get_data_handle_v2(dnnl_memory, &src_ptr, idx);

        const size_t size = dnnl_memory_desc_get_size_v2(orig_cc_mem_md, idx);
        std::memcpy(dst_ptr, src_ptr, size);
    }

    return fill_random_real_dense(mem, mem_ref, res, fill_cfg);
}

int fill_random_real(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
        const fill_cfg_t &fill_cfg, const_dnnl_memory_t dnnl_memory) {
    if (mem_ref.format_kind() == dnnl_format_kind_sparse) {
        assert(dnnl_memory != nullptr);
        return fill_random_real_sparse(
                dnnl_memory, mem, mem_ref, res, fill_cfg);
    }
    return fill_random_real_dense(mem, mem_ref, res, fill_cfg);
}

int fill_random_real(dnn_mem_t &mem_ref, const fill_cfg_t &fill_cfg,
        const_dnnl_memory_t dnnl_memory) {
    dnn_mem_t dummy;
    return fill_random_real(dummy, mem_ref, nullptr, fill_cfg, dnnl_memory);
}

std::string execarg2str(int exec_arg) {
    using arg_map_t = std::unordered_map<int, std::string>;
#define ARG(id) \
    { id, #id }
    static const arg_map_t ids = {
            ARG(DNNL_ARG_UNDEF),
            ARG(DNNL_ARG_SRC_0),
            ARG(DNNL_ARG_SRC_1),
            ARG(DNNL_ARG_SRC_2),
            ARG(DNNL_ARG_SRC_3),
            ARG(DNNL_ARG_DST_0),
            ARG(DNNL_ARG_DST_1),
            ARG(DNNL_ARG_DST_2),
            ARG(DNNL_ARG_WEIGHTS_0),
            ARG(DNNL_ARG_WEIGHTS_1),
            ARG(DNNL_ARG_WEIGHTS_2),
            ARG(DNNL_ARG_WEIGHTS_3),
            ARG(DNNL_ARG_BIAS),
            ARG(DNNL_ARG_REDUCE),
            ARG(DNNL_ARG_MEAN),
            ARG(DNNL_ARG_VARIANCE),
            ARG(DNNL_ARG_SCALE),
            ARG(DNNL_ARG_SHIFT),
            ARG(DNNL_ARG_WORKSPACE),
            ARG(DNNL_ARG_SCRATCHPAD),
            ARG(DNNL_ARG_DIFF_SRC_0),
            ARG(DNNL_ARG_DIFF_SRC_1),
            ARG(DNNL_ARG_DIFF_SRC_2),
            ARG(DNNL_ARG_DIFF_SRC_3),
            ARG(DNNL_ARG_DIFF_DST_0),
            ARG(DNNL_ARG_DIFF_DST_1),
            ARG(DNNL_ARG_DIFF_DST_2),
            ARG(DNNL_ARG_DIFF_WEIGHTS_0),
            ARG(DNNL_ARG_DIFF_WEIGHTS_1),
            ARG(DNNL_ARG_DIFF_WEIGHTS_2),
            ARG(DNNL_ARG_DIFF_WEIGHTS_3),
            ARG(DNNL_ARG_DIFF_BIAS),
            ARG(DNNL_ARG_DIFF_SCALE),
            ARG(DNNL_ARG_DIFF_SHIFT),
            ARG(DNNL_ARG_ATTR_ROUNDING_SEED),
            ARG(DNNL_ARG_ATTR_DROPOUT_MASK),
            ARG(DNNL_ARG_ATTR_DROPOUT_PROBABILITY),
            ARG(DNNL_ARG_ATTR_DROPOUT_SEED),
    };
    static const arg_map_t flags = {
            ARG(DNNL_ARG_ATTR_PRECOMPUTED_REDUCTIONS),
            ARG(DNNL_ARG_ATTR_SCALES),
            ARG(DNNL_ARG_ATTR_ZERO_POINTS),
            ARG(DNNL_ARG_ATTR_POST_OP_DW),
    };
    static const arg_map_t post_ops = {
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(0)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(1)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(2)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(3)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(4)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(5)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(6)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(7)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(8)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(9)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(10)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(11)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(12)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(13)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(14)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(15)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(16)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(17)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(18)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(19)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(20)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(21)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(22)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(23)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(24)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(25)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(26)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(27)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(28)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(29)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(30)),
            ARG(DNNL_ARG_ATTR_MULTIPLE_POST_OP(31)),
    };
#undef ARG
#define SPACE " | "
    auto search = [](const arg_map_t &map, int i) {
        auto it = map.find(i);
        if (it != map.end()) return SPACE + it->second;
        if (!i) return std::string();
        std::ostringstream oss;
        oss << std::hex << i;
        return SPACE "0x" + oss.str();
    };
    std::string retn = search(
            post_ops, exec_arg & ~(DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE - 1));
    exec_arg &= DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE - 1;
    for (auto &flag : flags) {
        assert(((flag.first - 1) & flag.first) == 0);
        if (exec_arg & flag.first) {
            retn += SPACE + flag.second;
            exec_arg &= ~flag.first;
        }
    }
    if (exec_arg >= DNNL_ARG_MULTIPLE_DST) {
        retn += SPACE "DNNL_ARG_MULTIPLE_DST+"
                + std::to_string(exec_arg - DNNL_ARG_MULTIPLE_DST);
    } else if (exec_arg >= DNNL_ARG_MULTIPLE_SRC) {
        retn += SPACE "DNNL_ARG_MULTIPLE_SRC+"
                + std::to_string(exec_arg - DNNL_ARG_MULTIPLE_SRC);
    } else {
        retn += search(ids, exec_arg); // including DNNL_ARG_UNDEF
    }
    return retn.erase(0, sizeof(SPACE) - 1);
#undef SPACE
}

std::string buffer_prefix;

bool fill_from_file(int exec_arg, dnn_mem_t &mem, dnn_mem_t &ref_mem) {
    static const char format[] = "File %s %s; buffer not imported.\n";
    auto prefix = buffer_prefix;
    if (prefix.empty()) return false;

    // `mem` is not supposed to be filled.
    if (mem.nelems() == 0) return false;

    prefix += "." + std::to_string(exec_arg) + ".bin";

    FILE *file = nullptr;
#ifdef _WIN32
    if (fopen_s(&file, prefix.c_str(), "rb")) file = nullptr;
#else
    file = fopen(prefix.c_str(), "rb");
#endif
    prefix = "'" + prefix + "' [" + execarg2str(exec_arg) + "]";
    if (!file) {
        BENCHDNN_PRINT(2, format, prefix.c_str(), "not found");
        return false;
    }
    fseek(file, 0, SEEK_END);
    size_t total = 0, size = ftell(file);
    if (mem.size() != size) {
        fclose(file);
        BENCHDNN_PRINT(0, format, prefix.c_str(),
                "differs in size from the buffer's memory descriptor");
        BENCHDNN_PRINT(0, "Actual size: %zu\nNeeded size: %zu (%zu x %s)\n",
                size, mem.size(), mem.size() / mem.sizeof_dt(),
                dt2str(mem.dt()));
        SAFE_V(FAIL);
        return false;
    }
    fseek(file, 0, SEEK_SET);
    for (size_t read = ~0; read && (total < size); total += read)
        read = fread(
                static_cast<uint8_t *>(mem) + total, 1, size - total, file);
    fclose(file);
    if (total != size) {
        BENCHDNN_PRINT(0, format, prefix.c_str(), "cannot be read correctly");
        SAFE_V(FAIL);
        return false;
    }
    if (ref_mem && (ref_mem.reorder(mem) != OK)) {
        BENCHDNN_PRINT(0, format, prefix.c_str(), "cannot be reordered");
        SAFE_V(FAIL);
        return false;
    }
    BENCHDNN_PRINT(2, "File %s successfully processed; buffer imported.\n",
            prefix.c_str());
    return true;
}