mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-21 02:53:56 +08:00
1854 lines
72 KiB
C++
1854 lines
72 KiB
C++
/*******************************************************************************
|
|
* Copyright 2023-2025 Intel Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*******************************************************************************/
|
|
|
|
#ifdef ENABLE_LLVM_WCONVERSION
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
|
|
#endif
|
|
|
|
#include "gpu/intel/jit/codegen/codegen.hpp"
|
|
#include "gpu/intel/jit/codegen/bank_conflict_allocation.hpp"
|
|
#include "gpu/intel/jit/codegen/kernel.hpp"
|
|
#include "gpu/intel/jit/codegen/reduce.hpp"
|
|
#include "gpu/intel/jit/codegen/register_scope.hpp"
|
|
#include "gpu/intel/jit/codegen/reorder.hpp"
|
|
#include "gpu/intel/jit/codegen/send.hpp"
|
|
#include "gpu/intel/jit/eltwise_injector.hpp"
|
|
#include "gpu/intel/jit/ir/eltwise.hpp"
|
|
#include "gpu/intel/jit/ir/fma.hpp"
|
|
#include "ngen.hpp"
|
|
#ifdef WITH_SYCL_RUNTIME
|
|
#include <optional>
|
|
#include "ngen_sycl.hpp"
|
|
#endif
|
|
#ifdef WITH_OPENCL_RUNTIME
|
|
#include "ngen_opencl.hpp"
|
|
#endif
|
|
#ifdef WITH_L0_RUNTIME
|
|
#include "ngen_level_zero.hpp"
|
|
#endif
|
|
|
|
namespace dnnl {
|
|
namespace impl {
|
|
namespace gpu {
|
|
namespace intel {
|
|
namespace jit {
|
|
|
|
inline ngen::ConditionModifier cmp_op_to_ngen(op_kind_t op_kind) {
|
|
gpu_assert(is_cmp_op(op_kind));
|
|
switch (op_kind) {
|
|
case op_kind_t::_eq: return ngen::ConditionModifier::eq;
|
|
case op_kind_t::_ne: return ngen::ConditionModifier::ne;
|
|
case op_kind_t::_ge: return ngen::ConditionModifier::ge;
|
|
case op_kind_t::_gt: return ngen::ConditionModifier::gt;
|
|
case op_kind_t::_le: return ngen::ConditionModifier::le;
|
|
case op_kind_t::_lt: return ngen::ConditionModifier::lt;
|
|
default: gpu_error_not_expected();
|
|
}
|
|
return ngen::ConditionModifier::none;
|
|
}
|
|
|
|
// Lowers IR to nGEN.
|
|
template <typename ngen_generator_t>
|
|
class ir_to_ngen_t : public ir_visitor_t {
|
|
public:
|
|
ir_to_ngen_t(ngen_generator_t *host, const expr_binding_t &expr_binding)
|
|
: host_(host)
|
|
, expr_binding_(expr_binding)
|
|
, simd_size_(host->getSIMD())
|
|
, with_atomic_fp64_(host->hw_info().has_fp64_atomic_support()) {}
|
|
|
|
~ir_to_ngen_t() override
|
|
#ifdef DNNL_DEV_MODE
|
|
{
|
|
if (bank_conflicts_ > 0)
|
|
gpu_warning() << "Found bank conflicts: " << bank_conflicts_;
|
|
if (bundle_conflicts_ > 0)
|
|
gpu_warning() << "Found bundle conflicts: " << bundle_conflicts_;
|
|
}
|
|
#else
|
|
= default;
|
|
#endif
|
|
|
|
ngen::HW hw() const { return host_->getHardware(); }
|
|
|
|
void _visit(const alloc_t &obj) override {
|
|
auto scope = register_scope();
|
|
bool do_alloc = (obj.kind == alloc_kind_t::grf);
|
|
bool use_bc_alloc = false;
|
|
if (do_alloc) {
|
|
const int max_ngen_type_bits = 64;
|
|
reg_buf_data_t rbd;
|
|
if (obj.has_attr<bank_conflict_attr_t>()) {
|
|
rbd = create_bank_conflict_allocation(obj);
|
|
use_bc_alloc = true;
|
|
} else if (obj.size * 8 <= max_ngen_type_bits) {
|
|
rbd = scope.alloc_reg_data(type_t::u(obj.size * 8));
|
|
} else {
|
|
const int regs
|
|
= utils::div_up(obj.size, ngen::GRF::bytes(hw()));
|
|
if (is_header(obj.buf)) {
|
|
rbd = alloc_header(scope, regs);
|
|
} else {
|
|
rbd = scope.alloc_reg_buf(regs);
|
|
}
|
|
}
|
|
if (obj.has_attr<grf_permute_attr_t>()) {
|
|
auto &attr = obj.get_attr<grf_permute_attr_t>();
|
|
rbd.set_grf_permutation(*attr.grf_perm);
|
|
}
|
|
expr_binding_.bind(obj.buf, rbd);
|
|
}
|
|
host_->comment(
|
|
obj.line_str() + " -> " + expr_binding_.get(obj.buf).str());
|
|
visit(obj.body);
|
|
if (do_alloc) expr_binding_.unbind(obj.buf);
|
|
if (use_bc_alloc) release_bank_conflict_allocation(obj);
|
|
}
|
|
|
|
void _visit(const for_t &obj) override {
|
|
host_->comment(obj.line_str());
|
|
auto scope = register_scope();
|
|
auto var_op = scope.alloc_reg_data(obj.var.type());
|
|
bool dynamic_loop = !is_const(obj.init) || !is_const(obj.bound);
|
|
auto init_op = eval(obj.init, scope);
|
|
auto bound_op = eval(obj.bound, scope);
|
|
auto step_op = eval(obj.step, scope);
|
|
|
|
expr_binding_.bind(obj.var, var_op);
|
|
host_->comment(
|
|
obj.var.str() + " -> " + expr_binding_.get(obj.var).str());
|
|
|
|
host_->emov(1, var_op, init_op);
|
|
|
|
// For dynamic loops use standard format otherwise
|
|
// use do-while format.
|
|
if (dynamic_loop) {
|
|
ngen::Label loop_end_label;
|
|
ngen::Label loop_begin_label;
|
|
host_->mark(loop_begin_label);
|
|
host_->ecmp(1 | host_->ge | host_->f0[0], var_op, bound_op);
|
|
host_->jmpi(1 | host_->f0[0], loop_end_label);
|
|
visit(obj.body);
|
|
|
|
host_->eadd(1, var_op, var_op, step_op);
|
|
host_->jmpi(1, loop_begin_label);
|
|
host_->mark(loop_end_label);
|
|
} else {
|
|
ngen::Label loop_label;
|
|
host_->mark(loop_label);
|
|
visit(obj.body);
|
|
|
|
host_->eadd(1, var_op, var_op, step_op);
|
|
host_->ecmp(1 | host_->lt | host_->f0[0], var_op, bound_op);
|
|
host_->jmpi(1 | host_->f0[0], loop_label);
|
|
}
|
|
|
|
expr_binding_.unbind(obj.var);
|
|
host_->comment("end " + obj.line_str());
|
|
}
|
|
|
|
void _visit(const func_call_t &obj) override {
|
|
host_->comment(obj.line_str());
|
|
auto scope = register_scope();
|
|
|
|
auto &func = obj.func;
|
|
if (func.is<dpas_t>()) {
|
|
auto arg_ops = eval(obj.args, scope);
|
|
dpas(func.as<dpas_t>(), arg_ops, obj.attr);
|
|
} else if (func.is<mad_t>()) {
|
|
auto arg_ops = eval(obj.args, scope);
|
|
mad(scope, func.as<mad_t>(), arg_ops, obj.attr);
|
|
} else if (func.is<send_t>()) {
|
|
auto &send_func = func.as<send_t>();
|
|
auto args = obj.args;
|
|
auto &mask = send_t::arg_mask(args);
|
|
// If all channels are disabled for writing, quick return.
|
|
if (all_of(mask, expr_t(false))) {
|
|
if (send_func.is_load() || send_func.is_load_2d()) {
|
|
auto reg_buf_op = eval(send_t::arg_reg_buf(args), scope);
|
|
auto pattern_op
|
|
= eval(send_t::arg_fill_pattern(args), scope);
|
|
fill_buf(reg_buf_op, send_func.payload_size(), pattern_op);
|
|
}
|
|
return;
|
|
}
|
|
// If all channels are enabled, do not use mask.
|
|
if (all_of(mask, expr_t(true))) mask = expr_t();
|
|
auto arg_ops = eval(args, scope);
|
|
send(scope, func.as<send_t>(), arg_ops, obj.attr);
|
|
} else if (func.is<reorder_t>()) {
|
|
auto arg_ops = eval(obj.args, scope);
|
|
gpu_assert(obj.attr.is_empty()) << "Unexpected attribute.";
|
|
reorder(scope, func.as<reorder_t>(), arg_ops);
|
|
} else if (func.is<reduce_t>()) {
|
|
auto arg_ops = eval(obj.args, scope);
|
|
gpu_assert(obj.attr.is_empty()) << "Unexpected attribute.";
|
|
reduce(scope, func.as<reduce_t>(), arg_ops);
|
|
} else if (func.is<eltwise_t>()) {
|
|
auto &eltwise_func = func.as<eltwise_t>();
|
|
auto arg_ops = eval(obj.args, scope);
|
|
eltwise(scope, eltwise_func, arg_ops);
|
|
} else if (func.is_same(funcs::barrier_func())) {
|
|
barrier(obj.attr);
|
|
} else if (func.is_same(funcs::barrier_wait_func())) {
|
|
barrier_wait();
|
|
} else if (func.is_same(funcs::signal_func())) {
|
|
signal(obj.attr);
|
|
} else if (func.is_same(funcs::slm_fence_func())) {
|
|
slm_fence(obj.attr);
|
|
} else if (func.is_same(funcs::zero_out_func())) {
|
|
auto buf_op = eval(obj.args[0], scope);
|
|
fill_buf(buf_op.reg_buf_data(), to_cpp<int>(obj.args[1]));
|
|
} else {
|
|
gpu_error_not_expected() << object_t(obj);
|
|
}
|
|
}
|
|
|
|
void _visit(const if_t &obj) override {
|
|
gpu_assert(obj.cond.type().elems() == simd_size_);
|
|
host_->comment(obj.line_str());
|
|
|
|
bool has_else = bool(obj.else_body);
|
|
auto scope = register_scope();
|
|
auto cond_op = eval(obj.cond, scope);
|
|
|
|
ngen::Label l_else;
|
|
ngen::Label l_endif;
|
|
host_->if_(simd_size_ | cond_op.flag_register(),
|
|
has_else ? l_else : l_endif, l_endif);
|
|
visit(obj.body);
|
|
if (has_else) {
|
|
host_->comment("else // " + obj.line_str());
|
|
host_->else_(simd_size_, l_endif, l_endif);
|
|
host_->mark(l_else);
|
|
visit(obj.else_body);
|
|
}
|
|
host_->mark(l_endif);
|
|
host_->endif(simd_size_);
|
|
host_->comment("end " + obj.line_str());
|
|
}
|
|
|
|
void _visit(const let_t &obj) override {
|
|
if (obj.value.is_empty()) {
|
|
auto var_op = expr_binding_.get(obj.var);
|
|
host_->comment(obj.line_str() + " -> " + var_op.str());
|
|
// External variable, must be already bound.
|
|
gpu_assert(expr_binding_.is_bound(obj.var))
|
|
<< "Variable is not defined: " << obj.var;
|
|
visit(obj.body);
|
|
return;
|
|
}
|
|
|
|
auto scope = register_scope();
|
|
host_->comment(obj.line_str());
|
|
if (is_const(obj.value) || is_shuffle_const(obj.value)
|
|
|| obj.var.type() != obj.value.type()) {
|
|
auto &var_type = obj.var.type();
|
|
auto var_op = (var_type.is_bool())
|
|
? ngen_operand_t(scope.alloc_flag(var_type.elems()))
|
|
: ngen_operand_t(scope.alloc_reg_data(var_type));
|
|
eval(obj.value, scope, ngen_operand_t(var_op, var_type.elems()));
|
|
expr_binding_.bind(obj.var, var_op);
|
|
} else {
|
|
auto value_op = eval(obj.value, scope);
|
|
expr_binding_.bind(obj.var, value_op);
|
|
}
|
|
|
|
auto var_op = expr_binding_.get(obj.var);
|
|
host_->comment(obj.var.str() + " -> " + var_op.str());
|
|
|
|
// At this point the scope contains allocations for temporary
|
|
// expressions. We need to 1) query and later re-claim the allocation
|
|
// for the let variable in a new scope and 2) release the current scope
|
|
// allocations to reduce GRF consumption.
|
|
ngen::GRFRange var_grf_range;
|
|
ngen::Subregister var_sub;
|
|
|
|
if (var_op.is_reg_data()) {
|
|
auto var_rd = var_op.reg_data();
|
|
var_grf_range = scope.find_grf_range(
|
|
var_rd.getBase(), var_rd.getByteOffset());
|
|
var_sub = scope.find_sub(var_rd.getBase(), var_rd.getByteOffset());
|
|
}
|
|
|
|
// Release the current scope allocations.
|
|
scope.clear();
|
|
|
|
// Claim the let variable allocation.
|
|
auto var_scope = register_scope();
|
|
if (!var_grf_range.isInvalid()) {
|
|
var_scope.claim(var_grf_range);
|
|
} else if (!var_sub.isInvalid()) {
|
|
var_scope.claim(var_sub);
|
|
}
|
|
|
|
visit(obj.body);
|
|
expr_binding_.unbind(obj.var);
|
|
}
|
|
|
|
void _visit(const store_t &obj) override {
|
|
host_->comment(obj.line_str());
|
|
auto scope = register_scope();
|
|
auto buf_op = eval(obj.buf, scope);
|
|
auto off = to_cpp<int>(obj.off);
|
|
auto mask_op = eval(obj.mask, scope);
|
|
|
|
auto &type = obj.value.type();
|
|
auto base_type = type.base();
|
|
|
|
int stride;
|
|
if (obj.has_default_stride()) {
|
|
stride = 1;
|
|
} else {
|
|
gpu_assert(obj.stride % base_type.size() == 0);
|
|
stride = obj.stride / base_type.size();
|
|
}
|
|
|
|
ngen::InstructionModifier mod = type.elems();
|
|
if (!mask_op.is_invalid()) mod |= mask_op.flag_register_mod();
|
|
auto dst_rbd = buf_op.reg_buf_data().format(off / base_type.size(),
|
|
type.elems(), stride, to_ngen(base_type));
|
|
ngen_operand_t dst(dst_rbd, mod);
|
|
eval(obj.value, scope, dst, obj.fill_mask0 && !mask_op.is_invalid());
|
|
}
|
|
|
|
void _visit(const while_t &obj) override {
|
|
host_->comment(obj.line_str());
|
|
auto scope = register_scope();
|
|
|
|
ngen::Label loop_end_label;
|
|
ngen::Label loop_begin_label;
|
|
|
|
host_->mark(loop_begin_label);
|
|
auto cond_op = eval(obj.cond, scope);
|
|
host_->jmpi(1 | ~cond_op.flag_register_mod(), loop_end_label);
|
|
visit(obj.body);
|
|
host_->jmpi(1, loop_begin_label);
|
|
host_->mark(loop_end_label);
|
|
host_->comment("end " + obj.line_str());
|
|
}
|
|
|
|
private:
|
|
bool is_header(const expr_t &buf) const {
|
|
return buf.as<var_t>().name.find("h_") == 0;
|
|
}
|
|
|
|
// Allocates headers using heuristics to reduce back-to-back header reuse -
|
|
// this helps to eliminate potential stalls caused by SWSB dependencies.
|
|
reg_buf_t alloc_header(ngen_register_scope_t &scope, int regs) {
|
|
auto is_used_recently = [&](const ngen::GRFRange &range) {
|
|
if (range.isInvalid()) return false;
|
|
for (int i = range.getBase(); i < range.getBase() + range.getLen();
|
|
i++) {
|
|
for (auto &r : last_used_header_regs_)
|
|
if (i == r) return true;
|
|
}
|
|
return false;
|
|
};
|
|
auto record = [&](const ngen::GRFRange &range) {
|
|
for (int i = range.getBase(); i < range.getBase() + range.getLen();
|
|
i++) {
|
|
last_used_header_regs_.push_back(i);
|
|
}
|
|
// Remove old header registers from tracking.
|
|
size_t cur_size = last_used_header_regs_.size();
|
|
if (cur_size > max_tracked_header_regs) {
|
|
last_used_header_regs_.erase(last_used_header_regs_.begin(),
|
|
last_used_header_regs_.begin() + cur_size
|
|
- max_tracked_header_regs);
|
|
}
|
|
};
|
|
// Try to allocate/claim registers until we find two GRF ranges that
|
|
// were not used recently. Registers are usually allocated
|
|
// sequentially, and the first range may still be in use in SWSB
|
|
// analysis: e.g. when a SIMD16 load instruction accesses one register
|
|
// while SWSB analysis assumes it's a full SIMD32 accessing two
|
|
// registers.
|
|
std::vector<ngen::GRFRange> ranges;
|
|
for (int found = 0; found < 2;) {
|
|
auto r = scope.try_alloc_range(regs);
|
|
ranges.push_back(r);
|
|
if (!is_used_recently(r)) found++;
|
|
}
|
|
auto range = ranges.back();
|
|
ranges.pop_back();
|
|
for (auto &r : ranges)
|
|
scope.safeRelease(r);
|
|
// If there no range found, fall back to regular allocation, without
|
|
// any heuristics.
|
|
if (range.isInvalid()) range = scope.alloc_range(regs);
|
|
record(range);
|
|
return reg_buf_t(scope.hw(), range);
|
|
}
|
|
|
|
ngen_register_scope_t register_scope() {
|
|
return ngen_register_scope_t(host_->ra());
|
|
}
|
|
|
|
#ifdef DNNL_DEV_MODE
|
|
void check_bank_conflicts(const ngen::InstructionModifier &mod,
|
|
const ngen::RegData &_src0, const ngen::RegData &_src1,
|
|
const ngen::RegData &_src2, bool is_dpas = false) {
|
|
int esize = mod.getExecSize();
|
|
int hw_simd = (hw() >= ngen::HW::XeHPC ? 16 : 8);
|
|
auto shift = [this](const ngen::RegData &rd, int exec_off) {
|
|
if (exec_off == 0 || rd.isNull()) return rd;
|
|
int type_size = ngen::getBytes(rd.getType());
|
|
int w = (exec_off % rd.getWidth());
|
|
int h = (exec_off / rd.getWidth());
|
|
int off = rd.getByteOffset()
|
|
+ (w * rd.getHS() + h * rd.getVS()) * type_size;
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int shifted_base = rd.getBase() + off / grf_size;
|
|
int shifted_off = off % grf_size;
|
|
auto ret = rd;
|
|
ret.setBase(shifted_base);
|
|
ret.setOffset(ir_utils::safe_divide(shifted_off, type_size));
|
|
return ret;
|
|
};
|
|
for (int i = 0; i < esize; i += hw_simd) {
|
|
auto src0 = shift(_src0, i);
|
|
auto src1 = shift(_src1, i);
|
|
auto src2 = shift(_src2, i);
|
|
bool same_bank01 = ngen::Bundle::same_bank(hw(), src0, src1);
|
|
bool same_bank02 = ngen::Bundle::same_bank(hw(), src0, src2);
|
|
if (is_dpas) {
|
|
if (same_bank02) bank_conflicts_++;
|
|
} else {
|
|
if (same_bank01 && same_bank02) bank_conflicts_++;
|
|
if (ngen::Bundle::conflicts(hw(), src0, src1)
|
|
|| ngen::Bundle::conflicts(hw(), src0, src2)
|
|
|| ngen::Bundle::conflicts(hw(), src1, src2)) {
|
|
bundle_conflicts_++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
template <typename... ArgsT>
|
|
void check_bank_conflicts(const ArgsT &...) {}
|
|
#endif
|
|
|
|
reg_buf_t create_bank_conflict_allocation(const alloc_t &alloc) {
|
|
auto &bc_attr = alloc.get_attr<bank_conflict_attr_t>();
|
|
auto it = bc_allocations_.find(bc_attr);
|
|
if (it != bc_allocations_.end()) {
|
|
it->second.retain();
|
|
return it->second.get_reg_buf(alloc.buf);
|
|
}
|
|
auto bca = bank_conflict_allocation_t::create(host_->ra(), bc_attr);
|
|
if (bca.is_empty()) return {};
|
|
|
|
auto ret = bc_allocations_.emplace(bc_attr, std::move(bca));
|
|
return ret.first->second.get_reg_buf(alloc.buf);
|
|
}
|
|
|
|
void release_bank_conflict_allocation(const alloc_t &alloc) {
|
|
auto &bc_attr = alloc.get_attr<bank_conflict_attr_t>();
|
|
auto it = bc_allocations_.find(bc_attr);
|
|
gpu_assert(it != bc_allocations_.end());
|
|
it->second.release(alloc.buf);
|
|
if (it->second.refs() == 0) bc_allocations_.erase(bc_attr);
|
|
}
|
|
|
|
void signal(const func_call_attr_t &attr) {
|
|
ngen::InstructionModifier mod;
|
|
if (attr) mod = mod | attr.as<instruction_modifier_attr_t>().mod;
|
|
host_->barriermsg(mod, host_->signal_header());
|
|
}
|
|
|
|
void barrier_wait() { host_->barrierwait(); }
|
|
|
|
void slm_fence(const func_call_attr_t &attr) {
|
|
auto scope = register_scope();
|
|
auto tmp = scope.alloc();
|
|
ngen::InstructionModifier mod;
|
|
if (attr) mod = mod | attr.as<instruction_modifier_attr_t>().mod;
|
|
|
|
host_->slmfence(mod, tmp, host_->r0);
|
|
host_->fencewait();
|
|
}
|
|
|
|
void barrier(const func_call_attr_t &attr) {
|
|
auto scope = register_scope();
|
|
auto tmp = scope.alloc();
|
|
ngen::InstructionModifier mod;
|
|
if (attr) mod = mod | attr.as<instruction_modifier_attr_t>().mod;
|
|
|
|
host_->slmfence(mod, tmp, host_->r0);
|
|
host_->fencewait();
|
|
host_->barriermsg(mod, host_->signal_header());
|
|
host_->barrierwait();
|
|
}
|
|
|
|
void dpas(const dpas_t &dpas_func, const std::vector<ngen_operand_t> &args,
|
|
const func_call_attr_t &attr) {
|
|
auto dst = dpas_t::arg_dst(args).reg_buf_data();
|
|
auto src1 = dpas_t::arg_src1(args).reg_buf_data();
|
|
auto src2 = dpas_t::arg_src2(args).reg_buf_data();
|
|
|
|
if (dpas_func.is_dpasw) dst = dst.unpermute();
|
|
|
|
int esize = dpas_func.exec_size;
|
|
|
|
ngen::RegData src0;
|
|
auto &src0_op = dpas_t::arg_src0(args);
|
|
if (!src0_op.is_immediate()) {
|
|
auto src0_rbd = src0_op.reg_buf_data().format(
|
|
0, esize, 1, to_ngen(dpas_func.dst_type));
|
|
if (dpas_func.is_dpasw) src0_rbd = src0_rbd.unpermute();
|
|
src0 = src0_rbd;
|
|
} else {
|
|
gpu_assert(src0_op.is_immediate());
|
|
gpu_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
|
|
src0 = host_->null.retype(to_ngen(dpas_func.dst_type));
|
|
}
|
|
|
|
dst = dst.format(0, esize, 1, to_ngen(dpas_func.dst_type));
|
|
src1 = src1.format(0, esize, 1, to_ngen(dpas_func.src1_type));
|
|
int src2_width = (dpas_func.is_dp4a() ? 1 : esize);
|
|
int src2_stride = (dpas_func.is_dp4a() ? 0 : 1);
|
|
src2 = src2.format(
|
|
0, src2_width, src2_stride, to_ngen(dpas_func.src2_type));
|
|
|
|
ngen::InstructionModifier mod = esize;
|
|
if (attr) mod = mod | attr.as<instruction_modifier_attr_t>().mod;
|
|
check_bank_conflicts(mod, src0, src1, src2, /*is_dpas=*/true);
|
|
if (dpas_func.is_dpasw) {
|
|
host_->dpasw(mod, dpas_func.sdepth, dpas_func.rcount, dst, src0,
|
|
src1, src2);
|
|
} else if (dpas_func.is_dp4a()) {
|
|
if (src0.isNull()) {
|
|
host_->dp4a(mod, dst, 0, src1, src2);
|
|
} else {
|
|
host_->dp4a(mod, dst, src0, src1, src2);
|
|
}
|
|
} else {
|
|
host_->dpas(mod, dpas_func.sdepth, dpas_func.rcount, dst, src0,
|
|
src1, src2);
|
|
}
|
|
}
|
|
|
|
void mad(ngen_register_scope_t &scope, const mad_t &mad_func,
|
|
const std::vector<ngen_operand_t> &args,
|
|
const func_call_attr_t &attr) {
|
|
auto dst = mad_t::arg_dst(args).reg_buf_data();
|
|
auto src1 = mad_t::arg_src1(args).reg_buf_data();
|
|
auto src2 = mad_t::arg_src2(args).reg_buf_data();
|
|
|
|
ngen::RegData src0;
|
|
auto &src0_op = mad_t::arg_src0(args);
|
|
if (!src0_op.is_immediate()) {
|
|
src0 = src0_op.reg_buf_data()
|
|
.format(0, mad_func.exec_size, 1,
|
|
to_ngen(mad_func.dst_type))
|
|
.reg_data();
|
|
} else {
|
|
gpu_assert(src0_op.is_immediate());
|
|
gpu_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
|
|
src0 = host_->null;
|
|
src0.setType(to_ngen(mad_func.dst_type));
|
|
}
|
|
|
|
dst = dst.format(0, mad_func.exec_size, 1, to_ngen(mad_func.dst_type));
|
|
|
|
int src1_width = (mad_func.src1_stride == 0 ? 1 : mad_func.exec_size);
|
|
int src2_width = (mad_func.src2_stride == 0 ? 1 : mad_func.exec_size);
|
|
src1 = src1.format(0, src1_width, mad_func.src1_stride,
|
|
to_ngen(mad_func.src1_type));
|
|
src2 = src2.format(0, src2_width, mad_func.src2_stride,
|
|
to_ngen(mad_func.src2_type));
|
|
|
|
ngen::InstructionModifier mod = mad_func.exec_size;
|
|
if (attr) mod = mod | attr.as<instruction_modifier_attr_t>().mod;
|
|
|
|
check_bank_conflicts(mod, src0, src1, src2, /*is_dpas=*/false);
|
|
if (src0.isNull()) {
|
|
host_->mul(mod, dst, src1, src2);
|
|
} else {
|
|
gpu_assert(dst.byte_offset() == src0.getByteOffset())
|
|
<< "dst/src0 must be aligned to the same GRF offset.";
|
|
align_src_dst_offset(host_, scope, mod, dst, src1, src2);
|
|
if (mad_func.dst_type == type_t::f64()
|
|
&& src1.reg_data().getHS() == 0
|
|
&& src1.reg_data().getVS() == 0) {
|
|
// Workaround for sporadic f64 mad errors with broadcast src1 on XeHPC.
|
|
host_->mad(mod, dst, src0, src2, src1);
|
|
} else {
|
|
host_->mad(mod, dst, src0, src1, src2);
|
|
}
|
|
}
|
|
}
|
|
|
|
void fill_buf(const ngen_operand_t &buf_op, int size,
|
|
const ngen_operand_t &pattern = {}) const {
|
|
auto &rd = buf_op.reg_buf_data();
|
|
type_t type = (pattern.is_invalid() ? type_t::f32() : type_t::u32());
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int step = 2 * grf_size / type.size();
|
|
int elems = size / type.size();
|
|
for (int i = 0; i < elems; i += step) {
|
|
step = std::min(step, elems - i);
|
|
step = utils::rnd_down_pow2(step);
|
|
auto sub_rd_mov = rd.format(i, step, 1, to_ngen(type)).reg_data();
|
|
if (pattern.is_invalid()) {
|
|
host_->emov(step, sub_rd_mov, ngen::Immediate(0));
|
|
} else if (pattern.is_immediate()) {
|
|
host_->emov(step, sub_rd_mov, pattern.immediate());
|
|
} else if (pattern.is_reg_data()) {
|
|
host_->emov(step, sub_rd_mov, pattern.reg_data());
|
|
} else {
|
|
gpu_error_not_expected();
|
|
}
|
|
}
|
|
}
|
|
|
|
ngen::RegData send_maybe_make_dense_payload(ngen_register_scope_t &scope,
|
|
const send_t &send_func, const ngen_operand_t &op_buf) const {
|
|
if (send_func.is_prefetch() || send_func.is_prefetch_2d())
|
|
return ngen::RegData(host_->null);
|
|
|
|
auto &buf = op_buf.reg_buf_data();
|
|
int size = send_func.payload_size();
|
|
bool is_dense = buf.is_dense(size);
|
|
if (is_dense) return ngen::GRF(buf.base());
|
|
|
|
if (send_func.is_load() || send_func.is_load_2d()) {
|
|
gpu_error_not_expected()
|
|
<< "Expected dense GRF region for load message.";
|
|
return ngen::RegData();
|
|
}
|
|
|
|
gpu_assert(send_func.is_store() || send_func.is_store_2d()
|
|
|| send_func.is_atomic());
|
|
|
|
// Reorder buffer to a dense buffer for store.
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int grf_elems = grf_size / ngen::getBytes(buf.type());
|
|
int regs = utils::div_up(size, grf_size);
|
|
|
|
auto tmp = scope.alloc_range(regs);
|
|
|
|
int dwords = ngen::GRF::bytes(hw()) / sizeof(int32_t);
|
|
int max_step = 2;
|
|
for (int i = 0; i < regs;) {
|
|
auto sub_buf = buf.format(i * grf_elems);
|
|
int step = std::min(max_step, regs - i);
|
|
if (step > 1 && !sub_buf.is_dense(step * grf_size)) step = 1;
|
|
int esize = step * dwords;
|
|
auto src = sub_buf.subregister(ngen::DataType::ud)(1);
|
|
auto dst = tmp[i].ud(0)(1);
|
|
host_->emov(esize, dst, src);
|
|
i += step;
|
|
}
|
|
return tmp[0];
|
|
}
|
|
|
|
void send_atomic_add_emu(ngen_register_scope_t &scope,
|
|
const send_t &send_func, const ngen_operand_t &mask_op,
|
|
ngen::InstructionModifier &mod, const ngen::RegData &mem_off_op,
|
|
ngen::RegData &rd) const {
|
|
int size = send_func.payload_size();
|
|
gpu_assert((send_func.type.is_dword() || send_func.type.is_qword())
|
|
&& (size == 32 || size == 64))
|
|
<< "expected atomic message dwordx8 or qwordx8";
|
|
auto load_func = send_t::make(send_func.hw, send_op_t::load,
|
|
send_func.address, send_func.type, send_func.slots,
|
|
send_func.fill_buf, send_func.cache_hint);
|
|
auto &load_send = load_func.as<send_t>();
|
|
send_impl_t load(load_send);
|
|
auto cmpwr_func = send_t::make(send_func.hw, send_op_t::atomic_cmpwr,
|
|
send_func.address, send_func.type, send_func.slots,
|
|
send_func.fill_buf, send_func.cache_hint);
|
|
auto &cmpwr_send = cmpwr_func.as<send_t>();
|
|
send_impl_t cmpwr(cmpwr_send);
|
|
bool is_df = send_func.type.is_qword();
|
|
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int regs = utils::div_up(size, grf_size);
|
|
|
|
auto new_val = scope.alloc_range(2 * regs);
|
|
auto old_save = scope.alloc_range(regs);
|
|
auto flag = scope.alloc_flag(send_func.slots);
|
|
ngen::Label atomic_label;
|
|
rd.setType(is_df ? ngen::DataType::df : ngen::DataType::f);
|
|
|
|
load.emit(host_, scope, mod, mem_off_op,
|
|
is_df ? new_val[0].df(0) : new_val[0].f(0));
|
|
|
|
if (mask_op.is_invalid())
|
|
host_->emov(1, flag, ngen::Immediate(uint16_t((1 << 8) - 1)));
|
|
else
|
|
host_->and_(1, flag, mod.getFlagReg(),
|
|
ngen::Immediate(uint16_t((1 << 8) - 1)));
|
|
|
|
auto region
|
|
= is_df ? new_val[2].df(0)(4, 4, 1) : new_val[1].f(0)(8, 8, 1);
|
|
auto old_region
|
|
= is_df ? new_val[0].df(0)(4, 4, 1) : new_val[0].f(0)(8, 8, 1);
|
|
auto old_save_region = is_df ? old_save[0].df(0)(4, 4, 1)
|
|
: old_save[0].f(0)(8, 8, 1);
|
|
int esize = (is_df && size < 64) ? 4 : 8;
|
|
host_->mark(atomic_label);
|
|
host_->emov(esize, old_save_region, old_region);
|
|
auto ne_mod = esize | flag | host_->ne | flag;
|
|
auto eq_mod = esize | flag | host_->eq | flag;
|
|
host_->add(esize, region, old_region, rd.setRegion(4, 4, 1));
|
|
auto cmpwr_mod = mod;
|
|
cmpwr_mod.setPredCtrl(ngen::PredCtrl::None);
|
|
cmpwr_mod |= flag;
|
|
cmpwr.emit(host_, scope, cmpwr_mod, old_region, mem_off_op, old_region);
|
|
host_->cmp(ne_mod, old_save_region, old_region);
|
|
// The previous comparison always fails for NaNs so check for NaNs
|
|
// explictly to prevent an infinite loop.
|
|
host_->cmp(eq_mod, old_region, old_region);
|
|
host_->while_(esize | flag, atomic_label);
|
|
}
|
|
|
|
void send(ngen_register_scope_t &scope, const send_t &send_func,
|
|
const std::vector<ngen_operand_t> &args,
|
|
const func_call_attr_t &attr) const {
|
|
send_impl_t spec_impl(send_func);
|
|
auto &mem_off_op = send_t::arg_mem_off(args);
|
|
auto ®_buf_op = send_t::arg_reg_buf(args);
|
|
auto &mask_op = send_t::arg_mask(args);
|
|
auto &fill_pattern = send_t::arg_fill_pattern(args);
|
|
|
|
ngen::InstructionModifier mod = send_func.nmasks();
|
|
gpu_assert(math::is_pow2(mod.getExecSize()));
|
|
if (attr) mod |= attr.as<instruction_modifier_attr_t>().mod;
|
|
if (!mask_op.is_invalid()) mod |= mask_op.flag_register_mod();
|
|
|
|
// Zero-out inactive channels unless told not to.
|
|
if (send_func.fill_buf
|
|
&& (send_func.is_load() || send_func.is_load_2d())
|
|
&& mod.getPredCtrl() != ngen::PredCtrl::None) {
|
|
fill_buf(reg_buf_op, send_func.payload_size(), fill_pattern);
|
|
}
|
|
|
|
// Emit send instruction.
|
|
auto rd = send_maybe_make_dense_payload(scope, send_func, reg_buf_op);
|
|
if (!send_func.has_default_slot_mask()) {
|
|
if (mod.getPredCtrl() != ngen::PredCtrl::None) {
|
|
auto flag = mod.getFlagReg();
|
|
if (send_func.slots > 16)
|
|
flag = ngen::FlagRegister(flag.index() >> 1);
|
|
host_->and_(1, flag, flag, send_func.slot_mask);
|
|
} else {
|
|
auto flag = scope.alloc_flag(send_func.slots);
|
|
host_->emov(1, flag, send_func.slot_mask);
|
|
mod |= flag;
|
|
}
|
|
}
|
|
if ((hw() <= ngen::HW::XeLP && send_func.is_atomic())
|
|
|| (hw() == ngen::HW::XeHPG && send_func.is_atomic()
|
|
&& send_func.type.is_qword() && !with_atomic_fp64_)) {
|
|
send_atomic_add_emu(
|
|
scope, send_func, mask_op, mod, mem_off_op.reg_data(), rd);
|
|
} else {
|
|
spec_impl.emit(host_, scope, mod, mem_off_op.reg_data(), rd);
|
|
}
|
|
}
|
|
|
|
void reorder(ngen_register_scope_t &scope, const reorder_t &reorder_func,
|
|
const std::vector<ngen_operand_t> &args) const {
|
|
auto &src_op = reorder_t::arg_src_buf(args);
|
|
auto &dst_op = reorder_t::arg_dst_buf(args);
|
|
|
|
reorder_impl_t reorder_impl(hw(), reorder_func);
|
|
reorder_impl.emit(
|
|
host_, scope, src_op.reg_buf_data(), dst_op.reg_buf_data());
|
|
}
|
|
|
|
void reduce(ngen_register_scope_t &scope, const reduce_t &reduce_func,
|
|
const std::vector<ngen_operand_t> &args) const {
|
|
auto &src_op = reduce_t::arg_src_buf(args);
|
|
auto &dst_op = reduce_t::arg_dst_buf(args);
|
|
|
|
reduce_impl_t reduce_impl(hw(), reduce_func, simd_size_);
|
|
reduce_impl.emit(
|
|
host_, scope, src_op.reg_buf_data(), dst_op.reg_buf_data());
|
|
}
|
|
|
|
void eltwise(ngen_register_scope_t &scope, const eltwise_t &func,
|
|
const std::vector<ngen_operand_t> &args) {
|
|
int elems = to_cpp<int>(hw(), eltwise_t::arg_elems(args));
|
|
auto &data_op = eltwise_t::arg_data(args);
|
|
const auto &data_rd = data_op.reg_buf_data();
|
|
|
|
eltwise_injector_f32_t<typename ngen_generator_t::RootCodeGenerator>
|
|
inj(host_, func.alg_kind, func.alpha, func.beta, func.scale);
|
|
auto scratch = scope.alloc_range(inj.preferred_scratch_regs());
|
|
inj.set_scratch(scratch);
|
|
inj.prepare();
|
|
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int f_size = sizeof(float);
|
|
int step = 2 * grf_size / f_size;
|
|
|
|
auto do_eltwise = [&](const reg_buf_data_t &r, const int count) {
|
|
if (func.alg_kind == alg_kind::eltwise_stochastic_round) {
|
|
gpu_assert(args.size() == 3);
|
|
const auto &seed = args[2].reg_buf_data();
|
|
inj.compute(ngen::GRFRange(r.base(), count),
|
|
seed.reg_data().getBase(), seed.reg_data().getOffset(),
|
|
func.dst_dt);
|
|
} else {
|
|
inj.compute(ngen::GRFRange(r.base(), count));
|
|
}
|
|
};
|
|
for (int i = 0; i < elems; i += step) {
|
|
ngen_register_scope_t i_scope(scope.register_allocator());
|
|
step = std::min(step, elems - i);
|
|
step = utils::rnd_down_pow2(step);
|
|
int cur_elems = step;
|
|
auto rd = data_rd.format(i, ngen::DataType::f);
|
|
// Use temporary storage when needed to ensure:
|
|
// - Eltwise is applied to full register
|
|
// - Data is aligned to GRF boundary
|
|
if ((cur_elems * f_size) % grf_size != 0 || rd.byte_offset() != 0) {
|
|
int full_elems
|
|
= utils::rnd_up(cur_elems * f_size, grf_size) / f_size;
|
|
auto tmp = i_scope.alloc_reg_data(type_t::f32(full_elems));
|
|
emit_reorder_1d_tile(host_, i_scope, cur_elems, rd, 1, tmp, 1);
|
|
do_eltwise(tmp, full_elems * f_size / grf_size);
|
|
emit_reorder_1d_tile(host_, i_scope, cur_elems, tmp, 1, rd, 1);
|
|
} else {
|
|
do_eltwise(rd, cur_elems * f_size / grf_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
protected:
|
|
ngen_operand_t eval(const expr_t &e, ngen_register_scope_t &scope,
|
|
const ngen_operand_t &dst_operand = ngen_operand_t(),
|
|
bool fill_mask0 = false) const {
|
|
expr_evaluator_t<ngen_generator_t> expr_evaluator(
|
|
host_, expr_binding_, scope);
|
|
return expr_evaluator.eval(e, dst_operand, fill_mask0);
|
|
}
|
|
|
|
std::vector<ngen_operand_t> eval(const std::vector<expr_t> &exprs,
|
|
ngen_register_scope_t &scope) const {
|
|
expr_evaluator_t<ngen_generator_t> expr_evaluator(
|
|
host_, expr_binding_, scope);
|
|
return expr_evaluator.eval(exprs);
|
|
}
|
|
|
|
private:
|
|
ngen_generator_t *host_;
|
|
expr_binding_t expr_binding_;
|
|
int simd_size_;
|
|
bool with_atomic_fp64_;
|
|
|
|
#ifdef DNNL_DEV_MODE
|
|
int bank_conflicts_ = 0;
|
|
int bundle_conflicts_ = 0;
|
|
#endif
|
|
|
|
object_map_t<alloc_attr_t, bank_conflict_allocation_t> bc_allocations_;
|
|
|
|
const size_t max_tracked_header_regs = 8;
|
|
std::vector<int> last_used_header_regs_;
|
|
};
|
|
|
|
// Evaluates expression by emitting instructions with nGEN.
|
|
template <typename ngen_generator_t>
|
|
class expr_evaluator_t : public ir_visitor_t {
|
|
public:
|
|
expr_evaluator_t(ngen_generator_t *host, const expr_binding_t &expr_binding,
|
|
ngen_register_scope_t &scope)
|
|
: host_(host), expr_binding_(expr_binding), scope_(scope) {}
|
|
|
|
constexpr ngen::HW hw() const { return host_->getHardware(); }
|
|
|
|
bool is_int_up_convert(const expr_t &e, type_t &type) const {
|
|
auto it = int_up_converts_.find(e);
|
|
if (it == int_up_converts_.end()) return false;
|
|
type = it->second;
|
|
return true;
|
|
}
|
|
|
|
// If `dst_operand` is not empty, use its pre-allocated location for the
|
|
// result.
|
|
ngen_operand_t eval(const expr_t &e,
|
|
const ngen_operand_t &dst_operand = ngen_operand_t(),
|
|
bool fill_mask0 = false) {
|
|
if (!dst_operand.is_invalid()) {
|
|
gpu_assert(dst_operand.mod().getExecSize() != 0);
|
|
}
|
|
if (expr_binding_.is_bound(e)) {
|
|
if (!dst_operand.is_invalid()) {
|
|
auto bind = expr_binding_.get(e);
|
|
if (fill_mask0) {
|
|
gpu_assert(!bind.is_immediate());
|
|
host_->sel(dst_operand.mod(), dst_operand.reg_data(),
|
|
bind.reg_data(), 0);
|
|
} else {
|
|
host_->emov(dst_operand.mod(), dst_operand, bind);
|
|
}
|
|
return dst_operand;
|
|
}
|
|
} else {
|
|
if (dst_operand.is_invalid()) {
|
|
visit(e);
|
|
} else if (!fill_mask0) {
|
|
expr_binding_.bind_dst(e, dst_operand);
|
|
visit(e);
|
|
} else {
|
|
auto op = eval(e);
|
|
gpu_assert(!op.is_immediate());
|
|
host_->sel(dst_operand.mod(), dst_operand.reg_data(),
|
|
op.reg_data(), 0);
|
|
}
|
|
}
|
|
|
|
return expr_binding_.get(e, /*allow_empty=*/true);
|
|
}
|
|
|
|
std::vector<ngen_operand_t> eval(const std::vector<expr_t> &exprs) {
|
|
std::vector<ngen_operand_t> ret;
|
|
for (auto &e : exprs) {
|
|
if (!expr_binding_.is_bound(e)) visit(e);
|
|
ret.push_back(expr_binding_.get(e));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void _visit(const binary_op_t &obj) override {
|
|
auto dst_op = alloc_dst_op(obj);
|
|
auto mod = dst_op.mod();
|
|
|
|
switch (obj.op_kind) {
|
|
case op_kind_t::_and: {
|
|
if (obj.type.is_bool()) {
|
|
auto has_and_only = [](const expr_t &bin_obj) {
|
|
auto bin_ops = find_objects<binary_op_t>(bin_obj);
|
|
for (auto &op : bin_ops) {
|
|
auto &bin = op.as<binary_op_t>();
|
|
if (is_cmp_op(bin.op_kind)
|
|
&& bin.op_kind != op_kind_t::_and)
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
auto a_is_var = has_and_only(obj.a);
|
|
auto b_is_var = has_and_only(obj.b);
|
|
auto a = b_is_var ? obj.b : obj.a;
|
|
auto b = b_is_var ? obj.a : obj.b;
|
|
auto flag_type = obj.type.elems() == 16
|
|
? ngen::DataType::uw
|
|
: ngen::DataType::ud;
|
|
if (a_is_var && b_is_var) {
|
|
auto tmp0 = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
auto tmp1 = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
|
|
auto tmp_dst = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
auto src0_op = eval(obj.a, tmp0);
|
|
|
|
auto src1_op = eval(obj.b, tmp1);
|
|
|
|
host_->eand(1, tmp_dst, src0_op, src1_op);
|
|
host_->emov(1, dst_op, tmp_dst);
|
|
} else if (a_is_var || b_is_var) {
|
|
auto tmp1 = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
|
|
auto tmp0 = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
auto tmp_dst = ngen_operand_t(
|
|
scope_.alloc_reg_data(to_ir(flag_type)), 1);
|
|
auto src0_op = eval(a, tmp0);
|
|
eval(b, ngen_operand_t(dst_op, mod));
|
|
|
|
host_->emov(1, tmp1, dst_op);
|
|
host_->eand(1, tmp_dst, src0_op, tmp1);
|
|
host_->emov(1, dst_op, tmp_dst);
|
|
} else {
|
|
eval(a, dst_op);
|
|
eval(b,
|
|
ngen_operand_t(dst_op,
|
|
mod | dst_op.flag_register_mod()));
|
|
}
|
|
break;
|
|
}
|
|
// else fall through to the default label.
|
|
}
|
|
default: {
|
|
// Some cases require pre-allocated register regions with
|
|
// special strides for a/b.
|
|
auto scope = ngen_register_scope_t(host_->ra());
|
|
auto a_out_op = maybe_alloc_strided_op(obj.type, obj.a, scope);
|
|
auto b_out_op = maybe_alloc_strided_op(obj.type, obj.b, scope);
|
|
bool is_mul = obj.op_kind == op_kind_t::_mul;
|
|
flag_setter_t no_vs(&allow_vert_stride_region_, !is_mul);
|
|
auto src0_op = eval(obj.a, a_out_op);
|
|
auto src1_op = eval(obj.b, b_out_op);
|
|
|
|
if ((src0_op.is_reg_buf_data()
|
|
&& src0_op.reg_buf_data().hs() != 0)
|
|
|| (src1_op.is_reg_buf_data()
|
|
&& src1_op.reg_buf_data().hs() != 0))
|
|
mod.setExecSize(obj.type.elems());
|
|
|
|
ebinary(obj, mod, dst_op, src0_op, src1_op);
|
|
break;
|
|
}
|
|
}
|
|
|
|
bind(obj, dst_op);
|
|
}
|
|
|
|
void _visit(const bool_imm_t &obj) override {
|
|
// Scalar booleans must never be directly lowered:
|
|
// - Booleans are mapped to flag registers
|
|
// - Flag register stores vector of boolean vectors
|
|
// - All boolean values in IR must be expressed by shuffle_t objects
|
|
// - _visit(shuffle_t *) must properly handle vector of booleans -> flag
|
|
// register lowering
|
|
gpu_error_not_expected();
|
|
}
|
|
|
|
void _visit(const cast_t &obj) override {
|
|
auto &from_type = obj.expr.type();
|
|
auto &to_type = obj.type;
|
|
|
|
gpu_assert(from_type != to_type) << "Equal types are not expected.";
|
|
|
|
if (is_const(obj.expr) && !to_type.is_bool()) {
|
|
if (obj.expr.type().is_bool()) {
|
|
bind(obj,
|
|
to_ngen(expr_t(to_cpp<bool>(obj.expr) ? 1 : 0),
|
|
to_type));
|
|
} else {
|
|
bind(obj, to_ngen(obj.expr, to_type));
|
|
}
|
|
return;
|
|
}
|
|
|
|
auto dst_op = alloc_dst_op(obj);
|
|
|
|
// Handle ptr -> u64 and u64 -> ptr casts.
|
|
if (utils::one_of(
|
|
obj.type, type_t::u64(), type_t::byte(type::attr_t::ptr))
|
|
&& utils::one_of(obj.expr.type(), type_t::u64(),
|
|
type_t::byte(type::attr_t::ptr))) {
|
|
eval(obj.expr, dst_op);
|
|
bind(obj, dst_op);
|
|
return;
|
|
}
|
|
|
|
// Handle integer (down-)conversion, assume bitwise equality in this
|
|
// case. Examples: d <-> ud, d -> w, q -> d.
|
|
bool is_int_convert = from_type.is_scalar() && to_type.is_scalar()
|
|
&& from_type.is_int() && to_type.is_int();
|
|
bool is_int_down_convert
|
|
= is_int_convert && from_type.size() >= to_type.size();
|
|
bool is_int_up_convert
|
|
= is_int_convert && from_type.size() < to_type.size();
|
|
if (is_int_down_convert) {
|
|
eval(obj.expr, dst_op.reinterpret(from_type));
|
|
bind(obj, dst_op);
|
|
return;
|
|
}
|
|
|
|
auto expr_op = eval(obj.expr);
|
|
auto mod = dst_op.mod();
|
|
if (obj.saturate) mod |= host_->sat;
|
|
host_->emov(mod, dst_op, expr_op);
|
|
if (is_int_up_convert) int_up_converts_.emplace(obj, from_type);
|
|
bind(obj, dst_op);
|
|
}
|
|
|
|
void _visit(const float_imm_t &obj) override { bind(obj, to_ngen(obj)); }
|
|
|
|
void _visit(const iif_t &obj) override {
|
|
auto dst_op = alloc_dst_op(obj);
|
|
auto cond_op = eval(obj.cond);
|
|
auto true_expr_op = eval(obj.true_expr);
|
|
auto false_expr_op = eval(obj.false_expr);
|
|
auto mod = dst_op.mod();
|
|
host_->esel(mod | cond_op.flag_register_mod(), dst_op, true_expr_op,
|
|
false_expr_op);
|
|
bind(obj, dst_op);
|
|
}
|
|
|
|
void _visit(const int_imm_t &obj) override { bind(obj, to_ngen(obj)); }
|
|
|
|
void _visit(const load_t &obj) override {
|
|
auto &type = obj.type;
|
|
auto base_type = type.base();
|
|
auto buf_op = eval(obj.buf);
|
|
auto off_op = eval(obj.off);
|
|
int stride;
|
|
if (obj.has_default_stride()) {
|
|
stride = 1;
|
|
} else {
|
|
gpu_assert(obj.stride % base_type.size() == 0);
|
|
stride = obj.stride / base_type.size();
|
|
}
|
|
int off = to_cpp<int>(off_op.immediate());
|
|
auto load_rbd = buf_op.reg_buf_data().format(off / base_type.size(),
|
|
type.elems(), stride, to_ngen(base_type));
|
|
bind(obj, load_rbd);
|
|
}
|
|
void _visit(const ref_t &obj) override {
|
|
auto &type = obj.type;
|
|
auto base_type = type.base();
|
|
auto buf_op = eval(obj.var);
|
|
int off = obj.off;
|
|
auto load_rbd = buf_op.reg_buf_data().format(
|
|
off, type.elems(), /*stride=*/1, to_ngen(base_type));
|
|
bind(obj, load_rbd);
|
|
}
|
|
|
|
void _visit(const ptr_t &obj) override {
|
|
auto base_op = eval(obj.base);
|
|
|
|
if (is_zero(obj.off)) {
|
|
bind(obj, base_op);
|
|
return;
|
|
}
|
|
|
|
gpu_assert(base_op.is_reg_buf_data());
|
|
|
|
int elem_off = to_cpp<int>(obj.off);
|
|
int byte_off = ir_utils::safe_divide(
|
|
elem_off * obj.type.base().bitsize(), 8);
|
|
bind(obj, base_op.reg_buf_data().format(byte_off, ngen::DataType::ub));
|
|
}
|
|
|
|
void _visit(const shuffle_t &obj) override {
|
|
int elems = obj.elems();
|
|
if (obj.type.is_bool() && is_shuffle_const(obj)) {
|
|
auto dst_op = alloc_dst_op(obj);
|
|
auto e_shuffle = expr_t(obj);
|
|
gpu_assert(dst_op.is_flag_register()
|
|
|| dst_op.type() == ngen::DataType::uw
|
|
|| dst_op.type() == ngen::DataType::ud)
|
|
<< e_shuffle;
|
|
gpu_assert(!dst_op.is_negated()) << e_shuffle;
|
|
uint32_t flag_mask = 0;
|
|
for (int i = elems - 1; i >= 0; i--) {
|
|
flag_mask <<= 1;
|
|
flag_mask |= (to_cpp<bool>(e_shuffle[i]) ? 1 : 0);
|
|
}
|
|
if (dst_op.mod().getPredCtrl() == ngen::PredCtrl::None) {
|
|
host_->emov(1, dst_op, ngen::Immediate(flag_mask));
|
|
} else {
|
|
gpu_assert(dst_op.mod().getFlagReg().getARFBase()
|
|
== dst_op.flag_register().getARFBase());
|
|
host_->and_(1, dst_op.flag_register(), dst_op.flag_register(),
|
|
ngen::Immediate(flag_mask));
|
|
}
|
|
bind(obj, dst_op);
|
|
return;
|
|
}
|
|
|
|
if (obj.is_broadcast()) {
|
|
if (obj.type.is_bool()) {
|
|
auto dst_op = alloc_dst_op(obj);
|
|
eval(obj.vec[0], dst_op);
|
|
bind(obj, dst_op);
|
|
} else {
|
|
auto scalar_op = eval(obj.vec[0]);
|
|
bind(obj, scalar_op);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (try_region_peephole(obj)) return;
|
|
if (try_packed_int_peephole(obj)) return;
|
|
|
|
// tuples: <offset, length, idx>
|
|
std::vector<std::tuple<int, int, int>> chunks;
|
|
for (int i = 0; i < elems; i++) {
|
|
int idx = obj.idx[i];
|
|
if (chunks.empty() || std::get<2>(chunks.back()) != idx) {
|
|
chunks.emplace_back(i, 1, idx);
|
|
} else {
|
|
std::get<1>(chunks.back())++;
|
|
}
|
|
}
|
|
|
|
auto dst_op = alloc_dst_op(obj);
|
|
auto op = ngen_operand_t(scope_.alloc_reg_data(type_t::u16(1)), 1);
|
|
for (auto &chunk : chunks) {
|
|
int off = std::get<0>(chunk);
|
|
int length = std::get<1>(chunk);
|
|
int idx = std::get<2>(chunk);
|
|
// Split length into powers of two.
|
|
while (length > 0) {
|
|
int exec_size = (1 << math::ilog2q(length));
|
|
if (obj.type.is_bool()) {
|
|
gpu_assert(off % 8 == 0)
|
|
<< "expected mask offset to be multiple of 8";
|
|
auto chunk_op = op.reg_buf_data().subregister(
|
|
off / 8, ngen::DataType::b)(1);
|
|
eval(obj.vec[idx], ngen_operand_t(dst_op, exec_size));
|
|
host_->emov(1, chunk_op, dst_op.flag_register().b(0));
|
|
} else {
|
|
auto chunk_op = dst_op.sub_reg_data(off, exec_size);
|
|
eval(obj.vec[idx], ngen_operand_t(chunk_op, exec_size));
|
|
}
|
|
length -= exec_size;
|
|
off += exec_size;
|
|
}
|
|
}
|
|
if (obj.type.is_bool()) host_->emov(1, dst_op, op);
|
|
bind(obj, dst_op);
|
|
}
|
|
|
|
void _visit(const ternary_op_t &obj) override {
|
|
flag_setter_t no_vs(&allow_vert_stride_region_, false);
|
|
auto dst_op = alloc_dst_op(obj);
|
|
auto mod = dst_op.mod();
|
|
auto src0_op = eval(obj.a);
|
|
auto src1_op = eval(obj.b);
|
|
auto src2_op = eval(obj.c);
|
|
switch (obj.op_kind) {
|
|
case op_kind_t::_add3:
|
|
host_->eadd3(mod, dst_op, src0_op, src1_op, src2_op);
|
|
break;
|
|
case op_kind_t::_mad:
|
|
host_->emad(mod, dst_op, src0_op, src1_op, src2_op);
|
|
break;
|
|
case op_kind_t::_idiv:
|
|
host_->eidiv(mod, dst_op.reg_data(), ngen::Subregister(),
|
|
src0_op.reg_data(), src1_op.reg_data(),
|
|
src2_op.reg_data());
|
|
break;
|
|
case op_kind_t::_imod:
|
|
host_->eidiv(mod, ngen::Subregister(), dst_op.reg_data(),
|
|
src0_op.reg_data(), src1_op.reg_data(),
|
|
src2_op.reg_data());
|
|
break;
|
|
default: gpu_error_not_expected();
|
|
}
|
|
bind(obj, dst_op);
|
|
}
|
|
|
|
void _visit(const unary_op_t &obj) override {
|
|
gpu_assert(obj.op_kind == op_kind_t::_minus);
|
|
ngen_operand_t a_op;
|
|
a_op = eval(obj.a);
|
|
bind(obj, -a_op);
|
|
}
|
|
|
|
void _visit(const var_t &obj) override {
|
|
gpu_assert(expr_binding_.is_bound(obj))
|
|
<< "Variable is not defined: " << expr_t(obj);
|
|
}
|
|
|
|
private:
|
|
struct flag_setter_t {
|
|
flag_setter_t(bool *flag, bool value) : flag(flag), old(*flag) {
|
|
*flag = value;
|
|
}
|
|
~flag_setter_t() { *flag = old; }
|
|
|
|
bool *flag;
|
|
bool old;
|
|
};
|
|
|
|
ngen_operand_t alloc_dst_op(const expr_t &e) {
|
|
gpu_assert(!expr_binding_.is_bound(e)) << "Already evaluated: " << e;
|
|
if (expr_binding_.is_dst_bound(e)) return expr_binding_.get_dst(e);
|
|
|
|
// Expression is not bound yet, allocate new storage and bind.
|
|
ngen_operand_t op;
|
|
if (e.type().is_bool()) {
|
|
int elems = std::max(
|
|
e.type().elems(), std::max(16, host_->getSIMD()));
|
|
op = ngen_operand_t(scope_.alloc_flag(elems), elems);
|
|
} else {
|
|
op = ngen_operand_t(
|
|
scope_.alloc_reg_data(e.type()), e.type().elems());
|
|
}
|
|
expr_binding_.bind_dst(e, op);
|
|
return op;
|
|
}
|
|
|
|
// Pre-allocates a strided register region for expression `e` if needed.
|
|
ngen_operand_t maybe_alloc_strided_op(const type_t &res_type,
|
|
const expr_t &e, ngen_register_scope_t &scope) {
|
|
// Need q-strided region for `e` if res_type is q/uq and `e` is of a
|
|
// sub-q data type and not a scalar.
|
|
if (e.type().is_scalar()) return ngen_operand_t();
|
|
if (!utils::one_of(res_type.base(), type_t::s64(), type_t::u64()))
|
|
return ngen_operand_t();
|
|
if (utils::one_of(e.type().base(), type_t::s64(), type_t::u64()))
|
|
return ngen_operand_t();
|
|
|
|
auto *shuffle = e.as_ptr<shuffle_t>();
|
|
if (shuffle && shuffle->is_broadcast()) return ngen_operand_t();
|
|
|
|
int stride = res_type.bitsize() / e.type().bitsize();
|
|
return ngen_operand_t(
|
|
scope.alloc_reg_data(e.type(), stride), e.type().elems());
|
|
}
|
|
|
|
void bind(const expr_t &e, const ngen_operand_t &op) {
|
|
if (!expr_binding_.is_dst_bound(e)) {
|
|
expr_binding_.bind(e, op);
|
|
return;
|
|
}
|
|
auto dst_op = expr_binding_.get_dst(e);
|
|
if (dst_op == op) {
|
|
expr_binding_.bind(e, op);
|
|
return;
|
|
}
|
|
// Expression is already bound, move to the location it was bound to.
|
|
// This is required for immediate values - they are bound as is but
|
|
// sometimes we need them to be moved to registers.
|
|
host_->emov(dst_op.mod(), dst_op, op);
|
|
expr_binding_.bind(e, dst_op);
|
|
}
|
|
|
|
void ebinary(const binary_op_t &obj, const ngen::InstructionModifier &mod,
|
|
const ngen_operand_t &_dst, const ngen_operand_t &_src0,
|
|
const ngen_operand_t &_src1) {
|
|
auto &dst = _dst;
|
|
auto src0 = _src0;
|
|
auto src1 = _src1;
|
|
align_src_dst_offset(host_, scope_, mod, dst, src0, src1);
|
|
switch (obj.op_kind) {
|
|
case op_kind_t::_add: host_->eadd(mod, dst, src0, src1); break;
|
|
case op_kind_t::_sub: host_->eadd(mod, dst, src0, -src1); break;
|
|
case op_kind_t::_mul: host_->emul(mod, dst, src0, src1); break;
|
|
case op_kind_t::_div: host_->ediv(mod, dst, src0, src1); break;
|
|
case op_kind_t::_mod: host_->emod(mod, dst, src0, src1); break;
|
|
case op_kind_t::_shl: host_->eshl(mod, dst, src0, src1); break;
|
|
case op_kind_t::_shr: host_->eshr(mod, dst, src0, src1); break;
|
|
case op_kind_t::_min: host_->emin(mod, dst, src0, src1); break;
|
|
case op_kind_t::_max: host_->emax(mod, dst, src0, src1); break;
|
|
case op_kind_t::_ge:
|
|
case op_kind_t::_gt:
|
|
case op_kind_t::_le:
|
|
case op_kind_t::_lt:
|
|
case op_kind_t::_eq:
|
|
case op_kind_t::_ne: {
|
|
gpu_assert(!dst.is_negated())
|
|
<< "Destination can't be negated.";
|
|
ngen::InstructionModifier cmp_mod = mod;
|
|
if (!src0.is_reg_data()) {
|
|
cmp_mod |= cmp_op_to_ngen(negate_cmp_op(obj.op_kind));
|
|
cmp_mod |= dst.flag_register();
|
|
host_->ecmp(cmp_mod, src1, src0);
|
|
} else {
|
|
cmp_mod |= cmp_op_to_ngen(obj.op_kind);
|
|
cmp_mod |= dst.flag_register();
|
|
host_->ecmp(cmp_mod, src0, src1);
|
|
}
|
|
break;
|
|
}
|
|
case op_kind_t::_and: host_->eand(mod, dst, src0, src1); break;
|
|
case op_kind_t::_prelu: {
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
int esize = mod.getExecSize();
|
|
int off = src0.reg_data().getOffset();
|
|
int regs = utils::div_up(
|
|
esize * int(sizeof(float)) + off, grf_size);
|
|
auto temp = scope_.alloc_reg_buf_data(regs).format(
|
|
off, esize, 1, ngen::DataType::f);
|
|
host_->emul(mod, temp, dst, src1);
|
|
// Workaround for regioning restriction.
|
|
if (esize == 2) {
|
|
host_->csel(mod | host_->le, dst.reg_data(),
|
|
temp.subregister(0)(1),
|
|
dst.reg_buf_data().subregister(0)(1),
|
|
dst.reg_buf_data().subregister(0)(1));
|
|
} else {
|
|
host_->csel(mod | host_->le, dst.reg_data(), temp,
|
|
dst.reg_data(), dst.reg_data());
|
|
}
|
|
|
|
break;
|
|
}
|
|
default:
|
|
gpu_error_not_expected()
|
|
<< "Unknown kind: " << to_string(obj.op_kind);
|
|
}
|
|
}
|
|
|
|
struct conjunct_t {
|
|
conjunct_t(op_kind_t op, ngen_operand_t a, ngen_operand_t b)
|
|
: op_(op), a_(std::move(a)), b_(std::move(b)) {}
|
|
op_kind_t op_;
|
|
ngen_operand_t a_, b_;
|
|
};
|
|
|
|
void split_by_and(const expr_t &e, std::vector<conjunct_t> &cv, type_t ty) {
|
|
if (auto bin = e.as_ptr<binary_op_t>()) {
|
|
if (bin->op_kind == op_kind_t::_and) {
|
|
split_by_and(bin->a, cv, ty);
|
|
split_by_and(bin->b, cv, ty);
|
|
} else
|
|
cv.emplace_back(bin->op_kind, eval(bin->a), eval(bin->b));
|
|
} else {
|
|
auto cast = cast_t::make(ty, e);
|
|
cv.emplace_back(op_kind_t::undef, eval(cast), ngen_operand_t());
|
|
}
|
|
}
|
|
|
|
ngen_operand_t try_process_negated_flags(const expr_t &e) {
|
|
ngen_operand_t retn;
|
|
auto cast = e.as<unary_op_t>().a.as_ptr<cast_t>();
|
|
if (cast && cast->expr.type().is_bool()) {
|
|
int elems = cast->expr.type().elems();
|
|
ngen_operand_t flags(scope_.alloc_flag(elems), e.type().elems());
|
|
retn = alloc_dst_op(e);
|
|
auto mod = retn.mod();
|
|
auto ar_op = [&](ngen::InstructionModifier m, const conjunct_t &c) {
|
|
if (c.op_ != op_kind_t::undef)
|
|
host_->ecmp(m | cmp_op_to_ngen(c.op_), retn, c.a_, c.b_);
|
|
else
|
|
host_->emov(m, retn, -c.a_);
|
|
};
|
|
std::vector<conjunct_t> cv;
|
|
split_by_and(cast->expr, cv, cast->type);
|
|
for (size_t i = 0; i < cv.size(); i++) {
|
|
if (cv[i].op_ == op_kind_t::undef) {
|
|
gpu_assert(i == cv.size() - 1);
|
|
}
|
|
}
|
|
|
|
ar_op(mod, cv[0]);
|
|
mod |= flags.flag_register();
|
|
for (size_t i = 1; i < cv.size(); i++)
|
|
ar_op(mod, cv[i]);
|
|
retn = -retn;
|
|
}
|
|
return retn;
|
|
}
|
|
|
|
bool try_region_peephole(const shuffle_t &obj) {
|
|
int elems = obj.elems();
|
|
if (elems % 2 != 0) return false;
|
|
|
|
std::vector<ngen_operand_t> vec(obj.vec.size());
|
|
ngen::DataType data_type = ngen::DataType::invalid;
|
|
for (size_t i = 0; i < vec.size(); i++) {
|
|
if (!obj.vec[i].is<load_t>()) return false;
|
|
vec[i] = eval(obj.vec[i]);
|
|
gpu_assert(vec[i].is_reg_buf_data()) << obj.vec[i];
|
|
auto &rbd = vec[i].reg_buf_data();
|
|
if (data_type == ngen::DataType::invalid) {
|
|
data_type = rbd.type();
|
|
continue;
|
|
}
|
|
if (data_type != rbd.type()) return false;
|
|
}
|
|
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
auto diff_bytes = [&](const ngen_operand_t &a,
|
|
const ngen_operand_t &b) {
|
|
auto a_rd = a.reg_data();
|
|
auto b_rd = b.reg_data();
|
|
int a_off = a_rd.getBase() * grf_size + a_rd.getByteOffset();
|
|
int b_off = b_rd.getBase() * grf_size + b_rd.getByteOffset();
|
|
return b_off - a_off;
|
|
};
|
|
|
|
int type_size = ngen::getBytes(data_type);
|
|
int stride_bytes = diff_bytes(vec[0], vec[1]);
|
|
if (stride_bytes < 0 || stride_bytes % type_size != 0) return false;
|
|
|
|
// Pattern 1: [xxyy]
|
|
auto is_xxyy = [&]() {
|
|
if (!allow_vert_stride_region_) return false;
|
|
for (int i = 0; i < elems / 2; i++) {
|
|
if (obj.idx[i] != 0) return false;
|
|
if (obj.idx[i + elems / 2] != 1) return false;
|
|
}
|
|
return true;
|
|
};
|
|
if (is_xxyy()) {
|
|
auto &rbd = vec[0].reg_buf_data();
|
|
auto rd = rbd.reg_data();
|
|
int regs = utils::div_up(stride_bytes * 2, grf_size);
|
|
if (regs > 2) return false;
|
|
rd.setRegion(stride_bytes / type_size, elems / 2, 0);
|
|
reg_buf_t rb(hw(), ngen::GRFRange(rd.getBase(), regs));
|
|
bind(obj, reg_buf_data_t(rb, rd));
|
|
return true;
|
|
}
|
|
|
|
// Pattern 2: [xyxy]
|
|
auto is_xyxy = [&]() {
|
|
for (int i = 0; i < elems / 2; i++) {
|
|
if (obj.idx[i] != i) return false;
|
|
if (obj.idx[i] != obj.idx[i + elems / 2]) return false;
|
|
if (i > 0 && diff_bytes(vec[i - 1], vec[i]) != stride_bytes)
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
if (is_xyxy()) {
|
|
auto &rbd = vec[0].reg_buf_data();
|
|
auto rd = rbd.reg_data();
|
|
int regs = utils::div_up(stride_bytes * elems / 2, grf_size);
|
|
if (regs > 2) return false;
|
|
rd.setRegion(0, elems / 2, stride_bytes / type_size);
|
|
reg_buf_t rb(hw(), ngen::GRFRange(rd.getBase(), regs));
|
|
bind(obj, reg_buf_data_t(rb, rd));
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool try_packed_int_peephole(const shuffle_t &obj) {
|
|
if (!obj.type.is_x32()) return false;
|
|
if (!utils::one_of(obj.elems(), 8, 16)) return false;
|
|
|
|
int64_t int_min = std::numeric_limits<int>::min();
|
|
int64_t int_max = std::numeric_limits<int>::max();
|
|
int vec_size = (int)obj.vec.size();
|
|
std::vector<int> vec(vec_size);
|
|
for (int i = 0; i < vec_size; i++) {
|
|
if (!is_const(obj.vec[i])) return false;
|
|
int64_t value = to_cpp<int64_t>(obj.vec[i]);
|
|
if (value < int_min || value > int_max) return false;
|
|
vec[i] = (int)value;
|
|
}
|
|
|
|
const int esize = 8;
|
|
|
|
auto half_same = [&](int off) {
|
|
return std::equal(obj.idx.begin() + off + 1,
|
|
obj.idx.begin() + off + esize, obj.idx.begin() + off);
|
|
};
|
|
// If true, the case is too trivial for :v/:uv to justify the overhead
|
|
if (half_same(0) && half_same(esize % obj.elems())) return false;
|
|
|
|
int vec_min = *std::min_element(vec.begin(), vec.end());
|
|
int vec_max = *std::max_element(vec.begin(), vec.end());
|
|
|
|
int factor = vec_max - vec_min;
|
|
for (int i = 0; i < vec_size; i++)
|
|
factor = math::gcd(vec[i] - vec_min, factor);
|
|
|
|
// XXX: Disabled due to an emulation limitation: vector multiplication
|
|
// by dword constant is not implemented yet.
|
|
int64_t s16_min = std::numeric_limits<int16_t>::min();
|
|
int64_t s16_max = std::numeric_limits<int16_t>::max();
|
|
if (factor < s16_min || factor > s16_max) return false;
|
|
|
|
auto check_range = [&](int f, int m, int a, int b) {
|
|
for (int i = 0; i < vec_size; i++) {
|
|
int d = (vec[i] - m) / f;
|
|
if (d < a || d > b) return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
bool use_uv = false, use_v = false;
|
|
for (int f : {1, factor, -factor}) {
|
|
use_uv = check_range(f, vec_min, 0, 15);
|
|
use_v = check_range(f, vec_min, -8, 7);
|
|
if (use_uv || use_v) {
|
|
factor = f;
|
|
break;
|
|
}
|
|
}
|
|
if (!use_uv && !use_v) return false;
|
|
if (vec_min % factor == 0) {
|
|
bool new_use_uv = check_range(factor, 0, 0, 15);
|
|
bool new_use_v = check_range(factor, 0, -8, 7);
|
|
if (new_use_uv || new_use_v) {
|
|
vec_min = 0;
|
|
use_uv = new_use_uv;
|
|
use_v = new_use_v;
|
|
}
|
|
}
|
|
|
|
auto set_packed = [](uint32_t &packed, int8_t value, int idx) {
|
|
uint32_t v = (value >= 0 ? value : ((value & 0x7) | 0x8));
|
|
packed = packed | (v << idx * 4);
|
|
};
|
|
|
|
auto dst = alloc_dst_op(obj);
|
|
auto &dst_rbd = dst.reg_buf_data();
|
|
int dst_stride = dst_rbd.hs();
|
|
int w_size = sizeof(uint16_t);
|
|
int grf_size = ngen::GRF::bytes(hw());
|
|
auto tmp = scope_.alloc_reg_buf_data(1);
|
|
auto w_type = (use_uv) ? ngen::DataType::uw : ngen::DataType::w;
|
|
for (int i = 0; i < obj.elems(); i += esize) {
|
|
uint32_t packed = 0;
|
|
for (int j = 0; j < esize; j++)
|
|
set_packed(packed, (vec[obj.idx[i + j]] - vec_min) / factor, j);
|
|
auto t = tmp.format(i, esize, 1, w_type);
|
|
host_->emov(esize, t,
|
|
(use_uv) ? ngen::Immediate::uv(packed)
|
|
: ngen::Immediate::v(packed));
|
|
}
|
|
auto d = dst_rbd.format(0, obj.elems(), dst_stride);
|
|
auto t = tmp.format(0, obj.elems(), 1, w_type);
|
|
reg_buf_data_t t_strided;
|
|
bool align_with_dst = false;
|
|
if (align_with_dst) {
|
|
int w_stride = dst_stride * (ngen::getBytes(dst.type()) / w_size);
|
|
int tmp_strided_regs
|
|
= utils::div_up(obj.elems() * w_size * w_stride, grf_size);
|
|
auto tmp_strided = scope_.alloc_reg_buf_data(tmp_strided_regs);
|
|
t_strided = tmp_strided.format(0, obj.elems(), w_stride, w_type);
|
|
host_->emov(obj.elems(), t_strided, t);
|
|
} else {
|
|
t_strided = std::move(t);
|
|
}
|
|
if (factor != 1) {
|
|
host_->emul(obj.elems(), d, t_strided, ngen::Immediate(factor));
|
|
}
|
|
if (factor == 1 || vec_min != 0) {
|
|
host_->eadd(obj.elems(), d, (factor == 1) ? t_strided : d,
|
|
ngen::Immediate(vec_min));
|
|
}
|
|
bind(obj, dst);
|
|
return true;
|
|
}
|
|
|
|
ngen_generator_t *host_;
|
|
expr_binding_t expr_binding_;
|
|
ngen_register_scope_t &scope_;
|
|
bool allow_vert_stride_region_ = true;
|
|
|
|
object_eq_map_t<expr_t, type_t> int_up_converts_;
|
|
};
|
|
|
|
class setup_visitor_t : public ir_visitor_t {
|
|
public:
|
|
void _visit(const func_call_t &obj) override {
|
|
auto &func = obj.func;
|
|
auto *dpas = func.as_ptr<dpas_t>();
|
|
auto *send = func.as_ptr<send_t>();
|
|
if (dpas)
|
|
flags.has_dpas = true;
|
|
else if (send && send->is_atomic())
|
|
flags.has_send_atomics = true;
|
|
else if (func.is_same(funcs::signal_func()))
|
|
flags.has_signal_header = true;
|
|
else if (func.is_same(funcs::barrier_func()))
|
|
flags.has_signal_header = true;
|
|
}
|
|
|
|
setup_flags_t flags = {};
|
|
};
|
|
|
|
setup_flags_t get_setup_flags(const stmt_t &s) {
|
|
setup_visitor_t visitor;
|
|
visitor.visit(s);
|
|
return visitor.flags;
|
|
}
|
|
|
|
template <typename GeneratorT>
|
|
void convert_ir_to_ngen(const stmt_t &body, GeneratorT &host,
|
|
const walk_order_t *kernel_grid_walk_order = nullptr) {
|
|
expr_binding_t expr_binding(host.getHardware());
|
|
host.comment("Prologue");
|
|
host.generate_prologue();
|
|
|
|
host.bind_external_vars(body, expr_binding, kernel_grid_walk_order);
|
|
if (kernel_grid_walk_order)
|
|
host.bind_kernel_grid_walk_order(*kernel_grid_walk_order, expr_binding);
|
|
|
|
host.comment("IR");
|
|
ir_to_ngen_t<GeneratorT> visitor(&host, expr_binding);
|
|
visitor.visit(body);
|
|
|
|
host.comment("Epilogue");
|
|
host.generate_epilogue();
|
|
}
|
|
|
|
template <typename GeneratorT>
|
|
std::string get_ngen_str(const stmt_t &body, GeneratorT *host,
|
|
const walk_order_t *kernel_grid_walk_order) {
|
|
#ifdef NGEN_ASM
|
|
ir_to_ngen_generator_t<ngen_asm_code_generator_with_interface_t> host_asm(
|
|
host->kernel_iface(), host->options(), {});
|
|
host_asm.set_interface(host->getInterface());
|
|
|
|
try {
|
|
convert_ir_to_ngen(body, host_asm, kernel_grid_walk_order);
|
|
return host_asm.str();
|
|
} catch (std::runtime_error &e) {
|
|
return "IR to nGEN Exception: " + std::string(e.what());
|
|
}
|
|
#else
|
|
return "";
|
|
#endif
|
|
}
|
|
|
|
template <typename GeneratorT>
|
|
void generate_from_ir(const stmt_t &kernel_body, GeneratorT *host,
|
|
const walk_order_t *kernel_grid_walk_order, int &peak_regs) {
|
|
gpu_trace() << get_ngen_str(kernel_body, host, kernel_grid_walk_order);
|
|
convert_ir_to_ngen(kernel_body, *host, kernel_grid_walk_order);
|
|
#ifdef DNNL_DEV_MODE
|
|
peak_regs = host->ra().get_peak_regs();
|
|
#endif
|
|
}
|
|
|
|
ngen::NEOInterfaceHandler generate_ngen_interface(
|
|
const kernel::iface_t &kernel_iface, const kernel::options_t &options,
|
|
bool require_dpas, const stmt_t &kernel_body) {
|
|
|
|
ngen::NEOInterfaceHandler interface(options.hw());
|
|
interface.externalName(kernel_iface.kernel_name());
|
|
interface.requireLocalID(3);
|
|
interface.requireLocalSize();
|
|
interface.requireGRF(options.regs());
|
|
interface.requireSIMD(options.simd());
|
|
interface.requireBarrier();
|
|
auto setup_flags = get_setup_flags(kernel_body);
|
|
|
|
// Allow dpas override to avoid context switch overhead on XeHPG
|
|
if (setup_flags.has_dpas || require_dpas) interface.requireDPAS();
|
|
if (setup_flags.has_send_atomics) interface.requireGlobalAtomics();
|
|
|
|
for (size_t i = 0; i < kernel_iface.nargs(); i++) {
|
|
auto &name = kernel_iface[i].as<var_t>().name;
|
|
auto &type = kernel_iface[i].type();
|
|
if (type.is_ptr()) {
|
|
interface.newArgument(name, ngen::ExternalArgumentType::GlobalPtr,
|
|
ngen::GlobalAccessType::Stateless);
|
|
} else {
|
|
interface.newArgument(name, to_ngen(type));
|
|
}
|
|
}
|
|
|
|
int slm_size = alloc_manager_t(kernel_body).slm_size();
|
|
interface.requireSLM(slm_size);
|
|
|
|
interface.finalize();
|
|
return interface;
|
|
}
|
|
|
|
void ir_kernel_t::generate_from_ir(
|
|
const stmt_t &kernel_body, const walk_order_t *kernel_grid_walk_order) {
|
|
gpu_assert(!generator_)
|
|
<< "ir_kernel_t::generate_from_ir() was called already.";
|
|
|
|
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
|
|
kernel_iface_, options_, require_dpas_, kernel_body);
|
|
|
|
if (local_range_) {
|
|
size_t max_slm_size = compute::device_info_t::max_slm_size_per_tg(
|
|
convert_ngen_arch_to_dnnl(options_.hw()), thread_group_size(),
|
|
options_.regs() > 128);
|
|
if (interface.getSLMSize() > max_slm_size) {
|
|
gpu_trace() << "SLM size limit exceeded: " << interface.getSLMSize()
|
|
<< " > " << max_slm_size;
|
|
gpu_except_not_implemented("SLM size limit is exceeded.");
|
|
}
|
|
}
|
|
|
|
#define GPU_HW_CASE(hw) \
|
|
using gen_type = ir_to_ngen_generator_t<generator_t<(hw)>>; \
|
|
generator_ = utils::make_unique<gen_type>( \
|
|
kernel_iface_, options_, debug_config_); \
|
|
auto *gen = static_cast<gen_type *>(generator_.get()); \
|
|
gen->setInterface(generate_ngen_interface( \
|
|
kernel_iface_, options_, require_dpas_, kernel_body)); \
|
|
if (force_emulate64_) gen->force_emulate64(); \
|
|
jit::generate_from_ir(kernel_body, gen, kernel_grid_walk_order, peak_regs_);
|
|
|
|
GPU_HW_SWITCH(options_.hw().ngen_hw());
|
|
|
|
#undef GPU_HW_CASE
|
|
}
|
|
|
|
#ifdef WITH_SYCL_RUNTIME
|
|
::sycl::kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
|
|
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
|
::sycl::context ctx, ::sycl::device dev) {
|
|
|
|
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
|
|
iface, options, false, body);
|
|
std::optional<::sycl::kernel> kernel;
|
|
|
|
#define GPU_HW_CASE(hw) \
|
|
ir_to_ngen_generator_t<ngen::SYCLCodeGenerator<(hw)>> g( \
|
|
iface, options, debug_cfg); \
|
|
g.setInterface(interface); \
|
|
convert_ir_to_ngen(body, g); \
|
|
kernel = g.getKernel(ctx, dev); \
|
|
break;
|
|
|
|
GPU_HW_SWITCH(options.hw().ngen_hw());
|
|
#undef GPU_HW_CASE
|
|
return kernel.value();
|
|
}
|
|
#endif
|
|
#ifdef WITH_OPENCL_RUNTIME
|
|
cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
|
|
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
|
cl_context ctx, cl_device_id dev) {
|
|
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
|
|
iface, options, false, body);
|
|
|
|
#define GPU_HW_CASE(hw) \
|
|
ir_to_ngen_generator_t<ngen::OpenCLCodeGenerator<(hw)>> g( \
|
|
iface, options, debug_cfg); \
|
|
g.setInterface(std::move(interface)); \
|
|
convert_ir_to_ngen(body, g); \
|
|
return g.getKernel(ctx, dev);
|
|
|
|
GPU_HW_SWITCH(options.hw().ngen_hw());
|
|
#undef GPU_HW_CASE
|
|
return {};
|
|
}
|
|
#endif
|
|
|
|
#ifdef WITH_L0_RUNTIME
|
|
std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
|
|
const kernel::iface_t &iface, const stmt_t &body,
|
|
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
|
ze_context_handle_t ctx, ze_device_handle_t dev) {
|
|
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
|
|
iface, options, false, body);
|
|
|
|
#define GPU_HW_CASE(hw) \
|
|
ir_to_ngen_generator_t<ngen::LevelZeroCodeGenerator<(hw)>> g( \
|
|
iface, options, debug_cfg); \
|
|
g.setInterface(std::move(interface)); \
|
|
convert_ir_to_ngen(body, g); \
|
|
auto module = g.getModule(ctx, dev); \
|
|
auto kernel = g.getKernel(module); \
|
|
return std::make_pair(module, kernel);
|
|
|
|
GPU_HW_SWITCH(options.hw().ngen_hw());
|
|
#undef GPU_HW_CASE
|
|
return {};
|
|
}
|
|
#endif
|
|
|
|
} // namespace jit
|
|
} // namespace intel
|
|
} // namespace gpu
|
|
} // namespace impl
|
|
} // namespace dnnl
|
|
|
|
#ifdef ENABLE_LLVM_WCONVERSION
|
|
#pragma clang diagnostic pop
|
|
#endif
|