Files
oneDNN/third_party/ngen/ngen_interface.hpp
2025-08-27 17:52:51 -07:00

844 lines
30 KiB
C++

/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef NGEN_INTERFACE_HPP
#define NGEN_INTERFACE_HPP
#include "ngen_core.hpp"
#include "ngen_asm.hpp"
#include <sstream>
namespace NGEN_NAMESPACE {
template <HW hw> class OpenCLCodeGenerator;
template <HW hw> class LevelZeroCodeGenerator;
// Exceptions.
#ifdef NGEN_SAFE
class illegal_simd_exception : public std::runtime_error {
public:
illegal_simd_exception() : std::runtime_error("Illegal SIMD size (subgroup size)") {}
};
class unknown_argument_exception : public std::runtime_error {
public:
unknown_argument_exception() : std::runtime_error("Argument not found") {}
};
class bad_argument_type_exception : public std::runtime_error {
public:
bad_argument_type_exception() : std::runtime_error("Bad argument type") {}
};
class interface_not_finalized : public std::runtime_error {
public:
interface_not_finalized() : std::runtime_error("Interface has not been finalized") {}
};
class use_simd1_local_id_exception : public std::runtime_error {
public:
use_simd1_local_id_exception() : std::runtime_error("Use getSIMD1LocalID for SIMD1 kernels") {}
};
class unsupported_argument_location_override : public std::runtime_error {
public:
unsupported_argument_location_override() : std::runtime_error("Argument register location is invalid") {}
};
#endif
enum class ExternalArgumentType { Scalar, GlobalPtr, LocalPtr, Hidden };
enum class GlobalAccessType { None = 0, Stateless = 1, Surface = 2, All = 3, Default = 4 };
static inline GlobalAccessType operator|(GlobalAccessType access1, GlobalAccessType access2)
{
return static_cast<GlobalAccessType>(static_cast<int>(access1) | static_cast<int>(access2));
}
enum class ThreadArbitrationMode { Default, OldestFirst, RoundRobin, RoundRobinOnStall };
class InterfaceHandler
{
template <HW hw> friend class OpenCLCodeGenerator;
template <HW hw> friend class LevelZeroCodeGenerator;
public:
InterfaceHandler(HW hw_) : hw(hw_)
, simd(GRF::bytes(hw_) >> 2)
, requestedInlineBytes(defaultInlineBytes(hw))
{}
inline void externalName(const std::string &name) { kernelName = name; }
template <typename DT>
inline void newArgument(const std::string &name) { newArgument(name, getDataType<DT>()); }
inline void newArgument(const std::string &name, DataType type, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
inline void newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
inline void newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access = GlobalAccessType::Default);
void allowArgumentRearrangement(bool allow) { rearrangeArgs = allow; }
inline Subregister getArgument(const std::string &name) const;
inline Subregister getArgumentIfExists(const std::string &name) const;
inline int getArgumentSurface(const std::string &name) const;
inline int getArgumentSurfaceIfExists(const std::string &name) const;
inline GRF getLocalID(int dim) const;
inline Subregister getSIMD1LocalID(int dim) const;
inline Subregister getLocalSize(int dim) const;
inline Subregister getGroupID(int dim) const;
const std::string &getExternalName() const { return kernelName; }
int getSIMD() const { return simd; }
int getBarrierCount() const { return barrierCount; }
int getGRFCount() const { return needGRF; }
size_t getSLMSize() const { return slmSize; }
void require32BitBuffers() { allow64BitBuffers = false; }
void requireArbitrationMode(ThreadArbitrationMode m) { arbitrationMode = m; }
void requireBarrier() { barrierCount = 1; }
void requireBarriers(int nBarriers) { barrierCount = nBarriers; }
void requireDPAS() { needDPAS = true; }
void requireGlobalAtomics() { needGlobalAtomics = true; }
void requireGRF(int grfs) { needGRF = grfs; }
void requireLocalID(int dimensions) { needLocalID = dimensions; }
void requireLocalSize() { needLocalSize = true; }
void requireNonuniformWGs() { needNonuniformWGs = true; }
void requireNoPreemption() { needNoPreemption = true; }
void requirePartitionDim(int dim) { needPartitionDim = dim; }
void requireScratch(size_t bytes = 1) { scratchSize = bytes; }
inline void requireSIMD(int simd_);
void requireSLM(size_t bytes) { slmSize = bytes; }
void requireStatelessWrites(bool req = true) { needStatelessWrites = req; }
inline void requireType(DataType type);
template <typename T> void requireType() { requireType(getDataType<T>()); }
void requireWalkOrder(int o1, int o2) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = -1; }
void requireWalkOrder(int o1, int o2, int o3) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = o3; }
void requireWorkgroup(size_t x, size_t y = 1,
size_t z = 1) { wg[0] = x; wg[1] = y; wg[2] = z; }
void setArgumentBase(RegData base) { baseOverride = base; }
void setInlineGRFCount(int grfs) { requestedInlineBytes = grfs * GRF::bytes(hw); }
int32_t getSkipCrossThreadOffset() const { return offsetSkipCrossThread; }
std::array<int32_t, 2> getCTPatchOffsets() const { return offsetCTPatches; }
inline Register getCrossthreadBase(bool effective = true) const;
inline Register getArgLoadBase() const;
inline void finalize();
template <typename CodeGenerator>
inline void generatePrologue(CodeGenerator &generator, const GRF &temp = GRF(127)) const;
inline void setPrologueLabels(InterfaceLabels &labels, LabelManager &man);
inline void generateDummyCL(std::ostream &stream) const;
inline std::string generateZeInfo() const;
#ifdef NGEN_ASM
inline void dumpAssignments(std::ostream &stream) const;
#endif
static constexpr int noSurface = 0x80; // Returned by getArgumentSurfaceIfExists in case of no surface assignment
struct Assignment {
std::string name;
DataType type;
ExternalArgumentType exttype;
GlobalAccessType access;
Subregister reg;
int surface;
int index;
bool globalSurfaceAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Surface)); }
bool globalStatelessAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Stateless)); }
};
const Assignment &getAssignment(int idx) const { return assignments[idx]; }
size_t numAssignments() const { return assignments.size(); }
protected:
HW hw;
std::vector<Assignment> assignments;
std::string kernelName = "default_kernel";
int nextArgIndex = 0;
bool finalized = false;
bool hasArgLocOverride = false;
bool rearrangeArgs = true;
bool allow64BitBuffers = false;
ThreadArbitrationMode arbitrationMode = ThreadArbitrationMode::Default;
int barrierCount = 0;
RegData baseOverride;
bool needDPAS = false;
bool needGlobalAtomics = false;
int32_t needGRF = 128;
int needLocalID = 0;
bool needLocalSize = false;
bool needNonuniformWGs = false;
bool needNoPreemption = false;
int needPartitionDim = -1;
bool needHalf = false;
bool needDouble = false;
bool needStatelessWrites = true;
int32_t offsetSkipPerThread = 0;
int32_t offsetSkipCrossThread = 0;
std::array<int32_t, 2> offsetCTPatches = {0, 0};
size_t scratchSize = 0;
int simd = 8;
size_t slmSize = 0;
int walkOrder[3] = {-1, -1, -1};
size_t wg[3] = {0, 0, 0};
int crossthreadBytes = 0;
int crossthreadRegs = 0;
int requestedInlineBytes = 0;
inline int inlineBytes() const;
inline int inlineRegs() const;
inline int getCrossthreadRegs() const;
inline int getCrossthreadBytes() const;
int grfsPerLID() const { return (simd > 16 && GRF::bytes(hw) < 64) ? 2 : 1; }
static inline GlobalAccessType defaultGlobalAccess(HW hw);
static inline int defaultInlineBytes(HW hw);
};
using NEOInterfaceHandler = InterfaceHandler; /* Deprecated -- do not use in new code. */
void InterfaceHandler::newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype, GlobalAccessType access)
{
auto type = reg.getType();
if (reg.isNull())
reg.invalidate();
else
hasArgLocOverride = true;
if (exttype != ExternalArgumentType::GlobalPtr)
access = GlobalAccessType::None;
if (access == GlobalAccessType::Default)
access = defaultGlobalAccess(hw);
assignments.push_back({name, type, exttype, access, reg, noSurface, nextArgIndex++});
}
void InterfaceHandler::newArgument(const std::string &name, DataType type, ExternalArgumentType exttype, GlobalAccessType access)
{
newArgument(name, NullRegister().sub(0, type), exttype, access);
}
void InterfaceHandler::newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access)
{
DataType type = DataType::invalid;
switch (exttype) {
case ExternalArgumentType::GlobalPtr: type = DataType::uq; break;
case ExternalArgumentType::LocalPtr: type = DataType::ud; break;
default:
#ifdef NGEN_SAFE
throw bad_argument_type_exception();
#else
break;
#endif
}
newArgument(name, type, exttype, access);
}
Subregister InterfaceHandler::getArgumentIfExists(const std::string &name) const
{
for (auto &assignment : assignments) {
if (assignment.name == name)
return assignment.reg;
}
return Subregister{};
}
Subregister InterfaceHandler::getArgument(const std::string &name) const
{
Subregister arg = getArgumentIfExists(name);
#ifdef NGEN_SAFE
if (arg.isInvalid())
throw unknown_argument_exception();
#endif
return arg;
}
int InterfaceHandler::getArgumentSurfaceIfExists(const std::string &name) const
{
for (auto &assignment : assignments)
if (assignment.name == name)
return assignment.surface;
return noSurface;
}
int InterfaceHandler::getArgumentSurface(const std::string &name) const
{
int surface = getArgumentSurfaceIfExists(name);
#ifdef NGEN_SAFE
if (surface == noSurface)
throw unknown_argument_exception();
#endif
return surface;
}
Subregister InterfaceHandler::getSIMD1LocalID(int dim) const
{
#ifdef NGEN_SAFE
if (dim > needLocalID || simd != 1) throw unknown_argument_exception();
#endif
return GRF(1).uw(dim);
}
GRF InterfaceHandler::getLocalID(int dim) const
{
#ifdef NGEN_SAFE
if (dim > needLocalID) throw unknown_argument_exception();
if (simd == 1) throw use_simd1_local_id_exception();
#endif
if (simd == 1)
return GRF(1).uw();
else
return GRF(1 + dim * grfsPerLID()).uw();
}
Subregister InterfaceHandler::getGroupID(int dim) const
{
switch (dim) {
case 0: return GRF(0).ud(1);
case 1: return GRF(0).ud(6);
case 2: return GRF(0).ud(7);
}
return Subregister();
}
void InterfaceHandler::requireSIMD(int simd_)
{
simd = simd_;
#ifdef NGEN_SAFE
if (simd > 32 || !utils::is_zero_or_pow2(simd))
throw illegal_simd_exception();
if (simd != 1 && simd < (GRF::bytes(hw) >> 2))
throw illegal_simd_exception();
#endif
}
void InterfaceHandler::requireType(DataType type)
{
switch (type) {
case DataType::hf: needHalf = true; break;
case DataType::df: needDouble = true; break;
default: break;
}
}
static inline const char *getCLDataType(DataType type)
{
static const char *_ = "INVALID";
static const char *names[32] = {"uint", "int", "ushort", "short", "uchar", "char", "double", "float",
"ulong", "long", "half", "ushort", "uchar", _, _, _,
"float", "uchar", _, _, _, _, _, _,
_, _, _, _, _, _, _, _};
return names[static_cast<uint8_t>(type) & 0x1F];
}
void InterfaceHandler::generateDummyCL(std::ostream &stream) const
{
#ifdef NGEN_SAFE
if (!finalized) throw interface_not_finalized();
if (hasArgLocOverride || !rearrangeArgs) throw unsupported_argument_location_override();
#endif
const char *dpasDummy = " int __builtin_IB_sub_group_idpas_s8_s8_8_1(int, int, int8) __attribute__((const));\n"
" int z = __builtin_IB_sub_group_idpas_s8_s8_8_1(0, ____[0], 1);\n"
" for (int i = 0; i < z; i++) (void) ____[0];\n";
if (needHalf) stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
if (needDouble) stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0)
stream << "__attribute__((reqd_work_group_size(" << wg[0] << ',' << wg[1] << ',' << wg[2] << ")))\n";
if (walkOrder[0] >= 0) {
stream << "__attribute__((intel_reqd_workgroup_walk_order(" << walkOrder[0] << ',' << walkOrder[1];
if (walkOrder[2] >= 0)
stream << ',' << walkOrder[2];
stream << ")))\n";
}
stream << "__attribute__((intel_reqd_sub_group_size(" << simd << ")))\n";
stream << "kernel void " << kernelName << '(';
bool firstArg = true;
for (const auto &assignment : assignments) {
if (assignment.exttype == ExternalArgumentType::Hidden) continue;
if (!firstArg) stream << ", ";
switch (assignment.exttype) {
case ExternalArgumentType::GlobalPtr: stream << "global uint *"; break;
case ExternalArgumentType::LocalPtr: stream << "local uint *"; break;
case ExternalArgumentType::Scalar: stream << getCLDataType(assignment.type) << ' '; break;
default: break;
}
stream << assignment.name;
firstArg = false;
}
stream << ") {\n";
stream << " global volatile int *____;\n";
if (needLocalID) stream << " (void) ____[get_local_id(0)];\n";
if (needLocalSize) stream << " (void) ____[get_enqueued_local_size(0)];\n";
if (barrierCount > 0) stream << " __asm__ volatile(\"barrier\");\n";
for (int i = 1; i < barrierCount; i++) {
stream << " local NamedBarrier_t *bar" << i << ";\n"
" bar" << i << " = named_barrier_init(1);\n"
" work_group_named_barrier(bar" << i << ", 0);\n";
}
if (needDPAS) stream << dpasDummy;
if (needGlobalAtomics) stream << " atomic_inc(____);\n";
if (scratchSize > 0) stream << " volatile char scratch[" << scratchSize << "] = {0};\n";
if (slmSize > 0) stream << " volatile local char slm[" << slmSize << "]; slm[0]++;\n";
if (needNoPreemption) {
if (hw == HW::Gen9)
stream << " volatile double *__df; *__df = 1.1 / *__df;\n"; // IEEE macro causes IGC to disable MTP.
/* To do: Gen11 */
}
if (hw >= HW::XeHP) for (const auto &assignment : assignments) {
// Force IGC to assume stateless accesses could occur if necessary.
if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalStatelessAccess())
stream << " __asm__ volatile(\"\" :: \"rw.u\"(" << assignment.name << "));\n";
}
for (const auto &assignment : assignments) {
// Force IGC to assume surface accesses could occur if necessary.
if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalSurfaceAccess())
stream << " { volatile uchar __load = ((global uchar *) " << assignment.name << ")[get_local_id(0)];}\n";
}
stream << "}\n";
}
Subregister InterfaceHandler::getLocalSize(int dim) const
{
static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
return getArgument(localSizeArgs[dim]);
}
void InterfaceHandler::finalize()
{
// Make assignments, following NEO rules:
// - all inputs are naturally aligned
// - all sub-DWord inputs are DWord-aligned
// - first register is
// r3 (no local IDs)
// r5 (SIMD8/16, local IDs)
// r8 (SIMD32, local IDs)
// - assign local ptr arguments left-to-right
// - assign global pointer arguments left-to-right
// - assign scalar arguments left-to-right
// - assign surface indices left-to-right for global pointers
// - no arguments can cross a GRF boundary. Arrays like work size count
// as 1 argument for this rule.
static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
static const std::string scratchSizeArg = "__scratch_size";
Register base;
int offset;
int nextSurface = 0;
int regSize = GRF::bytes(hw);
if (baseOverride.isValid()) {
base = GRF(baseOverride.getBase());
offset = baseOverride.getByteOffset();
} else {
base = getCrossthreadBase();
offset = 32;
}
auto assignArgsOfType = [&](ExternalArgumentType which) {
for (auto &assignment : assignments) {
auto exttype = assignment.exttype;
if (!rearrangeArgs)
exttype = ExternalArgumentType::Scalar;
if (exttype != which) continue;
auto bytes = getBytes(assignment.type);
auto size = getDwords(assignment.type) << 2;
if (assignment.reg.isInvalid()) {
if (assignment.name == localSizeArgs[0]) {
// Move to next GRF if local size arguments won't fit in this one.
if (offset > regSize - (3 * 4)) {
offset = 0;
base++;
}
}
offset = (offset + size - 1) & -size;
if (offset >= regSize) {
base += offset / regSize;
offset = 0;
}
assignment.reg = base.sub(offset / bytes, assignment.type);
} else {
int obase = assignment.reg.getBase();
int ooffset = assignment.reg.getByteOffset();
if (base.getBase() < obase) {
base = GRF(obase);
offset = ooffset;
} else if (base.getBase() == obase)
offset = std::max(offset, ooffset);
}
offset += size;
if (assignment.exttype == ExternalArgumentType::GlobalPtr) {
if (!assignment.globalStatelessAccess())
assignment.reg = Subregister{};
if (assignment.globalSurfaceAccess())
assignment.surface = nextSurface;
nextSurface++;
} else if (assignment.exttype == ExternalArgumentType::Scalar)
requireType(assignment.type);
}
};
assignArgsOfType(ExternalArgumentType::LocalPtr);
assignArgsOfType(ExternalArgumentType::GlobalPtr);
assignArgsOfType(ExternalArgumentType::Scalar);
// Add private memory size arguments.
if (scratchSize > 0)
newArgument(scratchSizeArg, DataType::uq, ExternalArgumentType::Hidden);
// Add enqueued local size arguments.
if (needLocalSize && needNonuniformWGs)
for (int dim = 0; dim < 3; dim++)
newArgument(localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden);
assignArgsOfType(ExternalArgumentType::Hidden);
{
crossthreadBytes = (base.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw)
+ ((offset + 31) & -32);
crossthreadRegs = GRF::bytesToGRFs(hw, crossthreadBytes);
}
// Manually add regular local size arguments.
if (needLocalSize && !needNonuniformWGs) {
for (int dim = 0; dim < 3; dim++) {
Subregister loc = GRF(getCrossthreadBase().getBase()).ud(dim + 3);
assignments.push_back({localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden,
GlobalAccessType::None, loc, noSurface, -1});
}
}
finalized = true;
}
int InterfaceHandler::inlineBytes() const
{
return requestedInlineBytes;
}
int InterfaceHandler::inlineRegs() const
{
return GRF::bytesToGRFs(hw, inlineBytes());
}
Register InterfaceHandler::getCrossthreadBase(bool effective) const
{
if (!needLocalID)
return GRF((!effective || (hw >= HW::XeHP)) ? 1 : 2);
else if (simd == 1)
return GRF(2);
else
return GRF(1 + 3 * grfsPerLID());
}
Register InterfaceHandler::getArgLoadBase() const
{
return getCrossthreadBase().advance(inlineRegs());
}
int InterfaceHandler::getCrossthreadBytes() const
{
#ifdef NGEN_SAFE
if (!finalized) throw interface_not_finalized();
#endif
return crossthreadBytes;
}
int InterfaceHandler::getCrossthreadRegs() const
{
#ifdef NGEN_SAFE
if (!finalized) throw interface_not_finalized();
#endif
return crossthreadRegs;
}
GlobalAccessType InterfaceHandler::defaultGlobalAccess(HW hw)
{
if (hw >= HW::XeHPC) return GlobalAccessType::Stateless;
return GlobalAccessType::All;
}
int InterfaceHandler::defaultInlineBytes(HW hw)
{
if (hw == HW::XeHP || hw == HW::XeHPG) return 32;
return 0;
}
template <typename CodeGenerator>
void InterfaceHandler::generatePrologue(CodeGenerator &generator, const GRF &temp) const
{
if (needLocalID)
generator.loadlid(getCrossthreadBytes(), needLocalID, simd, temp, -1);
generator.loadargs(getArgLoadBase(), getCrossthreadRegs() - inlineRegs(), temp);
}
void InterfaceHandler::setPrologueLabels(InterfaceLabels &labels, LabelManager &man)
{
auto setOffset = [&](Label &label, int32_t &out, int32_t off = 0) {
auto id = label.getID(man);
if (man.hasTarget(id))
out = man.getTarget(id) + off;
};
int immOffset = 0xC;
setOffset(labels.localIDsLoaded, offsetSkipPerThread);
setOffset(labels.argsLoaded, offsetSkipCrossThread);
for (int i = 0; i < 2; i++)
setOffset(labels.crossThreadPatches[i], offsetCTPatches[i], immOffset);
}
std::string InterfaceHandler::generateZeInfo() const
{
#ifdef NGEN_SAFE
if (!finalized) throw interface_not_finalized();
#endif
std::stringstream md;
md.imbue(std::locale::classic());
const char *version = "1.8";
md << "version: " << version << "\n"
"kernels: \n"
" - name: \"" << kernelName << "\"\n"
" execution_env: \n"
" grf_count: " << needGRF << "\n"
" simd_size: " << simd << "\n";
if (simd > 1)
md << " required_sub_group_size: " << simd << "\n";
if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0) {
md << " required_work_group_size:\n"
<< " - " << wg[0] << "\n"
<< " - " << wg[1] << "\n"
<< " - " << wg[2] << "\n";
}
if (walkOrder[0] >= 0) {
md << " work_group_walk_order_dimensions:\n"
<< " - " << walkOrder[0] << "\n"
<< " - " << walkOrder[1] << "\n"
<< " - " << std::max(walkOrder[2], 0) << "\n";
}
if (offsetSkipPerThread > 0)
md << " offset_to_skip_per_thread_data_load: " << offsetSkipPerThread << '\n';
if (barrierCount > 0)
md << " barrier_count: " << utils::roundup_pow2(barrierCount) << '\n';
if (allow64BitBuffers)
md << " has_4gb_buffers: true\n";
if (needDPAS)
md << " has_dpas: true\n";
if (needGlobalAtomics)
md << " has_global_atomics: true\n";
if (slmSize > 0)
md << " slm_size: " << slmSize << '\n';
if (!needStatelessWrites)
md << " has_no_stateless_write: true\n";
if (needNoPreemption)
md << " disable_mid_thread_preemption: true\n";
if (arbitrationMode != ThreadArbitrationMode::Default) {
md << " thread_scheduling_mode: ";
switch (arbitrationMode) {
case ThreadArbitrationMode::OldestFirst: md << "age_based\n"; break;
case ThreadArbitrationMode::RoundRobin: md << "round_robin\n"; break;
case ThreadArbitrationMode::RoundRobinOnStall: md << "round_robin_stall\n"; break;
default: break;
}
}
if (inlineBytes() > 0)
md << " inline_data_payload_size: " << inlineBytes() << "\n";
if (!assignments.empty()) {
md << "\n"
" payload_arguments: \n";
}
if (scratchSize > 0) {
md << " - arg_type: scratch_pointer\n"
" offset: 8\n"
" size: 8\n";
}
for (auto &assignment : assignments) {
uint32_t size = 0;
bool skipArg = false;
bool explicitArg = true;
if (assignment.globalSurfaceAccess()) {
md << " - arg_type: arg_bypointer\n"
" arg_index: " << assignment.index << "\n"
" offset: 0\n"
" size: 0\n"
" addrmode: stateful\n"
" addrspace: global\n"
" access_type: readwrite\n"
"\n";
}
switch (assignment.exttype) {
case ExternalArgumentType::Scalar:
md << " - arg_type: arg_byvalue\n";
size = (assignment.reg.getDwords() << 2);
break;
case ExternalArgumentType::GlobalPtr:
if (!assignment.globalStatelessAccess()) {
skipArg = true;
break;
}
// fall through
case ExternalArgumentType::LocalPtr:
md << " - arg_type: arg_bypointer\n";
size = (assignment.reg.getDwords() << 2);
break;
case ExternalArgumentType::Hidden: {
explicitArg = false;
if (assignment.name == "__local_size0") {
// from Zebin spec : local_size Argument size : int32x3
// may need refining to allow
// either int32x1, int32x2, int32x3 (x, xy, xyz)
// or fine grain : local_size_x, local_size_y, local_size_z
md << " - arg_type: "
<< (needNonuniformWGs ? "enqueued_local_size\n" : "local_size\n");
size = (assignment.reg.getDwords() << 2) * 3;
} else
skipArg = true;
break;
}
}
if (skipArg)
continue;
auto offset = assignment.reg.getBase() - getCrossthreadBase().getBase();
offset = offset * GRF::bytes(hw) + assignment.reg.getByteOffset();
#ifdef NGEN_SAFE
if (offset < 0) throw unsupported_argument_location_override();
#endif
if (explicitArg)
md << " arg_index: " << assignment.index << "\n";
md << " offset: " << offset << "\n"
" size: " << size << '\n';
if (assignment.globalStatelessAccess()) {
md << " addrmode: stateless\n"
" addrspace: global\n"
" access_type: readwrite\n";
} else if (assignment.exttype == ExternalArgumentType::LocalPtr) {
md << " addrmode: slm\n"
" addrspace: local\n"
" access_type: readwrite\n";
}
md << "\n";
}
bool firstSurface = true;
for (auto &assignment : assignments) {
if (assignment.globalSurfaceAccess()) {
if (firstSurface) {
md << "\n"
" binding_table_indices: \n";
firstSurface = false;
}
md << " - bti_value: " << assignment.surface << "\n"
" arg_index: " << assignment.index << "\n"
" \n";
}
}
if (needLocalID) {
md << "\n"
" per_thread_payload_arguments: \n";
if (simd == 1) {
md << " - arg_type: packed_local_ids\n"
" offset: 0\n"
" size: 6\n"
" \n";
} else {
auto localIDBytes = grfsPerLID() * GRF::bytes(hw);
localIDBytes *= 3; // runtime currently supports 0 or 3 localId channels in per thread data
md << " - arg_type: local_id\n"
" offset: 0\n"
" size: " << localIDBytes << "\n"
" \n";
}
}
md << "\n"; // ensure file ends with newline
#ifdef NGEN_DUMP_ZE_INFO
std::cerr << md.str();
#endif
return md.str();
}
#ifdef NGEN_ASM
void InterfaceHandler::dumpAssignments(std::ostream &stream) const
{
LabelManager manager;
for (auto &assignment : assignments) {
stream << ' ';
if (assignment.reg.isValid())
assignment.reg.outputText(stream, PrintDetail::sub, manager);
else
stream << "(none)";
stream << '\t' << assignment.name;
if (assignment.surface != noSurface)
stream << "\t(BTI " << assignment.surface << ')';
stream << std::endl;
}
}
#endif
} /* namespace NGEN_NAMESPACE */
#endif /* header guard */