mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-20 18:43:49 +08:00
844 lines
30 KiB
C++
844 lines
30 KiB
C++
/*******************************************************************************
|
|
* Copyright 2019-2025 Intel Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*******************************************************************************/
|
|
|
|
#ifndef NGEN_INTERFACE_HPP
|
|
#define NGEN_INTERFACE_HPP
|
|
|
|
|
|
#include "ngen_core.hpp"
|
|
#include "ngen_asm.hpp"
|
|
|
|
#include <sstream>
|
|
|
|
|
|
namespace NGEN_NAMESPACE {
|
|
|
|
template <HW hw> class OpenCLCodeGenerator;
|
|
template <HW hw> class LevelZeroCodeGenerator;
|
|
|
|
// Exceptions.
|
|
#ifdef NGEN_SAFE
|
|
class illegal_simd_exception : public std::runtime_error {
|
|
public:
|
|
illegal_simd_exception() : std::runtime_error("Illegal SIMD size (subgroup size)") {}
|
|
};
|
|
|
|
class unknown_argument_exception : public std::runtime_error {
|
|
public:
|
|
unknown_argument_exception() : std::runtime_error("Argument not found") {}
|
|
};
|
|
|
|
class bad_argument_type_exception : public std::runtime_error {
|
|
public:
|
|
bad_argument_type_exception() : std::runtime_error("Bad argument type") {}
|
|
};
|
|
|
|
class interface_not_finalized : public std::runtime_error {
|
|
public:
|
|
interface_not_finalized() : std::runtime_error("Interface has not been finalized") {}
|
|
};
|
|
|
|
class use_simd1_local_id_exception : public std::runtime_error {
|
|
public:
|
|
use_simd1_local_id_exception() : std::runtime_error("Use getSIMD1LocalID for SIMD1 kernels") {}
|
|
};
|
|
|
|
class unsupported_argument_location_override : public std::runtime_error {
|
|
public:
|
|
unsupported_argument_location_override() : std::runtime_error("Argument register location is invalid") {}
|
|
};
|
|
|
|
#endif
|
|
|
|
enum class ExternalArgumentType { Scalar, GlobalPtr, LocalPtr, Hidden };
|
|
enum class GlobalAccessType { None = 0, Stateless = 1, Surface = 2, All = 3, Default = 4 };
|
|
|
|
static inline GlobalAccessType operator|(GlobalAccessType access1, GlobalAccessType access2)
|
|
{
|
|
return static_cast<GlobalAccessType>(static_cast<int>(access1) | static_cast<int>(access2));
|
|
}
|
|
|
|
enum class ThreadArbitrationMode { Default, OldestFirst, RoundRobin, RoundRobinOnStall };
|
|
|
|
class InterfaceHandler
|
|
{
|
|
template <HW hw> friend class OpenCLCodeGenerator;
|
|
template <HW hw> friend class LevelZeroCodeGenerator;
|
|
|
|
public:
|
|
InterfaceHandler(HW hw_) : hw(hw_)
|
|
, simd(GRF::bytes(hw_) >> 2)
|
|
, requestedInlineBytes(defaultInlineBytes(hw))
|
|
{}
|
|
|
|
inline void externalName(const std::string &name) { kernelName = name; }
|
|
|
|
template <typename DT>
|
|
inline void newArgument(const std::string &name) { newArgument(name, getDataType<DT>()); }
|
|
inline void newArgument(const std::string &name, DataType type, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
|
|
inline void newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
|
|
inline void newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access = GlobalAccessType::Default);
|
|
|
|
void allowArgumentRearrangement(bool allow) { rearrangeArgs = allow; }
|
|
|
|
inline Subregister getArgument(const std::string &name) const;
|
|
inline Subregister getArgumentIfExists(const std::string &name) const;
|
|
inline int getArgumentSurface(const std::string &name) const;
|
|
inline int getArgumentSurfaceIfExists(const std::string &name) const;
|
|
inline GRF getLocalID(int dim) const;
|
|
inline Subregister getSIMD1LocalID(int dim) const;
|
|
inline Subregister getLocalSize(int dim) const;
|
|
inline Subregister getGroupID(int dim) const;
|
|
|
|
const std::string &getExternalName() const { return kernelName; }
|
|
int getSIMD() const { return simd; }
|
|
int getBarrierCount() const { return barrierCount; }
|
|
int getGRFCount() const { return needGRF; }
|
|
size_t getSLMSize() const { return slmSize; }
|
|
|
|
void require32BitBuffers() { allow64BitBuffers = false; }
|
|
void requireArbitrationMode(ThreadArbitrationMode m) { arbitrationMode = m; }
|
|
void requireBarrier() { barrierCount = 1; }
|
|
void requireBarriers(int nBarriers) { barrierCount = nBarriers; }
|
|
void requireDPAS() { needDPAS = true; }
|
|
void requireGlobalAtomics() { needGlobalAtomics = true; }
|
|
void requireGRF(int grfs) { needGRF = grfs; }
|
|
void requireLocalID(int dimensions) { needLocalID = dimensions; }
|
|
void requireLocalSize() { needLocalSize = true; }
|
|
void requireNonuniformWGs() { needNonuniformWGs = true; }
|
|
void requireNoPreemption() { needNoPreemption = true; }
|
|
void requirePartitionDim(int dim) { needPartitionDim = dim; }
|
|
void requireScratch(size_t bytes = 1) { scratchSize = bytes; }
|
|
inline void requireSIMD(int simd_);
|
|
void requireSLM(size_t bytes) { slmSize = bytes; }
|
|
void requireStatelessWrites(bool req = true) { needStatelessWrites = req; }
|
|
inline void requireType(DataType type);
|
|
template <typename T> void requireType() { requireType(getDataType<T>()); }
|
|
void requireWalkOrder(int o1, int o2) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = -1; }
|
|
void requireWalkOrder(int o1, int o2, int o3) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = o3; }
|
|
void requireWorkgroup(size_t x, size_t y = 1,
|
|
size_t z = 1) { wg[0] = x; wg[1] = y; wg[2] = z; }
|
|
|
|
void setArgumentBase(RegData base) { baseOverride = base; }
|
|
void setInlineGRFCount(int grfs) { requestedInlineBytes = grfs * GRF::bytes(hw); }
|
|
int32_t getSkipCrossThreadOffset() const { return offsetSkipCrossThread; }
|
|
std::array<int32_t, 2> getCTPatchOffsets() const { return offsetCTPatches; }
|
|
|
|
inline Register getCrossthreadBase(bool effective = true) const;
|
|
inline Register getArgLoadBase() const;
|
|
|
|
inline void finalize();
|
|
|
|
template <typename CodeGenerator>
|
|
inline void generatePrologue(CodeGenerator &generator, const GRF &temp = GRF(127)) const;
|
|
|
|
inline void setPrologueLabels(InterfaceLabels &labels, LabelManager &man);
|
|
|
|
inline void generateDummyCL(std::ostream &stream) const;
|
|
inline std::string generateZeInfo() const;
|
|
|
|
#ifdef NGEN_ASM
|
|
inline void dumpAssignments(std::ostream &stream) const;
|
|
#endif
|
|
|
|
static constexpr int noSurface = 0x80; // Returned by getArgumentSurfaceIfExists in case of no surface assignment
|
|
|
|
struct Assignment {
|
|
std::string name;
|
|
DataType type;
|
|
ExternalArgumentType exttype;
|
|
GlobalAccessType access;
|
|
Subregister reg;
|
|
int surface;
|
|
int index;
|
|
|
|
bool globalSurfaceAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Surface)); }
|
|
bool globalStatelessAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Stateless)); }
|
|
};
|
|
|
|
const Assignment &getAssignment(int idx) const { return assignments[idx]; }
|
|
size_t numAssignments() const { return assignments.size(); }
|
|
|
|
protected:
|
|
HW hw;
|
|
|
|
std::vector<Assignment> assignments;
|
|
std::string kernelName = "default_kernel";
|
|
|
|
int nextArgIndex = 0;
|
|
bool finalized = false;
|
|
bool hasArgLocOverride = false;
|
|
bool rearrangeArgs = true;
|
|
|
|
bool allow64BitBuffers = false;
|
|
ThreadArbitrationMode arbitrationMode = ThreadArbitrationMode::Default;
|
|
int barrierCount = 0;
|
|
RegData baseOverride;
|
|
bool needDPAS = false;
|
|
bool needGlobalAtomics = false;
|
|
int32_t needGRF = 128;
|
|
int needLocalID = 0;
|
|
bool needLocalSize = false;
|
|
bool needNonuniformWGs = false;
|
|
bool needNoPreemption = false;
|
|
int needPartitionDim = -1;
|
|
bool needHalf = false;
|
|
bool needDouble = false;
|
|
bool needStatelessWrites = true;
|
|
int32_t offsetSkipPerThread = 0;
|
|
int32_t offsetSkipCrossThread = 0;
|
|
std::array<int32_t, 2> offsetCTPatches = {0, 0};
|
|
size_t scratchSize = 0;
|
|
int simd = 8;
|
|
size_t slmSize = 0;
|
|
int walkOrder[3] = {-1, -1, -1};
|
|
size_t wg[3] = {0, 0, 0};
|
|
|
|
int crossthreadBytes = 0;
|
|
int crossthreadRegs = 0;
|
|
int requestedInlineBytes = 0;
|
|
inline int inlineBytes() const;
|
|
inline int inlineRegs() const;
|
|
inline int getCrossthreadRegs() const;
|
|
inline int getCrossthreadBytes() const;
|
|
int grfsPerLID() const { return (simd > 16 && GRF::bytes(hw) < 64) ? 2 : 1; }
|
|
|
|
static inline GlobalAccessType defaultGlobalAccess(HW hw);
|
|
static inline int defaultInlineBytes(HW hw);
|
|
};
|
|
|
|
using NEOInterfaceHandler = InterfaceHandler; /* Deprecated -- do not use in new code. */
|
|
|
|
void InterfaceHandler::newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype, GlobalAccessType access)
|
|
{
|
|
auto type = reg.getType();
|
|
if (reg.isNull())
|
|
reg.invalidate();
|
|
else
|
|
hasArgLocOverride = true;
|
|
if (exttype != ExternalArgumentType::GlobalPtr)
|
|
access = GlobalAccessType::None;
|
|
if (access == GlobalAccessType::Default)
|
|
access = defaultGlobalAccess(hw);
|
|
assignments.push_back({name, type, exttype, access, reg, noSurface, nextArgIndex++});
|
|
}
|
|
|
|
void InterfaceHandler::newArgument(const std::string &name, DataType type, ExternalArgumentType exttype, GlobalAccessType access)
|
|
{
|
|
newArgument(name, NullRegister().sub(0, type), exttype, access);
|
|
}
|
|
|
|
void InterfaceHandler::newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access)
|
|
{
|
|
DataType type = DataType::invalid;
|
|
|
|
switch (exttype) {
|
|
case ExternalArgumentType::GlobalPtr: type = DataType::uq; break;
|
|
case ExternalArgumentType::LocalPtr: type = DataType::ud; break;
|
|
default:
|
|
#ifdef NGEN_SAFE
|
|
throw bad_argument_type_exception();
|
|
#else
|
|
break;
|
|
#endif
|
|
}
|
|
|
|
newArgument(name, type, exttype, access);
|
|
}
|
|
|
|
Subregister InterfaceHandler::getArgumentIfExists(const std::string &name) const
|
|
{
|
|
for (auto &assignment : assignments) {
|
|
if (assignment.name == name)
|
|
return assignment.reg;
|
|
}
|
|
|
|
return Subregister{};
|
|
}
|
|
|
|
Subregister InterfaceHandler::getArgument(const std::string &name) const
|
|
{
|
|
Subregister arg = getArgumentIfExists(name);
|
|
|
|
#ifdef NGEN_SAFE
|
|
if (arg.isInvalid())
|
|
throw unknown_argument_exception();
|
|
#endif
|
|
|
|
return arg;
|
|
}
|
|
|
|
int InterfaceHandler::getArgumentSurfaceIfExists(const std::string &name) const
|
|
{
|
|
for (auto &assignment : assignments)
|
|
if (assignment.name == name)
|
|
return assignment.surface;
|
|
return noSurface;
|
|
}
|
|
|
|
int InterfaceHandler::getArgumentSurface(const std::string &name) const
|
|
{
|
|
int surface = getArgumentSurfaceIfExists(name);
|
|
|
|
#ifdef NGEN_SAFE
|
|
if (surface == noSurface)
|
|
throw unknown_argument_exception();
|
|
#endif
|
|
return surface;
|
|
}
|
|
|
|
Subregister InterfaceHandler::getSIMD1LocalID(int dim) const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (dim > needLocalID || simd != 1) throw unknown_argument_exception();
|
|
#endif
|
|
|
|
return GRF(1).uw(dim);
|
|
}
|
|
|
|
GRF InterfaceHandler::getLocalID(int dim) const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (dim > needLocalID) throw unknown_argument_exception();
|
|
if (simd == 1) throw use_simd1_local_id_exception();
|
|
#endif
|
|
|
|
if (simd == 1)
|
|
return GRF(1).uw();
|
|
else
|
|
return GRF(1 + dim * grfsPerLID()).uw();
|
|
}
|
|
|
|
Subregister InterfaceHandler::getGroupID(int dim) const
|
|
{
|
|
|
|
switch (dim) {
|
|
case 0: return GRF(0).ud(1);
|
|
case 1: return GRF(0).ud(6);
|
|
case 2: return GRF(0).ud(7);
|
|
}
|
|
|
|
return Subregister();
|
|
}
|
|
|
|
void InterfaceHandler::requireSIMD(int simd_)
|
|
{
|
|
simd = simd_;
|
|
|
|
#ifdef NGEN_SAFE
|
|
if (simd > 32 || !utils::is_zero_or_pow2(simd))
|
|
throw illegal_simd_exception();
|
|
if (simd != 1 && simd < (GRF::bytes(hw) >> 2))
|
|
throw illegal_simd_exception();
|
|
#endif
|
|
}
|
|
|
|
void InterfaceHandler::requireType(DataType type)
|
|
{
|
|
switch (type) {
|
|
case DataType::hf: needHalf = true; break;
|
|
case DataType::df: needDouble = true; break;
|
|
default: break;
|
|
}
|
|
}
|
|
|
|
static inline const char *getCLDataType(DataType type)
|
|
{
|
|
static const char *_ = "INVALID";
|
|
static const char *names[32] = {"uint", "int", "ushort", "short", "uchar", "char", "double", "float",
|
|
"ulong", "long", "half", "ushort", "uchar", _, _, _,
|
|
"float", "uchar", _, _, _, _, _, _,
|
|
_, _, _, _, _, _, _, _};
|
|
return names[static_cast<uint8_t>(type) & 0x1F];
|
|
}
|
|
|
|
void InterfaceHandler::generateDummyCL(std::ostream &stream) const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (!finalized) throw interface_not_finalized();
|
|
if (hasArgLocOverride || !rearrangeArgs) throw unsupported_argument_location_override();
|
|
#endif
|
|
const char *dpasDummy = " int __builtin_IB_sub_group_idpas_s8_s8_8_1(int, int, int8) __attribute__((const));\n"
|
|
" int z = __builtin_IB_sub_group_idpas_s8_s8_8_1(0, ____[0], 1);\n"
|
|
" for (int i = 0; i < z; i++) (void) ____[0];\n";
|
|
|
|
if (needHalf) stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
|
|
if (needDouble) stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
|
|
|
|
if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0)
|
|
stream << "__attribute__((reqd_work_group_size(" << wg[0] << ',' << wg[1] << ',' << wg[2] << ")))\n";
|
|
if (walkOrder[0] >= 0) {
|
|
stream << "__attribute__((intel_reqd_workgroup_walk_order(" << walkOrder[0] << ',' << walkOrder[1];
|
|
if (walkOrder[2] >= 0)
|
|
stream << ',' << walkOrder[2];
|
|
stream << ")))\n";
|
|
}
|
|
stream << "__attribute__((intel_reqd_sub_group_size(" << simd << ")))\n";
|
|
stream << "kernel void " << kernelName << '(';
|
|
|
|
bool firstArg = true;
|
|
for (const auto &assignment : assignments) {
|
|
if (assignment.exttype == ExternalArgumentType::Hidden) continue;
|
|
|
|
if (!firstArg) stream << ", ";
|
|
|
|
switch (assignment.exttype) {
|
|
case ExternalArgumentType::GlobalPtr: stream << "global uint *"; break;
|
|
case ExternalArgumentType::LocalPtr: stream << "local uint *"; break;
|
|
case ExternalArgumentType::Scalar: stream << getCLDataType(assignment.type) << ' '; break;
|
|
default: break;
|
|
}
|
|
|
|
stream << assignment.name;
|
|
firstArg = false;
|
|
}
|
|
stream << ") {\n";
|
|
stream << " global volatile int *____;\n";
|
|
|
|
if (needLocalID) stream << " (void) ____[get_local_id(0)];\n";
|
|
if (needLocalSize) stream << " (void) ____[get_enqueued_local_size(0)];\n";
|
|
if (barrierCount > 0) stream << " __asm__ volatile(\"barrier\");\n";
|
|
for (int i = 1; i < barrierCount; i++) {
|
|
stream << " local NamedBarrier_t *bar" << i << ";\n"
|
|
" bar" << i << " = named_barrier_init(1);\n"
|
|
" work_group_named_barrier(bar" << i << ", 0);\n";
|
|
}
|
|
if (needDPAS) stream << dpasDummy;
|
|
if (needGlobalAtomics) stream << " atomic_inc(____);\n";
|
|
if (scratchSize > 0) stream << " volatile char scratch[" << scratchSize << "] = {0};\n";
|
|
if (slmSize > 0) stream << " volatile local char slm[" << slmSize << "]; slm[0]++;\n";
|
|
if (needNoPreemption) {
|
|
if (hw == HW::Gen9)
|
|
stream << " volatile double *__df; *__df = 1.1 / *__df;\n"; // IEEE macro causes IGC to disable MTP.
|
|
/* To do: Gen11 */
|
|
}
|
|
|
|
if (hw >= HW::XeHP) for (const auto &assignment : assignments) {
|
|
// Force IGC to assume stateless accesses could occur if necessary.
|
|
if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalStatelessAccess())
|
|
stream << " __asm__ volatile(\"\" :: \"rw.u\"(" << assignment.name << "));\n";
|
|
}
|
|
|
|
for (const auto &assignment : assignments) {
|
|
// Force IGC to assume surface accesses could occur if necessary.
|
|
if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalSurfaceAccess())
|
|
stream << " { volatile uchar __load = ((global uchar *) " << assignment.name << ")[get_local_id(0)];}\n";
|
|
}
|
|
|
|
stream << "}\n";
|
|
}
|
|
|
|
Subregister InterfaceHandler::getLocalSize(int dim) const
|
|
{
|
|
static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
|
|
return getArgument(localSizeArgs[dim]);
|
|
}
|
|
|
|
void InterfaceHandler::finalize()
|
|
{
|
|
// Make assignments, following NEO rules:
|
|
// - all inputs are naturally aligned
|
|
// - all sub-DWord inputs are DWord-aligned
|
|
// - first register is
|
|
// r3 (no local IDs)
|
|
// r5 (SIMD8/16, local IDs)
|
|
// r8 (SIMD32, local IDs)
|
|
// - assign local ptr arguments left-to-right
|
|
// - assign global pointer arguments left-to-right
|
|
// - assign scalar arguments left-to-right
|
|
// - assign surface indices left-to-right for global pointers
|
|
// - no arguments can cross a GRF boundary. Arrays like work size count
|
|
// as 1 argument for this rule.
|
|
|
|
static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
|
|
static const std::string scratchSizeArg = "__scratch_size";
|
|
|
|
Register base;
|
|
int offset;
|
|
int nextSurface = 0;
|
|
int regSize = GRF::bytes(hw);
|
|
|
|
if (baseOverride.isValid()) {
|
|
base = GRF(baseOverride.getBase());
|
|
offset = baseOverride.getByteOffset();
|
|
} else {
|
|
base = getCrossthreadBase();
|
|
offset = 32;
|
|
}
|
|
|
|
auto assignArgsOfType = [&](ExternalArgumentType which) {
|
|
for (auto &assignment : assignments) {
|
|
auto exttype = assignment.exttype;
|
|
if (!rearrangeArgs)
|
|
exttype = ExternalArgumentType::Scalar;
|
|
if (exttype != which) continue;
|
|
|
|
auto bytes = getBytes(assignment.type);
|
|
auto size = getDwords(assignment.type) << 2;
|
|
|
|
if (assignment.reg.isInvalid()) {
|
|
if (assignment.name == localSizeArgs[0]) {
|
|
// Move to next GRF if local size arguments won't fit in this one.
|
|
if (offset > regSize - (3 * 4)) {
|
|
offset = 0;
|
|
base++;
|
|
}
|
|
}
|
|
|
|
offset = (offset + size - 1) & -size;
|
|
if (offset >= regSize) {
|
|
base += offset / regSize;
|
|
offset = 0;
|
|
}
|
|
|
|
assignment.reg = base.sub(offset / bytes, assignment.type);
|
|
} else {
|
|
int obase = assignment.reg.getBase();
|
|
int ooffset = assignment.reg.getByteOffset();
|
|
if (base.getBase() < obase) {
|
|
base = GRF(obase);
|
|
offset = ooffset;
|
|
} else if (base.getBase() == obase)
|
|
offset = std::max(offset, ooffset);
|
|
}
|
|
|
|
offset += size;
|
|
|
|
if (assignment.exttype == ExternalArgumentType::GlobalPtr) {
|
|
if (!assignment.globalStatelessAccess())
|
|
assignment.reg = Subregister{};
|
|
if (assignment.globalSurfaceAccess())
|
|
assignment.surface = nextSurface;
|
|
nextSurface++;
|
|
} else if (assignment.exttype == ExternalArgumentType::Scalar)
|
|
requireType(assignment.type);
|
|
}
|
|
};
|
|
|
|
assignArgsOfType(ExternalArgumentType::LocalPtr);
|
|
assignArgsOfType(ExternalArgumentType::GlobalPtr);
|
|
assignArgsOfType(ExternalArgumentType::Scalar);
|
|
|
|
// Add private memory size arguments.
|
|
if (scratchSize > 0)
|
|
newArgument(scratchSizeArg, DataType::uq, ExternalArgumentType::Hidden);
|
|
|
|
// Add enqueued local size arguments.
|
|
if (needLocalSize && needNonuniformWGs)
|
|
for (int dim = 0; dim < 3; dim++)
|
|
newArgument(localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden);
|
|
|
|
assignArgsOfType(ExternalArgumentType::Hidden);
|
|
|
|
{
|
|
crossthreadBytes = (base.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw)
|
|
+ ((offset + 31) & -32);
|
|
crossthreadRegs = GRF::bytesToGRFs(hw, crossthreadBytes);
|
|
}
|
|
|
|
// Manually add regular local size arguments.
|
|
if (needLocalSize && !needNonuniformWGs) {
|
|
for (int dim = 0; dim < 3; dim++) {
|
|
Subregister loc = GRF(getCrossthreadBase().getBase()).ud(dim + 3);
|
|
assignments.push_back({localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden,
|
|
GlobalAccessType::None, loc, noSurface, -1});
|
|
}
|
|
}
|
|
finalized = true;
|
|
}
|
|
|
|
int InterfaceHandler::inlineBytes() const
|
|
{
|
|
return requestedInlineBytes;
|
|
}
|
|
|
|
int InterfaceHandler::inlineRegs() const
|
|
{
|
|
return GRF::bytesToGRFs(hw, inlineBytes());
|
|
}
|
|
|
|
Register InterfaceHandler::getCrossthreadBase(bool effective) const
|
|
{
|
|
if (!needLocalID)
|
|
return GRF((!effective || (hw >= HW::XeHP)) ? 1 : 2);
|
|
else if (simd == 1)
|
|
return GRF(2);
|
|
else
|
|
return GRF(1 + 3 * grfsPerLID());
|
|
}
|
|
|
|
Register InterfaceHandler::getArgLoadBase() const
|
|
{
|
|
return getCrossthreadBase().advance(inlineRegs());
|
|
}
|
|
|
|
int InterfaceHandler::getCrossthreadBytes() const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (!finalized) throw interface_not_finalized();
|
|
#endif
|
|
return crossthreadBytes;
|
|
}
|
|
|
|
int InterfaceHandler::getCrossthreadRegs() const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (!finalized) throw interface_not_finalized();
|
|
#endif
|
|
return crossthreadRegs;
|
|
}
|
|
|
|
GlobalAccessType InterfaceHandler::defaultGlobalAccess(HW hw)
|
|
{
|
|
if (hw >= HW::XeHPC) return GlobalAccessType::Stateless;
|
|
return GlobalAccessType::All;
|
|
}
|
|
|
|
int InterfaceHandler::defaultInlineBytes(HW hw)
|
|
{
|
|
if (hw == HW::XeHP || hw == HW::XeHPG) return 32;
|
|
return 0;
|
|
}
|
|
|
|
template <typename CodeGenerator>
|
|
void InterfaceHandler::generatePrologue(CodeGenerator &generator, const GRF &temp) const
|
|
{
|
|
if (needLocalID)
|
|
generator.loadlid(getCrossthreadBytes(), needLocalID, simd, temp, -1);
|
|
generator.loadargs(getArgLoadBase(), getCrossthreadRegs() - inlineRegs(), temp);
|
|
}
|
|
|
|
void InterfaceHandler::setPrologueLabels(InterfaceLabels &labels, LabelManager &man)
|
|
{
|
|
auto setOffset = [&](Label &label, int32_t &out, int32_t off = 0) {
|
|
auto id = label.getID(man);
|
|
if (man.hasTarget(id))
|
|
out = man.getTarget(id) + off;
|
|
};
|
|
|
|
int immOffset = 0xC;
|
|
|
|
setOffset(labels.localIDsLoaded, offsetSkipPerThread);
|
|
setOffset(labels.argsLoaded, offsetSkipCrossThread);
|
|
for (int i = 0; i < 2; i++)
|
|
setOffset(labels.crossThreadPatches[i], offsetCTPatches[i], immOffset);
|
|
}
|
|
|
|
std::string InterfaceHandler::generateZeInfo() const
|
|
{
|
|
#ifdef NGEN_SAFE
|
|
if (!finalized) throw interface_not_finalized();
|
|
#endif
|
|
|
|
std::stringstream md;
|
|
md.imbue(std::locale::classic());
|
|
|
|
const char *version = "1.8";
|
|
|
|
md << "version: " << version << "\n"
|
|
"kernels: \n"
|
|
" - name: \"" << kernelName << "\"\n"
|
|
" execution_env: \n"
|
|
" grf_count: " << needGRF << "\n"
|
|
" simd_size: " << simd << "\n";
|
|
if (simd > 1)
|
|
md << " required_sub_group_size: " << simd << "\n";
|
|
if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0) {
|
|
md << " required_work_group_size:\n"
|
|
<< " - " << wg[0] << "\n"
|
|
<< " - " << wg[1] << "\n"
|
|
<< " - " << wg[2] << "\n";
|
|
}
|
|
if (walkOrder[0] >= 0) {
|
|
md << " work_group_walk_order_dimensions:\n"
|
|
<< " - " << walkOrder[0] << "\n"
|
|
<< " - " << walkOrder[1] << "\n"
|
|
<< " - " << std::max(walkOrder[2], 0) << "\n";
|
|
}
|
|
if (offsetSkipPerThread > 0)
|
|
md << " offset_to_skip_per_thread_data_load: " << offsetSkipPerThread << '\n';
|
|
if (barrierCount > 0)
|
|
md << " barrier_count: " << utils::roundup_pow2(barrierCount) << '\n';
|
|
if (allow64BitBuffers)
|
|
md << " has_4gb_buffers: true\n";
|
|
if (needDPAS)
|
|
md << " has_dpas: true\n";
|
|
if (needGlobalAtomics)
|
|
md << " has_global_atomics: true\n";
|
|
if (slmSize > 0)
|
|
md << " slm_size: " << slmSize << '\n';
|
|
if (!needStatelessWrites)
|
|
md << " has_no_stateless_write: true\n";
|
|
if (needNoPreemption)
|
|
md << " disable_mid_thread_preemption: true\n";
|
|
if (arbitrationMode != ThreadArbitrationMode::Default) {
|
|
md << " thread_scheduling_mode: ";
|
|
switch (arbitrationMode) {
|
|
case ThreadArbitrationMode::OldestFirst: md << "age_based\n"; break;
|
|
case ThreadArbitrationMode::RoundRobin: md << "round_robin\n"; break;
|
|
case ThreadArbitrationMode::RoundRobinOnStall: md << "round_robin_stall\n"; break;
|
|
default: break;
|
|
}
|
|
}
|
|
if (inlineBytes() > 0)
|
|
md << " inline_data_payload_size: " << inlineBytes() << "\n";
|
|
if (!assignments.empty()) {
|
|
md << "\n"
|
|
" payload_arguments: \n";
|
|
}
|
|
if (scratchSize > 0) {
|
|
md << " - arg_type: scratch_pointer\n"
|
|
" offset: 8\n"
|
|
" size: 8\n";
|
|
}
|
|
for (auto &assignment : assignments) {
|
|
uint32_t size = 0;
|
|
bool skipArg = false;
|
|
bool explicitArg = true;
|
|
|
|
if (assignment.globalSurfaceAccess()) {
|
|
md << " - arg_type: arg_bypointer\n"
|
|
" arg_index: " << assignment.index << "\n"
|
|
" offset: 0\n"
|
|
" size: 0\n"
|
|
" addrmode: stateful\n"
|
|
" addrspace: global\n"
|
|
" access_type: readwrite\n"
|
|
"\n";
|
|
}
|
|
|
|
switch (assignment.exttype) {
|
|
case ExternalArgumentType::Scalar:
|
|
md << " - arg_type: arg_byvalue\n";
|
|
size = (assignment.reg.getDwords() << 2);
|
|
break;
|
|
case ExternalArgumentType::GlobalPtr:
|
|
if (!assignment.globalStatelessAccess()) {
|
|
skipArg = true;
|
|
break;
|
|
}
|
|
// fall through
|
|
case ExternalArgumentType::LocalPtr:
|
|
md << " - arg_type: arg_bypointer\n";
|
|
size = (assignment.reg.getDwords() << 2);
|
|
break;
|
|
case ExternalArgumentType::Hidden: {
|
|
explicitArg = false;
|
|
if (assignment.name == "__local_size0") {
|
|
// from Zebin spec : local_size Argument size : int32x3
|
|
// may need refining to allow
|
|
// either int32x1, int32x2, int32x3 (x, xy, xyz)
|
|
// or fine grain : local_size_x, local_size_y, local_size_z
|
|
md << " - arg_type: "
|
|
<< (needNonuniformWGs ? "enqueued_local_size\n" : "local_size\n");
|
|
size = (assignment.reg.getDwords() << 2) * 3;
|
|
} else
|
|
skipArg = true;
|
|
break;
|
|
}
|
|
}
|
|
if (skipArg)
|
|
continue;
|
|
|
|
auto offset = assignment.reg.getBase() - getCrossthreadBase().getBase();
|
|
offset = offset * GRF::bytes(hw) + assignment.reg.getByteOffset();
|
|
|
|
#ifdef NGEN_SAFE
|
|
if (offset < 0) throw unsupported_argument_location_override();
|
|
#endif
|
|
|
|
if (explicitArg)
|
|
md << " arg_index: " << assignment.index << "\n";
|
|
md << " offset: " << offset << "\n"
|
|
" size: " << size << '\n';
|
|
|
|
if (assignment.globalStatelessAccess()) {
|
|
md << " addrmode: stateless\n"
|
|
" addrspace: global\n"
|
|
" access_type: readwrite\n";
|
|
} else if (assignment.exttype == ExternalArgumentType::LocalPtr) {
|
|
md << " addrmode: slm\n"
|
|
" addrspace: local\n"
|
|
" access_type: readwrite\n";
|
|
}
|
|
md << "\n";
|
|
}
|
|
|
|
bool firstSurface = true;
|
|
for (auto &assignment : assignments) {
|
|
if (assignment.globalSurfaceAccess()) {
|
|
if (firstSurface) {
|
|
md << "\n"
|
|
" binding_table_indices: \n";
|
|
firstSurface = false;
|
|
}
|
|
md << " - bti_value: " << assignment.surface << "\n"
|
|
" arg_index: " << assignment.index << "\n"
|
|
" \n";
|
|
}
|
|
}
|
|
|
|
if (needLocalID) {
|
|
md << "\n"
|
|
" per_thread_payload_arguments: \n";
|
|
|
|
if (simd == 1) {
|
|
md << " - arg_type: packed_local_ids\n"
|
|
" offset: 0\n"
|
|
" size: 6\n"
|
|
" \n";
|
|
} else {
|
|
auto localIDBytes = grfsPerLID() * GRF::bytes(hw);
|
|
localIDBytes *= 3; // runtime currently supports 0 or 3 localId channels in per thread data
|
|
md << " - arg_type: local_id\n"
|
|
" offset: 0\n"
|
|
" size: " << localIDBytes << "\n"
|
|
" \n";
|
|
}
|
|
}
|
|
|
|
md << "\n"; // ensure file ends with newline
|
|
|
|
#ifdef NGEN_DUMP_ZE_INFO
|
|
std::cerr << md.str();
|
|
#endif
|
|
|
|
return md.str();
|
|
}
|
|
|
|
#ifdef NGEN_ASM
|
|
void InterfaceHandler::dumpAssignments(std::ostream &stream) const
|
|
{
|
|
LabelManager manager;
|
|
|
|
for (auto &assignment : assignments) {
|
|
stream << ' ';
|
|
if (assignment.reg.isValid())
|
|
assignment.reg.outputText(stream, PrintDetail::sub, manager);
|
|
else
|
|
stream << "(none)";
|
|
stream << '\t' << assignment.name;
|
|
if (assignment.surface != noSurface)
|
|
stream << "\t(BTI " << assignment.surface << ')';
|
|
stream << std::endl;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
} /* namespace NGEN_NAMESPACE */
|
|
|
|
#endif /* header guard */
|