mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-20 18:43:49 +08:00
x64: cpu: conv: hybrid cache topology win bug fix for non-hybrid cpu
This commit is contained in:
@ -256,7 +256,7 @@ unsigned get_per_core_cache_size(int level) {
|
||||
#if DNNL_X64
|
||||
using namespace x64;
|
||||
if (cpu().getDataCacheLevels() == 0) return guess(level);
|
||||
|
||||
|
||||
if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
|
||||
unsigned l = level - 1;
|
||||
return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
|
||||
@ -440,6 +440,13 @@ inline unsigned count_bits_in_mask(KAFFINITY mask) {
|
||||
|
||||
// Helper function to determine core type from processor info
|
||||
// Returns p_core for performance cores, e_core for efficiency cores
|
||||
//
|
||||
// **Helper function assumes that is_hybrid() has already returned true**
|
||||
//
|
||||
// If not hybrid, this function is unreliable it can return either core type
|
||||
// depending on if the EfficiencyClass field is present or not.
|
||||
// TODO: revisit this helper function considering that EfficiencyClass can be
|
||||
// `0` for for P-cores when there are only P-cores present in the system.
|
||||
core_type get_core_type_from_processor_info(
|
||||
const SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *info) {
|
||||
// For hybrid CPUs, we need to distinguish between P-cores and E-cores
|
||||
@ -456,7 +463,7 @@ core_type get_core_type_from_processor_info(
|
||||
return core_type::p_core;
|
||||
#endif
|
||||
}
|
||||
// Default to p_core for non-hybrid or if we can't determine
|
||||
// Default to p_core if we can't determine the core_type
|
||||
return core_type::p_core;
|
||||
}
|
||||
|
||||
@ -541,7 +548,10 @@ void init_cache_topology_windows(cache_topology_t &cache_topology) {
|
||||
auto *info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(
|
||||
reinterpret_cast<BYTE*>(info_base_ptr) + offset);
|
||||
if (info->Relationship == RelationProcessorCore) {
|
||||
core_type ctype = get_core_type_from_processor_info(info);
|
||||
core_type ctype = core_type::default_core;
|
||||
if (cache_topology.is_hybrid)
|
||||
ctype = get_core_type_from_processor_info(info);
|
||||
|
||||
// Store each group mask for this core type
|
||||
for (WORD i = 0; i < info->Processor.GroupCount; i++) {
|
||||
core_info.push_back({ctype, info->Processor.GroupMask[i]});
|
||||
@ -885,7 +895,7 @@ void init_cache_topology_linux(cache_topology_t &cache_topology) {
|
||||
cache_topology.caches[idx] = cache_info;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// If not hybrid, copy P-core data to E-core slots
|
||||
if (!cache_topology.is_hybrid) {
|
||||
for (size_t level = 0; level < cache_topology_t::max_cache_levels; level++) {
|
||||
@ -918,25 +928,31 @@ void init_cache_topology() {
|
||||
unsigned get_per_core_cache_size(int level, behavior_t btype) {
|
||||
#if DNNL_X64
|
||||
init_cache_topology();
|
||||
|
||||
// Convert 1-based level to 0-based for array access
|
||||
|
||||
if (level < 1 || level > (int)cache_topology_t::max_cache_levels) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
auto pcore_cache = global_cache_topology.get_cache(level, core_type::p_core);
|
||||
auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
|
||||
|
||||
|
||||
// Calculate effective cache size per core (considering sharing)
|
||||
auto pcore_size = pcore_cache.size;
|
||||
if (pcore_cache.num_sharing_cores > 0) {
|
||||
pcore_size /= pcore_cache.num_sharing_cores;
|
||||
}
|
||||
|
||||
|
||||
if (!global_cache_topology.is_hybrid) {
|
||||
// For Non-hybrid system, all cores are assumed to be p-core
|
||||
return pcore_size;
|
||||
}
|
||||
|
||||
auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
|
||||
|
||||
auto ecore_size = ecore_cache.size;
|
||||
if (ecore_cache.num_sharing_cores > 0) {
|
||||
ecore_size /= ecore_cache.num_sharing_cores;
|
||||
}
|
||||
|
||||
switch (btype)
|
||||
{
|
||||
case behavior_t::p_core:
|
||||
|
@ -184,19 +184,21 @@ int get_vector_register_size();
|
||||
|
||||
size_t get_timestamp();
|
||||
|
||||
// using the P-core E-core terminology from Intel
|
||||
// P-core: Performance core (high performance, high power consumption)
|
||||
// E-core: Efficiency core (low performance, low power consumption)
|
||||
// However the naming in the SDM is different: using core (P-core) and atom (E-core)
|
||||
// However the naming in the Intel SDM is different:
|
||||
// - core for P-core
|
||||
// - atom for E-core
|
||||
enum class core_type : int {
|
||||
p_core = 0, // Performance core
|
||||
e_core = 1, // Efficiency core
|
||||
default_core = p_core // Default core (used for non-hybrid CPUs)
|
||||
};
|
||||
|
||||
// Assumption each core type on a system is homogeneous in terms of cache topology
|
||||
// e.g. all P-cores have the same cache topology, all E-cores have the same cache topology
|
||||
// this is true for all Intel hybrid CPUs so far (Alder Lake, Raptor Lake, Lunar Lake)
|
||||
// Assumption each core type on a system is homogeneous in terms of cache
|
||||
// topology e.g. all P-cores have the same cache topology, all E-cores have the
|
||||
// same cache topology this is true for all Intel hybrid CPUs so far
|
||||
// (Alder Lake, Raptor Lake, Lunar Lake)
|
||||
struct cache_info_t {
|
||||
uint8_t level; // cache level (0 - L1i, 1 - L1d, 2 - L2, 3 - L3, etc)
|
||||
uint32_t size; // cache size in bytes
|
||||
@ -236,18 +238,27 @@ enum class behavior_t {
|
||||
p_core, // Performance core
|
||||
e_core, // Efficiency core
|
||||
current, // Current core
|
||||
min, // used to select the smallest value for the core_type
|
||||
max, // used to select the largest value for the core_type
|
||||
min, // used to select the smallest value for all the cores
|
||||
max, // used to select the largest value for all the cores
|
||||
unknown
|
||||
};
|
||||
// called in place of get_per_core_cache_size when dealing with hybrid CPUs
|
||||
// the core_type argument specifies the type of core to query
|
||||
// if behavior_t is current, the function returns the cache size of the core the calling
|
||||
// thread is running on.
|
||||
// Use OS specific methods to determine the per-core cache size.
|
||||
//
|
||||
// if core_type is min/max, the function returns the min/max cache size among all cores
|
||||
// Examples:
|
||||
// This is avoids using CPUID-based methods which can result in inaccurate values
|
||||
//
|
||||
// Expected to be called in place of get_per_core_cache_size(level) the
|
||||
// behavior_t argument specifies the behavior of the query on hybrid CPUs.
|
||||
//
|
||||
// - if behavior_t is p_core/e_core, the function returns the per-core cache
|
||||
// size for that core type.
|
||||
// - if behavior_t is min/max, the function returns the min/max per-core cache
|
||||
// size among all cores
|
||||
// - if behavior_t is current, the function returns the cache size of the core
|
||||
// the calling thread is running on.
|
||||
// - if behavior_t is unknown, the function behaves like legacy
|
||||
// get_per_core_cache_size(level) function.
|
||||
//
|
||||
// Examples: (showing KB and MB values for clarity actual function returns bytes)
|
||||
// for a hybrid CPU with (e.g. Alder Lake)
|
||||
// 48KB L1d cache on P-cores (with hyperthreading) and
|
||||
// 32KB L1d cache on E-cores
|
||||
@ -278,16 +289,15 @@ enum class behavior_t {
|
||||
// get_per_core_cache_size(3, core_type::unknown)
|
||||
// uses the get_per_core_cache_size(int level) function.
|
||||
//
|
||||
// for non-hybrid CPUs (e.g. SRF/CWF), get_per_core_cache_size(int level, core_type ctype)
|
||||
// behaves like get_per_core_cache_size(int level) unless core_type is min/max/current
|
||||
// in which case it will consider the cache topology to return the appropriate value.
|
||||
// this can be used for CPUs with non-uniform cache topology.
|
||||
// TODO: Test This behavior on non-hybrid CPUs.
|
||||
//
|
||||
// Note: for non-hybrid CPUs, the core_type argument is ignored and the function
|
||||
// behaves like get_per_core_cache_size(int level)
|
||||
// TODO: Test behavior on non-hybrid CPUs.
|
||||
unsigned DNNL_API get_per_core_cache_size(int level, behavior_t btype);
|
||||
|
||||
// returns true if the CPU is a hybrid CPU
|
||||
// (e.g. Alder Lake, Raptor Lake, Lunar Lake)
|
||||
bool is_hybrid();
|
||||
|
||||
// get the core_type of the core the calling thread is running on
|
||||
// to get the core_type of a specific core, set thread affinity to that core
|
||||
core_type get_core_type();
|
||||
|
||||
} // namespace platform
|
||||
|
Reference in New Issue
Block a user