x64: cpu: conv: hybrid cache topology win bug fix for non-hybrid cpu

This commit is contained in:
George Nash
2025-10-17 14:18:46 -07:00
parent f3eedec7a4
commit 4d0b907709
2 changed files with 57 additions and 31 deletions

View File

@ -256,7 +256,7 @@ unsigned get_per_core_cache_size(int level) {
#if DNNL_X64
using namespace x64;
if (cpu().getDataCacheLevels() == 0) return guess(level);
if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
unsigned l = level - 1;
return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
@ -440,6 +440,13 @@ inline unsigned count_bits_in_mask(KAFFINITY mask) {
// Helper function to determine core type from processor info
// Returns p_core for performance cores, e_core for efficiency cores
//
// **Helper function assumes that is_hybrid() has already returned true**
//
// If not hybrid, this function is unreliable it can return either core type
// depending on if the EfficiencyClass field is present or not.
// TODO: revisit this helper function considering that EfficiencyClass can be
// `0` for for P-cores when there are only P-cores present in the system.
core_type get_core_type_from_processor_info(
const SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *info) {
// For hybrid CPUs, we need to distinguish between P-cores and E-cores
@ -456,7 +463,7 @@ core_type get_core_type_from_processor_info(
return core_type::p_core;
#endif
}
// Default to p_core for non-hybrid or if we can't determine
// Default to p_core if we can't determine the core_type
return core_type::p_core;
}
@ -541,7 +548,10 @@ void init_cache_topology_windows(cache_topology_t &cache_topology) {
auto *info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(
reinterpret_cast<BYTE*>(info_base_ptr) + offset);
if (info->Relationship == RelationProcessorCore) {
core_type ctype = get_core_type_from_processor_info(info);
core_type ctype = core_type::default_core;
if (cache_topology.is_hybrid)
ctype = get_core_type_from_processor_info(info);
// Store each group mask for this core type
for (WORD i = 0; i < info->Processor.GroupCount; i++) {
core_info.push_back({ctype, info->Processor.GroupMask[i]});
@ -885,7 +895,7 @@ void init_cache_topology_linux(cache_topology_t &cache_topology) {
cache_topology.caches[idx] = cache_info;
}
}
// If not hybrid, copy P-core data to E-core slots
if (!cache_topology.is_hybrid) {
for (size_t level = 0; level < cache_topology_t::max_cache_levels; level++) {
@ -918,25 +928,31 @@ void init_cache_topology() {
unsigned get_per_core_cache_size(int level, behavior_t btype) {
#if DNNL_X64
init_cache_topology();
// Convert 1-based level to 0-based for array access
if (level < 1 || level > (int)cache_topology_t::max_cache_levels) {
return 0;
}
auto pcore_cache = global_cache_topology.get_cache(level, core_type::p_core);
auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
// Calculate effective cache size per core (considering sharing)
auto pcore_size = pcore_cache.size;
if (pcore_cache.num_sharing_cores > 0) {
pcore_size /= pcore_cache.num_sharing_cores;
}
if (!global_cache_topology.is_hybrid) {
// For Non-hybrid system, all cores are assumed to be p-core
return pcore_size;
}
auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
auto ecore_size = ecore_cache.size;
if (ecore_cache.num_sharing_cores > 0) {
ecore_size /= ecore_cache.num_sharing_cores;
}
switch (btype)
{
case behavior_t::p_core:

View File

@ -184,19 +184,21 @@ int get_vector_register_size();
size_t get_timestamp();
// using the P-core E-core terminology from Intel
// P-core: Performance core (high performance, high power consumption)
// E-core: Efficiency core (low performance, low power consumption)
// However the naming in the SDM is different: using core (P-core) and atom (E-core)
// However the naming in the Intel SDM is different:
// - core for P-core
// - atom for E-core
enum class core_type : int {
p_core = 0, // Performance core
e_core = 1, // Efficiency core
default_core = p_core // Default core (used for non-hybrid CPUs)
};
// Assumption each core type on a system is homogeneous in terms of cache topology
// e.g. all P-cores have the same cache topology, all E-cores have the same cache topology
// this is true for all Intel hybrid CPUs so far (Alder Lake, Raptor Lake, Lunar Lake)
// Assumption each core type on a system is homogeneous in terms of cache
// topology e.g. all P-cores have the same cache topology, all E-cores have the
// same cache topology this is true for all Intel hybrid CPUs so far
// (Alder Lake, Raptor Lake, Lunar Lake)
struct cache_info_t {
uint8_t level; // cache level (0 - L1i, 1 - L1d, 2 - L2, 3 - L3, etc)
uint32_t size; // cache size in bytes
@ -236,18 +238,27 @@ enum class behavior_t {
p_core, // Performance core
e_core, // Efficiency core
current, // Current core
min, // used to select the smallest value for the core_type
max, // used to select the largest value for the core_type
min, // used to select the smallest value for all the cores
max, // used to select the largest value for all the cores
unknown
};
// called in place of get_per_core_cache_size when dealing with hybrid CPUs
// the core_type argument specifies the type of core to query
// if behavior_t is current, the function returns the cache size of the core the calling
// thread is running on.
// Use OS specific methods to determine the per-core cache size.
//
// if core_type is min/max, the function returns the min/max cache size among all cores
// Examples:
// This is avoids using CPUID-based methods which can result in inaccurate values
//
// Expected to be called in place of get_per_core_cache_size(level) the
// behavior_t argument specifies the behavior of the query on hybrid CPUs.
//
// - if behavior_t is p_core/e_core, the function returns the per-core cache
// size for that core type.
// - if behavior_t is min/max, the function returns the min/max per-core cache
// size among all cores
// - if behavior_t is current, the function returns the cache size of the core
// the calling thread is running on.
// - if behavior_t is unknown, the function behaves like legacy
// get_per_core_cache_size(level) function.
//
// Examples: (showing KB and MB values for clarity actual function returns bytes)
// for a hybrid CPU with (e.g. Alder Lake)
// 48KB L1d cache on P-cores (with hyperthreading) and
// 32KB L1d cache on E-cores
@ -278,16 +289,15 @@ enum class behavior_t {
// get_per_core_cache_size(3, core_type::unknown)
// uses the get_per_core_cache_size(int level) function.
//
// for non-hybrid CPUs (e.g. SRF/CWF), get_per_core_cache_size(int level, core_type ctype)
// behaves like get_per_core_cache_size(int level) unless core_type is min/max/current
// in which case it will consider the cache topology to return the appropriate value.
// this can be used for CPUs with non-uniform cache topology.
// TODO: Test This behavior on non-hybrid CPUs.
//
// Note: for non-hybrid CPUs, the core_type argument is ignored and the function
// behaves like get_per_core_cache_size(int level)
// TODO: Test behavior on non-hybrid CPUs.
unsigned DNNL_API get_per_core_cache_size(int level, behavior_t btype);
// returns true if the CPU is a hybrid CPU
// (e.g. Alder Lake, Raptor Lake, Lunar Lake)
bool is_hybrid();
// get the core_type of the core the calling thread is running on
// to get the core_type of a specific core, set thread affinity to that core
core_type get_core_type();
} // namespace platform