x64: cpu: conv: hybrid cache topology win bug fix for non-hybrid cpu

2025-10-20 18:43:49 +08:00 · 2025-10-17 14:18:46 -07:00
parent f3eedec7a4
commit 4d0b907709
2 changed files with 57 additions and 31 deletions
--- a/src/cpu/platform.cpp
+++ b/src/cpu/platform.cpp
@ -256,7 +256,7 @@ unsigned get_per_core_cache_size(int level) {
 #if DNNL_X64
    using namespace x64;
    if (cpu().getDataCacheLevels() == 0) return guess(level);
-    
+
    if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
        unsigned l = level - 1;
        return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
@ -440,6 +440,13 @@ inline unsigned count_bits_in_mask(KAFFINITY mask) {

 // Helper function to determine core type from processor info
 // Returns p_core for performance cores, e_core for efficiency cores
+//
+// **Helper function assumes that is_hybrid() has already returned true**
+//
+// If not hybrid, this function is unreliable it can return either core type
+// depending on if the EfficiencyClass field is present or not.
+// TODO: revisit this helper function considering that EfficiencyClass can be
+// `0` for for P-cores when there are only P-cores present in the system.
 core_type get_core_type_from_processor_info(
        const SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *info) {
    // For hybrid CPUs, we need to distinguish between P-cores and E-cores
@ -456,7 +463,7 @@ core_type get_core_type_from_processor_info(
            return core_type::p_core;
 #endif
    }
-    // Default to p_core for non-hybrid or if we can't determine
+    // Default to p_core if we can't determine the core_type
    return core_type::p_core;
 }

@ -541,7 +548,10 @@ void init_cache_topology_windows(cache_topology_t &cache_topology) {
        auto *info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(
            reinterpret_cast<BYTE*>(info_base_ptr) + offset);
        if (info->Relationship == RelationProcessorCore) {
-            core_type ctype = get_core_type_from_processor_info(info);
+            core_type ctype = core_type::default_core;
+            if (cache_topology.is_hybrid)
+                ctype = get_core_type_from_processor_info(info);
+
            // Store each group mask for this core type
            for (WORD i = 0; i < info->Processor.GroupCount; i++) {
                core_info.push_back({ctype, info->Processor.GroupMask[i]});
@ -885,7 +895,7 @@ void init_cache_topology_linux(cache_topology_t &cache_topology) {
            cache_topology.caches[idx] = cache_info;
        }
    }
-    
+
    // If not hybrid, copy P-core data to E-core slots
    if (!cache_topology.is_hybrid) {
        for (size_t level = 0; level < cache_topology_t::max_cache_levels; level++) {
@ -918,25 +928,31 @@ void init_cache_topology() {
 unsigned get_per_core_cache_size(int level, behavior_t btype) {
 #if DNNL_X64
    init_cache_topology();
-    
-    // Convert 1-based level to 0-based for array access
+
    if (level < 1 || level > (int)cache_topology_t::max_cache_levels) {
        return 0;
    }
-    
+
    auto pcore_cache = global_cache_topology.get_cache(level, core_type::p_core);
-    auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
-    
+
    // Calculate effective cache size per core (considering sharing)
    auto pcore_size = pcore_cache.size;
    if (pcore_cache.num_sharing_cores > 0) {
        pcore_size /= pcore_cache.num_sharing_cores;
    }
-    
+
+    if (!global_cache_topology.is_hybrid) {
+        // For Non-hybrid system, all cores are assumed to be p-core
+        return pcore_size;
+    }
+
+    auto ecore_cache = global_cache_topology.get_cache(level, core_type::e_core);
+
    auto ecore_size = ecore_cache.size;
    if (ecore_cache.num_sharing_cores > 0) {
        ecore_size /= ecore_cache.num_sharing_cores;
    }
+
    switch (btype)
    {
    case behavior_t::p_core:
--- a/src/cpu/platform.hpp
+++ b/src/cpu/platform.hpp
@ -184,19 +184,21 @@ int get_vector_register_size();

 size_t get_timestamp();

-// using the P-core E-core terminology from Intel
 // P-core: Performance core (high performance, high power consumption)
 // E-core: Efficiency core (low performance, low power consumption)
-// However the naming in the SDM is different: using core (P-core) and atom (E-core)
+// However the naming in the Intel SDM is different:
+//  - core for P-core
+//  - atom for E-core
 enum class core_type : int {
    p_core = 0, // Performance core
    e_core = 1, // Efficiency core
    default_core = p_core // Default core (used for non-hybrid CPUs)
 };

-// Assumption each core type on a system is homogeneous in terms of cache topology
-// e.g. all P-cores have the same cache topology, all E-cores have the same cache topology
-// this is true for all Intel hybrid CPUs so far (Alder Lake, Raptor Lake, Lunar Lake)
+// Assumption each core type on a system is homogeneous in terms of cache
+// topology e.g. all P-cores have the same cache topology, all E-cores have the
+// same cache topology this is true for all Intel hybrid CPUs so far
+// (Alder Lake, Raptor Lake, Lunar Lake)
 struct cache_info_t {
    uint8_t level; // cache level (0 - L1i, 1 - L1d, 2 - L2, 3 - L3, etc)
    uint32_t size; // cache size in bytes
@ -236,18 +238,27 @@ enum class behavior_t {
    p_core, // Performance core
    e_core, // Efficiency core
    current, // Current core
-    min,    // used to select the smallest value for the core_type
-    max,    // used to select the largest value for the core_type
+    min,    // used to select the smallest value for all the cores
+    max,    // used to select the largest value for all the cores
    unknown
 };
-// called in place of get_per_core_cache_size when dealing with hybrid CPUs
-// the core_type argument specifies the type of core to query
-// if behavior_t is current, the function returns the cache size of the core the calling
-// thread is running on.
+// Use OS specific methods to determine the per-core cache size.
 //
-// if core_type is min/max, the function returns the min/max cache size among all cores
-// Examples:
+// This is avoids using CPUID-based methods which can result in inaccurate values
 //
+// Expected to be called in place of get_per_core_cache_size(level) the
+// behavior_t argument specifies the behavior of the query on hybrid CPUs.
+//
+// - if behavior_t is p_core/e_core, the function returns the per-core cache
+//   size for that core type.
+// - if behavior_t is min/max, the function returns the min/max per-core cache
+//   size among all cores
+// - if behavior_t is current, the function returns the cache size of the core
+//   the calling thread is running on.
+// - if behavior_t is unknown, the function behaves like legacy
+//   get_per_core_cache_size(level) function.
+//
+// Examples: (showing KB and MB values for clarity actual function returns bytes)
 // for a hybrid CPU with (e.g. Alder Lake)
 //   48KB L1d cache on P-cores (with hyperthreading) and
 //   32KB L1d cache on E-cores
@ -278,16 +289,15 @@ enum class behavior_t {
 // get_per_core_cache_size(3, core_type::unknown)
 //   uses the get_per_core_cache_size(int level) function.
 //
-// for non-hybrid CPUs (e.g. SRF/CWF), get_per_core_cache_size(int level, core_type ctype)
-// behaves like get_per_core_cache_size(int level) unless core_type is min/max/current
-// in which case it will consider the cache topology to return the appropriate value.
-// this can be used for CPUs with non-uniform cache topology.
-// TODO: Test This behavior on non-hybrid CPUs.
-//
-// Note: for non-hybrid CPUs, the core_type argument is ignored and the function
-// behaves like get_per_core_cache_size(int level)
+// TODO: Test behavior on non-hybrid CPUs.
 unsigned DNNL_API get_per_core_cache_size(int level, behavior_t btype);
+
+// returns true if the CPU is a hybrid CPU
+// (e.g. Alder Lake, Raptor Lake, Lunar Lake)
 bool is_hybrid();
+
+// get the core_type of the core the calling thread is running on
+// to get the core_type of a specific core, set thread affinity to that core
 core_type get_core_type();

 } // namespace platform