From 0fd976b65c6daf3799a501d9202e4f50144446d1 Mon Sep 17 00:00:00 2001
From: Murray Steele <murray.steele@arm.com>
Date: Thu, 9 Oct 2025 20:49:46 +0000
Subject: [PATCH] Enable mimalloc on non-Windows platforms and make default for
 AArch64 builds (#164741)

This change removes the Windows requirement for mimalloc builds, and makes mimalloc the default c10 system allocator for AArch64 builds. This significantly improves the performance of AArch64 builds of PyTorch as large allocations are better cached by mimalloc than glibc.

**Updated Results**

Torchbench FP32 eager Inference, 16 threads:
<img width="1510" height="733" alt="mimalloc-v2-fp32-diff" src="https://github.com/user-attachments/assets/7fe3ea0c-3b52-42e7-879b-612444479c90" />

Torchbench BF16 eager Inference, 16 threads:
<img width="1510" height="733" alt="mimalloc-v2-bf16-diff" src="https://github.com/user-attachments/assets/56469a72-9e06-4d57-ae2a-aeb139ca79a3" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164741
Approved by: https://github.com/fadara01, https://github.com/aditew01, https://github.com/malfet
---
 CMakeLists.txt              |  4 ++--
 c10/core/impl/alloc_cpu.cpp | 15 ++++++++-------
 setup.py                    |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d78d2613251c..0b88247df27a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker
 
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
-# on Windows.
+# on Windows and AArch64.
 option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
-if(WIN32)
+if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
   set(USE_MIMALLOC ON)
 
   # Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index 791104690cd8..c1b7ca858632 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -108,12 +108,15 @@ void* alloc_cpu(size_t nbytes) {
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
       nbytes,
       " bytes.");
-#elif defined(_MSC_VER)
-#ifdef USE_MIMALLOC
+#elif defined(USE_MIMALLOC)
   data = mi_malloc_aligned(nbytes, gAlignment);
-#else
+  CAFFE_ENFORCE(
+      data,
+      "DefaultCPUAllocator: not enough memory: you tried to allocate ",
+      nbytes,
+      " bytes.");
+#elif defined(_MSC_VER)
   data = _aligned_malloc(nbytes, gAlignment);
-#endif
   CAFFE_ENFORCE(
       data,
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
@@ -160,12 +163,10 @@ void* alloc_cpu(size_t nbytes) {
 }
 
 void free_cpu(void* data) {
-#ifdef _MSC_VER
 #ifdef USE_MIMALLOC
   mi_free(data);
-#else
+#elif defined(_MSC_VER)
   _aligned_free(data);
-#endif
 #else
   // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
   free(data);
diff --git a/setup.py b/setup.py
index 9f2eae6f6a90..bdfab24a0b32 100644
--- a/setup.py
+++ b/setup.py
@@ -225,7 +225,7 @@
 #
 #   USE_MIMALLOC
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
-#      By default, It is only enabled on Windows.
+#      By default, It is only enabled on Windows and AArch64.
 #
 #   BUILD_LIBTORCH_WHL
 #      Builds libtorch.so and its dependencies as a wheel