From a06ec54d40013c97fbffc174ea8f524ea5a95715 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 13 Aug 2025 14:57:18 -0700 Subject: [PATCH] [MPS] Add API to query GPU core count (#160414) Using good old IOKit to get `gpu-core-count` property from device implementing `AGXAccelerator` service Expose this one as `torch.backend.mps.get_core_count()` and make it accessible via `MpsInterface` to the inductor Test Plan: Run `python3 -c "import torch;print(torch.backends.mps.get_name(), torch.backends.mps.get_core_count())"` and compare it to `system_profiler SPDisplaysDataType|head -n10` ``` % python3 -c "import torch;print(torch.backends.mps.get_name(), torch.backends.mps.get_core_count())" Apple M1 Pro 16 % system_profiler SPDisplaysDataType|head -n10 Graphics/Displays: Apple M1 Pro: Chipset Model: Apple M1 Pro Type: GPU Bus: Built-In Total Number of Cores: 16 Vendor: Apple (0x106b) Metal Support: Metal 3 ``` This would significantly improve occupancy for torch.compile generated kernels Pull Request resolved: https://github.com/pytorch/pytorch/pull/160414 Approved by: https://github.com/dcci --- CMakeLists.txt | 2 +- aten/src/ATen/mps/MPSDevice.h | 11 +++++++++++ aten/src/ATen/mps/MPSDevice.mm | 28 +++++++++++++++++++++++++++- torch/_C/__init__.pyi.in | 2 ++ torch/_dynamo/device_interface.py | 7 +++++-- torch/_inductor/runtime/hints.py | 3 --- torch/backends/mps/__init__.py | 26 +++++++++++++++++++++++++- torch/csrc/Module.cpp | 1 - torch/csrc/mps/Module.cpp | 6 ++++++ 9 files changed, 77 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc9476bb001a..9fe3855242e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1196,7 +1196,7 @@ if(APPLE) string( APPEND CMAKE_SHARED_LINKER_FLAGS - " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal" + " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal -weak_framework IOKit" ) # To suppress MPSGraph availability warnings append_cxx_flag_if_supported("-Wno-unguarded-availability-new" diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h index 87c820430c98..9b5847710497 100644 --- a/aten/src/ATen/mps/MPSDevice.h +++ b/aten/src/ATen/mps/MPSDevice.h @@ -55,6 +55,17 @@ class TORCH_API MPSDevice { */ bool isMacOS13Plus(MacOSVersion version) const; + /** + * Returns device name + */ + std::string getName() const; + + /** + * Returns number of GPU cores. + * 1 Core = 16 ExecutionUnit x 8 ALU x 24 threads + */ + unsigned getCoreCount() const; + ~MPSDevice(); private: diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm index 72a066c69450..5a37490c0240 100644 --- a/aten/src/ATen/mps/MPSDevice.mm +++ b/aten/src/ATen/mps/MPSDevice.mm @@ -85,10 +85,36 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const { } } +std::string MPSDevice::getName() const { + @autoreleasepool { + return [[_mtl_device name] UTF8String]; + } +} + +unsigned MPSDevice::getCoreCount() const { + io_iterator_t iterator = 0; + io_registry_entry_t entry = 0; + int core_count = 0; + auto matchingDict = IOServiceMatching("AGXAccelerator"); + TORCH_INTERNAL_ASSERT(matchingDict, "Failed to create matching dict"); + const auto status = IOServiceGetMatchingServices(kIOMainPortDefault, matchingDict, &iterator); + TORCH_INTERNAL_ASSERT(status == KERN_SUCCESS); + while ((entry = IOIteratorNext(iterator)) != 0) { + auto property = IORegistryEntryCreateCFProperty(entry, CFSTR("gpu-core-count"), kCFAllocatorDefault, 0); + auto found = CFNumberGetValue(static_cast(property), kCFNumberIntType, &core_count); + CFRelease(property); + IOObjectRelease(entry); + if (found) { + break; + } + } + IOObjectRelease(iterator); + return core_count; +} + at::Allocator* GetMPSAllocator(bool useSharedAllocator) { return getIMPSAllocator(useSharedAllocator); } - bool is_available() { return MPSDevice::getInstance()->device() != nil; } diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index d03650a85f37..94567b08a5cc 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -1979,7 +1979,9 @@ def _mtia_resetPeakMemoryStats(device: _int) -> None: ... # Defined in torch/csrc/mps/Module.cpp def _mps_deviceSynchronize() -> None: ... +def _mps_get_core_count() -> _int: ... def _mps_get_default_generator() -> Generator: ... +def _mps_get_name() -> _str: ... def _mps_emptyCache() -> None: ... def _mps_setMemoryFraction(fraction: _float) -> None: ... def _mps_currentAllocatedMemory() -> _int: ... diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py index 9ea53c900b05..26cf4796fd07 100644 --- a/torch/_dynamo/device_interface.py +++ b/torch/_dynamo/device_interface.py @@ -17,6 +17,7 @@ specialized implementations for each hardware backend's unique features. import inspect import time +from collections import namedtuple from collections.abc import Iterable from dataclasses import dataclass from typing import Any, Callable, Literal, Optional, Union @@ -544,8 +545,10 @@ class MpsInterface(DeviceInterface): class Worker: @staticmethod - def get_device_properties(device: torch.types.Device = None) -> dict[str, Any]: - return {} + def get_device_properties(device: torch.types.Device = None) -> Any: + return namedtuple("MPSProperties", ["multi_processor_count"])( + torch.backends.mps.get_core_count() # type: ignore[arg-type] + ) @staticmethod def current_device() -> int: diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py index 2732b9cecfb2..15b86b1b3d1a 100644 --- a/torch/_inductor/runtime/hints.py +++ b/torch/_inductor/runtime/hints.py @@ -153,9 +153,6 @@ class DeviceProperties(typing.NamedTuple): except AttributeError: if device_type == "xpu": multi_processor_count = props.gpu_subslice_count - elif device_type == "mps": - # TODO: Fetch the actual value from ioreg - multi_processor_count = 8 elif device_type == "mtia": multi_processor_count = 64 else: diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py index 2fe445bfcb0e..5c3c507428cf 100644 --- a/torch/backends/mps/__init__.py +++ b/torch/backends/mps/__init__.py @@ -5,7 +5,14 @@ import torch from torch.library import Library as _Library -__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"] +__all__ = [ + "get_core_count", + "get_name", + "is_built", + "is_available", + "is_macos13_or_newer", + "is_macos_or_newer", +] def is_built() -> bool: @@ -36,6 +43,23 @@ def is_macos13_or_newer(minor: int = 0) -> bool: return torch._C._mps_is_on_macos_or_newer(13, minor) +@_lru_cache +def get_name() -> str: + r"""Return Metal device name""" + return torch._C._mps_get_name() + + +@_lru_cache +def get_core_count() -> int: + r"""Return GPU core count. + + According to the documentation, one core is comprised of 16 Execution Units. + One execution Unit has 8 ALUs. + And one ALU can run 24 threads, i.e. one core is capable of executing 3072 threads concurrently. + """ + return torch._C._mps_get_core_count() + + _lib: Optional[_Library] = None diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 1aa8a8b6df8a..8a8bd79653c8 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp index 95263f108c82..51c77aba6d76 100644 --- a/torch/csrc/mps/Module.cpp +++ b/torch/csrc/mps/Module.cpp @@ -501,6 +501,12 @@ void initModule(PyObject* module) { at::mps::getMPSProfiler().startCapture(fileName); }); m.def("_mps_stopCapture", []() { at::mps::getMPSProfiler().stopCapture(); }); + m.def("_mps_get_name", []() { + return at::mps::MPSDevice::getInstance()->getName(); + }); + m.def("_mps_get_core_count", []() { + return at::mps::MPSDevice::getInstance()->getCoreCount(); + }); } #endif /* USE_MPS */