Skip slow tests for aarch64-inductor-benchmarks (#158842)

This PR suggests adding some models to `cpu_skip_list` which are currently being run in TIMM and Torchbench.
The suggested models takes a long time which leads to the benchmark runs being `timeout`.  [benchmark runs for aarch64](https://github.com/pytorch/pytorch/actions/workflows/inductor-perf-test-nightly-aarch64.yml)

•	The issue stems from unoptimized groupwise convolution (BF16 /F16 dtype) kernels for aarch64 platforms  , which significantly slow down execution leading to the timeout.
**Action:**
•	An optimized BF16 groupwise convolution kernel is currently being developed in oneDNN, targeted for release in Q4 2025.

To maintain dashboard consistency and signal clarity, I’ve skipped the affected tests in:
      * timm benchmarks
      * torchbench benchmarks

 As suggested, skip is applied at the CPU - arch level, explicitly branching for aarch64 and adding models which needs to be skipped. This keeps the logic clean, but:
•	An alternative considered was increasing shard counts for aarch64 runners, but given the known performance bottleneck, skipping avoids wasted compute cycles. Suggestions around this will be appreciated.

Benchmark does not timeout after the suggested change: https://github.com/pytorch/pytorch/actions/runs/16447200138

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158842
Approved by: https://github.com/malfet
This commit is contained in:
Aditya Tewari
2025-07-24 00:21:33 +00:00
committed by PyTorch MergeBot
parent 0118931e27
commit 7001d6fbc9
6 changed files with 43 additions and 0 deletions

View File

@ -14,6 +14,7 @@ import itertools
import json import json
import logging import logging
import os import os
import platform
import random import random
import shutil import shutil
import signal import signal
@ -1763,6 +1764,10 @@ class BenchmarkRunner:
def skip_models_for_cpu(self): def skip_models_for_cpu(self):
return set() return set()
@property
def skip_models_for_cpu_aarch64(self):
return set()
@property @property
def skip_models_for_freezing_cpu(self): def skip_models_for_freezing_cpu(self):
return set() return set()
@ -3715,7 +3720,10 @@ def run(runner, args, original_dir=None):
runner.skip_models.update(runner.slow_models) runner.skip_models.update(runner.slow_models)
if args.devices == ["cpu"]: if args.devices == ["cpu"]:
arch = platform.machine()
runner.skip_models.update(runner.skip_models_for_cpu) runner.skip_models.update(runner.skip_models_for_cpu)
if arch == "aarch64":
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
elif args.devices == ["cuda"]: elif args.devices == ["cuda"]:
runner.skip_models.update(runner.skip_models_for_cuda) runner.skip_models.update(runner.skip_models_for_cuda)

View File

@ -32,6 +32,7 @@ import io
import itertools import itertools
import logging import logging
import os import os
import platform
import re import re
import shutil import shutil
import subprocess import subprocess
@ -374,6 +375,7 @@ def get_skip_tests(suite, device, is_training: bool):
original_dir = abspath(os.getcwd()) original_dir = abspath(os.getcwd())
module = importlib.import_module(suite) module = importlib.import_module(suite)
os.chdir(original_dir) os.chdir(original_dir)
arch = platform.machine()
if suite == "torchbench": if suite == "torchbench":
skip_tests.update(module.TorchBenchmarkRunner().skip_models) skip_tests.update(module.TorchBenchmarkRunner().skip_models)
@ -383,6 +385,10 @@ def get_skip_tests(suite, device, is_training: bool):
) )
if device == "cpu": if device == "cpu":
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu) skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
if arch == "aarch64":
skip_tests.update(
module.TorchBenchmarkRunner().skip_models_for_cpu_aarch64
)
elif device == "cuda": elif device == "cuda":
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda) skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)

View File

@ -230,6 +230,14 @@ class TimmRunner(BenchmarkRunner):
def _skip(self): def _skip(self):
return self._config["skip"] return self._config["skip"]
@property
def skip_models_for_cpu(self):
return self._skip["device"]["cpu"]
@property
def skip_models_for_cpu_aarch64(self):
return self._skip["device"]["cpu_aarch64"]
@property @property
def skip_models(self): def skip_models(self):
return self._skip["all"] return self._skip["all"]

View File

@ -2,3 +2,14 @@
skip: skip:
all: all:
- ~ - ~
device:
cpu:
- ~
# Skip these additional models when running on aarch64
cpu_aarch64:
# timeout on aarch64
- dm_nfnet_f0
- nfnet_l0
- resnest101e
- swsl_resnext101_32x16d
- visformer_small

View File

@ -138,6 +138,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
def skip_models_for_cpu(self): def skip_models_for_cpu(self):
return self._skip["device"]["cpu"] return self._skip["device"]["cpu"]
@property
def skip_models_for_cpu_aarch64(self):
return self._skip["device"]["cpu_aarch64"]
@property @property
def skip_models_for_cuda(self): def skip_models_for_cuda(self):
return self._skip["device"]["cuda"] return self._skip["device"]["cuda"]

View File

@ -213,6 +213,12 @@ skip:
- llava - llava
- moco - moco
# Skip these additional models when running on aarch64
cpu_aarch64:
# timeout on aarch64
- timm_regnet
- timm_nfnet
cuda: [] cuda: []
test: test: