mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156077 Approved by: https://github.com/Skylion007, https://github.com/malfet ghstack dependencies: #156069
295 lines
6.0 KiB
YAML
295 lines
6.0 KiB
YAML
# Some models have large dataset that doesn't fit in memory. Lower the batch
|
|
# size to test the accuracy.
|
|
batch_size:
|
|
training:
|
|
demucs: 4
|
|
dlrm: 1024
|
|
densenet121: 4
|
|
hf_Reformer: 4
|
|
hf_T5_base: 4
|
|
timm_efficientdet: 1
|
|
llama_v2_7b_16h: 1
|
|
# reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
|
|
yolov3: 8
|
|
|
|
inference:
|
|
timm_efficientdet: 32
|
|
|
|
|
|
dont_change_batch_size:
|
|
- demucs
|
|
- pytorch_struct
|
|
- pyhpc_turbulent_kinetic_energy
|
|
# https://github.com/pytorch/benchmark/pull/1656
|
|
- vision_maskrcnn
|
|
|
|
|
|
tolerance:
|
|
# Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
|
|
higher:
|
|
- alexnet
|
|
- attention_is_all_you_need_pytorch
|
|
- densenet121
|
|
- hf_Albert
|
|
- vgg16
|
|
- mobilenet_v3_large
|
|
- nvidia_deeprecommender
|
|
|
|
# These models need >1e-3 tolerance
|
|
even_higher:
|
|
- soft_actor_critic
|
|
- tacotron2
|
|
- yolov3
|
|
- timm_efficientdet
|
|
- timm_efficientnet
|
|
- squeezenet1_1
|
|
|
|
higher_fp16:
|
|
- doctr_reco_predictor
|
|
- drq
|
|
- hf_Whisper
|
|
- phlippe_resnet
|
|
|
|
higher_bf16:
|
|
- doctr_reco_predictor
|
|
- drq
|
|
- hf_Whisper
|
|
|
|
freezing:
|
|
# Similar logic to timm_models.py:get_tolerance_and_cosine_flag
|
|
# the conv-batchnorm fusion used under freezing may cause relatively
|
|
# large numerical difference. We need are larger tolerance.
|
|
# Check https://github.com/pytorch/pytorch/issues/120545 for context
|
|
even_higher:
|
|
- mobilenet_v2
|
|
|
|
cosine: []
|
|
|
|
require_larger_multiplier_for_smaller_tensor:
|
|
- yolov3
|
|
- timm_efficientnet
|
|
|
|
# These benchmarks took >600s on an i9-11900K CPU
|
|
very_slow: &VERY_SLOW_MODELS
|
|
# 3339s
|
|
- hf_BigBird
|
|
# 3062s
|
|
- hf_Longformer
|
|
# 930s
|
|
- hf_T5
|
|
|
|
|
|
# These benchmarks took >60s on an i9-11900K CPU
|
|
slow:
|
|
- *VERY_SLOW_MODELS
|
|
# 137s
|
|
- BERT_pytorch
|
|
# 116s
|
|
- demucs
|
|
# 242s
|
|
- fastNLP_Bert
|
|
# 221s
|
|
- hf_Albert
|
|
# 400s
|
|
- hf_Bart
|
|
# 334s
|
|
- hf_Bert
|
|
# 187s
|
|
- hf_DistilBert
|
|
# 470s
|
|
- hf_GPT2
|
|
# 141s
|
|
- hf_Reformer
|
|
# 317s
|
|
- speech_transformer
|
|
# 99s
|
|
- vision_maskrcnn
|
|
|
|
|
|
non_deterministic:
|
|
# https://github.com/pytorch/pytorch/issues/98355
|
|
- mobilenet_v3_large
|
|
- sam_fast
|
|
|
|
|
|
disable_cudagraph:
|
|
# tts_angular is flaky with cudagraphs. Its speedup
|
|
# oscillates from .05 to 1.05
|
|
- tts_angular
|
|
|
|
|
|
dtype:
|
|
force_amp_for_fp16_bf16_models:
|
|
- DALLE2_pytorch
|
|
- doctr_det_predictor
|
|
- doctr_reco_predictor
|
|
- Super_SloMo
|
|
- tts_angular
|
|
- pyhpc_turbulent_kinetic_energy
|
|
- detectron2_fcos_r_50_fpn
|
|
|
|
force_fp16_for_bf16_models:
|
|
- vision_maskrcnn
|
|
|
|
|
|
# models in canary_models that we should run anyway
|
|
canary_models:
|
|
- torchrec_dlrm
|
|
|
|
|
|
detectron2_models: &DETECTRON2_MODELS
|
|
- detectron2_fasterrcnn_r_101_c4
|
|
- detectron2_fasterrcnn_r_101_dc5
|
|
- detectron2_fasterrcnn_r_101_fpn
|
|
- detectron2_fasterrcnn_r_50_c4
|
|
- detectron2_fasterrcnn_r_50_dc5
|
|
- detectron2_fasterrcnn_r_50_fpn
|
|
- detectron2_maskrcnn_r_101_c4
|
|
- detectron2_maskrcnn_r_101_fpn
|
|
- detectron2_maskrcnn_r_50_fpn
|
|
|
|
|
|
# These models support only train mode. So accuracy checking can't be done in
|
|
# eval mode.
|
|
only_training:
|
|
- *DETECTRON2_MODELS
|
|
- tts_angular
|
|
- tacotron2
|
|
- demucs
|
|
- hf_Reformer
|
|
- pytorch_struct
|
|
- yolov3
|
|
|
|
|
|
trt_not_yet_working:
|
|
- alexnet
|
|
- resnet18
|
|
- resnet50
|
|
- mobilenet_v2
|
|
- mnasnet1_0
|
|
- squeezenet1_1
|
|
- shufflenetv2_x1_0
|
|
- vgg16
|
|
- resnext50_32x4d
|
|
|
|
|
|
skip:
|
|
all:
|
|
# OOMs (A100 40G)
|
|
- detectron2_maskrcnn
|
|
# TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
|
|
- tacotron2
|
|
# Failing in eager mode
|
|
- hf_clip
|
|
# multi gpu not always available in benchmark runners
|
|
- simple_gpt_tp_manual
|
|
|
|
device:
|
|
cpu:
|
|
# OOMs
|
|
- hf_T5_generate
|
|
# model is CUDA only
|
|
- cm3leon_generate
|
|
# timeout
|
|
- nanogpt
|
|
# timeout
|
|
- sam
|
|
# model is CUDA only
|
|
- sam_fast
|
|
# model is CUDA only
|
|
- llama_v2_7b_16h
|
|
# flaky
|
|
- stable_diffusion
|
|
# requires FBGEMM, CUDA only
|
|
- torchrec_dlrm
|
|
- simple_gpt
|
|
# works on cuda, accuracy failure on cpu
|
|
- hf_Whisper
|
|
- stable_diffusion_text_encoder
|
|
- llava
|
|
- moco
|
|
|
|
cuda: []
|
|
|
|
test:
|
|
training:
|
|
- *DETECTRON2_MODELS
|
|
# not designed for training
|
|
- pyhpc_equation_of_state
|
|
- pyhpc_isoneutral_mixing
|
|
- pyhpc_turbulent_kinetic_energy
|
|
- maml
|
|
- llama
|
|
- llama_v2_7b_16h
|
|
- simple_gpt
|
|
- sam_fast
|
|
# Model's DEFAULT_TRAIN_BSIZE is not implemented
|
|
- cm3leon_generate
|
|
- hf_T5_generate
|
|
- doctr_det_predictor
|
|
- doctr_reco_predictor
|
|
- moondream
|
|
# doesn't fit in memory
|
|
- phi_1_5
|
|
- detectron2_fcos_r_50_fpn
|
|
|
|
control_flow:
|
|
- cm3leon_generate
|
|
- detectron2_fcos_r_50_fpn
|
|
- fastNLP_Bert
|
|
- hf_Longformer
|
|
- hf_Reformer
|
|
- hf_T5_generate
|
|
- opacus_cifar10
|
|
- speech_transformer
|
|
|
|
export_not_supported:
|
|
- doctr_reco_predictor
|
|
- doctr_det_predictor
|
|
- drq
|
|
- llama
|
|
- sam_fast
|
|
- soft_actor_critic
|
|
- timm_efficientdet
|
|
- vision_maskrcnn
|
|
|
|
# Models that should only run in --multiprocess mode
|
|
multiprocess:
|
|
- simple_gpt
|
|
|
|
# for these models, conv-batchnorm fusing causes big numerical churn.
|
|
# Skip them
|
|
# mnasnet1_0 and shufflenet_v2_x1_0 can pass on cpu, moco cuda only.
|
|
freezing:
|
|
cuda:
|
|
- mnasnet1_0
|
|
- moco
|
|
- shufflenet_v2_x1_0
|
|
cpu: []
|
|
|
|
|
|
|
|
|
|
accuracy:
|
|
skip:
|
|
large_models:
|
|
# Models too large to have eager, dynamo and fp64_numbers simultaneosuly
|
|
# even for 40 GB machine. We have tested accuracy for smaller version of
|
|
# these models
|
|
- hf_GPT2_large
|
|
- hf_T5_large
|
|
- timm_vision_transformer_large
|
|
# accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
- maml
|
|
- llama_v2_7b_16h
|
|
- Background_Matting
|
|
- stable_diffusion_unet
|
|
eager_not_deterministic:
|
|
# Models that deterministic algorithms can not be turned on for eager mode.
|
|
- Background_Matting
|
|
- pytorch_unet
|
|
|
|
max_batch_size:
|
|
hf_GPT2: 2
|
|
pytorch_unet: 2
|