# Some models have large dataset that doesn't fit in memory. Lower the batch # size to test the accuracy. batch_size: training: demucs: 4 dlrm: 1024 densenet121: 4 timm_efficientdet: 1 llama_v2_7b_16h: 1 # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard yolov3: 8 inference: timm_efficientdet: 32 dont_change_batch_size: - demucs - pytorch_struct - pyhpc_turbulent_kinetic_energy # https://github.com/pytorch/benchmark/pull/1656 - vision_maskrcnn tolerance: # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models. higher: - alexnet - attention_is_all_you_need_pytorch - densenet121 - vgg16 - mobilenet_v3_large - nvidia_deeprecommender # These models need >1e-3 tolerance even_higher: - soft_actor_critic - tacotron2 - yolov3 - squeezenet1_1 higher_fp16: - doctr_reco_predictor - drq - phlippe_resnet higher_bf16: - doctr_reco_predictor - drq # These models need higher tolerance for xpu devices with bf16 higher_bf16_xpu: - squeezenet1_1 freezing: # Similar logic to timm_models.py:get_tolerance_and_cosine_flag # the conv-batchnorm fusion used under freezing may cause relatively # large numerical difference. We need are larger tolerance. # Check https://github.com/pytorch/pytorch/issues/120545 for context even_higher: - mobilenet_v2 cosine: [] require_larger_multiplier_for_smaller_tensor: - yolov3 # These benchmarks took >600s on an i9-11900K CPU very_slow: &VERY_SLOW_MODELS # These benchmarks took >60s on an i9-11900K CPU slow: - *VERY_SLOW_MODELS # 137s - BERT_pytorch # 116s - demucs # 242s - fastNLP_Bert # 317s - speech_transformer # 99s - vision_maskrcnn non_deterministic: # https://github.com/pytorch/pytorch/issues/98355 - mobilenet_v3_large - sam_fast disable_cudagraph: # tts_angular is flaky with cudagraphs. Its speedup # oscillates from .05 to 1.05 - tts_angular dtype: force_amp_for_fp16_bf16_models: - DALLE2_pytorch - doctr_det_predictor - doctr_reco_predictor - Super_SloMo - tts_angular - pyhpc_turbulent_kinetic_energy - detectron2_fcos_r_50_fpn force_fp16_for_bf16_models: - vision_maskrcnn # models in canary_models that we should run anyway canary_models: - torchrec_dlrm detectron2_models: &DETECTRON2_MODELS - detectron2_fasterrcnn_r_101_c4 - detectron2_fasterrcnn_r_101_dc5 - detectron2_fasterrcnn_r_101_fpn - detectron2_fasterrcnn_r_50_c4 - detectron2_fasterrcnn_r_50_dc5 - detectron2_fasterrcnn_r_50_fpn - detectron2_maskrcnn_r_101_c4 - detectron2_maskrcnn_r_101_fpn - detectron2_maskrcnn_r_50_fpn # These models support only train mode. So accuracy checking can't be done in # eval mode. only_training: - *DETECTRON2_MODELS - tts_angular - tacotron2 - demucs - hf_Reformer - pytorch_struct - yolov3 trt_not_yet_working: - alexnet - resnet18 - resnet50 - mobilenet_v2 - mnasnet1_0 - squeezenet1_1 - shufflenetv2_x1_0 - vgg16 - resnext50_32x4d skip: all: # OOMs (A100 40G) - detectron2_maskrcnn # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467 - tacotron2 # Failing in eager mode - hf_clip # multi gpu not always available in benchmark runners - simple_gpt_tp_manual # skip hf and timm models in torchbench since # there are already separate benchmarks for them - hf_Albert - hf_Bart - hf_Bert - hf_BigBird - hf_DistilBert - hf_GPT2 - hf_Longformer - hf_Reformer - hf_T5 - timm_efficientdet - timm_efficientnet - timm_nfnet - timm_regnet - timm_resnest - timm_vision_transformer - timm_vovnet - hf_Bert_large - hf_GPT2_large - hf_Roberta_base - hf_T5_base - hf_T5_generate - hf_T5_large - hf_Whisper - hf_distil_whisper - timm_vision_transformer_large device: cpu: # model is CUDA only - cm3leon_generate # timeout - nanogpt # timeout - sam # model is CUDA only - sam_fast # model is CUDA only - llama_v2_7b_16h # flaky - stable_diffusion # requires FBGEMM, CUDA only - torchrec_dlrm - simple_gpt # works on cuda, accuracy failure on cpu - stable_diffusion_text_encoder - llava - moco # Skip these additional models when running on aarch64 cpu_aarch64: [] cuda: [] test: training: - *DETECTRON2_MODELS # not designed for training - pyhpc_equation_of_state - pyhpc_isoneutral_mixing - pyhpc_turbulent_kinetic_energy - maml - llama - llama_v2_7b_16h - simple_gpt - sam_fast # Model's DEFAULT_TRAIN_BSIZE is not implemented - cm3leon_generate - doctr_det_predictor - doctr_reco_predictor - moondream # doesn't fit in memory - phi_1_5 - detectron2_fcos_r_50_fpn control_flow: - cm3leon_generate - detectron2_fcos_r_50_fpn - fastNLP_Bert - opacus_cifar10 - speech_transformer export_not_supported: - doctr_reco_predictor - doctr_det_predictor - drq - llama - sam_fast - soft_actor_critic - timm_efficientdet - vision_maskrcnn # Models that should only run in --multiprocess mode multiprocess: - simple_gpt # for these models, conv-batchnorm fusing causes big numerical churn. # Skip them # mnasnet1_0 and shufflenet_v2_x1_0 can pass on cpu, moco cuda only. freezing: cuda: - mnasnet1_0 - moco - shufflenet_v2_x1_0 cpu: [] accuracy: skip: large_models: # Models too large to have eager, dynamo and fp64_numbers simultaneosuly # even for 40 GB machine. We have tested accuracy for smaller version of # these models # accuracy https://github.com/pytorch/pytorch/issues/93847 - maml - llama_v2_7b_16h - Background_Matting - stable_diffusion_unet eager_not_deterministic: # Models that deterministic algorithms can not be turned on for eager mode. - Background_Matting - pytorch_unet max_batch_size: pytorch_unet: 2