Compare commits

...

14 Commits

Author SHA1 Message Date
34e28963d0 [no ci] Turn off TORCHINDUCTOR_GRAPH_PARTITION
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-25 21:43:47 -07:00
f0fe616cd5 [no ci] Reinstall detectron2
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-25 13:01:53 -07:00
00d3694059 [no ci] Some models are not fit to run on CI
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-25 11:02:08 -07:00
34e8518bfc [no ci] Rerun with all LLM models
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-24 20:52:25 -07:00
f94ebe2599 Merge branch 'main' into prepare-perf-number-2.9
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-24 20:51:31 -07:00
e0ba333f4f [no ci] RC4
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-23 23:53:11 -07:00
400d1e9777 Skip LLM training and increase the number of H100 shard for HF
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-23 04:19:32 -07:00
52d9bd3c93 [no ci] Silly mistake
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-22 23:03:01 -07:00
aa22b4fa50 2.9 RC3
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-22 17:45:51 -07:00
c969b47090 [no ci] Rebuild torchrec and fbgemm
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-19 22:18:05 -07:00
9218a4716e Does this work?
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-19 03:33:40 -07:00
a22515f573 Add a debug
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-19 03:30:27 -07:00
7e16093692 [no ci ] Merge branch 'main' into prepare-perf-baseline-number-2.8 2025-09-19 03:27:42 -07:00
d4456bde3b [no ci] Run TorchInductor benchmark on PyTorch 2.8
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-18 17:18:13 -07:00
5 changed files with 40 additions and 8 deletions

View File

@ -826,6 +826,24 @@ test_dynamo_benchmark() {
local shard_id="$1"
shift
### Perf benchmark 2.9 RC4, need to reinstall detectron2 to avoid crashing when importing it
pip_uninstall torch torchvision torchaudio torchrec fbgemm-gpu triton pytorch-triton detectron2
pip_install torch==2.9.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
# Rebuild torchrec and fbgemm because they don't have RC for 2.9 yet
if [[ "${TEST_CONFIG}" == *torchbench* ]] && [[ "${TEST_CONFIG}" != *cpu* ]]; then
rm -rf dist/torchrec
rm -rf dist/fbgemm_gpu
install_torchrec_and_fbgemm
fi
# Same pinned commit as used in TorchBench
pip_install git+https://github.com/facebookresearch/detectron2.git@0df2d73d0013db7de629602c23cc120219b4f2b8
pip freeze
# Control TORCHINDUCTOR_GRAPH_PARTITION
export TORCHINDUCTOR_GRAPH_PARTITION=0
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then

View File

@ -100,11 +100,12 @@ jobs:
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },

View File

@ -3355,7 +3355,7 @@ def parse_args(args=None):
parser.add_argument(
"--timeout",
type=int,
default=2000,
default=3600,
help="timeout (second) for benchmarking.",
)

View File

@ -373,6 +373,10 @@ class HuggingfaceRunner(BenchmarkRunner):
def skip_models_due_to_control_flow(self):
return self._skip["control_flow"]
@property
def skip_not_suitable_for_training_models(self):
return self._skip["test"]["training"]
def use_larger_multiplier_for_smaller_tensor(self, name):
return name in [
"ElectraForQuestionAnswering",

View File

@ -9,10 +9,9 @@ skip:
# Fails with even batch size = 1
- GPTJForCausalLM
- GPTJForQuestionAnswering
# Model too big
# Model too big or the benchmark is taking too long (timeout)
- google/gemma-3-4b-it
- openai/gpt-oss-20b
- mistralai/Mistral-7B-Instruct-v0.3
device:
cpu:
@ -27,6 +26,16 @@ skip:
control_flow:
- AllenaiLongformerBase
test:
training:
- meta-llama/Llama-3.2-1B
- google/gemma-2-2b
- google/gemma-3-4b-it
- openai/whisper-tiny
- Qwen/Qwen3-0.6B
- mistralai/Mistral-7B-Instruct-v0.3
- openai/gpt-oss-20b
batch_size:
# TODO - Fails even after fake tensors
divisors: