Install torchrec/fbgemm from source in CI (#106808)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/106808
Approved by: https://github.com/malfet, https://github.com/xuzhao9
This commit is contained in:
Edward Z. Yang
2023-08-11 12:47:51 -04:00
committed by PyTorch MergeBot
parent 9858edd99f
commit 5b04e9b6ce
10 changed files with 42 additions and 12 deletions

View File

@ -158,6 +158,19 @@ function install_torchvision() {
fi
}
function install_torchrec_and_fbgemm() {
local torchrec_commit
torchrec_commit=$(get_pinned_commit torchrec)
local fbgemm_commit
fbgemm_commit=$(get_pinned_commit fbgemm)
pip_uninstall torchrec-nightly
pip_uninstall fbgemm-gpu-nightly
pip_install setuptools-git-versioning scikit-build pyre-extensions
# See https://github.com/pytorch/pytorch/issues/106971
CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
}
function install_numpy_pytorch_interop() {
local commit
commit=$(get_pinned_commit numpy_pytorch_interop)

View File

@ -1028,6 +1028,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
else
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
fi
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then

1
.github/ci_commit_pins/fbgemm.txt vendored Normal file
View File

@ -0,0 +1 @@
1b2746f642cc2c99fe9d1a0c34359c0de45341c2

View File

@ -1 +1 @@
8a0f5e3678bef55148743ab987baa3c89f8dfb5e
9371b9e13c826f3930e54346b4d619cb59182f68

1
.github/ci_commit_pins/torchrec.txt vendored Normal file
View File

@ -0,0 +1 @@
6cd9fd362514d14ebb9ed51314c62ac1e1e2bbf2

View File

@ -12,6 +12,7 @@ basic_gnn_sage,pass,0
clip,pass,0
cm3leon_generate,pass,6
dcgan,pass,0
dlrm,pass,0
doctr_det_predictor,pass,2
doctr_reco_predictor,fail_accuracy,4
drq,pass,0

1 name accuracy graph_breaks
12 clip pass 0
13 cm3leon_generate pass 6
14 dcgan pass 0
15 dlrm pass 0
16 doctr_det_predictor pass 2
17 doctr_reco_predictor fail_accuracy 4
18 drq pass 0

View File

@ -1,4 +1,5 @@
name,accuracy,graph_breaks
torchrec_dlrm,infra_error,0
BERT_pytorch,pass,8
LearningToPaint,pass,8
Super_SloMo,pass,8
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
basic_gnn_sage,pass,8
clip,pass,8
dcgan,pass,8
dlrm,pass,8
drq,pass,7
fastNLP_Bert,pass,12
functorch_dp_cifar10,pass,8

1 name accuracy graph_breaks
2 torchrec_dlrm infra_error 0
3 BERT_pytorch pass 8
4 LearningToPaint pass 8
5 Super_SloMo pass 8
11 basic_gnn_sage pass 8
12 clip pass 8
13 dcgan pass 8
14 dlrm pass 8
15 drq pass 7
16 fastNLP_Bert pass 12
17 functorch_dp_cifar10 pass 8

View File

@ -1,4 +1,5 @@
name,accuracy,graph_breaks
torchrec_dlrm,pass,8
BERT_pytorch,pass,8
LearningToPaint,pass,8
Super_SloMo,pass,8
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
basic_gnn_sage,pass,8
clip,pass,8
dcgan,pass,8
dlrm,pass,8
drq,pass,7
fastNLP_Bert,pass,12
functorch_dp_cifar10,pass,8

1 name accuracy graph_breaks
2 torchrec_dlrm pass 8
3 BERT_pytorch pass 8
4 LearningToPaint pass 8
5 Super_SloMo pass 8
11 basic_gnn_sage pass 8
12 clip pass 8
13 dcgan pass 8
14 dlrm pass 8
15 drq pass 7
16 fastNLP_Bert pass 12
17 functorch_dp_cifar10 pass 8

View File

@ -95,8 +95,6 @@ CI_SKIP[CI("eager", training=False)] = [
"hf_BigBird", # fail_accuracy
# TypeError: pad_center() takes 1 positional argument but 2 were given
"tacotron2",
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
"torchrec_dlrm",
# Huggingface
"DebertaV2ForQuestionAnswering", # OOM
]
@ -170,8 +168,6 @@ CI_SKIP[CI("aot_eager", training=True)] = [
CI_SKIP[CI("inductor", training=False)] = [
# TorchBench
"DALLE2_pytorch", # AttributeError: text_encodings
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
"torchrec_dlrm",
"demucs", # OOM
"detectron2_fasterrcnn_r_101_c4",
"detectron2_fasterrcnn_r_101_dc5",
@ -223,8 +219,6 @@ CI_SKIP[CI("inductor", training=False, device="cpu")] = [
"pyhpc_turbulent_kinetic_energy",
"resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double)
"sage", # does not work with fp32
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
"torchrec_dlrm",
# Huggingface
"MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793
"PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794
@ -239,7 +233,6 @@ CI_SKIP[CI("inductor", training=True)] = [
*CI_SKIP[CI("inductor", training=False)],
# TorchBench
"Background_Matting", # fp64_OOM
"dlrm", # Fails on CI - unable to repro locally
"hf_T5_base", # accuracy
"mobilenet_v3_large", # accuracy
"resnet50_quantized_qat", # Eager model failed to run
@ -260,7 +253,6 @@ CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
*CI_SKIP[CI("aot_eager", training=False)],
"vision_maskrcnn", # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093
# https://github.com/pytorch/pytorch/issues/103760
"dlrm",
"hf_T5_generate",
"hf_Bert", # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
]
@ -269,6 +261,7 @@ CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
*CI_SKIP[CI("aot_eager", training=True)],
*CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
"llama", # AssertionError: cannot compute free_symbols of True
"torchrec_dlrm", # RuntimeError: mat1 and mat2 must have the same dtype, but got Float and BFloat16
]
CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
@ -296,8 +289,6 @@ CI_SKIP_OPTIMIZER = {
# TIMM
"convmixer_768_32", # accuracy
"hrnet_w18", # Stack issue in fx
# TorchBench
"dlrm", # symbolic shapes error
# HF
"pnasnet5large", # Stack issue in fx
"MobileBertForMaskedLM", # Stack issue in fx
@ -311,6 +302,7 @@ CI_SKIP_DYNAMIC_BATCH_ONLY = {
# It iterates over the batch, which is dynamic, and dynamo chokes
# We should be able to graphbreak there.
"doctr_det_predictor",
"dlrm",
}

View File

@ -90,6 +90,7 @@ SKIP_FOR_CPU = {
"sam", # timeout
"llama_v2_7b_16h", # model is CUDA only
"stable_diffusion", # flaky
"torchrec_dlrm", # requires FBGEMM, CUDA only
}
SKIP_FOR_CUDA = {
@ -228,6 +229,11 @@ FORCE_AMP_FOR_FP16_BF16_MODELS = {
"tts_angular",
}
# models in canary_models that we should run anyway
CANARY_MODELS = {
"torchrec_dlrm",
}
class TorchBenchmarkRunner(BenchmarkRunner):
def __init__(self):
@ -394,9 +400,16 @@ class TorchBenchmarkRunner(BenchmarkRunner):
return device, benchmark.name, model, example_inputs, batch_size
def iter_model_names(self, args):
from torchbenchmark import _list_model_paths
from torchbenchmark import _list_canary_model_paths, _list_model_paths
models = _list_model_paths()
models += [
f
for f in _list_canary_model_paths()
if os.path.basename(f) in CANARY_MODELS
]
models.sort()
start, end = self.get_benchmark_indices(len(models))
for index, model_path in enumerate(models):
if index < start or index >= end: