mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Install torchrec/fbgemm from source in CI (#106808)
Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/106808 Approved by: https://github.com/malfet, https://github.com/xuzhao9
This commit is contained in:
committed by
PyTorch MergeBot
parent
9858edd99f
commit
5b04e9b6ce
@ -158,6 +158,19 @@ function install_torchvision() {
|
||||
fi
|
||||
}
|
||||
|
||||
function install_torchrec_and_fbgemm() {
|
||||
local torchrec_commit
|
||||
torchrec_commit=$(get_pinned_commit torchrec)
|
||||
local fbgemm_commit
|
||||
fbgemm_commit=$(get_pinned_commit fbgemm)
|
||||
pip_uninstall torchrec-nightly
|
||||
pip_uninstall fbgemm-gpu-nightly
|
||||
pip_install setuptools-git-versioning scikit-build pyre-extensions
|
||||
# See https://github.com/pytorch/pytorch/issues/106971
|
||||
CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
|
||||
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
|
||||
}
|
||||
|
||||
function install_numpy_pytorch_interop() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit numpy_pytorch_interop)
|
||||
|
@ -1028,6 +1028,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
|
||||
else
|
||||
checkout_install_torchbench
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
|
||||
|
1
.github/ci_commit_pins/fbgemm.txt
vendored
Normal file
1
.github/ci_commit_pins/fbgemm.txt
vendored
Normal file
@ -0,0 +1 @@
|
||||
1b2746f642cc2c99fe9d1a0c34359c0de45341c2
|
2
.github/ci_commit_pins/torchbench.txt
vendored
2
.github/ci_commit_pins/torchbench.txt
vendored
@ -1 +1 @@
|
||||
8a0f5e3678bef55148743ab987baa3c89f8dfb5e
|
||||
9371b9e13c826f3930e54346b4d619cb59182f68
|
||||
|
1
.github/ci_commit_pins/torchrec.txt
vendored
Normal file
1
.github/ci_commit_pins/torchrec.txt
vendored
Normal file
@ -0,0 +1 @@
|
||||
6cd9fd362514d14ebb9ed51314c62ac1e1e2bbf2
|
@ -12,6 +12,7 @@ basic_gnn_sage,pass,0
|
||||
clip,pass,0
|
||||
cm3leon_generate,pass,6
|
||||
dcgan,pass,0
|
||||
dlrm,pass,0
|
||||
doctr_det_predictor,pass,2
|
||||
doctr_reco_predictor,fail_accuracy,4
|
||||
drq,pass,0
|
||||
|
|
@ -1,4 +1,5 @@
|
||||
name,accuracy,graph_breaks
|
||||
torchrec_dlrm,infra_error,0
|
||||
BERT_pytorch,pass,8
|
||||
LearningToPaint,pass,8
|
||||
Super_SloMo,pass,8
|
||||
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
|
||||
basic_gnn_sage,pass,8
|
||||
clip,pass,8
|
||||
dcgan,pass,8
|
||||
dlrm,pass,8
|
||||
drq,pass,7
|
||||
fastNLP_Bert,pass,12
|
||||
functorch_dp_cifar10,pass,8
|
||||
|
|
@ -1,4 +1,5 @@
|
||||
name,accuracy,graph_breaks
|
||||
torchrec_dlrm,pass,8
|
||||
BERT_pytorch,pass,8
|
||||
LearningToPaint,pass,8
|
||||
Super_SloMo,pass,8
|
||||
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
|
||||
basic_gnn_sage,pass,8
|
||||
clip,pass,8
|
||||
dcgan,pass,8
|
||||
dlrm,pass,8
|
||||
drq,pass,7
|
||||
fastNLP_Bert,pass,12
|
||||
functorch_dp_cifar10,pass,8
|
||||
|
|
@ -95,8 +95,6 @@ CI_SKIP[CI("eager", training=False)] = [
|
||||
"hf_BigBird", # fail_accuracy
|
||||
# TypeError: pad_center() takes 1 positional argument but 2 were given
|
||||
"tacotron2",
|
||||
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
|
||||
"torchrec_dlrm",
|
||||
# Huggingface
|
||||
"DebertaV2ForQuestionAnswering", # OOM
|
||||
]
|
||||
@ -170,8 +168,6 @@ CI_SKIP[CI("aot_eager", training=True)] = [
|
||||
CI_SKIP[CI("inductor", training=False)] = [
|
||||
# TorchBench
|
||||
"DALLE2_pytorch", # AttributeError: text_encodings
|
||||
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
|
||||
"torchrec_dlrm",
|
||||
"demucs", # OOM
|
||||
"detectron2_fasterrcnn_r_101_c4",
|
||||
"detectron2_fasterrcnn_r_101_dc5",
|
||||
@ -223,8 +219,6 @@ CI_SKIP[CI("inductor", training=False, device="cpu")] = [
|
||||
"pyhpc_turbulent_kinetic_energy",
|
||||
"resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double)
|
||||
"sage", # does not work with fp32
|
||||
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
|
||||
"torchrec_dlrm",
|
||||
# Huggingface
|
||||
"MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793
|
||||
"PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794
|
||||
@ -239,7 +233,6 @@ CI_SKIP[CI("inductor", training=True)] = [
|
||||
*CI_SKIP[CI("inductor", training=False)],
|
||||
# TorchBench
|
||||
"Background_Matting", # fp64_OOM
|
||||
"dlrm", # Fails on CI - unable to repro locally
|
||||
"hf_T5_base", # accuracy
|
||||
"mobilenet_v3_large", # accuracy
|
||||
"resnet50_quantized_qat", # Eager model failed to run
|
||||
@ -260,7 +253,6 @@ CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
|
||||
*CI_SKIP[CI("aot_eager", training=False)],
|
||||
"vision_maskrcnn", # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093
|
||||
# https://github.com/pytorch/pytorch/issues/103760
|
||||
"dlrm",
|
||||
"hf_T5_generate",
|
||||
"hf_Bert", # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
|
||||
]
|
||||
@ -269,6 +261,7 @@ CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
|
||||
*CI_SKIP[CI("aot_eager", training=True)],
|
||||
*CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
|
||||
"llama", # AssertionError: cannot compute free_symbols of True
|
||||
"torchrec_dlrm", # RuntimeError: mat1 and mat2 must have the same dtype, but got Float and BFloat16
|
||||
]
|
||||
|
||||
CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
|
||||
@ -296,8 +289,6 @@ CI_SKIP_OPTIMIZER = {
|
||||
# TIMM
|
||||
"convmixer_768_32", # accuracy
|
||||
"hrnet_w18", # Stack issue in fx
|
||||
# TorchBench
|
||||
"dlrm", # symbolic shapes error
|
||||
# HF
|
||||
"pnasnet5large", # Stack issue in fx
|
||||
"MobileBertForMaskedLM", # Stack issue in fx
|
||||
@ -311,6 +302,7 @@ CI_SKIP_DYNAMIC_BATCH_ONLY = {
|
||||
# It iterates over the batch, which is dynamic, and dynamo chokes
|
||||
# We should be able to graphbreak there.
|
||||
"doctr_det_predictor",
|
||||
"dlrm",
|
||||
}
|
||||
|
||||
|
||||
|
@ -90,6 +90,7 @@ SKIP_FOR_CPU = {
|
||||
"sam", # timeout
|
||||
"llama_v2_7b_16h", # model is CUDA only
|
||||
"stable_diffusion", # flaky
|
||||
"torchrec_dlrm", # requires FBGEMM, CUDA only
|
||||
}
|
||||
|
||||
SKIP_FOR_CUDA = {
|
||||
@ -228,6 +229,11 @@ FORCE_AMP_FOR_FP16_BF16_MODELS = {
|
||||
"tts_angular",
|
||||
}
|
||||
|
||||
# models in canary_models that we should run anyway
|
||||
CANARY_MODELS = {
|
||||
"torchrec_dlrm",
|
||||
}
|
||||
|
||||
|
||||
class TorchBenchmarkRunner(BenchmarkRunner):
|
||||
def __init__(self):
|
||||
@ -394,9 +400,16 @@ class TorchBenchmarkRunner(BenchmarkRunner):
|
||||
return device, benchmark.name, model, example_inputs, batch_size
|
||||
|
||||
def iter_model_names(self, args):
|
||||
from torchbenchmark import _list_model_paths
|
||||
from torchbenchmark import _list_canary_model_paths, _list_model_paths
|
||||
|
||||
models = _list_model_paths()
|
||||
models += [
|
||||
f
|
||||
for f in _list_canary_model_paths()
|
||||
if os.path.basename(f) in CANARY_MODELS
|
||||
]
|
||||
models.sort()
|
||||
|
||||
start, end = self.get_benchmark_indices(len(models))
|
||||
for index, model_path in enumerate(models):
|
||||
if index < start or index >= end:
|
||||
|
Reference in New Issue
Block a user