Install torchrec/fbgemm from source in CI (#106808)

Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/106808 Approved by: https://github.com/malfet, https://github.com/xuzhao9
2025-10-20 21:14:14 +08:00 · 2023-08-11 12:47:51 -04:00
parent 9858edd99f
commit 5b04e9b6ce
10 changed files with 42 additions and 12 deletions
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -158,6 +158,19 @@ function install_torchvision() {
  fi
 }

+function install_torchrec_and_fbgemm() {
+  local torchrec_commit
+  torchrec_commit=$(get_pinned_commit torchrec)
+  local fbgemm_commit
+  fbgemm_commit=$(get_pinned_commit fbgemm)
+  pip_uninstall torchrec-nightly
+  pip_uninstall fbgemm-gpu-nightly
+  pip_install setuptools-git-versioning scikit-build pyre-extensions
+  # See https://github.com/pytorch/pytorch/issues/106971
+  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+}
+
 function install_numpy_pytorch_interop() {
  local commit
  commit=$(get_pinned_commit numpy_pytorch_interop)
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1028,6 +1028,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  else
    checkout_install_torchbench
+    # Do this after checkout_install_torchbench to ensure we clobber any
+    # nightlies that torchbench may pull in
+    if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
+      install_torchrec_and_fbgemm
+    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
--- a/.github/ci_commit_pins/fbgemm.txt
+++ b/.github/ci_commit_pins/fbgemm.txt
@ -0,0 +1 @@
+1b2746f642cc2c99fe9d1a0c34359c0de45341c2
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-8a0f5e3678bef55148743ab987baa3c89f8dfb5e
+9371b9e13c826f3930e54346b4d619cb59182f68
--- a/.github/ci_commit_pins/torchrec.txt
+++ b/.github/ci_commit_pins/torchrec.txt
@ -0,0 +1 @@
+6cd9fd362514d14ebb9ed51314c62ac1e1e2bbf2
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_inference.csv
@ -12,6 +12,7 @@ basic_gnn_sage,pass,0
 clip,pass,0
 cm3leon_generate,pass,6
 dcgan,pass,0
+dlrm,pass,0
 doctr_det_predictor,pass,2
 doctr_reco_predictor,fail_accuracy,4
 drq,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_training.csv
@ -1,4 +1,5 @@
 name,accuracy,graph_breaks
+torchrec_dlrm,infra_error,0
 BERT_pytorch,pass,8
 LearningToPaint,pass,8
 Super_SloMo,pass,8
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
 basic_gnn_sage,pass,8
 clip,pass,8
 dcgan,pass,8
+dlrm,pass,8
 drq,pass,7
 fastNLP_Bert,pass,12
 functorch_dp_cifar10,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -1,4 +1,5 @@
 name,accuracy,graph_breaks
+torchrec_dlrm,pass,8
 BERT_pytorch,pass,8
 LearningToPaint,pass,8
 Super_SloMo,pass,8
@ -10,6 +11,7 @@ basic_gnn_gin,pass,8
 basic_gnn_sage,pass,8
 clip,pass,8
 dcgan,pass,8
+dlrm,pass,8
 drq,pass,7
 fastNLP_Bert,pass,12
 functorch_dp_cifar10,pass,8
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -95,8 +95,6 @@ CI_SKIP[CI("eager", training=False)] = [
    "hf_BigBird",  # fail_accuracy
    # TypeError: pad_center() takes 1 positional argument but 2 were given
    "tacotron2",
-    # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
-    "torchrec_dlrm",
    # Huggingface
    "DebertaV2ForQuestionAnswering",  # OOM
 ]
@ -170,8 +168,6 @@ CI_SKIP[CI("aot_eager", training=True)] = [
 CI_SKIP[CI("inductor", training=False)] = [
    # TorchBench
    "DALLE2_pytorch",  # AttributeError: text_encodings
-    # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
-    "torchrec_dlrm",
    "demucs",  # OOM
    "detectron2_fasterrcnn_r_101_c4",
    "detectron2_fasterrcnn_r_101_dc5",
@ -223,8 +219,6 @@ CI_SKIP[CI("inductor", training=False, device="cpu")] = [
    "pyhpc_turbulent_kinetic_energy",
    "resnet50_quantized_qat",  # Eager model failed to run(Quantize only works on Float Tensor, got Double)
    "sage",  # does not work with fp32
-    # torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
-    "torchrec_dlrm",
    # Huggingface
    "MBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94793
    "PLBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94794
@ -239,7 +233,6 @@ CI_SKIP[CI("inductor", training=True)] = [
    *CI_SKIP[CI("inductor", training=False)],
    # TorchBench
    "Background_Matting",  # fp64_OOM
-    "dlrm",  # Fails on CI - unable to repro locally
    "hf_T5_base",  # accuracy
    "mobilenet_v3_large",  # accuracy
    "resnet50_quantized_qat",  # Eager model failed to run
@ -260,7 +253,6 @@ CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
    *CI_SKIP[CI("aot_eager", training=False)],
    "vision_maskrcnn",  # accuracy failure on boxes, after https://github.com/pytorch/pytorch/issues/101093
    # https://github.com/pytorch/pytorch/issues/103760
-    "dlrm",
    "hf_T5_generate",
    "hf_Bert",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
 ]
@ -269,6 +261,7 @@ CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
    *CI_SKIP[CI("aot_eager", training=True)],
    *CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
    "llama",  # AssertionError: cannot compute free_symbols of True
+    "torchrec_dlrm",  # RuntimeError: mat1 and mat2 must have the same dtype, but got Float and BFloat16
 ]

 CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
@ -296,8 +289,6 @@ CI_SKIP_OPTIMIZER = {
    # TIMM
    "convmixer_768_32",  # accuracy
    "hrnet_w18",  # Stack issue in fx
-    # TorchBench
-    "dlrm",  # symbolic shapes error
    # HF
    "pnasnet5large",  # Stack issue in fx
    "MobileBertForMaskedLM",  # Stack issue in fx
@ -311,6 +302,7 @@ CI_SKIP_DYNAMIC_BATCH_ONLY = {
    # It iterates over the batch, which is dynamic, and dynamo chokes
    # We should be able to graphbreak there.
    "doctr_det_predictor",
+    "dlrm",
 }


--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@ -90,6 +90,7 @@ SKIP_FOR_CPU = {
    "sam",  # timeout
    "llama_v2_7b_16h",  # model is CUDA only
    "stable_diffusion",  # flaky
+    "torchrec_dlrm",  # requires FBGEMM, CUDA only
 }

 SKIP_FOR_CUDA = {
@ -228,6 +229,11 @@ FORCE_AMP_FOR_FP16_BF16_MODELS = {
    "tts_angular",
 }

+# models in canary_models that we should run anyway
+CANARY_MODELS = {
+    "torchrec_dlrm",
+}
+

 class TorchBenchmarkRunner(BenchmarkRunner):
    def __init__(self):
@ -394,9 +400,16 @@ class TorchBenchmarkRunner(BenchmarkRunner):
        return device, benchmark.name, model, example_inputs, batch_size

    def iter_model_names(self, args):
-        from torchbenchmark import _list_model_paths
+        from torchbenchmark import _list_canary_model_paths, _list_model_paths

        models = _list_model_paths()
+        models += [
+            f
+            for f in _list_canary_model_paths()
+            if os.path.basename(f) in CANARY_MODELS
+        ]
+        models.sort()
+
        start, end = self.get_benchmark_indices(len(models))
        for index, model_path in enumerate(models):
            if index < start or index >= end: