Enable inductor CI for huggingface (#86792)

Summary: Unit tests will be enabled after fixed in trunck. TorchBench and TIMM need more setup and are coming later. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86792 Approved by: https://github.com/jansel, https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2022-10-20 22:37:07 +00:00
parent 9ba632253a
commit b1cf377cce
10 changed files with 155 additions and 0 deletions
--- a/.github/ci_commit_pins/huggingface.txt
+++ b/.github/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
+ebee0a27940adfbb30444d83387b9ea0f1173f40
--- a/.github/ci_commit_pins/timm.txt
+++ b/.github/ci_commit_pins/timm.txt
@ -0,0 +1 @@
+ebee0a27940adfbb30444d83387b9ea0f1173f40
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -0,0 +1 @@
+24b95f2f627bf07a61cefed653419389a7586357
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -7,3 +7,8 @@
 "module: inductor":
 - torch/_inductor/**
 - test/inductor/**
+
+"ciflow/inductor":
+- torch/_dynamo/**
+- torch/_inductor/**
+- benchmarks/dynamo/**
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -22,6 +22,7 @@ VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in {
    "dynamo",
    "force_on_cpu",
    "functorch",
+    "inductor",
    "jit_legacy",
    "multigpu",
    "nogpu_AVX512",
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -0,0 +1,36 @@
+name: inductor
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - ciflow/inductor/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_6-py3_10-gcc7-inductor-build:
+    name: cuda11.6-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
+    name: cuda11.6-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_6-py3_10-gcc7-inductor-build
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.test-matrix }}
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@ -170,6 +170,33 @@ function test_torch_deploy(){
 popd
 }

+function install_huggingface() {
+  local commit
+  commit=$(get_pinned_commit huggingface)
+  pip_install pandas
+  pip_install scipy
+  pip_install "git+https://github.com/huggingface/transformers.git@${commit}#egg=transformers"
+}
+
+function install_timm() {
+  local commit
+  commit=$(get_pinned_commit timm)
+  pip_install pandas
+  pip_install scipy
+  pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
+}
+
+function checkout_install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "${commit}"
+  python install.py
+  pip_install gym==0.25.2  # workaround issue in 0.26.0
+  popd
+}
+
 function test_functorch() {
  python test/run_test.py --functorch --verbose
 }
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -109,6 +109,10 @@ if [[ "$TEST_CONFIG" == *dynamo* ]]; then
  export PYTORCH_TEST_WITH_DYNAMO=1
 fi

+if [[ "$TEST_CONFIG" == *inductor* ]]; then
+  export PYTORCH_TEST_WITH_INDUCTOR=1
+fi
+
 # TODO: this condition is never true, need to fix this.
 if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
  # skip expensive checks when on PR and CI_MASTER flag is not set
@ -249,6 +253,30 @@ test_dynamo_shard() {
  assert_git_not_dirty
 }

+
+test_inductor() {
+  echo "TODO: enable inductor unit tests"
+  # time python test/run_test.py --core --exclude test_autograd --continue-through-error --verbose
+
+  # PYTORCH_TEST_WITH_DYNAMO and PYTORCH_TEST_WITH_INDUCTOR are only needed for PyTorch tests not written with
+  # using dynamo/inductor. For dynamo/inductor unit tests, specifiying them will trigger an error like
+  # "Detected two calls to `torchdynamo.optimize(...)` with a different backend compiler arguments."
+  # PYTORCH_TEST_WITH_DYNAMO=0 PYTORCH_TEST_WITH_INDUCTOR=0 pytest test/inductor
+}
+
+test_inductor_huggingface_shard() {
+  if [[ -z "$NUM_TEST_SHARDS" ]]; then
+    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
+    exit 1
+  fi
+  TEST_REPORTS_DIR=/tmp/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  python benchmarks/dynamo/huggingface.py --ci --training --accuracy \
+    --device cuda --inductor --float32 --total-partitions 1 --partition-id "$1" \
+    --output "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv
+}
+
 test_python_gloo_with_tls() {
  source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
  assert_git_not_dirty
@ -699,6 +727,17 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
  install_filelock
  install_triton
  test_dynamo_shard 2
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  test_inductor
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  install_huggingface
+  test_inductor_huggingface_shard 0
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
--- a/benchmarks/dynamo/check_csv.py
+++ b/benchmarks/dynamo/check_csv.py
@ -0,0 +1,40 @@
+import argparse
+import sys
+import textwrap
+
+import pandas as pd
+
+
+def check_csv(filename):
+    """
+    Basic accuracy checking.
+    """
+
+    df = pd.read_csv(filename)
+
+    failed = []
+    for _, row in df.iterrows():
+        model_name = row["name"]
+        status = row["accuracy"]
+        if "pass" not in status:
+            failed.append(model_name)
+
+        print(f"{model_name:34} {status}")
+
+    if failed:
+        print(
+            textwrap.dedent(
+                f"""
+                Error {len(failed)} models failed
+                    {' '.join(failed)}
+                """
+            )
+        )
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file", "-f", type=str, help="csv file name")
+    args = parser.parse_args()
+    check_csv(args.file)
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -147,6 +147,10 @@ CI_SKIP_INDUCTOR_TRAINING = [
    "cait_m36_384",  # fp64_OOM
    "coat_lite_mini",  # time out
    "convit_base",  # fp64_OOM
+    "gernet_l",  # accuracy
+    "gluon_xception65",
+    "lcnet_0500",  # accuracy
+    "levit_128",  # levit_128
    "rexnet_100",  # accuracy
    "swin_base_patch4_window7_224",
    "twins_pcpvt_base",  # time out