try

2025-10-21 09:44:02 +08:00 · 2024-03-21 17:15:47 +01:00 · 2024-03-21 17:12:12 +01:00 · 2024-03-21 17:10:53 +01:00 · 2024-03-21 17:10:19 +01:00 · 2024-03-21 17:09:39 +01:00
5 changed files with 482 additions and 652 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
  push:
    branches:
-      - build_ci_docker_image*
+      - check_docker_i
  repository_dispatch:
  workflow_call:
    inputs:
@ -42,286 +42,4 @@ jobs:
          build-args: |
            REF=main
          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
      # Push CI images still need to be re-built daily
      -
        name: Build and push (for Push CI) in a daily basis
        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
        if: inputs.image_postfix != '-push-ci'
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-all-latest-gpu-push-ci
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      - name: Cleanup disk
        run: |
          sudo ls -l /usr/local/lib/
          sudo ls -l /usr/share/
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
          sudo rm -rf /usr/local/lib/android
          sudo rm -rf /usr/share/dotnet
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
  latest-torch-deepspeed-docker-for-push-ci-daily-build:
    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      - name: Cleanup disk
        run: |
          sudo ls -l /usr/local/lib/
          sudo ls -l /usr/share/
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
          sudo rm -rf /usr/local/lib/android
          sudo rm -rf /usr/share/dotnet
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      # Push CI images still need to be re-built daily
      -
        name: Build and push (for Push CI) in a daily basis
        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
        if: inputs.image_postfix != '-push-ci'
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
  doc-builder:
    name: "Doc builder"
    # Push CI doesn't need this image
    if: inputs.image_postfix != '-push-ci'
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-doc-builder
          push: true
          tags: huggingface/transformers-doc-builder
  latest-pytorch:
    name: "Latest PyTorch [dev]"
    # Push CI doesn't need this image
    if: inputs.image_postfix != '-push-ci'
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      - name: Cleanup disk
        run: |
          sudo ls -l /usr/local/lib/
          sudo ls -l /usr/share/
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
          sudo rm -rf /usr/local/lib/android
          sudo rm -rf /usr/share/dotnet
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-pytorch-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-gpu
 # Need to be fixed with the help from Guillaume.
 #  latest-pytorch-amd:
 #    name: "Latest PyTorch (AMD) [dev]"
 #    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
 #    steps:
 #      - name: Set up Docker Buildx
 #        uses: docker/setup-buildx-action@v3
 #      - name: Check out code
 #        uses: actions/checkout@v3
 #      - name: Login to DockerHub
 #        uses: docker/login-action@v3
 #        with:
 #          username: ${{ secrets.DOCKERHUB_USERNAME }}
 #          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 #      - name: Build and push
 #        uses: docker/build-push-action@v5
 #        with:
 #          context: ./docker/transformers-pytorch-amd-gpu
 #          build-args: |
 #            REF=main
 #          push: true
 #          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
 #      # Push CI images still need to be re-built daily
 #      -
 #        name: Build and push (for Push CI) in a daily basis
 #        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
 #        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
 #        if: inputs.image_postfix != '-push-ci'
 #        uses: docker/build-push-action@v5
 #        with:
 #          context: ./docker/transformers-pytorch-amd-gpu
 #          build-args: |
 #            REF=main
 #          push: true
 #          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
  latest-tensorflow:
    name: "Latest TensorFlow [dev]"
    # Push CI doesn't need this image
    if: inputs.image_postfix != '-push-ci'
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-tensorflow-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-tensorflow-gpu
  # latest-pytorch-deepspeed-amd:
  #   name: "PyTorch + DeepSpeed (AMD) [dev]"
  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
  #   steps:
  #     - name: Set up Docker Buildx
  #       uses: docker/setup-buildx-action@v3
  #     - name: Check out code
  #       uses: actions/checkout@v3
  #     - name: Login to DockerHub
  #       uses: docker/login-action@v3
  #       with:
  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
  #     - name: Build and push
  #       uses: docker/build-push-action@v5
  #       with:
  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
  #         build-args: |
  #           REF=main
  #         push: true
  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
  #     # Push CI images still need to be re-built daily
  #     -
  #       name: Build and push (for Push CI) in a daily basis
  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
  #       if: inputs.image_postfix != '-push-ci'
  #       uses: docker/build-push-action@v5
  #       with:
  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
  #         build-args: |
  #           REF=main
  #         push: true
  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
  latest-quantization-torch-docker:
    name: "Latest Pytorch + Quantization [dev]"
     # Push CI doesn't need this image
    if: inputs.image_postfix != '-push-ci'
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      -
        name: Check out code
        uses: actions/checkout@v3
      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-quantization-latest-gpu
          build-args: |
            REF=main
          push: true
          tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -0,0 +1,37 @@
 name: Self-hosted runner (scheduled)
 # Note that each job's dependencies go into a corresponding docker file.
 #
 # For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
 # `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
 # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 on:
  repository_dispatch:
  schedule:
    - cron: "17 2 * * *"
  push:
    branches:
      - run_scheduled_ci*
      - move_jobs_from_daily_ci
 jobs:
  model-ci:
    name: Model CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_tests_gpu
    secrets:
      CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
      CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
      ACCESS_REPO_INFO_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
  quantization-ci:
    name: Quantization CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_tests_quantization_torch_gpu
    secrets:
      CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
      CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_BOT_TOKEN }}
      ACCESS_REPO_INFO_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -7,12 +7,18 @@ name: Self-hosted runner (scheduled)
 # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 on:
-  repository_dispatch:
+  workflow_call:
-  schedule:
+    inputs:
-    - cron: "17 2 * * *"
+      job:
-  push:
+        required: true
-    branches:
+        type: string
-      - run_scheduled_ci*
+    secrets:
      CI_SLACK_BOT_TOKEN:
        required: true
      CI_SLACK_REPORT_CHANNEL_ID:
        required: true
      ACCESS_REPO_INFO_TOKEN:
        required: true
 env:
  HF_HOME: /mnt/cache
@ -30,274 +36,277 @@ env:
  NUM_SLICES: 2
 jobs:
-  setup:
+#  setup:
-    name: Setup
+#    if: ${{ inputs.job == 'run_tests_gpu' }}
-    strategy:
+#    name: Setup
-      matrix:
+#    strategy:
-        machine_type: [single-gpu, multi-gpu]
+#      matrix:
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+#        machine_type: [single-gpu, multi-gpu]
-    container:
+#    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-      image: huggingface/transformers-all-latest-gpu
+#    container:
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#      image: huggingface/transformers-all-latest-gpu
-    outputs:
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+#    outputs:
-      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+#      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
-    steps:
+#      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
-      - name: Update clone
+#    steps:
-        working-directory: /transformers
+#      - name: Update clone
-        run: |
+#        working-directory: /transformers
-          git fetch && git checkout ${{ github.sha }}
+#        run: |
 #          git fetch && git checkout ${{ github.sha }}
 #
 #      - name: Cleanup
 #        working-directory: /transformers
 #        run: |
 #          rm -rf tests/__pycache__
 #          rm -rf tests/models/__pycache__
 #          rm -rf reports
 #
 #      - name: Show installed libraries and their versions
 #        working-directory: /transformers
 #        run: pip freeze
 #
 #      - id: set-matrix
 #        name: Identify models to test
 #        working-directory: /transformers/tests
 #        run: |
 #          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
 #          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
 #  run_tests_gpu:
 #    if: ${{ inputs.job == 'run_tests_gpu' }}
 #    name: " "
 #    needs: setup
 #    strategy:
 #      fail-fast: false
 #      matrix:
 #        machine_type: [single-gpu, multi-gpu]
 #        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
 #    uses: ./.github/workflows/model_jobs.yml
 #    with:
 #      folder_slices: ${{ needs.setup.outputs.folder_slices }}
 #      machine_type: ${{ matrix.machine_type }}
 #      slice_id: ${{ matrix.slice_id }}
 #    secrets: inherit
-      - name: Cleanup
+#  run_examples_gpu:
-        working-directory: /transformers
+#    name: Examples directory
-        run: |
+#    strategy:
-          rm -rf tests/__pycache__
+#      fail-fast: false
-          rm -rf tests/models/__pycache__
+#      matrix:
-          rm -rf reports
+#        machine_type: [single-gpu]
-
+#    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-      - name: Show installed libraries and their versions
+#    container:
-        working-directory: /transformers
+#      image: huggingface/transformers-all-latest-gpu
-        run: pip freeze
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
+#    needs: setup
-      - id: set-matrix
+#    steps:
-        name: Identify models to test
+#      - name: Update clone
-        working-directory: /transformers/tests
+#        working-directory: /transformers
-        run: |
+#        run: git fetch && git checkout ${{ github.sha }}
-          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+#
-          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+#      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-
+#        working-directory: /transformers
-      - name: NVIDIA-SMI
+#        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-        run: |
+#
-          nvidia-smi
+#      - name: NVIDIA-SMI
-
+#        run: |
-  run_tests_gpu:
+#          nvidia-smi
-    name: " "
+#
-    needs: setup
+#      - name: Environment
-    strategy:
+#        working-directory: /transformers
-      fail-fast: false
+#        run: |
-      matrix:
+#          python3 utils/print_env.py
-        machine_type: [single-gpu, multi-gpu]
+#
-        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+#      - name: Show installed libraries and their versions
-    uses: ./.github/workflows/model_jobs.yml
+#        working-directory: /transformers
-    with:
+#        run: pip freeze
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+#
-      machine_type: ${{ matrix.machine_type }}
+#      - name: Run examples tests on GPU
-      slice_id: ${{ matrix.slice_id }}
+#        working-directory: /transformers
-    secrets: inherit
+#        run: |
-
+#          pip install -r examples/pytorch/_tests_requirements.txt
-  run_examples_gpu:
+#          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-    name: Examples directory
+#
-    strategy:
+#      - name: Failure short reports
-      fail-fast: false
+#        if: ${{ failure() }}
-      matrix:
+#        continue-on-error: true
-        machine_type: [single-gpu]
+#        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+#
-    container:
+#      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
-      image: huggingface/transformers-all-latest-gpu
+#        if: ${{ always() }}
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#        uses: actions/upload-artifact@v3
-    needs: setup
+#        with:
-    steps:
+#          name: ${{ matrix.machine_type }}_run_examples_gpu
-      - name: Update clone
+#          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-        working-directory: /transformers
+#
-        run: git fetch && git checkout ${{ github.sha }}
+#  run_pipelines_torch_gpu:
-
+#    name: PyTorch pipelines
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+#    strategy:
-        working-directory: /transformers
+#      fail-fast: false
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+#      matrix:
-
+#        machine_type: [single-gpu, multi-gpu]
-      - name: NVIDIA-SMI
+#    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-        run: |
+#    container:
-          nvidia-smi
+#      image: huggingface/transformers-pytorch-gpu
-
+#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-      - name: Environment
+#    needs: setup
-        working-directory: /transformers
+#    steps:
-        run: |
+#      - name: Update clone
-          python3 utils/print_env.py
+#        working-directory: /transformers
-
+#        run: git fetch && git checkout ${{ github.sha }}
-      - name: Show installed libraries and their versions
+#
-        working-directory: /transformers
+#      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        run: pip freeze
+#        working-directory: /transformers
-
+#        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: Run examples tests on GPU
+#
-        working-directory: /transformers
+#      - name: NVIDIA-SMI
-        run: |
+#        run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
+#          nvidia-smi
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+#
-
+#      - name: Environment
-      - name: Failure short reports
+#        working-directory: /transformers
-        if: ${{ failure() }}
+#        run: |
-        continue-on-error: true
+#          python3 utils/print_env.py
-        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+#
-
+#      - name: Show installed libraries and their versions
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
+#        working-directory: /transformers
-        if: ${{ always() }}
+#        run: pip freeze
-        uses: actions/upload-artifact@v3
+#
-        with:
+#      - name: Run all pipeline tests on GPU
-          name: ${{ matrix.machine_type }}_run_examples_gpu
+#        working-directory: /transformers
-          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+#        run: |
-
+#          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-  run_pipelines_torch_gpu:
+#
-    name: PyTorch pipelines
+#      - name: Failure short reports
-    strategy:
+#        if: ${{ failure() }}
-      fail-fast: false
+#        continue-on-error: true
-      matrix:
+#        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-        machine_type: [single-gpu, multi-gpu]
+#
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+#      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
-    container:
+#        if: ${{ always() }}
-      image: huggingface/transformers-pytorch-gpu
+#        uses: actions/upload-artifact@v3
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#        with:
-    needs: setup
+#          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-    steps:
+#          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
-      - name: Update clone
+#
-        working-directory: /transformers
+#  run_pipelines_tf_gpu:
-        run: git fetch && git checkout ${{ github.sha }}
+#    name: TensorFlow pipelines
-
+#    strategy:
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+#      fail-fast: false
-        working-directory: /transformers
+#      matrix:
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+#        machine_type: [single-gpu, multi-gpu]
-
+#    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-      - name: NVIDIA-SMI
+#    container:
-        run: |
+#      image: huggingface/transformers-tensorflow-gpu
-          nvidia-smi
+#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
+#    needs: setup
-      - name: Environment
+#    steps:
-        working-directory: /transformers
+#      - name: Update clone
-        run: |
+#        working-directory: /transformers
-          python3 utils/print_env.py
+#        run: |
-
+#          git fetch && git checkout ${{ github.sha }}
-      - name: Show installed libraries and their versions
+#
-        working-directory: /transformers
+#      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        run: pip freeze
+#        working-directory: /transformers
-
+#        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: Run all pipeline tests on GPU
+#
-        working-directory: /transformers
+#      - name: NVIDIA-SMI
-        run: |
+#        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+#          nvidia-smi
-
+#
-      - name: Failure short reports
+#      - name: Environment
-        if: ${{ failure() }}
+#        working-directory: /transformers
-        continue-on-error: true
+#        run: |
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+#          python3 utils/print_env.py
-
+#
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
+#      - name: Show installed libraries and their versions
-        if: ${{ always() }}
+#        working-directory: /transformers
-        uses: actions/upload-artifact@v3
+#        run: pip freeze
-        with:
+#
-          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+#      - name: Run all pipeline tests on GPU
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+#        working-directory: /transformers
-
+#        run: |
-  run_pipelines_tf_gpu:
+#          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
-    name: TensorFlow pipelines
+#
-    strategy:
+#      - name: Failure short reports
-      fail-fast: false
+#        if: ${{ always() }}
-      matrix:
+#        run: |
-        machine_type: [single-gpu, multi-gpu]
+#          cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+#
-    container:
+#      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
-      image: huggingface/transformers-tensorflow-gpu
+#        if: ${{ always() }}
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#        uses: actions/upload-artifact@v3
-    needs: setup
+#        with:
-    steps:
+#          name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
-      - name: Update clone
+#          path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
-        working-directory: /transformers
+#
-        run: |
+#  run_all_tests_torch_cuda_extensions_gpu:
-          git fetch && git checkout ${{ github.sha }}
+#    name: Torch CUDA extension tests
-
+#    strategy:
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+#      fail-fast: false
-        working-directory: /transformers
+#      matrix:
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+#        machine_type: [single-gpu, multi-gpu]
-
+#    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-      - name: NVIDIA-SMI
+#    needs: setup
-        run: |
+#    container:
-          nvidia-smi
+#      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
-
+#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-      - name: Environment
+#    steps:
-        working-directory: /transformers
+#      - name: Update clone
-        run: |
+#        working-directory: /workspace/transformers
-          python3 utils/print_env.py
+#        run: git fetch && git checkout ${{ github.sha }}
-
+#
-      - name: Show installed libraries and their versions
+#      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
+#        working-directory: /workspace/transformers
-        run: pip freeze
+#        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
+#
-      - name: Run all pipeline tests on GPU
+#      - name: Remove cached torch extensions
-        working-directory: /transformers
+#        run: rm -rf /github/home/.cache/torch_extensions/
-        run: |
+#
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
+#      # To avoid unknown test failures
-
+#      - name: Pre build DeepSpeed *again*
-      - name: Failure short reports
+#        working-directory: /workspace
-        if: ${{ always() }}
+#        run: |
-        run: |
+#          python3 -m pip uninstall -y deepspeed
-          cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
+#          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
-
+#
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
+#      - name: NVIDIA-SMI
-        if: ${{ always() }}
+#        run: |
-        uses: actions/upload-artifact@v3
+#          nvidia-smi
-        with:
+#
-          name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
+#      - name: Environment
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
+#        working-directory: /workspace/transformers
-
+#        run: |
-  run_all_tests_torch_cuda_extensions_gpu:
+#          python utils/print_env.py
-    name: Torch CUDA extension tests
+#
-    strategy:
+#      - name: Show installed libraries and their versions
-      fail-fast: false
+#        working-directory: /workspace/transformers
-      matrix:
+#        run: pip freeze
-        machine_type: [single-gpu, multi-gpu]
+#
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+#      - name: Run all tests on GPU
-    needs: setup
+#        working-directory: /workspace/transformers
-    container:
+#        run: |
-      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
+#          python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#
-    steps:
+#      - name: Failure short reports
-      - name: Update clone
+#        if: ${{ failure() }}
-        working-directory: /workspace/transformers
+#        continue-on-error: true
-        run: git fetch && git checkout ${{ github.sha }}
+#        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
-
+#
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+#      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports"
-        working-directory: /workspace/transformers
+#        if: ${{ always() }}
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+#        uses: actions/upload-artifact@v3
-
+#        with:
-      - name: Remove cached torch extensions
+#          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
-        run: rm -rf /github/home/.cache/torch_extensions/
+#          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again*
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Environment
        working-directory: /workspace/transformers
        run: |
          python utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /workspace/transformers
        run: pip freeze
      - name: Run all tests on GPU
        working-directory: /workspace/transformers
        run: |
          python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
  run_tests_quantization_torch_gpu:
    if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
    name: Quantization tests
    strategy:
      fail-fast: false
@ -307,7 +316,6 @@ jobs:
    container:
      image: huggingface/transformers-quantization-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@ -347,101 +355,113 @@ jobs:
          name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu
          path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
-  run_extract_warnings:
+#
-    name: Extract warnings in CI artifacts
+#  run_extract_warnings:
-    runs-on: ubuntu-22.04
+#    name: Extract warnings in CI artifacts
-    if: always()
+#    runs-on: ubuntu-22.04
-    needs: [
+#    if: always()
-      setup,
+#    needs: [
-      run_tests_gpu,
+#      setup,
-      run_examples_gpu,
+#      run_tests_gpu,
-      run_pipelines_tf_gpu,
+#      run_examples_gpu,
-      run_pipelines_torch_gpu,
+#      run_pipelines_tf_gpu,
-      run_all_tests_torch_cuda_extensions_gpu,
+#      run_pipelines_torch_gpu,
-      run_tests_quantization_torch_gpu,
+#      run_all_tests_torch_cuda_extensions_gpu,
-    ]
+#      run_tests_quantization_torch_gpu,
-    steps:
+#    ]
-      - name: Checkout transformers
+#    steps:
-        uses: actions/checkout@v3
+#      - name: Checkout transformers
-        with:
+#        uses: actions/checkout@v3
-          fetch-depth: 2
+#        with:
-
+#          fetch-depth: 2
-      - name: Install transformers
+#
-        run: pip install transformers
+#      - name: Install transformers
-
+#        run: pip install transformers
-      - name: Show installed libraries and their versions
+#
-        run: pip freeze
+#      - name: Show installed libraries and their versions
-
+#        run: pip freeze
-      - name: Create output directory
+#
-        run: mkdir warnings_in_ci
+#      - name: Create output directory
-
+#        run: mkdir warnings_in_ci
-      - uses: actions/download-artifact@v3
+#
-        with:
+#      - uses: actions/download-artifact@v3
-          path: warnings_in_ci
+#        with:
-
+#          path: warnings_in_ci
-      - name: Show artifacts
+#
-        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
+#      - name: Show artifacts
-        working-directory: warnings_in_ci
+#        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
-
+#        working-directory: warnings_in_ci
-      - name: Extract warnings in CI artifacts
+#
-        run: |
+#      - name: Extract warnings in CI artifacts
-          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
+#        run: |
-          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+#          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
-
+#          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
-      - name: Upload artifact
+#
-        if: ${{ always() }}
+#      - name: Upload artifact
-        uses: actions/upload-artifact@v3
+#        if: ${{ always() }}
-        with:
+#        uses: actions/upload-artifact@v3
-          name: warnings_in_ci
+#        with:
-          path: warnings_in_ci/selected_warnings.json
+#          name: warnings_in_ci
 #          path: warnings_in_ci/selected_warnings.json
 #
 #  send_results:
 #    name: Send results to webhook
 #    runs-on: ubuntu-22.04
 #    if: always()
 #    needs: [
 #      setup,
 #      run_tests_gpu,
 #      run_examples_gpu,
 #      run_pipelines_tf_gpu,
 #      run_pipelines_torch_gpu,
 #      run_all_tests_torch_cuda_extensions_gpu,
 #      run_tests_quantization_torch_gpu,
 #      run_extract_warnings
 #    ]
 #    steps:
 #      - name: Preliminary job status
 #        shell: bash
 #        # For the meaning of these environment variables, see the job `Setup`
 #        run: |
 #          echo "Setup status: ${{ needs.setup.result }}"
 #
 #      - uses: actions/checkout@v3
 #      - uses: actions/download-artifact@v3
 #      - name: Send message to Slack
 #        env:
 #          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
 #          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
 #          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
 #          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
 #          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
 #          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
 #          CI_EVENT: scheduled
 #          CI_SHA: ${{ github.sha }}
 #          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
 #          SETUP_STATUS: ${{ needs.setup.result }}
 #        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
 #        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
 #        run: |
 #          sudo apt-get install -y curl
 #          pip install slack_sdk
 #          pip show slack_sdk
 #          python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
 #
 #      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
 #      - name: Failure table artifacts
 #        if: ${{ always() }}
 #        uses: actions/upload-artifact@v3
 #        with:
 #          name: prev_ci_results
 #          path: prev_ci_results
  send_results:
-    name: Send results to webhook
+    name: Slack Report
-    runs-on: ubuntu-22.04
+    needs: "${{ inputs.job }}"
-    if: always()
+    uses: ./.github/workflows/slack-report.yml
-    needs: [
+    with:
-      setup,
+      job: ${{ inputs.job }}
-      run_tests_gpu,
+    secrets:
-      run_examples_gpu,
+      CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-      run_pipelines_tf_gpu,
+      CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-      run_pipelines_torch_gpu,
+      ACCESS_REPO_INFO_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
      run_all_tests_torch_cuda_extensions_gpu,
      run_tests_quantization_torch_gpu,
      run_extract_warnings
    ]
    steps:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
          echo "Setup status: ${{ needs.setup.result }}"
      - uses: actions/checkout@v3
      - uses: actions/download-artifact@v3
      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: scheduled
          CI_SHA: ${{ github.sha }}
          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          sudo apt-get install -y curl
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
          name: prev_ci_results
          path: prev_ci_results
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -0,0 +1,54 @@
 name: CI slack report
 on:
  workflow_call:
    inputs:
      job:
        required: true
        type: string
    secrets:
      CI_SLACK_BOT_TOKEN:
        required: true
      CI_SLACK_REPORT_CHANNEL_ID:
        required: true
      ACCESS_REPO_INFO_TOKEN:
        required: true
 jobs:
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-22.04
    if: always()
    steps:
 #      - name: Preliminary job status
 #        shell: bash
 #        # For the meaning of these environment variables, see the job `Setup`
 #        run: |
 #          echo "Setup status: ${{ needs.setup.result }}"
      - uses: actions/checkout@v3
      - uses: actions/download-artifact@v3
      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_REPORT_CHANNEL_ID }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: scheduled
          CI_SHA: ${{ github.sha }}
          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          # SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          sudo apt-get install -y curl
 #          pip install slack_sdk
 #          pip show slack_sdk
 #          python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
 #      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
 #      - name: Failure table artifacts
 #        if: ${{ always() }}
 #        uses: actions/upload-artifact@v3
 #        with:
 #          name: prev_ci_results
 #          path: prev_ci_results
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -62,4 +62,5 @@ if __name__ == "__main__":
        start = end
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])
    model_splits = [["models/bert"], ["models/gpt2"]]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	c555694cbf	try	2024-03-21 17:15:47 +01:00
ydshieh	f8ee240b40	try	2024-03-21 17:12:12 +01:00
ydshieh	9bd4c17b1c	try	2024-03-21 17:10:53 +01:00
ydshieh	76fe733664	try	2024-03-21 17:10:19 +01:00
ydshieh	96d2333111	try	2024-03-21 17:09:39 +01:00
ydshieh	8a301ac0e1	try	2024-03-21 17:08:25 +01:00
ydshieh	01ab42b2f2	try	2024-03-21 17:07:02 +01:00
ydshieh	f670d84979	try	2024-03-21 17:03:55 +01:00
ydshieh	ad044b12b5	try	2024-03-21 16:44:56 +01:00
ydshieh	0554a9c760	try	2024-03-21 16:39:44 +01:00
ydshieh	ecbff23436	try	2024-03-21 16:29:54 +01:00
ydshieh	9133fe5172	try	2024-03-21 16:27:21 +01:00
ydshieh	2a897f5074	try	2024-03-21 16:16:45 +01:00
ydshieh	0dd8ed476a	try	2024-03-21 16:05:54 +01:00
ydshieh	81c9996dbc	try	2024-03-21 16:04:22 +01:00
ydshieh	36b492993f	try	2024-03-21 15:38:31 +01:00
ydshieh	b228d36d35	try	2024-03-21 15:30:53 +01:00
ydshieh	919ca7a8fa	try	2024-03-21 15:21:33 +01:00
ydshieh	67cf3d28a6	try	2024-03-21 14:39:40 +01:00
ydshieh	fc506756e8	try	2024-03-21 14:38:08 +01:00
ydshieh	554801884e	try	2024-03-21 14:35:35 +01:00
ydshieh	2f9eddeca8	try	2024-03-21 14:33:17 +01:00