fix

enable back other tests
2025-10-20 17:13:56 +08:00 · 2023-12-07 12:35:03 +01:00 · 2023-12-06 17:43:01 +01:00 · 2023-12-06 15:59:49 +01:00 · 2023-12-06 12:53:01 +01:00 · 2023-12-06 10:42:57 +01:00
12 changed files with 142 additions and 268 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
  push:
    branches:
-      - build_ci_docker_image*
+      - trigger_run_amd_scheduled_ci_caller_deepspeed_test
  repository_dispatch:
  workflow_call:
    inputs:
@ -18,256 +18,25 @@ concurrency:
  cancel-in-progress: false

 jobs:
-  latest-docker:
-    name: "Latest PyTorch + TensorFlow [dev]"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu-push-ci
+   latest-pytorch-deepspeed-amd:
+     name: "PyTorch + DeepSpeed (AMD) [dev]"

-  latest-torch-deepspeed-docker:
-    name: "Latest PyTorch + DeepSpeed"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
-
-  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
-  latest-torch-deepspeed-docker-for-push-ci-daily-build:
-    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-
-  doc-builder:
-    name: "Doc builder"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-doc-builder
-          push: true
-          tags: huggingface/transformers-doc-builder
-
-  latest-pytorch:
-    name: "Latest PyTorch [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-gpu
-
-# Need to be fixed with the help from Guillaume.
-#  latest-pytorch-amd:
-#    name: "Latest PyTorch (AMD) [dev]"
-#    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
-#    steps:
-#      - name: Set up Docker Buildx
-#        uses: docker/setup-buildx-action@v3
-#      - name: Check out code
-#        uses: actions/checkout@v3
-#      - name: Login to DockerHub
-#        uses: docker/login-action@v3
-#        with:
-#          username: ${{ secrets.DOCKERHUB_USERNAME }}
-#          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-#      - name: Build and push
-#        uses: docker/build-push-action@v5
-#        with:
-#          context: ./docker/transformers-pytorch-amd-gpu
-#          build-args: |
-#            REF=main
-#          push: true
-#          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
-#      # Push CI images still need to be re-built daily
-#      -
-#        name: Build and push (for Push CI) in a daily basis
-#        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-#        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-#        if: inputs.image_postfix != '-push-ci'
-#        uses: docker/build-push-action@v5
-#        with:
-#          context: ./docker/transformers-pytorch-amd-gpu
-#          build-args: |
-#            REF=main
-#          push: true
-#          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
-
-  latest-tensorflow:
-    name: "Latest TensorFlow [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-tensorflow-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-tensorflow-gpu
+     runs-on: [yih-dar-shieh-debug-daily,multi-gpu]
+     steps:
+       - name: Set up Docker Buildx
+         uses: docker/setup-buildx-action@v3
+       - name: Check out code
+         uses: actions/checkout@v3
+       - name: Login to DockerHub
+         uses: docker/login-action@v3
+         with:
+           username: ${{ secrets.DOCKERHUB_USERNAME }}
+           password: ${{ secrets.DOCKERHUB_PASSWORD }}
+       - name: Build and push
+         uses: docker/build-push-action@v5
+         with:
+           context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+           build-args: |
+             REF=main
+           push: true
+           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@ -212,7 +212,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -286,4 +286,4 @@ jobs:
        with:
          name: |
              single-*
-              multi-*
+              multi-*
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@ -267,7 +267,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -353,4 +353,4 @@ jobs:
        with:
          name: |
              single-*
-              multi-*
+              multi-*
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -366,7 +366,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -456,7 +456,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -7,7 +7,7 @@ on:
    types: [completed]
  push:
    branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*

 jobs:
  run_amd_ci:
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -7,7 +7,7 @@ on:
    types: [completed]
  push:
    branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*

 jobs:
  run_amd_ci:
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@ -356,6 +356,62 @@ jobs:
          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu

+  run_tests_torch_deepspeed_gpu:
+    name: Torch ROCm deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    needs: setup
+    container:
+      image: echarlaix/amd-deepspeed-test # TODO: replace with huggingface/transformers-pytorch-deepspeed-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
+
  run_extract_warnings:
    name: Extract warnings in CI artifacts
    runs-on: ubuntu-22.04
@ -368,7 +424,7 @@ jobs:
      run_tests_multi_gpu,
      run_examples_gpu,
      run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu
+      run_tests_torch_deepspeed_gpu
    ]
    steps:
      - name: Checkout transformers
@ -417,7 +473,7 @@ jobs:
      run_tests_multi_gpu,
      run_examples_gpu,
      run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu,
+      run_tests_torch_deepspeed_gpu,
      run_extract_warnings
    ]
    steps:
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -366,7 +366,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt

 ARG REF=main
 WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]

 RUN python3 -m pip uninstall -y tensorflow flax
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -0,0 +1,45 @@
+FROM rocm/dev-ubuntu-22.04:5.6
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.1.1'
+ARG TORCH_VISION='0.16.1'
+ARG TORCH_AUDIO='2.1.1'
+ARG ROCM='5.6'
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+    libaio-dev \
+    git \
+    # These are required to build deepspeed.
+    python3-dev \
+    python-is-python3 \
+    rocrand-dev \
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    rocblas-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
+RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
+
+# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+
+ARG REF=main
+WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+RUN python3 -c "from deepspeed.launcher.runner import main"
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt

 # recompile apex
 RUN python3 -m pip uninstall -y apex
-RUN git clone https://github.com/NVIDIA/apex
+# RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
 # TODO: check if there is alternative way to install latest apex
 # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@ -561,8 +561,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)

-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+        # Relative difference. See the note above how to get identical loss on a small bs
+        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)

    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
Author	SHA1	Message	Date
ydshieh	6df3e84f4d	fix	2023-12-07 12:35:03 +01:00
ydshieh	8fbab29be4	fix	2023-12-06 17:43:01 +01:00
Ella Charlaix	cfcc312b4e	enable back other tests	2023-12-06 15:59:49 +01:00
Ella Charlaix	ecb92392c6	add sklearn dependency to fix slow tests	2023-12-06 12:53:01 +01:00
Ella Charlaix	fa82a9c747	trigger	2023-12-06 10:42:57 +01:00
Ella Charlaix	92c402d9e4	comment tests	2023-12-05 18:04:02 +01:00
Ella Charlaix	df00cff6ca	trigger deepspeed tests with new image	2023-12-05 17:56:47 +01:00
Ella Charlaix	fc6d8909d4	Merge branch 'main' into run_amd_scheduled_ci_caller_deepspeed_test	2023-12-05 16:28:40 +01:00
Ella Charlaix	84a7a3398d	fix comment	2023-12-05 16:13:11 +01:00
Ella Charlaix	9696cc4ef7	precompile deepspeed to avoid timeout during tests	2023-12-05 16:11:55 +01:00
Ella Charlaix	3332cd2eb5	tmp disable test	2023-12-05 11:47:57 +01:00
Ella Charlaix	40398b9a0f	fix	2023-12-05 11:47:45 +01:00
fxmarty	f0f931e2fc	Update docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile	2023-12-05 19:34:58 +09:00
Félix Marty	ba8cc9f74a	Merge branch 'main' into run_amd_scheduled_ci_caller_deepspeed_test	2023-12-04 16:54:29 +01:00
Felix Marty	785b63aed4	update docker & make tests pass	2023-12-04 15:30:47 +00:00
Ella Charlaix	f846b80bed	upgrade torch	2023-11-30 18:20:53 +01:00
Ella Charlaix	407cfe9722	remove deprecated deepspeed build option	2023-11-30 16:21:43 +01:00
Ella Charlaix	09fee9eac7	comment until docker image build scheduled test fix	2023-11-30 15:57:12 +01:00
Ella Charlaix	508ae294fd	fix	2023-11-30 15:37:41 +01:00
Ella Charlaix	090b88e204	add amd tests back	2023-11-30 11:26:04 +01:00
Ella Charlaix	70c3580feb	fix typo	2023-11-30 11:25:49 +01:00
Ella Charlaix	e16c271403	add back amd tests	2023-11-30 11:21:38 +01:00
Ella Charlaix	cbe995ff20	Trigger	2023-11-30 11:19:46 +01:00
Ella Charlaix	da4774c048	push new image	2023-11-29 19:07:15 +01:00
Ella Charlaix	971ba80a8c	replace test docker image with original image	2023-11-29 00:05:02 +01:00
Ella Charlaix	233bd7f07a	remove test suffix from docker image tag	2023-11-28 23:45:52 +01:00
Ella Charlaix	a47ac2ca1f	use new docker image	2023-11-28 17:13:25 +01:00
Ella Charlaix	a7033499d8	change runner env to trigger the docker build image test	2023-11-28 13:55:21 +01:00
Ella Charlaix	4cb9d6f54f	trigger	2023-11-28 12:15:49 +01:00
Ella Charlaix	a0c3dafbee	remove trigger for this branch	2023-11-27 19:23:02 +01:00
Ella Charlaix	c29d2492a5	trigger	2023-11-27 18:58:01 +01:00
Ella Charlaix	af46e872c6	enable tests	2023-11-23 19:25:14 +01:00
Ella Charlaix	5a9a5296ad	add comment	2023-11-23 19:18:13 +01:00
Ella Charlaix	2cfb53d1d5	add dockerfile	2023-11-23 19:16:33 +01:00
Ella Charlaix	bf276ed036	fix image	2023-11-22 00:32:51 +01:00
Ella Charlaix	1e8ce6607b	add deepspeed scheduled test for amd	2023-11-21 16:36:14 +01:00