Update Gaudi Runners (#3593)

* test * fix * push * in the morning * fix backend * run first * set habana modules * dynamo backend * trigger * remove on pr * remove on file change
2025-10-20 18:13:46 +08:00 · 2025-06-03 12:36:56 +02:00
parent 791055b484
commit 682691deac
3 changed files with 34 additions and 19 deletions
--- a/.github/workflows/gaudi3_scheduled.yml
+++ b/.github/workflows/gaudi3_scheduled.yml
@ -1,23 +1,22 @@
-name: Gaudi1 tests (scheduled)
+name: Gaudi3 tests (scheduled)

 on:
  workflow_dispatch:
-  schedule:
-    - cron: "0 2 * * *"
+  schedule: # every day at 6 AM UTC
+    - cron: "0 6 * * *"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
-  run_gaudi1_tests:
-    name: Test on Gaudi1
+  run-gaudi3-tests:
    runs-on:
-      group: aws-dl1-24xlarge
+      group: itac-bm-emr-gaudi3-dell-2gaudi

    container:
      image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
+      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
      env:
        OMPI_MCA_btl_vader_single_copy_mechanism: none
        PT_ENABLE_INT64_SUPPORT: 1
@ -50,28 +49,34 @@ jobs:
        run: |
          pip install -e .[testing] \
            git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \
-            git+https://github.com/huggingface/transformers.git@hpu-support
+            git+https://github.com/huggingface/transformers.git

      - name: Run CLI tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_cli

      - name: Run Core tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_core

      - name: Run Big Modeling tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_big_modeling

      - name: Run FSDP integration tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_fsdp

      - name: Run DeepSpeed integration tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_deepspeed

      - name: Run Examples tests
+        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_examples
--- a/tests/test_compile.py
+++ b/tests/test_compile.py
@ -16,7 +16,7 @@ import unittest
 import torch
 from torch.utils.benchmark import Timer

-from accelerate.test_utils import require_huggingface_suite, require_non_cpu, torch_device
+from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
 from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory


@ -28,7 +28,13 @@ INFERENCE_ITERS = 100
 INFRENCE_STMT = "model(input_ids, use_cache=False)"
 COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"

+if torch_device == "hpu":
+    backend = "hpu_backend"
+else:
+    backend = "inductor"

+
+@require_non_hpu
@require_huggingface_suite
 class RegionalCompilationTester(unittest.TestCase):
    def _get_model_and_inputs(self):
@ -43,7 +49,7 @@ class RegionalCompilationTester(unittest.TestCase):

    def test_regions_are_compiled(self):
        model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model, mode="reduce-overhead")
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        # Check that the compiled model keeps a reference to the original model
        assert hasattr(compiled_model, "_orig_mod")
@ -55,20 +61,20 @@ class RegionalCompilationTester(unittest.TestCase):

    def test_extract_model_keep_torch_compile(self):
        model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model)
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        distributed_model = torch.nn.parallel.DataParallel(model)
-        distributed_compiled_model = compile_regions(distributed_model)
+        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)

        assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod

    def test_extract_model_remove_torch_compile(self):
        model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model)
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        distributed_model = torch.nn.parallel.DataParallel(model)
-        distributed_compiled_model = compile_regions(distributed_model)
+        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)

        assert compiled_model._orig_mod is compiled_model_unwrapped
@ -78,14 +84,14 @@ class RegionalCompilationTester(unittest.TestCase):
    def test_regional_compilation_cold_start(self):
        model, input_ids = self._get_model_and_inputs()

-        regional_compilation_model = compile_regions(model)
+        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
        regional_compilation_cold_start = (
            Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
            .timeit(COMPILE_ITERS)
            .median
        )

-        full_compilation_model = torch.compile(model)
+        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
        full_compilation_cold_start = (
            Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
            .timeit(COMPILE_ITERS)
@ -109,14 +115,14 @@ class RegionalCompilationTester(unittest.TestCase):
            Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
        )

-        regional_compilation_model = compile_regions(model)
+        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
        regional_compilation_inference_latency = (
            Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
            .timeit(INFERENCE_ITERS)
            .median
        )

-        full_compilation_model = torch.compile(model)
+        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
        full_compilation_inference_latency = (
            Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
            .timeit(INFERENCE_ITERS)
--- a/tests/test_load_checkpoint_and_dispatch_with_broadcast.py
+++ b/tests/test_load_checkpoint_and_dispatch_with_broadcast.py
@ -33,10 +33,11 @@ from accelerate.test_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_multi_device,
+    run_first,
    torch_device,
 )
 from accelerate.test_utils.testing import require_torch_min_version, require_transformers
-from accelerate.utils.imports import is_transformers_available, is_xccl_available
+from accelerate.utils.imports import is_hpu_available, is_transformers_available, is_xccl_available


 if is_transformers_available():
@ -53,6 +54,8 @@ def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
        #       pytorch built-in xccl will be available from PyTorch 2.9, will remove this after we have xccl
        if torch_device == "xpu" and not is_xccl_available():
            dist.init_process_group(backend="ccl", world_size=torch_accelerator_module.device_count())
+        elif torch_device == "hpu" and is_hpu_available(init_hccl=True):
+            dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
        else:
            dist.init_process_group(world_size=torch_accelerator_module.device_count())
        try:
@ -188,6 +191,7 @@ def load_checkpoint_and_dispatch_ddp():
@require_torch_min_version(version="2.4.0")
@require_transformers
@require_multi_device
+@run_first
 class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
    def setUp(self):
        self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)