Update Gaudi Runners (#3593)

* test

* fix

* push

* in the morning

* fix backend

* run first

* set habana modules

* dynamo backend

* trigger

* remove on pr

* remove on file change
This commit is contained in:
Ilyas Moutawwakil
2025-06-03 12:36:56 +02:00
committed by GitHub
parent 791055b484
commit 682691deac
3 changed files with 34 additions and 19 deletions

View File

@ -1,23 +1,22 @@
name: Gaudi1 tests (scheduled)
name: Gaudi3 tests (scheduled)
on:
workflow_dispatch:
schedule:
- cron: "0 2 * * *"
schedule: # every day at 6 AM UTC
- cron: "0 6 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
run_gaudi1_tests:
name: Test on Gaudi1
run-gaudi3-tests:
runs-on:
group: aws-dl1-24xlarge
group: itac-bm-emr-gaudi3-dell-2gaudi
container:
image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
env:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_ENABLE_INT64_SUPPORT: 1
@ -50,28 +49,34 @@ jobs:
run: |
pip install -e .[testing] \
git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \
git+https://github.com/huggingface/transformers.git@hpu-support
git+https://github.com/huggingface/transformers.git
- name: Run CLI tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_cli
- name: Run Core tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_core
- name: Run Big Modeling tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_big_modeling
- name: Run FSDP integration tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_fsdp
- name: Run DeepSpeed integration tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_deepspeed
- name: Run Examples tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_examples

View File

@ -16,7 +16,7 @@ import unittest
import torch
from torch.utils.benchmark import Timer
from accelerate.test_utils import require_huggingface_suite, require_non_cpu, torch_device
from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory
@ -28,7 +28,13 @@ INFERENCE_ITERS = 100
INFRENCE_STMT = "model(input_ids, use_cache=False)"
COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"
if torch_device == "hpu":
backend = "hpu_backend"
else:
backend = "inductor"
@require_non_hpu
@require_huggingface_suite
class RegionalCompilationTester(unittest.TestCase):
def _get_model_and_inputs(self):
@ -43,7 +49,7 @@ class RegionalCompilationTester(unittest.TestCase):
def test_regions_are_compiled(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model, mode="reduce-overhead")
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
# Check that the compiled model keeps a reference to the original model
assert hasattr(compiled_model, "_orig_mod")
@ -55,20 +61,20 @@ class RegionalCompilationTester(unittest.TestCase):
def test_extract_model_keep_torch_compile(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model)
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
distributed_model = torch.nn.parallel.DataParallel(model)
distributed_compiled_model = compile_regions(distributed_model)
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)
assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod
def test_extract_model_remove_torch_compile(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model)
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
distributed_model = torch.nn.parallel.DataParallel(model)
distributed_compiled_model = compile_regions(distributed_model)
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)
assert compiled_model._orig_mod is compiled_model_unwrapped
@ -78,14 +84,14 @@ class RegionalCompilationTester(unittest.TestCase):
def test_regional_compilation_cold_start(self):
model, input_ids = self._get_model_and_inputs()
regional_compilation_model = compile_regions(model)
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
regional_compilation_cold_start = (
Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
.timeit(COMPILE_ITERS)
.median
)
full_compilation_model = torch.compile(model)
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
full_compilation_cold_start = (
Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
.timeit(COMPILE_ITERS)
@ -109,14 +115,14 @@ class RegionalCompilationTester(unittest.TestCase):
Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
)
regional_compilation_model = compile_regions(model)
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
regional_compilation_inference_latency = (
Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
.timeit(INFERENCE_ITERS)
.median
)
full_compilation_model = torch.compile(model)
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
full_compilation_inference_latency = (
Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
.timeit(INFERENCE_ITERS)

View File

@ -33,10 +33,11 @@ from accelerate.test_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_multi_device,
run_first,
torch_device,
)
from accelerate.test_utils.testing import require_torch_min_version, require_transformers
from accelerate.utils.imports import is_transformers_available, is_xccl_available
from accelerate.utils.imports import is_hpu_available, is_transformers_available, is_xccl_available
if is_transformers_available():
@ -53,6 +54,8 @@ def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
# pytorch built-in xccl will be available from PyTorch 2.9, will remove this after we have xccl
if torch_device == "xpu" and not is_xccl_available():
dist.init_process_group(backend="ccl", world_size=torch_accelerator_module.device_count())
elif torch_device == "hpu" and is_hpu_available(init_hccl=True):
dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
else:
dist.init_process_group(world_size=torch_accelerator_module.device_count())
try:
@ -188,6 +191,7 @@ def load_checkpoint_and_dispatch_ddp():
@require_torch_min_version(version="2.4.0")
@require_transformers
@require_multi_device
@run_first
class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
def setUp(self):
self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)