mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 18:13:46 +08:00
Update Gaudi Runners (#3593)
* test * fix * push * in the morning * fix backend * run first * set habana modules * dynamo backend * trigger * remove on pr * remove on file change
This commit is contained in:
committed by
GitHub
parent
791055b484
commit
682691deac
@ -1,23 +1,22 @@
|
||||
name: Gaudi1 tests (scheduled)
|
||||
name: Gaudi3 tests (scheduled)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 2 * * *"
|
||||
schedule: # every day at 6 AM UTC
|
||||
- cron: "0 6 * * *"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_gaudi1_tests:
|
||||
name: Test on Gaudi1
|
||||
run-gaudi3-tests:
|
||||
runs-on:
|
||||
group: aws-dl1-24xlarge
|
||||
group: itac-bm-emr-gaudi3-dell-2gaudi
|
||||
|
||||
container:
|
||||
image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
|
||||
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
|
||||
env:
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
@ -50,28 +49,34 @@ jobs:
|
||||
run: |
|
||||
pip install -e .[testing] \
|
||||
git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \
|
||||
git+https://github.com/huggingface/transformers.git@hpu-support
|
||||
git+https://github.com/huggingface/transformers.git
|
||||
|
||||
- name: Run CLI tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_cli
|
||||
|
||||
- name: Run Core tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_core
|
||||
|
||||
- name: Run Big Modeling tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_big_modeling
|
||||
|
||||
- name: Run FSDP integration tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_fsdp
|
||||
|
||||
- name: Run DeepSpeed integration tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_deepspeed
|
||||
|
||||
- name: Run Examples tests
|
||||
if: ${{ !cancelled() && (success() || failure()) }}
|
||||
run: |
|
||||
make test_examples
|
@ -16,7 +16,7 @@ import unittest
|
||||
import torch
|
||||
from torch.utils.benchmark import Timer
|
||||
|
||||
from accelerate.test_utils import require_huggingface_suite, require_non_cpu, torch_device
|
||||
from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
|
||||
from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory
|
||||
|
||||
|
||||
@ -28,7 +28,13 @@ INFERENCE_ITERS = 100
|
||||
INFRENCE_STMT = "model(input_ids, use_cache=False)"
|
||||
COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"
|
||||
|
||||
if torch_device == "hpu":
|
||||
backend = "hpu_backend"
|
||||
else:
|
||||
backend = "inductor"
|
||||
|
||||
|
||||
@require_non_hpu
|
||||
@require_huggingface_suite
|
||||
class RegionalCompilationTester(unittest.TestCase):
|
||||
def _get_model_and_inputs(self):
|
||||
@ -43,7 +49,7 @@ class RegionalCompilationTester(unittest.TestCase):
|
||||
|
||||
def test_regions_are_compiled(self):
|
||||
model, _ = self._get_model_and_inputs()
|
||||
compiled_model = compile_regions(model, mode="reduce-overhead")
|
||||
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
|
||||
|
||||
# Check that the compiled model keeps a reference to the original model
|
||||
assert hasattr(compiled_model, "_orig_mod")
|
||||
@ -55,20 +61,20 @@ class RegionalCompilationTester(unittest.TestCase):
|
||||
|
||||
def test_extract_model_keep_torch_compile(self):
|
||||
model, _ = self._get_model_and_inputs()
|
||||
compiled_model = compile_regions(model)
|
||||
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
|
||||
|
||||
distributed_model = torch.nn.parallel.DataParallel(model)
|
||||
distributed_compiled_model = compile_regions(distributed_model)
|
||||
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
|
||||
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)
|
||||
|
||||
assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod
|
||||
|
||||
def test_extract_model_remove_torch_compile(self):
|
||||
model, _ = self._get_model_and_inputs()
|
||||
compiled_model = compile_regions(model)
|
||||
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
|
||||
|
||||
distributed_model = torch.nn.parallel.DataParallel(model)
|
||||
distributed_compiled_model = compile_regions(distributed_model)
|
||||
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
|
||||
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)
|
||||
|
||||
assert compiled_model._orig_mod is compiled_model_unwrapped
|
||||
@ -78,14 +84,14 @@ class RegionalCompilationTester(unittest.TestCase):
|
||||
def test_regional_compilation_cold_start(self):
|
||||
model, input_ids = self._get_model_and_inputs()
|
||||
|
||||
regional_compilation_model = compile_regions(model)
|
||||
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
|
||||
regional_compilation_cold_start = (
|
||||
Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
|
||||
.timeit(COMPILE_ITERS)
|
||||
.median
|
||||
)
|
||||
|
||||
full_compilation_model = torch.compile(model)
|
||||
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
|
||||
full_compilation_cold_start = (
|
||||
Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
|
||||
.timeit(COMPILE_ITERS)
|
||||
@ -109,14 +115,14 @@ class RegionalCompilationTester(unittest.TestCase):
|
||||
Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
|
||||
)
|
||||
|
||||
regional_compilation_model = compile_regions(model)
|
||||
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
|
||||
regional_compilation_inference_latency = (
|
||||
Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
|
||||
.timeit(INFERENCE_ITERS)
|
||||
.median
|
||||
)
|
||||
|
||||
full_compilation_model = torch.compile(model)
|
||||
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
|
||||
full_compilation_inference_latency = (
|
||||
Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
|
||||
.timeit(INFERENCE_ITERS)
|
||||
|
@ -33,10 +33,11 @@ from accelerate.test_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_multi_device,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
from accelerate.test_utils.testing import require_torch_min_version, require_transformers
|
||||
from accelerate.utils.imports import is_transformers_available, is_xccl_available
|
||||
from accelerate.utils.imports import is_hpu_available, is_transformers_available, is_xccl_available
|
||||
|
||||
|
||||
if is_transformers_available():
|
||||
@ -53,6 +54,8 @@ def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
# pytorch built-in xccl will be available from PyTorch 2.9, will remove this after we have xccl
|
||||
if torch_device == "xpu" and not is_xccl_available():
|
||||
dist.init_process_group(backend="ccl", world_size=torch_accelerator_module.device_count())
|
||||
elif torch_device == "hpu" and is_hpu_available(init_hccl=True):
|
||||
dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
|
||||
else:
|
||||
dist.init_process_group(world_size=torch_accelerator_module.device_count())
|
||||
try:
|
||||
@ -188,6 +191,7 @@ def load_checkpoint_and_dispatch_ddp():
|
||||
@require_torch_min_version(version="2.4.0")
|
||||
@require_transformers
|
||||
@require_multi_device
|
||||
@run_first
|
||||
class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
|
||||
|
Reference in New Issue
Block a user