mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 10:03:46 +08:00
182 lines
6.6 KiB
Python
182 lines
6.6 KiB
Python
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import json
|
|
import os
|
|
import unittest
|
|
|
|
import torch
|
|
|
|
from accelerate import Accelerator
|
|
from accelerate.state import AcceleratorState
|
|
from accelerate.test_utils import (
|
|
get_launch_command,
|
|
require_cuda,
|
|
require_cuda_or_hpu,
|
|
require_huggingface_suite,
|
|
require_multi_device,
|
|
require_multi_gpu,
|
|
require_torchao,
|
|
require_transformer_engine,
|
|
run_first,
|
|
)
|
|
from accelerate.test_utils.testing import require_deepspeed, run_command
|
|
from accelerate.utils import (
|
|
AORecipeKwargs,
|
|
FP8RecipeKwargs,
|
|
has_ao_layers,
|
|
has_transformer_engine_layers,
|
|
is_torchao_available,
|
|
is_transformer_engine_available,
|
|
)
|
|
|
|
|
|
def can_convert_te_model():
|
|
accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [FP8RecipeKwargs(backend="TE")]}
|
|
accelerator = Accelerator(**accelerator_kwargs)
|
|
dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
|
|
model = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.Linear(32, 16))
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
|
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
|
|
|
|
model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
|
|
assert has_transformer_engine_layers(model)
|
|
|
|
|
|
def maintain_proper_deepspeed_config(expected_version):
|
|
assert AcceleratorState().deepspeed_plugin.zero_stage == expected_version, (
|
|
f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
|
|
)
|
|
|
|
|
|
def can_convert_ao_model():
|
|
from transformers import AutoModelForSequenceClassification
|
|
|
|
accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [AORecipeKwargs()]}
|
|
accelerator = Accelerator(**accelerator_kwargs)
|
|
dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
|
|
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
|
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
|
|
|
|
model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
|
|
assert has_ao_layers(model)
|
|
|
|
|
|
@run_first
|
|
@require_cuda_or_hpu
|
|
@require_transformer_engine
|
|
class TestTransformerEngine(unittest.TestCase):
|
|
def test_can_prepare_model_single_gpu(self):
|
|
command = get_launch_command(num_processes=1, monitor_interval=0.1)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
@require_multi_device
|
|
def test_can_prepare_model_multi_gpu(self):
|
|
command = get_launch_command(num_processes=2, monitor_interval=0.1)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
@require_deepspeed
|
|
@require_multi_device
|
|
def test_can_prepare_model_multigpu_deepspeed(self):
|
|
for zero_stage in [1, 2, 3]:
|
|
os.environ["ZERO_STAGE"] = str(zero_stage)
|
|
ds_config = {
|
|
"bf16": {"enabled": True},
|
|
"zero_optimization": {
|
|
"stage": zero_stage,
|
|
"allgather_partitions": True,
|
|
"allgather_bucket_size": 2e8,
|
|
"overlap_comm": True,
|
|
"reduce_scatter": True,
|
|
"reduce_bucket_size": 2e8,
|
|
"contiguous_gradients": True,
|
|
},
|
|
"gradient_accumulation_steps": 1,
|
|
"gradient_clipping": "auto",
|
|
"steps_per_print": 2000,
|
|
"train_batch_size": "auto",
|
|
"train_micro_batch_size_per_gpu": "auto",
|
|
"wall_clock_breakdown": False,
|
|
}
|
|
|
|
ds_config = json.dumps(ds_config)
|
|
|
|
command = get_launch_command(
|
|
num_processes=2, monitor_interval=0.1, use_deepspeed=True, deepspeed_config_file=ds_config
|
|
)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
|
|
@require_torchao
|
|
@require_huggingface_suite
|
|
class TestTorchAO(unittest.TestCase):
|
|
@require_cuda
|
|
def test_can_prepare_model_single_gpu(self):
|
|
command = get_launch_command(num_processes=1, monitor_interval=0.1)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
@require_multi_gpu
|
|
def test_can_prepare_model_multi_gpu(self):
|
|
command = get_launch_command(num_processes=2, monitor_interval=0.1)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
@require_deepspeed
|
|
@require_multi_gpu
|
|
def test_can_prepare_model_multigpu_deepspeed(self):
|
|
for zero_stage in [1, 2, 3]:
|
|
os.environ["ZERO_STAGE"] = str(zero_stage)
|
|
ds_config = {
|
|
"bf16": {"enabled": True},
|
|
"zero_optimization": {
|
|
"stage": zero_stage,
|
|
"allgather_partitions": True,
|
|
"allgather_bucket_size": 2e8,
|
|
"overlap_comm": True,
|
|
"reduce_scatter": True,
|
|
"reduce_bucket_size": 2e8,
|
|
"contiguous_gradients": True,
|
|
},
|
|
"gradient_accumulation_steps": 1,
|
|
"gradient_clipping": "auto",
|
|
"steps_per_print": 2000,
|
|
"train_batch_size": "auto",
|
|
"train_micro_batch_size_per_gpu": "auto",
|
|
"wall_clock_breakdown": False,
|
|
}
|
|
|
|
ds_config = json.dumps(ds_config)
|
|
|
|
command = get_launch_command(
|
|
num_processes=2, monitor_interval=0.1, use_deepspeed=True, deepspeed_config_file=ds_config
|
|
)
|
|
command += ["-m", "tests.test_fp8"]
|
|
run_command(command)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# TE suite
|
|
if is_transformer_engine_available():
|
|
can_convert_te_model()
|
|
if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
|
|
maintain_proper_deepspeed_config(int(os.environ.get("ZERO_STAGE")))
|
|
# AO suite
|
|
if is_torchao_available():
|
|
can_convert_ao_model()
|