mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-21 10:56:33 +08:00
* init * style * is_hpu_available * fix * import habana_frameworks.torch.distributed.hccl * style * test * initialize dist proc group * revert * set backend to hccl only if hccl initialization sets a local rank * force backend hccl and multi_hpu type when sure of distributed launch * style * pass accelerator tests * pas big modeling tests with bigger atol/rtol for accelerators * fix hpu device count and skip tests requiring hpu:x * hpu autocast * hpu rng_state * hpu launch * hpu special device placement * hpu launch * rng state * distributed data loop tests * enforce non contiguity after device memory allocation * pass fsdp tests * enforce pt_hpu_lazy_mode=0 when fsdp testing * pass cli tests * pass and document grad sync tests * pass kwargs handler and autocast tests * memory utils * found source of int64 errors * skip some modeling utils tests * enable int64 * skip optimizer tests * pass checkpointing tests * pass accelerator tests with safetensors main * more hpu stuff * style * remove PT_HPU_LAZY_MODE and PT_ENABLE_INT64_SUPPORT as they should be in the testing environment * start testing on gaudi2 * support fp16 on gaudi2 * add testing order * custom hpu fsdp env dict * fix torch trace malloc * test ddp half precision comm hooks * fix * fix * remove lower bound for hpu * use 0.72 as lower bound * lower lower bound * order deepspeed tests * fix * deepspeed_use_hpu * assert non lazy mode with offloaded optimizer * make patching torch with habana frameworks the default * less of require_non_hpu * skip test_multi_device_merge_fsdp_weights for now as it halts * skip another flaky test * format * use habana_visible_modules * patch torch hpu device count * avoid setting HABANA_VISIBLE_MODULES * don't play with habana visible devices/modules * only with hpu * fixes and skips * skip * fix device ids and add some todos * skip offloading with generate() * fix * reduced atol/rtol for hpu * fix * tag deepspeed tests that should run first * enable a test path that was skipped * revert a test that was customized for gaudi1 * some patching to enable HABANA_VISIBLE_MODULES * fix zero3 test * misc * test DTensor TP * remove gaudi1 * test * style * comment * pass pad_across_processes * require_fp16 * pass memory utils test * test_ddp_comm_hook * skip half precision comm hooks on hpu * fix * is_fp16_available * fp16 * tp as part of integration tests * fix * write_basic_config * safetensors * local sgd and masked_fill_fwd_i64 * fix num_processes in test_load_states_by_steps * fp8 support * test * fix * add a workflow * Update src/accelerate/accelerator.py * review comments * ci * style * comments * test * habana_frameworks.torch * patch device count * fix * fix * require_fp8 * fix * fix * gaudi 1 * remove unnecessary * fixed maskd fill error in transformers * style * balanced_memory pass on hpu * remove for now * run first * Apply suggestions from code review * style after merge * Update src/accelerate/accelerator.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update src/accelerate/utils/transformer_engine.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * empty cache review comments * test_scirpt.py error messages * AccelerateTestCase for accelerator state cleanup * test * add gaudi1 workflow * fp8 avilability * fix * reduce batch size * concurrency * check cuda as well * nits and comments * mark fsdp tests that require_fp16 * style * mark deepspeed fp16 tests * update image * fix * updated * better msgs * skip pippy * test * test on 2 device * support up to 1% relative error in test_accelerate * skip hpu fp16 * allow for 1 byte differene * revert torch_device change * style * skip memory release since it's flaky * add accelerator state cleanup to fixture * fix * atol * fix * more rtol * equal grad test * revert * pass pippy on gaudi2 and skip on gaudi1 * enable sd 1.5 test with require fp16 * added warning on memory release * don't log warning in memory release as it requires PartialState to be initialized * Apply suggestions from code review --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>
83 lines
2.9 KiB
Python
83 lines
2.9 KiB
Python
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import pickle
|
|
|
|
import torch
|
|
|
|
from accelerate import Accelerator
|
|
from accelerate.test_utils import require_cpu, require_fp16, require_non_cpu
|
|
from accelerate.test_utils.testing import AccelerateTestCase
|
|
|
|
|
|
@require_cpu
|
|
class CPUOptimizerTester(AccelerateTestCase):
|
|
def test_accelerated_optimizer_pickling(self):
|
|
model = torch.nn.Linear(10, 10)
|
|
optimizer = torch.optim.SGD(model.parameters(), 0.1)
|
|
accelerator = Accelerator()
|
|
optimizer = accelerator.prepare(optimizer)
|
|
try:
|
|
pickle.loads(pickle.dumps(optimizer))
|
|
except Exception as e:
|
|
self.fail(f"Accelerated optimizer pickling failed with {e}")
|
|
|
|
|
|
@require_fp16
|
|
@require_non_cpu
|
|
class OptimizerTester(AccelerateTestCase):
|
|
def test_accelerated_optimizer_step_was_skipped(self):
|
|
model = torch.nn.Linear(5, 5)
|
|
optimizer = torch.optim.SGD(model.parameters(), 0.1)
|
|
accelerator = Accelerator(mixed_precision="fp16")
|
|
model, optimizer = accelerator.prepare(model, optimizer)
|
|
|
|
loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
|
|
accelerator.backward(loss)
|
|
for p in model.parameters():
|
|
# Fake the gradients, as if there's no overflow
|
|
p.grad.fill_(0.01)
|
|
|
|
optimizer.step()
|
|
assert optimizer.step_was_skipped is False
|
|
|
|
loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
|
|
accelerator.backward(loss)
|
|
for p in model.parameters():
|
|
p.grad.fill_(0.01)
|
|
# Manually set the gradients to be NaN, as if there's an overflow
|
|
p.grad[0] = torch.tensor(float("nan"))
|
|
|
|
optimizer.step()
|
|
assert optimizer.step_was_skipped is True
|
|
|
|
loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
|
|
accelerator.backward(loss)
|
|
for p in model.parameters():
|
|
p.grad.fill_(0.01)
|
|
# Manually set the gradients to be NaN, as if there's an overflow
|
|
p.grad[0] = torch.tensor(float("nan"))
|
|
|
|
optimizer.step()
|
|
assert optimizer.step_was_skipped is True
|
|
|
|
loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
|
|
accelerator.backward(loss)
|
|
for p in model.parameters():
|
|
# Fake the gradients, as if there's no overflow
|
|
p.grad.fill_(0.01)
|
|
|
|
optimizer.step()
|
|
assert optimizer.step_was_skipped is False
|