mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
* Abstract accelerator (step 2) * more flex op_builder path for both installation and runtime * add SpatialInferenceBuilder into cuda_accelerator.py * use reflection to make cuda_accelerator adapt to CUDA op builder change automatically * clean up deepspeed/__init__.py * add comments in cuda_accelerator for no torch path * Update deepspeed/env_report.py Change env_report.py according to suggestion Co-authored-by: Michael Wyatt <mrwyattii@gmail.com> * reduce the range of try...except for better code clarity * Add porting for deepspeed/ops/random_ltd/dropping_utils.py * move accelerator to top directory and create symlink under deepspeed Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Michael Wyatt <mrwyattii@gmail.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
141 lines
6.1 KiB
Python
Executable File
141 lines
6.1 KiB
Python
Executable File
'''
|
|
Copyright 2020 The Microsoft DeepSpeed Team
|
|
'''
|
|
|
|
import torch
|
|
from deepspeed.accelerator import get_accelerator
|
|
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder
|
|
from deepspeed.utils.logging import should_log_le
|
|
|
|
|
|
class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
|
|
optimizer_id = 0
|
|
|
|
def __init__(self,
|
|
model_params,
|
|
lr=1e-2,
|
|
eps=1e-10,
|
|
weight_decay=0,
|
|
amsgrad=False,
|
|
fp32_optimizer_states=True):
|
|
|
|
default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
|
|
super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
|
|
|
|
self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
|
|
DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
|
|
self.fp32_optimizer_states = fp32_optimizer_states
|
|
self.ds_opt_adagrad = get_accelerator().create_op_builder(
|
|
CPUAdagradBuilder).load()
|
|
|
|
self.ds_opt_adagrad.create_adagrad(self.opt_id,
|
|
lr,
|
|
eps,
|
|
weight_decay,
|
|
should_log_le("info"))
|
|
|
|
def __del__(self):
|
|
# need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
|
|
# is used multiple times in the same process (notebook or pytest worker)
|
|
self.ds_opt_adagrad.destroy_adagrad(self.opt_id)
|
|
|
|
def __setstate__(self, state):
|
|
super(DeepSpeedCPUAdagrad, self).__setstate__(state)
|
|
for group in self.param_groups:
|
|
group.setdefault('amsgrad', False)
|
|
|
|
@torch.no_grad()
|
|
def step(self, closure=None, fp16_param_groups=None):
|
|
"""Update the model parameters.
|
|
|
|
.. note::
|
|
This method will be called internally by ZeRO-Offload. DeepSpeed
|
|
users should still use ``engine.step()`` as shown in the
|
|
`Getting Started
|
|
<https://www.deepspeed.ai/getting-started/#training>`_ guide.
|
|
|
|
Args:
|
|
closure (callable, optional): closure to compute the loss.
|
|
Defaults to ``None``.
|
|
fp16_param_groups: FP16 GPU parameters to update. Performing the
|
|
copy here reduces communication time. Defaults to ``None``.
|
|
|
|
Returns:
|
|
loss: if ``closure`` is provided. Otherwise ``None``.
|
|
"""
|
|
|
|
loss = None
|
|
if closure is not None:
|
|
with torch.enable_grad():
|
|
loss = closure()
|
|
|
|
# intended device for step
|
|
device = torch.device('cpu')
|
|
|
|
for group_id, group in enumerate(self.param_groups):
|
|
for param_id, p in enumerate(group['params']):
|
|
|
|
if p.grad is None:
|
|
continue
|
|
|
|
assert p.device == device, f"CPUAdagrad param is on {p.device} and must be 'cpu', make " \
|
|
"sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config."
|
|
|
|
state = self.state[p]
|
|
# State initialization
|
|
if len(state) == 0:
|
|
#print(f'group {group_id} param {param_id} = {p.numel()}')
|
|
state['step'] = 0
|
|
|
|
#use full precision by default unless self.fp32_optimizer_states is off
|
|
state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
|
|
|
|
#memory_format=torch.preserve_format)
|
|
# gradient variances
|
|
state['exp_avg_sq'] = torch.zeros_like(p.data,
|
|
dtype=state_dtype,
|
|
device='cpu')
|
|
#memory_format=torch.preserve_format)
|
|
|
|
state['step'] += 1
|
|
|
|
if p.grad.is_sparse == True:
|
|
sparse_param = p.sparse_mask(p.grad)
|
|
sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
|
|
self.ds_opt_adagrad.adagrad_update(self.opt_id,
|
|
state['step'],
|
|
group['lr'],
|
|
group['eps'],
|
|
group['weight_decay'],
|
|
sparse_param.values(),
|
|
p.grad.values(),
|
|
sparse_exp_avg_sq.values())
|
|
p[sparse_param.indices()] = sparse_param.values()
|
|
state['exp_avg_sq'][
|
|
sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
|
|
if fp16_param_groups is not None:
|
|
fp16_param_groups[group_id][param_id][
|
|
sparse_param.indices()] = sparse_param.values()
|
|
else:
|
|
if fp16_param_groups is not None:
|
|
self.ds_opt_adagrad.adagrad_update_copy(
|
|
self.opt_id,
|
|
state['step'],
|
|
group['lr'],
|
|
group['eps'],
|
|
group['weight_decay'],
|
|
p.data,
|
|
p.grad.data,
|
|
state['exp_avg_sq'],
|
|
fp16_param_groups[group_id][param_id].data)
|
|
else:
|
|
self.ds_opt_adagrad.adagrad_update(self.opt_id,
|
|
state['step'],
|
|
group['lr'],
|
|
group['eps'],
|
|
group['weight_decay'],
|
|
p.data,
|
|
p.grad.data,
|
|
state['exp_avg_sq'])
|
|
return loss
|