Files
DeepSpeed/deepspeed/ops/adagrad/cpu_adagrad.py
Ma, Guokai 9548d48f48 Abstract accelerator (step 2) (#2560)
* Abstract accelerator (step 2)

* more flex op_builder path for both installation and runtime

* add SpatialInferenceBuilder into cuda_accelerator.py

* use reflection to make cuda_accelerator adapt to CUDA op builder change automatically

* clean up deepspeed/__init__.py

* add comments in cuda_accelerator for no torch path

* Update deepspeed/env_report.py

Change env_report.py according to suggestion

Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>

* reduce the range of try...except for better code clarity

* Add porting for deepspeed/ops/random_ltd/dropping_utils.py

* move accelerator to top directory and create symlink under deepspeed

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
2023-01-06 23:40:58 -05:00

141 lines
6.1 KiB
Python
Executable File

'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
import torch
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder
from deepspeed.utils.logging import should_log_le
class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
optimizer_id = 0
def __init__(self,
model_params,
lr=1e-2,
eps=1e-10,
weight_decay=0,
amsgrad=False,
fp32_optimizer_states=True):
default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
self.fp32_optimizer_states = fp32_optimizer_states
self.ds_opt_adagrad = get_accelerator().create_op_builder(
CPUAdagradBuilder).load()
self.ds_opt_adagrad.create_adagrad(self.opt_id,
lr,
eps,
weight_decay,
should_log_le("info"))
def __del__(self):
# need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
# is used multiple times in the same process (notebook or pytest worker)
self.ds_opt_adagrad.destroy_adagrad(self.opt_id)
def __setstate__(self, state):
super(DeepSpeedCPUAdagrad, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
@torch.no_grad()
def step(self, closure=None, fp16_param_groups=None):
"""Update the model parameters.
.. note::
This method will be called internally by ZeRO-Offload. DeepSpeed
users should still use ``engine.step()`` as shown in the
`Getting Started
<https://www.deepspeed.ai/getting-started/#training>`_ guide.
Args:
closure (callable, optional): closure to compute the loss.
Defaults to ``None``.
fp16_param_groups: FP16 GPU parameters to update. Performing the
copy here reduces communication time. Defaults to ``None``.
Returns:
loss: if ``closure`` is provided. Otherwise ``None``.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
# intended device for step
device = torch.device('cpu')
for group_id, group in enumerate(self.param_groups):
for param_id, p in enumerate(group['params']):
if p.grad is None:
continue
assert p.device == device, f"CPUAdagrad param is on {p.device} and must be 'cpu', make " \
"sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config."
state = self.state[p]
# State initialization
if len(state) == 0:
#print(f'group {group_id} param {param_id} = {p.numel()}')
state['step'] = 0
#use full precision by default unless self.fp32_optimizer_states is off
state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
#memory_format=torch.preserve_format)
# gradient variances
state['exp_avg_sq'] = torch.zeros_like(p.data,
dtype=state_dtype,
device='cpu')
#memory_format=torch.preserve_format)
state['step'] += 1
if p.grad.is_sparse == True:
sparse_param = p.sparse_mask(p.grad)
sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
self.ds_opt_adagrad.adagrad_update(self.opt_id,
state['step'],
group['lr'],
group['eps'],
group['weight_decay'],
sparse_param.values(),
p.grad.values(),
sparse_exp_avg_sq.values())
p[sparse_param.indices()] = sparse_param.values()
state['exp_avg_sq'][
sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
if fp16_param_groups is not None:
fp16_param_groups[group_id][param_id][
sparse_param.indices()] = sparse_param.values()
else:
if fp16_param_groups is not None:
self.ds_opt_adagrad.adagrad_update_copy(
self.opt_id,
state['step'],
group['lr'],
group['eps'],
group['weight_decay'],
p.data,
p.grad.data,
state['exp_avg_sq'],
fp16_param_groups[group_id][param_id].data)
else:
self.ds_opt_adagrad.adagrad_update(self.opt_id,
state['step'],
group['lr'],
group['eps'],
group['weight_decay'],
p.data,
p.grad.data,
state['exp_avg_sq'])
return loss