mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 23:53:48 +08:00
Fix invalid f-strings detected by ruff. --------- Signed-off-by: cyy <cyyever@outlook.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com> Co-authored-by: Michael Wyatt <michael.wyatt@snowflake.com>
154 lines
5.7 KiB
Python
Executable File
154 lines
5.7 KiB
Python
Executable File
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
import torch
|
|
from deepspeed.utils import log_dist
|
|
import numpy as np
|
|
import logging
|
|
from deepspeed.utils.torch import required_torch_version
|
|
|
|
|
|
class Eigenvalue(object):
|
|
|
|
def __init__(self,
|
|
verbose=False,
|
|
max_iter=100,
|
|
tol=1e-2,
|
|
stability=0,
|
|
gas_boundary_resolution=1,
|
|
layer_name='',
|
|
layer_num=0):
|
|
super().__init__()
|
|
|
|
self.verbose = verbose
|
|
self.max_iter = max_iter
|
|
self.tol = tol
|
|
self.stability = stability
|
|
self.gas_boundary_resolution = gas_boundary_resolution
|
|
self.layer_name = layer_name
|
|
self.layer_num = layer_num
|
|
|
|
assert len(self.layer_name) > 0 and layer_num > 0
|
|
|
|
log_dist(
|
|
f'enabled eigenvalue with verbose={verbose}, max_iter={max_iter}, tol={tol}, stability={stability}, gas_boundary_resolution={gas_boundary_resolution}, layer_name={layer_name}, layer_num={layer_num}',
|
|
ranks=[0])
|
|
|
|
# Replace all nan/pos-inf/neg-inf to zero
|
|
def nan_to_num(self, x):
|
|
if required_torch_version(min_version=1.8):
|
|
return torch.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
|
|
else:
|
|
# Fallback to numpy based implementation for backwards-compatibility with PyTorch 1.7 or older versions.
|
|
device = x.device
|
|
x = x.cpu().numpy()
|
|
x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
|
|
return torch.from_numpy(x).to(device)
|
|
|
|
def normalize(self, v):
|
|
norm_squared = self.inner_product(v, v)
|
|
norm = norm_squared**0.5 + self.stability
|
|
normalized_vectors = [vector / norm for vector in v]
|
|
normalized_vectors = [self.nan_to_num(vector) for vector in normalized_vectors]
|
|
return normalized_vectors
|
|
|
|
def inner_product(self, xs, ys):
|
|
return sum([torch.sum(x * y) for (x, y) in zip(xs, ys)])
|
|
|
|
def get_layers(self, module):
|
|
scope_names = self.layer_name.split('.')
|
|
assert len(scope_names) > 0
|
|
|
|
m = module
|
|
for name in scope_names:
|
|
assert hasattr(m, name), "layer_name configuration is invalid."
|
|
m = getattr(m, name)
|
|
|
|
return m
|
|
|
|
def compute_eigenvalue(self, module, device=None, scale=1.0):
|
|
block_eigenvalue = []
|
|
param_keys = []
|
|
layers = self.get_layers(module)
|
|
|
|
for block in range(self.layer_num):
|
|
model_block = layers[block]
|
|
|
|
# We found this randn() has obvious accuracy impact in some cases, save/recover random state here.
|
|
rng_state = torch.random.get_rng_state()
|
|
if device is None:
|
|
v = [
|
|
torch.randn(p.size()) for p in model_block.parameters()
|
|
if p.grad is not None and p.grad.grad_fn is not None
|
|
]
|
|
else:
|
|
v = [
|
|
torch.randn(p.size(), device=device) for p in model_block.parameters()
|
|
if p.grad is not None and p.grad.grad_fn is not None
|
|
]
|
|
torch.random.set_rng_state(rng_state)
|
|
|
|
grads = [
|
|
param.grad for param in model_block.parameters()
|
|
if param.grad is not None and param.grad.grad_fn is not None
|
|
]
|
|
params = [
|
|
param for param in model_block.parameters()
|
|
if param.grad is not None and param.grad.grad_fn is not None
|
|
]
|
|
|
|
layer_keys = [id(p) for p in model_block.parameters()]
|
|
param_keys.append(layer_keys)
|
|
|
|
v = self.normalize(v)
|
|
|
|
# Disable eigenvalue if the model doesn't support second order gradients computation,
|
|
# e.g. when enabling DS transformer kernel.
|
|
if len(grads) == 0 or len(params) == 0:
|
|
log_dist('The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
|
|
return []
|
|
|
|
i = 0
|
|
eigenvalue_current, eigenvalue_previous = 1., 0.
|
|
|
|
while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
|
|
(eigenvalue_current - eigenvalue_previous) / eigenvalue_current)
|
|
>= self.tol): # test convergence criteria
|
|
eigenvalue_previous = eigenvalue_current
|
|
|
|
Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True)
|
|
#Hv = [hv.float() for hv in Hv]
|
|
Hv = [self.nan_to_num(hv).float() for hv in Hv]
|
|
|
|
eigenvalue_current = self.inner_product(Hv, v).item()
|
|
|
|
v = self.normalize(Hv)
|
|
v = [x / scale for x in v]
|
|
i += 1
|
|
|
|
eigenvalue_current *= scale
|
|
block_eigenvalue.append(eigenvalue_current)
|
|
|
|
if self.verbose:
|
|
log_dist(f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', ranks=[0])
|
|
|
|
block_eigenvalue = self.post_process(block_eigenvalue)
|
|
|
|
if self.verbose:
|
|
log_dist(f'post processed block_eigenvalue: {block_eigenvalue}', ranks=[0])
|
|
|
|
# {param_id: (eigenvalue, layer_id)}
|
|
ev_dict = {}
|
|
for i, (layer_keys, value) in enumerate(zip(param_keys, block_eigenvalue)):
|
|
ev_dict.update(dict.fromkeys(layer_keys, (value, i)))
|
|
|
|
return ev_dict
|
|
|
|
# 1. Map all eigenvalues to [0, 1.0].
|
|
# 2. Some layers can't generate valid eigenvalues on fp16 precision, use 1.0 instead.
|
|
def post_process(self, value_list):
|
|
max_value = abs(max(value_list, key=abs))
|
|
return [abs(v) / max_value if v != 0.0 else 1.0 for v in value_list]
|