mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
126 lines
4.6 KiB
Python
Executable File
126 lines
4.6 KiB
Python
Executable File
import torch
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import deepspeed
|
|
from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
|
|
from deepspeed.ops.op_builder import CPUAdagradBuilder
|
|
|
|
if not deepspeed.ops.__compatible_ops__[CPUAdagradBuilder.NAME]:
|
|
pytest.skip("cpu-adagrad is not compatible")
|
|
|
|
|
|
def check_equal(first, second, atol=1e-2, verbose=False):
|
|
x = first.detach().numpy()
|
|
y = second.detach().numpy()
|
|
if verbose:
|
|
print("x = {}".format(x.flatten()))
|
|
print("y = {}".format(y.flatten()))
|
|
print('-' * 80)
|
|
np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
|
|
|
|
|
|
@pytest.mark.parametrize('model_size',
|
|
[
|
|
(64),
|
|
(22),
|
|
(55),
|
|
(127),
|
|
(1024),
|
|
(1048576),
|
|
(30000000),
|
|
]) # yapf: disable
|
|
def test_cpu_adagrad_opt(model_size):
|
|
device = 'cpu'
|
|
rng_state = torch.get_rng_state()
|
|
param = torch.nn.Parameter(torch.randn(model_size, device=device))
|
|
torch.set_rng_state(rng_state)
|
|
param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
|
|
torch.set_rng_state(rng_state)
|
|
|
|
optimizer = DeepSpeedCPUAdagrad([param])
|
|
optimizer1 = torch.optim.Adagrad([param1])
|
|
|
|
for i in range(10):
|
|
rng_state = torch.get_rng_state()
|
|
param.grad = torch.randn(model_size, device=device)
|
|
torch.set_rng_state(rng_state)
|
|
param1.grad = torch.randn(model_size, device=device)
|
|
optimizer.step()
|
|
optimizer1.step()
|
|
|
|
check_equal(param, param1, atol=1e-2, verbose=True)
|
|
|
|
|
|
@pytest.mark.parametrize('model_size,vocabulary_size,dim',
|
|
[
|
|
(16 * 2, 16 * 4, 16),
|
|
(16 * 32, 16 * 256, 16),
|
|
(16 * 256, 16 * 16384, 16),
|
|
]) # yapf: disable
|
|
def test_cpu_adagrad_opt_sparse_embedding(model_size, vocabulary_size, dim):
|
|
device = 'cpu'
|
|
rng_state = torch.get_rng_state()
|
|
|
|
def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
|
|
i = torch.randint(vocabulary_size,
|
|
size=(1,
|
|
num_indices),
|
|
dtype=torch.int64,
|
|
device=device)
|
|
v = torch.randn(num_indices, dim, dtype=dtype, device=device)
|
|
t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
|
|
t = t.coalesce()
|
|
new_i = (t.indices().view(-1,
|
|
1).repeat(1,
|
|
dim) * dim +
|
|
torch.tensor(range(dim))).flatten().unsqueeze(0)
|
|
new_v = t.values().flatten()
|
|
new_t = torch.sparse_coo_tensor(new_i,
|
|
new_v,
|
|
(vocabulary_size * dim,
|
|
),
|
|
device=device)
|
|
new_t = new_t.coalesce()
|
|
new_t.requires_grad = False
|
|
return new_t
|
|
|
|
voc_size = vocabulary_size
|
|
dim = dim
|
|
num_indices = int(model_size // dim)
|
|
dtype = torch.float32
|
|
|
|
param = torch.nn.Parameter(torch.randn((voc_size * dim,
|
|
),
|
|
dtype=dtype,
|
|
device=device),
|
|
requires_grad=True)
|
|
torch.set_rng_state(rng_state)
|
|
param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
|
|
),
|
|
dtype=dtype,
|
|
device=device),
|
|
requires_grad=True)
|
|
torch.set_rng_state(rng_state)
|
|
|
|
optimizer = DeepSpeedCPUAdagrad([param])
|
|
optimizer1 = torch.optim.Adagrad([param1])
|
|
|
|
for i in range(10):
|
|
torch.set_rng_state(rng_state)
|
|
param.grad = gen_sparse_grad(voc_size,
|
|
dim,
|
|
num_indices,
|
|
dtype=dtype,
|
|
device=device)
|
|
torch.set_rng_state(rng_state)
|
|
param1.grad = gen_sparse_grad(voc_size,
|
|
dim,
|
|
num_indices,
|
|
dtype=dtype,
|
|
device=device)
|
|
optimizer.step()
|
|
optimizer1.step()
|
|
|
|
check_equal(param, param1, atol=1e-2, verbose=True)
|