Files
pytorch/caffe2/python/operator_test/adagrad_test_helper.py
Jongsoo Park 42e9a619b3 add decay parameter in ref_adagrad (#15329)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15329

Add decay parameter to match with C++ Adagrad implementation.

Reviewed By: chocjy

Differential Revision: D13300991

fbshipit-source-id: db734df0202d8f5fd156f2742207d0b5a3aa7348
2019-05-07 18:58:58 -07:00

141 lines
4.3 KiB
Python

from __future__ import absolute_import, division, print_function, unicode_literals
from functools import partial
import caffe2.python.hypothesis_test_util as hu
import numpy as np
from caffe2.python import core
def ref_adagrad(
param_in,
mom_in,
grad,
lr,
epsilon,
using_fp16=False,
output_effective_lr=False,
output_effective_lr_and_update=False,
decay=1.0,
row_wise=False,
):
mom_in_f32 = mom_in
param_in_f32 = param_in
if using_fp16:
mom_in_f32 = mom_in.astype(np.float32)
param_in_f32 = param_in.astype(np.float32)
if row_wise:
mom_out = decay * mom_in_f32 + np.mean(np.square(grad))
else:
mom_out = decay * mom_in_f32 + np.square(grad)
effective_lr = lr / (np.sqrt(mom_out) + epsilon)
grad_adj = effective_lr * grad
param_out = param_in_f32 + grad_adj
if output_effective_lr_and_update:
if using_fp16:
return (
param_out.astype(np.float16),
mom_out.astype(np.float16),
effective_lr.astype(np.float16),
grad_adj.astype(np.float16),
)
else:
return (
param_out.astype(np.float32),
mom_out.astype(np.float32),
effective_lr.astype(np.float32),
grad_adj.astype(np.float32),
)
elif output_effective_lr:
if using_fp16:
return (
param_out.astype(np.float16),
mom_out.astype(np.float16),
effective_lr.astype(np.float16),
)
else:
return (
param_out.astype(np.float32),
mom_out.astype(np.float32),
effective_lr.astype(np.float32),
)
if using_fp16:
return (param_out.astype(np.float16), mom_out.astype(np.float16))
else:
return (param_out.astype(np.float32), mom_out.astype(np.float32))
def adagrad_sparse_test_helper(
parent_test, inputs, lr, epsilon, engine, ref_adagrad, gc, dc, row_wise=False
):
param, momentum, grad = inputs
if row_wise:
# For row-wise adagrad, only take the first element of each row
momentum = momentum.reshape(momentum.shape[0], -1)[:, 0]
momentum = np.abs(momentum)
lr = np.array([lr], dtype=np.float32)
# Create an indexing array containing values that are lists of indices,
# which index into grad
if grad.size == 0:
indices = np.empty(shape=(0,), dtype=np.int)
else:
indices = np.random.choice(
np.arange(grad.shape[0]),
size=np.random.randint(grad.shape[0]),
replace=False,
)
# Sparsify grad
grad = grad[indices]
op = core.CreateOperator(
"RowWiseSparseAdagrad" if row_wise else "SparseAdagrad",
["param", "momentum", "indices", "grad", "lr"],
["param", "momentum"],
epsilon=epsilon,
engine=engine,
device_option=gc,
)
def ref_sparse(param, momentum, indices, grad, lr, ref_using_fp16=False):
param_out = np.copy(param)
momentum_out = np.copy(momentum)
# Need to do this because it's possible ref_adagrad's using_fp16 could
# have been already specialized.
ref_adagrad_temp = (
partial(ref_adagrad, using_fp16=ref_using_fp16)
if ref_using_fp16
else ref_adagrad
)
for i, index in enumerate(indices):
param_out[index], momentum_out[index] = ref_adagrad_temp(
param[index],
momentum[index],
grad[i],
lr,
epsilon,
)
return (param_out, momentum_out)
ref_using_fp16_values = [False]
if gc == hu.gpu_do and not row_wise:
ref_using_fp16_values.append(True)
for ref_using_fp16 in ref_using_fp16_values:
if ref_using_fp16:
print("test_sparse_adagrad with half precision embedding")
momentum_i = momentum.astype(np.float16)
param_i = param.astype(np.float16)
else:
print("test_sparse_adagrad with full precision embedding")
momentum_i = momentum.astype(np.float32)
param_i = param.astype(np.float32)
parent_test.assertReferenceChecks(
gc, op, [param_i, momentum_i, indices, grad, lr, ref_using_fp16], ref_sparse
)