mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: There is a module called `2to3` which you can target for future specifically to remove these, the directory of `caffe2` has the most redundant imports: ```2to3 -f future -w caffe2``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/45033 Reviewed By: seemethere Differential Revision: D23808648 Pulled By: bugra fbshipit-source-id: 38971900f0fe43ab44a9168e57f2307580d36a38
242 lines
9.1 KiB
Python
242 lines
9.1 KiB
Python
## @package tt_core
|
|
# Module caffe2.python.tt_core
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
"""
|
|
The following methods are various utility methods for using the Tensor-Train
|
|
decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
|
|
paper (http://epubs.siam.org/doi/abs/10.1137/090752286).
|
|
|
|
Broadly speaking, these methods are used to replace fully connected layers in
|
|
neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
|
|
in their paper (http://arxiv.org/abs/1509.06569). More details about each of
|
|
the methods are provided in each respective docstring.
|
|
"""
|
|
|
|
|
|
def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
|
|
"""
|
|
Initialize randomized orthogonalized TT-cores.
|
|
|
|
This method should be used when a TT-layer is trained from scratch. The
|
|
sizes of each of the cores are specified by the inp_sizes and out_sizes, and
|
|
the respective tt_ranks will dictate the ranks of each of the cores. Note
|
|
that a larger set of tt_ranks will result in slower computation but will
|
|
result in more accurate approximations. The size of the ith core is:
|
|
|
|
tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
|
|
|
|
Note that the following relationships of lengths of each input is expected:
|
|
|
|
len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
|
|
|
|
Args:
|
|
inp_sizes: list of the input dimensions of the respective cores
|
|
out_sizes: list of the output dimensions of the respective cores
|
|
tt_ranks: list of the ranks of the respective cores
|
|
seed: integer to seed the random number generator
|
|
|
|
Returns:
|
|
cores: One-dimensional list of cores concatentated along an axis
|
|
"""
|
|
np.random.seed(seed)
|
|
|
|
# Assert that the sizes of each input is correct
|
|
assert(len(inp_sizes) == len(out_sizes)), \
|
|
"The number of input dimensions (" + str(len(inp_sizes)) + \
|
|
") must be equal to the number of output dimensions (" + \
|
|
str(len(out_sizes)) + ")."
|
|
|
|
assert(len(tt_ranks) == len(inp_sizes) + 1), \
|
|
"The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
|
|
"one more than the number of input and output dims (" + \
|
|
str(len(out_sizes)) + ")."
|
|
|
|
# Convert to numpy arrays
|
|
inp_sizes = np.array(inp_sizes)
|
|
out_sizes = np.array(out_sizes)
|
|
tt_ranks = np.array(tt_ranks)
|
|
|
|
# Initialize the cores array
|
|
cores_len = np.sum(
|
|
inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
|
|
cores = np.zeros(cores_len)
|
|
cores_idx = 0
|
|
rv = 1
|
|
|
|
# Compute the full list of cores by computing each individual one
|
|
for i in range(inp_sizes.shape[0]):
|
|
shape = [tt_ranks[i],
|
|
inp_sizes[i],
|
|
out_sizes[i],
|
|
tt_ranks[i + 1]]
|
|
|
|
# Precompute the shape of each core
|
|
tall_shape = (np.prod(shape[:3]), shape[3])
|
|
|
|
# Randomly initialize the current core using a normal distribution
|
|
curr_core = np.dot(rv, np.random.normal(
|
|
0, 1, size=(shape[0], np.prod(shape[1:]))))
|
|
curr_core = curr_core.reshape(tall_shape)
|
|
|
|
# Orthogonalize the initialized current core and append to cores list
|
|
if i < inp_sizes.shape[0] - 1:
|
|
curr_core, rv = np.linalg.qr(curr_core)
|
|
cores[cores_idx:cores_idx +
|
|
curr_core.size] = curr_core.flatten()
|
|
cores_idx += curr_core.size
|
|
|
|
# Normalize the list of arrays using this Glarot trick
|
|
glarot_style = (np.prod(inp_sizes) *
|
|
np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])
|
|
|
|
return (0.1 / glarot_style) * np.array(cores).astype(np.float32)
|
|
|
|
|
|
def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
|
|
"""
|
|
Convert a matrix into the TT-format.
|
|
|
|
This method will consume a 2D weight matrix such as those used in fully
|
|
connected layers in a neural network and will compute the TT-decomposition
|
|
of the weight matrix and return the TT-cores of the resulting computation.
|
|
This method should be used when converting a trained, fully connected layer,
|
|
into a TT-layer for increased speed and decreased parameter size. The size
|
|
of the ith core is:
|
|
|
|
tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
|
|
|
|
Note that the following relationships of lengths of each input is expected:
|
|
|
|
len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
|
|
|
|
We also require that np.prod(inp_sizes) == W.shape[0] and that
|
|
np.prod(out_sizes) == W.shape[1].
|
|
|
|
Args:
|
|
W: two-dimensional weight matrix numpy array representing a fully
|
|
connected layer to be converted to TT-format; note that the weight
|
|
matrix is transposed before decomposed because we want to emulate the
|
|
X * W^T operation that the FC layer performs.
|
|
inp_sizes: list of the input dimensions of the respective cores
|
|
out_sizes: list of the output dimensions of the respective cores
|
|
tt_ranks: list of the ranks of the respective cores
|
|
|
|
Returns:
|
|
new_cores: One-dimensional list of cores concatentated along an axis
|
|
"""
|
|
|
|
# Assert that the sizes of each input is correct
|
|
assert(len(inp_sizes) == len(out_sizes)), \
|
|
"The number of input dimensions (" + str(len(inp_sizes)) + \
|
|
") must be equal to the number of output dimensions (" + \
|
|
str(len(out_sizes)) + ")."
|
|
|
|
assert(len(tt_ranks) == len(inp_sizes) + 1), \
|
|
"The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
|
|
"one more than the number of input and output dimensions (" + \
|
|
str(len(out_sizes)) + ")."
|
|
|
|
assert(W.shape[0] == np.prod(inp_sizes)), \
|
|
"The product of the input sizes (" + str(np.prod(inp_sizes)) + \
|
|
") must be equal to first dimension of W (" + str(W.shape[0]) + ")."
|
|
|
|
assert(W.shape[1] == np.prod(out_sizes)), \
|
|
"The product of the output sizes (" + str(np.prod(out_sizes)) + \
|
|
") must be equal to second dimension of W (" + str(W.shape[1]) + ")."
|
|
|
|
# W is transposed so that the multiplication X * W^T can be computed, just
|
|
# as it is in the FC layer.
|
|
W = W.transpose()
|
|
|
|
# Convert to numpy arrays
|
|
inp_sizes = np.array(inp_sizes)
|
|
out_sizes = np.array(out_sizes)
|
|
tt_ranks = np.array(tt_ranks)
|
|
|
|
# Copy the original weight matrix in order to permute and reshape the weight
|
|
# matrix. In addition, the inp_sizes and out_sizes are combined to a single
|
|
# sizes array to use the tt_svd helper method, which only consumes a single
|
|
# sizes array.
|
|
W_copy = W.copy()
|
|
total_inp_size = inp_sizes.size
|
|
W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
|
|
order = np.repeat(np.arange(0, total_inp_size), 2) + \
|
|
np.tile([0, total_inp_size], total_inp_size)
|
|
W_copy = np.transpose(W_copy, axes=order)
|
|
W_copy = np.reshape(W_copy, inp_sizes * out_sizes)
|
|
|
|
# Use helper method to convert the W matrix copy into the preliminary
|
|
# cores array.
|
|
cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)
|
|
|
|
# Permute the dimensions of each of the cores to be compatible with the
|
|
# TT-layer.
|
|
new_cores = np.zeros(cores.shape).astype(np.float32)
|
|
idx = 0
|
|
for i in range(len(inp_sizes)):
|
|
shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
|
|
current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
|
|
current_core = current_core.transpose((1, 3, 0, 2))
|
|
new_cores[new_cores.shape[0] - idx - np.prod(shape):
|
|
new_cores.shape[0] - idx] \
|
|
= current_core.flatten()
|
|
idx += np.prod(shape)
|
|
|
|
return new_cores
|
|
|
|
|
|
def tt_svd(W, sizes, tt_ranks):
|
|
"""
|
|
Helper method for the matrix_to_tt() method performing the TT-SVD
|
|
decomposition.
|
|
|
|
Uses the TT-decomposition algorithm to convert a matrix to TT-format using
|
|
multiple reduced SVD operations.
|
|
|
|
Args:
|
|
W: two-dimensional weight matrix representing a fully connected layer to
|
|
be converted to TT-format preprocessed by the matrix_to_tt() method.
|
|
sizes: list of the dimensions of each of the cores
|
|
tt_ranks: list of the ranks of the respective cores
|
|
|
|
Returns:
|
|
cores: One-dimensional list of cores concatentated along an axis
|
|
"""
|
|
|
|
assert(len(tt_ranks) == len(sizes) + 1)
|
|
|
|
C = W.copy()
|
|
total_size = sizes.size
|
|
core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
|
|
dtype='float32')
|
|
|
|
# Compute iterative reduced SVD operations and store each resulting U matrix
|
|
# as an individual core.
|
|
pos = 0
|
|
for i in range(0, total_size - 1):
|
|
shape = tt_ranks[i] * sizes[i]
|
|
C = np.reshape(C, [shape, -1])
|
|
U, S, V = np.linalg.svd(C, full_matrices=False)
|
|
U = U[:, 0:tt_ranks[i + 1]]
|
|
S = S[0:tt_ranks[i + 1]]
|
|
V = V[0:tt_ranks[i + 1], :]
|
|
|
|
core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
|
|
pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
|
|
C = np.dot(np.diag(S), V)
|
|
|
|
core[pos:pos + tt_ranks[total_size - 1] *
|
|
sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
|
|
return core
|
|
|
|
|
|
# TODO(Surya) Write a method to convert an entire network where all fully
|
|
# connected layers are replaced by an TT layer.
|
|
def fc_net_to_tt_net(net):
|
|
pass
|