mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
520 lines
17 KiB
Python
520 lines
17 KiB
Python
# Copyright 2019 by Michiel de Hoon.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
|
|
"""Substitution matrices."""
|
|
|
|
import os
|
|
import string
|
|
|
|
import numpy as np
|
|
|
|
from Bio.File import as_handle
|
|
|
|
|
|
class Array(np.ndarray):
|
|
"""numpy array subclass indexed by integers and by letters."""
|
|
|
|
def __new__(cls, alphabet=None, dims=None, data=None, dtype=float):
|
|
"""Create a new Array instance."""
|
|
if isinstance(data, dict):
|
|
if alphabet is not None:
|
|
raise ValueError("alphabet should be None if data is a dict")
|
|
if dims is not None:
|
|
raise ValueError("dims should be None if data is a dict")
|
|
alphabet = []
|
|
single_letters = True
|
|
for key in data:
|
|
if isinstance(key, str):
|
|
if dims is None:
|
|
dims = 1
|
|
elif dims != 1:
|
|
raise ValueError("inconsistent dimensions in data")
|
|
alphabet.append(key)
|
|
elif isinstance(key, tuple):
|
|
if dims is None:
|
|
dims = len(key)
|
|
elif dims != len(key):
|
|
raise ValueError("inconsistent dimensions in data")
|
|
if dims == 1:
|
|
if not isinstance(key, str):
|
|
raise ValueError("expected string")
|
|
if len(key) > 1:
|
|
single_letters = False
|
|
alphabet.append(key)
|
|
elif dims == 2:
|
|
for letter in key:
|
|
if not isinstance(letter, str):
|
|
raise ValueError("expected string")
|
|
if len(letter) > 1:
|
|
single_letters = False
|
|
alphabet.append(letter)
|
|
else:
|
|
raise ValueError(
|
|
"data array should be 1- or 2- dimensional "
|
|
"(found %d dimensions) in key" % dims
|
|
)
|
|
alphabet = sorted(set(alphabet))
|
|
if single_letters:
|
|
alphabet = "".join(alphabet)
|
|
else:
|
|
alphabet = tuple(alphabet)
|
|
n = len(alphabet)
|
|
if dims == 1:
|
|
shape = (n,)
|
|
elif dims == 2:
|
|
shape = (n, n)
|
|
else: # dims is None
|
|
raise ValueError("data is an empty dictionary")
|
|
obj = super().__new__(cls, shape, dtype)
|
|
if dims == 1:
|
|
for i, key in enumerate(alphabet):
|
|
obj[i] = data.get(letter, 0.0)
|
|
elif dims == 2:
|
|
for i1, letter1 in enumerate(alphabet):
|
|
for i2, letter2 in enumerate(alphabet):
|
|
key = (letter1, letter2)
|
|
value = data.get(key, 0.0)
|
|
obj[i1, i2] = value
|
|
obj._alphabet = alphabet
|
|
return obj
|
|
if alphabet is None:
|
|
alphabet = string.ascii_uppercase
|
|
elif not (isinstance(alphabet, (str, tuple))):
|
|
raise ValueError("alphabet should be a string or a tuple")
|
|
n = len(alphabet)
|
|
if data is None:
|
|
if dims is None:
|
|
dims = 1
|
|
elif dims not in (1, 2):
|
|
raise ValueError("dims should be 1 or 2 (found %s)" % dims)
|
|
shape = (n,) * dims
|
|
else:
|
|
if dims is None:
|
|
shape = data.shape
|
|
dims = len(shape)
|
|
if dims == 1:
|
|
pass
|
|
elif dims == 2:
|
|
if shape[0] != shape[1]:
|
|
raise ValueError("data array is not square")
|
|
else:
|
|
raise ValueError(
|
|
"data array should be 1- or 2- dimensional "
|
|
"(found %d dimensions) " % dims
|
|
)
|
|
else:
|
|
shape = (n,) * dims
|
|
if data.shape != shape:
|
|
raise ValueError(
|
|
"data shape has inconsistent shape (expected (%s), found (%s))"
|
|
% (shape, data.shape)
|
|
)
|
|
obj = super().__new__(cls, shape, dtype)
|
|
if data is None:
|
|
obj[:] = 0.0
|
|
else:
|
|
obj[:] = data
|
|
obj._alphabet = alphabet
|
|
return obj
|
|
|
|
def __array_finalize__(self, obj):
|
|
if obj is None:
|
|
return
|
|
self._alphabet = getattr(obj, "_alphabet", None)
|
|
|
|
def _convert_key(self, key):
|
|
if isinstance(key, tuple):
|
|
indices = []
|
|
for index in key:
|
|
if isinstance(index, str):
|
|
try:
|
|
index = self._alphabet.index(index)
|
|
except ValueError:
|
|
raise IndexError("'%s'" % index) from None
|
|
indices.append(index)
|
|
key = tuple(indices)
|
|
elif isinstance(key, str):
|
|
try:
|
|
key = self._alphabet.index(key)
|
|
except ValueError:
|
|
raise IndexError("'%s'" % key) from None
|
|
return key
|
|
|
|
def __getitem__(self, key):
|
|
key = self._convert_key(key)
|
|
value = np.ndarray.__getitem__(self, key)
|
|
if value.ndim == 2:
|
|
if self.ndim == 2:
|
|
if value.shape != self.shape:
|
|
raise IndexError("Requesting truncated array")
|
|
elif self.ndim == 1:
|
|
length = self.shape[0]
|
|
if value.shape[0] == length and value.shape[1] == 1:
|
|
pass
|
|
elif value.shape[0] == 1 and value.shape[1] == length:
|
|
pass
|
|
else:
|
|
raise IndexError("Requesting truncated array")
|
|
elif value.ndim == 1:
|
|
if value.shape[0] != self.shape[0]:
|
|
value._alphabet = self.alphabet[key]
|
|
elif value.ndim == 0:
|
|
return value.item()
|
|
return value.view(Array)
|
|
|
|
def __setitem__(self, key, value):
|
|
key = self._convert_key(key)
|
|
np.ndarray.__setitem__(self, key, value)
|
|
|
|
def __contains__(self, key):
|
|
# Follow dict definition of __contains__
|
|
return key in self.keys()
|
|
|
|
def __array_prepare__(self, out_arr, context=None):
|
|
# needed for numpy older than 1.13.0
|
|
ufunc, inputs, i = context
|
|
alphabet = self.alphabet
|
|
for arg in inputs:
|
|
if isinstance(arg, Array):
|
|
if arg.alphabet != alphabet:
|
|
raise ValueError("alphabets are inconsistent")
|
|
return np.ndarray.__array_prepare__(self, out_arr, context)
|
|
|
|
def __array_wrap__(self, out_arr, context=None):
|
|
if len(out_arr) == 1:
|
|
return out_arr[0]
|
|
return np.ndarray.__array_wrap__(self, out_arr, context)
|
|
|
|
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
|
|
args = []
|
|
alphabet = self._alphabet
|
|
for arg in inputs:
|
|
if isinstance(arg, Array):
|
|
if arg.alphabet != alphabet:
|
|
raise ValueError("alphabets are inconsistent")
|
|
args.append(arg.view(np.ndarray))
|
|
else:
|
|
args.append(arg)
|
|
|
|
outputs = kwargs.pop("out", None)
|
|
if outputs:
|
|
out_args = []
|
|
for arg in outputs:
|
|
if isinstance(arg, Array):
|
|
if arg.alphabet != alphabet:
|
|
raise ValueError("alphabets are inconsistent")
|
|
out_args.append(arg.view(np.ndarray))
|
|
else:
|
|
out_args.append(arg)
|
|
kwargs["out"] = tuple(out_args)
|
|
else:
|
|
outputs = (None,) * ufunc.nout
|
|
|
|
raw_results = super().__array_ufunc__(ufunc, method, *args, **kwargs)
|
|
if raw_results is NotImplemented:
|
|
return NotImplemented
|
|
|
|
if method == "at":
|
|
return
|
|
|
|
if ufunc.nout == 1:
|
|
raw_results = (raw_results,)
|
|
|
|
results = []
|
|
for raw_result, output in zip(raw_results, outputs):
|
|
if raw_result.ndim == 0:
|
|
result = raw_result
|
|
elif output is None:
|
|
result = np.asarray(raw_result).view(Array)
|
|
result._alphabet = self._alphabet
|
|
else:
|
|
result = output
|
|
result._alphabet = self._alphabet
|
|
results.append(result)
|
|
|
|
return results[0] if len(results) == 1 else results
|
|
|
|
def __reduce__(self):
|
|
import pickle
|
|
|
|
values = np.array(self)
|
|
state = pickle.dumps(values)
|
|
alphabet = self._alphabet
|
|
dims = len(self.shape)
|
|
dtype = self.dtype
|
|
arguments = (Array, alphabet, dims, None, dtype)
|
|
return (Array.__new__, arguments, state)
|
|
|
|
def __setstate__(self, state):
|
|
import pickle
|
|
|
|
self[:, :] = pickle.loads(state)
|
|
|
|
def transpose(self, axes=None):
|
|
"""Transpose the array."""
|
|
other = np.ndarray.transpose(self, axes)
|
|
other._alphabet = self._alphabet
|
|
return other
|
|
|
|
@property
|
|
def alphabet(self):
|
|
"""Return the alphabet property."""
|
|
return self._alphabet
|
|
|
|
def copy(self):
|
|
"""Create and return a copy of the array."""
|
|
other = Array(alphabet=self._alphabet, data=self)
|
|
return other
|
|
|
|
def get(self, key, value=None):
|
|
"""Return the value of the key if found; return value otherwise."""
|
|
try:
|
|
return self[key]
|
|
except IndexError:
|
|
return value
|
|
|
|
def items(self):
|
|
"""Return an iterator of (key, value) pairs in the array."""
|
|
dims = len(self.shape)
|
|
if dims == 1:
|
|
for index, key in enumerate(self._alphabet):
|
|
value = np.ndarray.__getitem__(self, index)
|
|
yield key, value
|
|
elif dims == 2:
|
|
for i1, c1 in enumerate(self._alphabet):
|
|
for i2, c2 in enumerate(self._alphabet):
|
|
key = (c1, c2)
|
|
value = np.ndarray.__getitem__(self, (i1, i2))
|
|
yield key, value
|
|
else:
|
|
raise RuntimeError("array has unexpected shape %s" % self.shape)
|
|
|
|
def keys(self):
|
|
"""Return a tuple with the keys associated with the array."""
|
|
dims = len(self.shape)
|
|
alphabet = self._alphabet
|
|
if dims == 1:
|
|
return tuple(alphabet)
|
|
elif dims == 2:
|
|
return tuple((c1, c2) for c2 in alphabet for c1 in alphabet)
|
|
else:
|
|
raise RuntimeError("array has unexpected shape %s" % self.shape)
|
|
|
|
def values(self):
|
|
"""Return a tuple with the values stored in the array."""
|
|
dims = len(self.shape)
|
|
alphabet = self._alphabet
|
|
if dims == 1:
|
|
return tuple(self)
|
|
elif dims == 2:
|
|
n1, n2 = self.shape
|
|
return tuple(
|
|
np.ndarray.__getitem__(self, (i1, i2))
|
|
for i2 in range(n2)
|
|
for i1 in range(n1)
|
|
)
|
|
else:
|
|
raise RuntimeError("array has unexpected shape %s" % self.shape)
|
|
|
|
def update(self, E=None, **F):
|
|
"""Update the array from dict/iterable E and F."""
|
|
if E is not None:
|
|
try:
|
|
alphabet = E.keys()
|
|
except AttributeError:
|
|
for key, value in E:
|
|
self[key] = value
|
|
else:
|
|
for key in E:
|
|
self[key] = E[key]
|
|
for key in F:
|
|
self[key] = F[key]
|
|
|
|
def select(self, alphabet):
|
|
"""Subset the array by selecting the letters from the specified alphabet."""
|
|
ii = []
|
|
jj = []
|
|
for i, key in enumerate(alphabet):
|
|
try:
|
|
j = self._alphabet.index(key)
|
|
except ValueError:
|
|
continue
|
|
ii.append(i)
|
|
jj.append(j)
|
|
dims = len(self.shape)
|
|
a = Array(alphabet, dims=dims)
|
|
ii = np.ix_(*[ii] * dims)
|
|
jj = np.ix_(*[jj] * dims)
|
|
a[ii] = np.ndarray.__getitem__(self, jj)
|
|
return a
|
|
|
|
def _format_1D(self, fmt):
|
|
_alphabet = self._alphabet
|
|
n = len(_alphabet)
|
|
words = [None] * n
|
|
lines = []
|
|
try:
|
|
header = self.header
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
for line in header:
|
|
line = "# %s\n" % line
|
|
lines.append(line)
|
|
maxwidth = 0
|
|
for i, key in enumerate(_alphabet):
|
|
value = self[key]
|
|
word = fmt % value
|
|
width = len(word)
|
|
if width > maxwidth:
|
|
maxwidth = width
|
|
words[i] = word
|
|
fmt2 = " %" + str(maxwidth) + "s"
|
|
for letter, word in zip(_alphabet, words):
|
|
word = fmt2 % word
|
|
line = letter + word + "\n"
|
|
lines.append(line)
|
|
text = "".join(lines)
|
|
return text
|
|
|
|
def _format_2D(self, fmt):
|
|
alphabet = self.alphabet
|
|
n = len(alphabet)
|
|
words = [[None] * n for _ in range(n)]
|
|
lines = []
|
|
try:
|
|
header = self.header
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
for line in header:
|
|
line = "# %s\n" % line
|
|
lines.append(line)
|
|
keywidth = max(len(c) for c in alphabet)
|
|
keyfmt = "%" + str(keywidth) + "s"
|
|
line = " " * keywidth
|
|
for j, c2 in enumerate(alphabet):
|
|
maxwidth = 0
|
|
for i, c1 in enumerate(alphabet):
|
|
key = (c1, c2)
|
|
value = self[key]
|
|
word = fmt % value
|
|
width = len(word)
|
|
if width > maxwidth:
|
|
maxwidth = width
|
|
words[i][j] = word
|
|
fmt2 = " %" + str(maxwidth) + "s"
|
|
word = fmt2 % c2
|
|
line += word
|
|
for i, c1 in enumerate(alphabet):
|
|
word = words[i][j]
|
|
words[i][j] = fmt2 % word
|
|
line = line.rstrip() + "\n"
|
|
lines.append(line)
|
|
for letter, row in zip(alphabet, words):
|
|
key = keyfmt % letter
|
|
line = key + "".join(row) + "\n"
|
|
lines.append(line)
|
|
text = "".join(lines)
|
|
return text
|
|
|
|
def __format__(self, fmt):
|
|
return self.format(fmt)
|
|
|
|
def format(self, fmt=""):
|
|
"""Return a string representation of the array.
|
|
|
|
The argument ``fmt`` specifies the number format to be used.
|
|
By default, the number format is "%i" if the array contains integer
|
|
numbers, and "%.1f" otherwise.
|
|
|
|
"""
|
|
if fmt == "":
|
|
if np.issubdtype(self.dtype, np.integer):
|
|
fmt = "%i"
|
|
else:
|
|
fmt = "%.1f"
|
|
n = len(self.shape)
|
|
if n == 1:
|
|
return self._format_1D(fmt)
|
|
elif n == 2:
|
|
return self._format_2D(fmt)
|
|
else:
|
|
raise RuntimeError("Array has unexpected rank %d" % n)
|
|
|
|
def __str__(self):
|
|
return self.format()
|
|
|
|
def __repr__(self):
|
|
text = np.ndarray.__repr__(self)
|
|
alphabet = self._alphabet
|
|
if isinstance(alphabet, str):
|
|
assert text.endswith(")")
|
|
text = text[:-1] + ",\n alphabet='%s')" % self._alphabet
|
|
return text
|
|
|
|
|
|
def read(handle, dtype=float):
|
|
"""Parse the file and return an Array object."""
|
|
with as_handle(handle) as fp:
|
|
lines = fp.readlines()
|
|
|
|
header = []
|
|
for i, line in enumerate(lines):
|
|
if not line.startswith("#"):
|
|
break
|
|
header.append(line[1:].strip())
|
|
rows = [line.split() for line in lines[i:]]
|
|
if len(rows[0]) == len(rows[1]) == 2:
|
|
alphabet = [key for key, value in rows]
|
|
for key in alphabet:
|
|
if len(key) > 1:
|
|
alphabet = tuple(alphabet)
|
|
break
|
|
else:
|
|
alphabet = "".join(alphabet)
|
|
matrix = Array(alphabet=alphabet, dims=1, dtype=dtype)
|
|
matrix.update(rows)
|
|
else:
|
|
alphabet = rows.pop(0)
|
|
for key in alphabet:
|
|
if len(key) > 1:
|
|
alphabet = tuple(alphabet)
|
|
break
|
|
else:
|
|
alphabet = "".join(alphabet)
|
|
matrix = Array(alphabet=alphabet, dims=2, dtype=dtype)
|
|
for letter1, row in zip(alphabet, rows):
|
|
assert letter1 == row.pop(0)
|
|
for letter2, word in zip(alphabet, row):
|
|
matrix[letter1, letter2] = float(word)
|
|
matrix.header = header
|
|
return matrix
|
|
|
|
|
|
def load(name=None):
|
|
"""Load and return a precalculated substitution matrix.
|
|
|
|
>>> from Bio.Align import substitution_matrices
|
|
>>> names = substitution_matrices.load()
|
|
"""
|
|
path = os.path.realpath(__file__)
|
|
directory = os.path.dirname(path)
|
|
subdirectory = os.path.join(directory, "data")
|
|
if name is None:
|
|
filenames = os.listdir(subdirectory)
|
|
try:
|
|
filenames.remove("README.txt")
|
|
# The README.txt file is not present in usual Biopython
|
|
# installations, but is included in a development install.
|
|
except ValueError:
|
|
pass
|
|
return sorted(filenames)
|
|
path = os.path.join(subdirectory, name)
|
|
matrix = read(path)
|
|
return matrix
|