Files
biopython/Bio/Align/substitution_matrices/__init__.py
ruff-isort de0bb21fb3 Apply isort (forcing single lines, not sorting by type) via ruff
$ ruff check --fix --select=I \
  --config=lint.isort.force-single-line=true \
  --config=lint.isort.order-by-type=false \
  BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py

Using ruff version 0.4.10
2024-06-26 15:31:39 +09:00

520 lines
17 KiB
Python

# Copyright 2019 by Michiel de Hoon.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Substitution matrices."""
import os
import string
import numpy as np
from Bio.File import as_handle
class Array(np.ndarray):
"""numpy array subclass indexed by integers and by letters."""
def __new__(cls, alphabet=None, dims=None, data=None, dtype=float):
"""Create a new Array instance."""
if isinstance(data, dict):
if alphabet is not None:
raise ValueError("alphabet should be None if data is a dict")
if dims is not None:
raise ValueError("dims should be None if data is a dict")
alphabet = []
single_letters = True
for key in data:
if isinstance(key, str):
if dims is None:
dims = 1
elif dims != 1:
raise ValueError("inconsistent dimensions in data")
alphabet.append(key)
elif isinstance(key, tuple):
if dims is None:
dims = len(key)
elif dims != len(key):
raise ValueError("inconsistent dimensions in data")
if dims == 1:
if not isinstance(key, str):
raise ValueError("expected string")
if len(key) > 1:
single_letters = False
alphabet.append(key)
elif dims == 2:
for letter in key:
if not isinstance(letter, str):
raise ValueError("expected string")
if len(letter) > 1:
single_letters = False
alphabet.append(letter)
else:
raise ValueError(
"data array should be 1- or 2- dimensional "
"(found %d dimensions) in key" % dims
)
alphabet = sorted(set(alphabet))
if single_letters:
alphabet = "".join(alphabet)
else:
alphabet = tuple(alphabet)
n = len(alphabet)
if dims == 1:
shape = (n,)
elif dims == 2:
shape = (n, n)
else: # dims is None
raise ValueError("data is an empty dictionary")
obj = super().__new__(cls, shape, dtype)
if dims == 1:
for i, key in enumerate(alphabet):
obj[i] = data.get(letter, 0.0)
elif dims == 2:
for i1, letter1 in enumerate(alphabet):
for i2, letter2 in enumerate(alphabet):
key = (letter1, letter2)
value = data.get(key, 0.0)
obj[i1, i2] = value
obj._alphabet = alphabet
return obj
if alphabet is None:
alphabet = string.ascii_uppercase
elif not (isinstance(alphabet, (str, tuple))):
raise ValueError("alphabet should be a string or a tuple")
n = len(alphabet)
if data is None:
if dims is None:
dims = 1
elif dims not in (1, 2):
raise ValueError("dims should be 1 or 2 (found %s)" % dims)
shape = (n,) * dims
else:
if dims is None:
shape = data.shape
dims = len(shape)
if dims == 1:
pass
elif dims == 2:
if shape[0] != shape[1]:
raise ValueError("data array is not square")
else:
raise ValueError(
"data array should be 1- or 2- dimensional "
"(found %d dimensions) " % dims
)
else:
shape = (n,) * dims
if data.shape != shape:
raise ValueError(
"data shape has inconsistent shape (expected (%s), found (%s))"
% (shape, data.shape)
)
obj = super().__new__(cls, shape, dtype)
if data is None:
obj[:] = 0.0
else:
obj[:] = data
obj._alphabet = alphabet
return obj
def __array_finalize__(self, obj):
if obj is None:
return
self._alphabet = getattr(obj, "_alphabet", None)
def _convert_key(self, key):
if isinstance(key, tuple):
indices = []
for index in key:
if isinstance(index, str):
try:
index = self._alphabet.index(index)
except ValueError:
raise IndexError("'%s'" % index) from None
indices.append(index)
key = tuple(indices)
elif isinstance(key, str):
try:
key = self._alphabet.index(key)
except ValueError:
raise IndexError("'%s'" % key) from None
return key
def __getitem__(self, key):
key = self._convert_key(key)
value = np.ndarray.__getitem__(self, key)
if value.ndim == 2:
if self.ndim == 2:
if value.shape != self.shape:
raise IndexError("Requesting truncated array")
elif self.ndim == 1:
length = self.shape[0]
if value.shape[0] == length and value.shape[1] == 1:
pass
elif value.shape[0] == 1 and value.shape[1] == length:
pass
else:
raise IndexError("Requesting truncated array")
elif value.ndim == 1:
if value.shape[0] != self.shape[0]:
value._alphabet = self.alphabet[key]
elif value.ndim == 0:
return value.item()
return value.view(Array)
def __setitem__(self, key, value):
key = self._convert_key(key)
np.ndarray.__setitem__(self, key, value)
def __contains__(self, key):
# Follow dict definition of __contains__
return key in self.keys()
def __array_prepare__(self, out_arr, context=None):
# needed for numpy older than 1.13.0
ufunc, inputs, i = context
alphabet = self.alphabet
for arg in inputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
return np.ndarray.__array_prepare__(self, out_arr, context)
def __array_wrap__(self, out_arr, context=None):
if len(out_arr) == 1:
return out_arr[0]
return np.ndarray.__array_wrap__(self, out_arr, context)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
args = []
alphabet = self._alphabet
for arg in inputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
args.append(arg.view(np.ndarray))
else:
args.append(arg)
outputs = kwargs.pop("out", None)
if outputs:
out_args = []
for arg in outputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
out_args.append(arg.view(np.ndarray))
else:
out_args.append(arg)
kwargs["out"] = tuple(out_args)
else:
outputs = (None,) * ufunc.nout
raw_results = super().__array_ufunc__(ufunc, method, *args, **kwargs)
if raw_results is NotImplemented:
return NotImplemented
if method == "at":
return
if ufunc.nout == 1:
raw_results = (raw_results,)
results = []
for raw_result, output in zip(raw_results, outputs):
if raw_result.ndim == 0:
result = raw_result
elif output is None:
result = np.asarray(raw_result).view(Array)
result._alphabet = self._alphabet
else:
result = output
result._alphabet = self._alphabet
results.append(result)
return results[0] if len(results) == 1 else results
def __reduce__(self):
import pickle
values = np.array(self)
state = pickle.dumps(values)
alphabet = self._alphabet
dims = len(self.shape)
dtype = self.dtype
arguments = (Array, alphabet, dims, None, dtype)
return (Array.__new__, arguments, state)
def __setstate__(self, state):
import pickle
self[:, :] = pickle.loads(state)
def transpose(self, axes=None):
"""Transpose the array."""
other = np.ndarray.transpose(self, axes)
other._alphabet = self._alphabet
return other
@property
def alphabet(self):
"""Return the alphabet property."""
return self._alphabet
def copy(self):
"""Create and return a copy of the array."""
other = Array(alphabet=self._alphabet, data=self)
return other
def get(self, key, value=None):
"""Return the value of the key if found; return value otherwise."""
try:
return self[key]
except IndexError:
return value
def items(self):
"""Return an iterator of (key, value) pairs in the array."""
dims = len(self.shape)
if dims == 1:
for index, key in enumerate(self._alphabet):
value = np.ndarray.__getitem__(self, index)
yield key, value
elif dims == 2:
for i1, c1 in enumerate(self._alphabet):
for i2, c2 in enumerate(self._alphabet):
key = (c1, c2)
value = np.ndarray.__getitem__(self, (i1, i2))
yield key, value
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def keys(self):
"""Return a tuple with the keys associated with the array."""
dims = len(self.shape)
alphabet = self._alphabet
if dims == 1:
return tuple(alphabet)
elif dims == 2:
return tuple((c1, c2) for c2 in alphabet for c1 in alphabet)
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def values(self):
"""Return a tuple with the values stored in the array."""
dims = len(self.shape)
alphabet = self._alphabet
if dims == 1:
return tuple(self)
elif dims == 2:
n1, n2 = self.shape
return tuple(
np.ndarray.__getitem__(self, (i1, i2))
for i2 in range(n2)
for i1 in range(n1)
)
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def update(self, E=None, **F):
"""Update the array from dict/iterable E and F."""
if E is not None:
try:
alphabet = E.keys()
except AttributeError:
for key, value in E:
self[key] = value
else:
for key in E:
self[key] = E[key]
for key in F:
self[key] = F[key]
def select(self, alphabet):
"""Subset the array by selecting the letters from the specified alphabet."""
ii = []
jj = []
for i, key in enumerate(alphabet):
try:
j = self._alphabet.index(key)
except ValueError:
continue
ii.append(i)
jj.append(j)
dims = len(self.shape)
a = Array(alphabet, dims=dims)
ii = np.ix_(*[ii] * dims)
jj = np.ix_(*[jj] * dims)
a[ii] = np.ndarray.__getitem__(self, jj)
return a
def _format_1D(self, fmt):
_alphabet = self._alphabet
n = len(_alphabet)
words = [None] * n
lines = []
try:
header = self.header
except AttributeError:
pass
else:
for line in header:
line = "# %s\n" % line
lines.append(line)
maxwidth = 0
for i, key in enumerate(_alphabet):
value = self[key]
word = fmt % value
width = len(word)
if width > maxwidth:
maxwidth = width
words[i] = word
fmt2 = " %" + str(maxwidth) + "s"
for letter, word in zip(_alphabet, words):
word = fmt2 % word
line = letter + word + "\n"
lines.append(line)
text = "".join(lines)
return text
def _format_2D(self, fmt):
alphabet = self.alphabet
n = len(alphabet)
words = [[None] * n for _ in range(n)]
lines = []
try:
header = self.header
except AttributeError:
pass
else:
for line in header:
line = "# %s\n" % line
lines.append(line)
keywidth = max(len(c) for c in alphabet)
keyfmt = "%" + str(keywidth) + "s"
line = " " * keywidth
for j, c2 in enumerate(alphabet):
maxwidth = 0
for i, c1 in enumerate(alphabet):
key = (c1, c2)
value = self[key]
word = fmt % value
width = len(word)
if width > maxwidth:
maxwidth = width
words[i][j] = word
fmt2 = " %" + str(maxwidth) + "s"
word = fmt2 % c2
line += word
for i, c1 in enumerate(alphabet):
word = words[i][j]
words[i][j] = fmt2 % word
line = line.rstrip() + "\n"
lines.append(line)
for letter, row in zip(alphabet, words):
key = keyfmt % letter
line = key + "".join(row) + "\n"
lines.append(line)
text = "".join(lines)
return text
def __format__(self, fmt):
return self.format(fmt)
def format(self, fmt=""):
"""Return a string representation of the array.
The argument ``fmt`` specifies the number format to be used.
By default, the number format is "%i" if the array contains integer
numbers, and "%.1f" otherwise.
"""
if fmt == "":
if np.issubdtype(self.dtype, np.integer):
fmt = "%i"
else:
fmt = "%.1f"
n = len(self.shape)
if n == 1:
return self._format_1D(fmt)
elif n == 2:
return self._format_2D(fmt)
else:
raise RuntimeError("Array has unexpected rank %d" % n)
def __str__(self):
return self.format()
def __repr__(self):
text = np.ndarray.__repr__(self)
alphabet = self._alphabet
if isinstance(alphabet, str):
assert text.endswith(")")
text = text[:-1] + ",\n alphabet='%s')" % self._alphabet
return text
def read(handle, dtype=float):
"""Parse the file and return an Array object."""
with as_handle(handle) as fp:
lines = fp.readlines()
header = []
for i, line in enumerate(lines):
if not line.startswith("#"):
break
header.append(line[1:].strip())
rows = [line.split() for line in lines[i:]]
if len(rows[0]) == len(rows[1]) == 2:
alphabet = [key for key, value in rows]
for key in alphabet:
if len(key) > 1:
alphabet = tuple(alphabet)
break
else:
alphabet = "".join(alphabet)
matrix = Array(alphabet=alphabet, dims=1, dtype=dtype)
matrix.update(rows)
else:
alphabet = rows.pop(0)
for key in alphabet:
if len(key) > 1:
alphabet = tuple(alphabet)
break
else:
alphabet = "".join(alphabet)
matrix = Array(alphabet=alphabet, dims=2, dtype=dtype)
for letter1, row in zip(alphabet, rows):
assert letter1 == row.pop(0)
for letter2, word in zip(alphabet, row):
matrix[letter1, letter2] = float(word)
matrix.header = header
return matrix
def load(name=None):
"""Load and return a precalculated substitution matrix.
>>> from Bio.Align import substitution_matrices
>>> names = substitution_matrices.load()
"""
path = os.path.realpath(__file__)
directory = os.path.dirname(path)
subdirectory = os.path.join(directory, "data")
if name is None:
filenames = os.listdir(subdirectory)
try:
filenames.remove("README.txt")
# The README.txt file is not present in usual Biopython
# installations, but is included in a development install.
except ValueError:
pass
return sorted(filenames)
path = os.path.join(subdirectory, name)
matrix = read(path)
return matrix