remove deprecated functionality from Bio.Align.AlignInfo (#4907)

Authored-by: Michiel de Hoon <mdehoon@tkx288.genome.gsc.riken.jp>
This commit is contained in:
mdehoon
2025-01-16 03:38:28 +09:00
committed by GitHub
parent e297041e3a
commit 4ea2126ec0
6 changed files with 42 additions and 2240 deletions

View File

@ -11,14 +11,6 @@ functions which return summary type information about alignments should
be put into classes in this module.
"""
import math
import sys
import warnings
from collections import Counter
from Bio import BiopythonDeprecationWarning
from Bio.Seq import Seq
class SummaryInfo:
"""Calculate summary info about the alignment.
@ -36,749 +28,7 @@ class SummaryInfo:
self.alignment = alignment
self.ic_vector = []
def dumb_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False):
"""Output a fast consensus sequence of the alignment.
This doesn't do anything fancy at all. It will just go through the
sequence residue by residue and count up the number of each type
of residue (ie. A or G or T or C for DNA) in all sequences in the
alignment. If the percentage of the most common residue type is
greater then the passed threshold, then we will add that residue type,
otherwise an ambiguous character will be added.
This could be made a lot fancier (ie. to take a substitution matrix
into account), but it just meant for a quick and dirty consensus.
Arguments:
- threshold - The threshold value that is required to add a particular
atom.
- ambiguous - The ambiguous character to be added when the threshold is
not reached.
- require_multiple - If set as True, this will require that more than
1 sequence be part of an alignment to put it in the consensus (ie.
not just 1 sequence and gaps).
"""
warnings.warn(
"The `dumb_consensus` method is deprecated and will be removed "
"in a future release of Biopython. As an alternative, you can "
"convert the multiple sequence alignment object to a new-style "
"Alignment object by via its `.alignment` property, and then "
"create a Motif object. You can then use the `.consensus` or "
"`.degenerate_consensus` property of the Motif object to get a "
"consensus sequence. For more control over how the consensus "
"sequence is calculated, you can call the `calculate_consensus` "
"method on the `.counts` property of the Motif object. This is an "
"example for a multiple sequence alignment `msa` of DNA "
"nucleotides:"
"\n"
">>> from Bio.Seq import Seq\n"
">>> from Bio.SeqRecord import SeqRecord\n"
">>> from Bio.Align import MultipleSeqAlignment\n"
">>> from Bio.Align.AlignInfo import SummaryInfo\n"
">>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),\n"
"... SeqRecord(Seq('ATGT')),\n"
"... SeqRecord(Seq('ATGT'))])\n"
">>> summary = SummaryInfo(msa)\n"
">>> dumb_consensus = summary.dumb_consensus(ambiguous='N')\n"
">>> print(dumb_consensus)\n"
"ANGT\n"
">>> alignment = msa.alignment\n"
">>> from Bio.motifs import Motif\n"
">>> motif = Motif('ACGT', alignment)\n"
">>> print(motif.consensus)\n"
"ATGT\n"
">>> print(motif.degenerate_consensus)\n"
"AYGT\n"
">>> counts = motif.counts\n"
">>> consensus = counts.calculate_consensus(identity=0.7)\n"
">>> print(consensus)\n"
"ANGT\n"
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
# Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
consensus = ""
# find the length of the consensus we are creating
con_len = self.alignment.get_alignment_length()
# go through each seq item
for n in range(con_len):
# keep track of the counts of the different atoms we get
atom_dict = Counter()
num_atoms = 0
for record in self.alignment:
# make sure we haven't run past the end of any sequences
# if they are of different lengths
try:
c = record[n]
except IndexError:
continue
if c != "-" and c != ".":
atom_dict[c] += 1
num_atoms += 1
max_atoms = []
max_size = 0
for atom in atom_dict:
if atom_dict[atom] > max_size:
max_atoms = [atom]
max_size = atom_dict[atom]
elif atom_dict[atom] == max_size:
max_atoms.append(atom)
if require_multiple and num_atoms == 1:
consensus += ambiguous
elif len(max_atoms) == 1 and max_size / num_atoms >= threshold:
consensus += max_atoms[0]
else:
consensus += ambiguous
return Seq(consensus)
def gap_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False):
"""Output a fast consensus sequence of the alignment, allowing gaps.
Same as dumb_consensus(), but allows gap on the output.
Things to do:
- Let the user define that with only one gap, the result
character in consensus is gap.
- Let the user select gap character, now
it takes the same as input.
"""
warnings.warn(
"The `gap_consensus` method is deprecated and will be removed "
"in a future release of Biopython. As an alternative, you can "
"convert the multiple sequence alignment object to a new-style "
"Alignment object by via its `.alignment` property, and then "
"create a Motif object. You can then use the `.consensus` or "
"`.degenerate_consensus` property of the Motif object to get a "
"consensus sequence. For more control over how the consensus "
"sequence is calculated, you can call the `calculate_consensus` "
"method on the `.counts` property of the Motif object. This is an "
"example for a multiple sequence alignment `msa` of DNA "
"nucleotides:"
"\n"
">>> from Bio.Seq import Seq\n"
">>> from Bio.SeqRecord import SeqRecord\n"
">>> from Bio.Align import MultipleSeqAlignment\n"
">>> from Bio.Align.AlignInfo import SummaryInfo\n"
">>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),\n"
"... SeqRecord(Seq('AT-T')),\n"
"... SeqRecord(Seq('CT-T')),\n"
"... SeqRecord(Seq('GT-T'))])\n"
">>> summary = SummaryInfo(msa)\n"
">>> gap_consensus = summary.gap_consensus(ambiguous='N')\n"
">>> print(gap_consensus)\n"
"NT-T\n"
">>> alignment = msa.alignment\n"
">>> from Bio.motifs import Motif\n"
">>> motif = Motif('ACGT-', alignment) # include '-' in alphabet\n"
">>> print(motif.consensus)\n"
"AT-T\n"
">>> print(motif.degenerate_consensus)\n"
"VT-T\n"
">>> counts = motif.counts\n"
">>> consensus = counts.calculate_consensus(identity=0.7)\n"
">>> print(consensus)\n"
"NT-T\n"
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
consensus = ""
# find the length of the consensus we are creating
con_len = self.alignment.get_alignment_length()
# go through each seq item
for n in range(con_len):
# keep track of the counts of the different atoms we get
atom_dict = Counter()
num_atoms = 0
for record in self.alignment:
# make sure we haven't run past the end of any sequences
# if they are of different lengths
try:
c = record[n]
except IndexError:
continue
atom_dict[c] += 1
num_atoms += 1
max_atoms = []
max_size = 0
for atom in atom_dict:
if atom_dict[atom] > max_size:
max_atoms = [atom]
max_size = atom_dict[atom]
elif atom_dict[atom] == max_size:
max_atoms.append(atom)
if require_multiple and num_atoms == 1:
consensus += ambiguous
elif len(max_atoms) == 1 and max_size / num_atoms >= threshold:
consensus += max_atoms[0]
else:
consensus += ambiguous
return Seq(consensus)
def replacement_dictionary(self, skip_chars=None, letters=None):
"""Generate a replacement dictionary to plug into a substitution matrix.
This should look at an alignment, and be able to generate the number
of substitutions of different residues for each other in the
aligned object.
Will then return a dictionary with this information::
{('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
This also treats weighted sequences. The following example shows how
we calculate the replacement dictionary. Given the following
multiple sequence alignment::
GTATC 0.5
AT--C 0.8
CTGTC 1.0
For the first column we have::
('A', 'G') : 0.5 * 0.8 = 0.4
('C', 'G') : 0.5 * 1.0 = 0.5
('A', 'C') : 0.8 * 1.0 = 0.8
We then continue this for all of the columns in the alignment, summing
the information for each substitution in each column, until we end
up with the replacement dictionary.
Arguments:
- skip_chars - Not used; setting it to anything other than None
will raise a ValueError
- letters - An iterable (e.g. a string or list of characters to include.
"""
warnings.warn(
"The `replacement_dictionary` method is deprecated and will be "
"removed in a future release of Biopython. As an alternative, you "
"can convert the multiple sequence alignment object to a new-style "
"Alignment object by via its `.alignment` property, and then "
"use the `.substitutions` property of the `Alignment` object. "
"For example, for a multiple sequence alignment `msa` of DNA "
"nucleotides, you would do: "
"\n"
">>> alignment = msa.alignment\n"
">>> dictionary = alignment.substitutions\n"
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
if skip_chars is not None:
raise ValueError(
"argument skip_chars has been deprecated; instead, please use 'letters' to specify the characters you want to include"
)
rep_dict = {(letter1, letter2): 0 for letter1 in letters for letter2 in letters}
# iterate through each record
for rec_num1 in range(len(self.alignment)):
# iterate through each record from one beyond the current record
# to the end of the list of records
for rec_num2 in range(rec_num1 + 1, len(self.alignment)):
# for each pair of records, compare the sequences and add
# the pertinent info to the dictionary
self._pair_replacement(
self.alignment[rec_num1].seq,
self.alignment[rec_num2].seq,
self.alignment[rec_num1].annotations.get("weight", 1.0),
self.alignment[rec_num2].annotations.get("weight", 1.0),
rep_dict,
letters,
)
return rep_dict
def _pair_replacement(self, seq1, seq2, weight1, weight2, dictionary, letters):
"""Compare two sequences and generate info on the replacements seen (PRIVATE).
Arguments:
- seq1, seq2 - The two sequences to compare.
- weight1, weight2 - The relative weights of seq1 and seq2.
- dictionary - The dictionary containing the starting replacement
info that we will modify.
- letters - A list of characters to include when calculating replacements.
"""
# loop through each residue in the sequences
for residue1, residue2 in zip(seq1, seq2):
if residue1 in letters and residue2 in letters:
dictionary[(residue1, residue2)] += weight1 * weight2
def _get_all_letters(self):
"""Return a string containing the expected letters in the alignment (PRIVATE)."""
set_letters = set()
for record in self.alignment:
set_letters.update(record.seq)
list_letters = sorted(set_letters)
all_letters = "".join(list_letters)
return all_letters
def pos_specific_score_matrix(self, axis_seq=None, chars_to_ignore=None):
"""Create a position specific score matrix object for the alignment.
This creates a position specific score matrix (pssm) which is an
alternative method to look at a consensus sequence.
Arguments:
- chars_to_ignore - A list of all characters not to include in
the pssm.
- axis_seq - An optional argument specifying the sequence to
put on the axis of the PSSM. This should be a Seq object. If nothing
is specified, the consensus sequence, calculated with default
parameters, will be used.
Returns:
- A PSSM (position specific score matrix) object.
"""
# determine all of the letters we have to deal with
warnings.warn(
"The `pos_specific_score_matrix` method is deprecated and will be "
"removed in a future release of Biopython. As an alternative, you "
"can convert the multiple sequence alignment object to a new-style "
"Alignment object by via its `.alignment` property, and then "
"create a Motif object. For example, for a multiple sequence "
"alignment `msa` of DNA nucleotides, you would do: "
"\n"
">>> alignment = msa.alignment\n"
">>> from Bio.motifs import Motif\n"
">>> motif = Motif('ACGT', alignment)\n"
">>> counts = motif.counts\n"
"\n"
"The `counts` object contains the same information as the PSSM "
"returned by `pos_specific_score_matrix`, but note that the "
"indices are reversed:\n"
"\n"
">>> counts[letter][i] == pssm[index][letter]\n"
"True\n"
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
all_letters = self._get_all_letters()
if not all_letters:
raise ValueError("_get_all_letters returned empty string")
if chars_to_ignore is None:
chars_to_ignore = []
if not isinstance(chars_to_ignore, list):
raise TypeError("chars_to_ignore should be a list.")
gap_char = "-"
chars_to_ignore.append(gap_char)
for char in chars_to_ignore:
all_letters = all_letters.replace(char, "")
if axis_seq:
left_seq = axis_seq
if len(axis_seq) != self.alignment.get_alignment_length():
raise ValueError(
"Axis sequence length does not equal the get_alignment_length"
)
else:
left_seq = self.dumb_consensus()
pssm_info = []
# now start looping through all of the sequences and getting info
for residue_num in range(len(left_seq)):
score_dict = dict.fromkeys(all_letters, 0)
for record in self.alignment:
try:
this_residue = record.seq[residue_num]
# if we hit an index error we've run out of sequence and
# should not add new residues
except IndexError:
this_residue = None
if this_residue and this_residue not in chars_to_ignore:
weight = record.annotations.get("weight", 1.0)
try:
score_dict[this_residue] += weight
except KeyError:
raise ValueError(
"Residue %s not found" % this_residue
) from None
pssm_info.append((left_seq[residue_num], score_dict))
return PSSM(pssm_info)
def information_content(
self,
start=0,
end=None,
e_freq_table=None,
log_base=2,
chars_to_ignore=None,
pseudo_count=0,
):
"""Calculate the information content for each residue along an alignment.
Arguments:
- start, end - The starting an ending points to calculate the
information content. These points should be relative to the first
sequence in the alignment, starting at zero (ie. even if the 'real'
first position in the seq is 203 in the initial sequence, for
the info content, we need to use zero). This defaults to the entire
length of the first sequence.
- e_freq_table - A dictionary specifying the expected frequencies
for each letter (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}).
Gap characters should not be included, since these should not have
expected frequencies.
- log_base - The base of the logarithm to use in calculating the
information content. This defaults to 2 so the info is in bits.
- chars_to_ignore - A listing of characters which should be ignored
in calculating the info content. Defaults to none.
Returns:
- A number representing the info content for the specified region.
Please see the Biopython manual for more information on how information
content is calculated.
"""
warnings.warn(
"The `information_content` method and `ic_vector` attribute of the "
"`SummaryInfo` class are deprecated and will be removed in a "
"future release of Biopython. As an alternative, you can convert "
"the multiple sequence alignment object to a new-style Alignment "
"object by via its `.alignment` property, and use the "
"`information_content` attribute of the Alignment obecjt. "
"For example, for a multiple sequence alignment `msa` of "
"DNA nucleotides, you would do: "
"\n"
">>> alignment = msa.alignment\n"
">>> from Bio.motifs import Motif\n"
">>> motif = Motif('ACGT', alignment)\n"
">>> information_content = motif.information_content\n"
"\n"
"The `information_content` object contains the same values as the "
"`ic_vector` attribute of the `SummaryInfo` object. Its sum is "
"equal to the value return by the `information_content` method. "
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
# if no end was specified, then we default to the end of the sequence
if end is None:
end = len(self.alignment[0].seq)
if chars_to_ignore is None:
chars_to_ignore = []
if start < 0 or end > len(self.alignment[0].seq):
raise ValueError(
"Start (%s) and end (%s) are not in the range %s to %s"
% (start, end, 0, len(self.alignment[0].seq))
)
# determine random expected frequencies, if necessary
random_expected = None
# determine all of the letters we have to deal with
all_letters = self._get_all_letters()
for char in chars_to_ignore:
all_letters = all_letters.replace(char, "")
info_content = {}
for residue_num in range(start, end):
freq_dict = self._get_letter_freqs(
residue_num,
self.alignment,
all_letters,
chars_to_ignore,
pseudo_count,
e_freq_table,
random_expected,
)
# print(freq_dict, end="")
column_score = self._get_column_info_content(
freq_dict, e_freq_table, log_base, random_expected
)
info_content[residue_num] = column_score
# sum up the score
total_info = sum(info_content.values())
# fill in the ic_vector member: holds IC for each column
# reset ic_vector to empty list at each call
self.ic_vector = []
for i, k in enumerate(info_content):
self.ic_vector.append(info_content[i + start])
return total_info
def _get_letter_freqs(
self,
residue_num,
all_records,
letters,
to_ignore,
pseudo_count=0,
e_freq_table=None,
random_expected=None,
):
"""Determine the frequency of specific letters in the alignment (PRIVATE).
Arguments:
- residue_num - The number of the column we are getting frequencies
from.
- all_records - All of the SeqRecords in the alignment.
- letters - The letters we are interested in getting the frequency
for.
- to_ignore - Letters we are specifically supposed to ignore.
- pseudo_count - Optional argument specifying the Pseudo count (k)
to add in order to prevent a frequency of 0 for a letter.
- e_freq_table - An optional argument specifying a dictionary with
the expected frequencies for each letter.
- random_expected - Optional argument that specify the frequency to use
when e_freq_table is not defined.
This will calculate the frequencies of each of the specified letters
in the alignment at the given frequency, and return this as a
dictionary where the keys are the letters and the values are the
frequencies. Pseudo count can be added to prevent a null frequency
"""
freq_info = dict.fromkeys(letters, 0)
total_count = 0
gap_char = "-"
if pseudo_count < 0:
raise ValueError(
"Positive value required for pseudo_count, %s provided" % (pseudo_count)
)
# collect the count info into the dictionary for all the records
for record in all_records:
try:
if record.seq[residue_num] not in to_ignore:
weight = record.annotations.get("weight", 1.0)
freq_info[record.seq[residue_num]] += weight
total_count += weight
except KeyError:
raise ValueError(
"Residue %s not found in letters %s"
% (record.seq[residue_num], letters)
) from None
if e_freq_table:
# check if all the residus in freq_info are in e_freq_table
for key in freq_info:
if key != gap_char and key not in e_freq_table:
raise ValueError("%s not found in expected frequency table" % key)
if total_count == 0:
# This column must be entirely ignored characters
for letter in freq_info:
if freq_info[letter] != 0:
raise ValueError("freq_info[letter] is not 0")
# TODO - Map this to NA or NaN?
else:
# now convert the counts into frequencies
for letter in freq_info:
if pseudo_count and (random_expected or e_freq_table):
# use either the expected random freq or the
if e_freq_table:
ajust_freq = e_freq_table[letter]
else:
ajust_freq = random_expected
ajusted_letter_count = freq_info[letter] + ajust_freq * pseudo_count
ajusted_total = total_count + pseudo_count
freq_info[letter] = ajusted_letter_count / ajusted_total
else:
freq_info[letter] = freq_info[letter] / total_count
return freq_info
def _get_column_info_content(
self, obs_freq, e_freq_table, log_base, random_expected
):
"""Calculate the information content for a column (PRIVATE).
Arguments:
- obs_freq - The frequencies observed for each letter in the column.
- e_freq_table - An optional argument specifying a dictionary with
the expected frequencies for each letter.
- log_base - The base of the logarithm to use in calculating the
info content.
"""
gap_char = "-"
if e_freq_table:
# check the expected freq information to make sure it is good
for key in obs_freq:
if key != gap_char and key not in e_freq_table:
raise ValueError(
f"Frequency table provided does not contain observed letter {key}"
)
total_info = 0.0
for letter in obs_freq:
inner_log = 0.0
# if we have expected frequencies, modify the log value by them
# gap characters do not have expected frequencies, so they
# should just be the observed frequency.
if letter != gap_char:
if e_freq_table:
inner_log = obs_freq[letter] / e_freq_table[letter]
else:
inner_log = obs_freq[letter] / random_expected
# if the observed frequency is zero, we don't add any info to the
# total information content
if inner_log > 0:
letter_info = (
obs_freq[letter] * math.log(inner_log) / math.log(log_base)
)
total_info += letter_info
return total_info
def get_column(self, col):
"""Return column of alignment."""
# TODO - Deprecate this and implement slicing?
return self.alignment[:, col]
class PSSM:
"""Represent a position specific score matrix.
This class is meant to make it easy to access the info within a PSSM
and also make it easy to print out the information in a nice table.
Let's say you had an alignment like this::
GTATC
AT--C
CTGTC
The position specific score matrix (when printed) looks like::
G A T C
G 1 1 0 1
T 0 0 3 0
A 1 1 0 0
T 0 0 2 0
C 0 0 0 3
You can access a single element of the PSSM using the following::
your_pssm[sequence_number][residue_count_name]
For instance, to get the 'T' residue for the second element in the
above alignment you would need to do:
your_pssm[1]['T']
"""
def __init__(self, pssm):
"""Initialize with pssm data to represent.
The pssm passed should be a list with the following structure:
list[0] - The letter of the residue being represented (for instance,
from the example above, the first few list[0]s would be GTAT...
list[1] - A dictionary with the letter substitutions and counts.
"""
warnings.warn(
"The `PSSM` class is deprecated and will be removed in a future "
"release of Biopython. As an alternative, you can convert the "
"multiple sequence alignment object to a new-style Alignment "
"object by via its `.alignment` property, and then create a Motif "
"object. For example, for a multiple sequence alignment `msa` of "
"DNA nucleotides, you would do: "
"\n"
">>> alignment = msa.alignment\n"
">>> from Bio.motifs import Motif\n"
">>> motif = Motif('ACGT', alignment)\n"
">>> counts = motif.counts\n"
"\n"
"The `counts` object contains the same information as the PSSM "
"returned by `pos_specific_score_matrix`, but note that the "
"indices are reversed:\n"
"\n"
">>> counts[letter][i] == pssm[index][letter]\n"
"True\n"
"\n"
"If your multiple sequence alignment object was obtained using "
"Bio.AlignIO, then you can obtain a new-style Alignment object "
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
BiopythonDeprecationWarning,
)
self.pssm = pssm
def __getitem__(self, pos):
return self.pssm[pos][1]
def __str__(self):
out = " "
all_residues = sorted(self.pssm[0][1])
# first print out the top header
for res in all_residues:
out += " %s" % res
out += "\n"
# for each item, write out the substitutions
for item in self.pssm:
out += "%s " % item[0]
for res in all_residues:
out += " %.1f" % item[1][res]
out += "\n"
return out
def get_residue(self, pos):
"""Return the residue letter at the specified position."""
return self.pssm[pos][0]
def print_info_content(summary_info, fout=None, rep_record=0):
"""3 column output: position, aa in representative sequence, ic_vector value."""
warnings.warn(
"The `print_info_content` function is deprecated and will be removed "
"in a future release of Biopython.",
BiopythonDeprecationWarning,
)
fout = fout or sys.stdout
if not summary_info.ic_vector:
summary_info.information_content()
rep_sequence = summary_info.alignment[rep_record]
for pos, (aa, ic) in enumerate(zip(rep_sequence, summary_info.ic_vector)):
fout.write("%d %s %.3f\n" % (pos, aa, ic))

View File

@ -97,10 +97,10 @@ wrapped is no longer available since SCOP moved to the EBI website.
Bio.AlignInfo
-------------
The ``pos_specific_score_matrix`` method of the ``SummaryInfo`` class and the
``PSSM`` class were deprecated in release 1.82. As an alternative, please use
the ``alignment`` property of a ``MultipleSeqAlignment`` object to obtains a
new-style ``Alignment`` object, and use it to create a ``Bio.motifs.Motif``
object. For example,
``PSSM`` class were deprecated in release 1.82, and removed in release 1.85. As
an alternative, please use the ``alignment`` property of a ``MultipleSeqAlignment``
object to obtains a new-style ``Alignment`` object, and use it to create a
``Bio.motifs.Motif`` object. For example,
>>> alignment = msa.alignment
>>> from Bio.motifs import Motif
@ -114,17 +114,17 @@ The ``counts`` object contains the same information as the PSSM returned by
True
The ``information_content`` method and the ``ic_vector`` attribute of the
``SummaryInfo`` class were deprecated in release 1.82. As an alternative,
please use the ``relative_entropy`` attribute of the ``motif`` instance (see
above); it contains the same values as the ``ic_vector`` attribute, while
``sum(relative_entropy)`` is equal to the value returned by
``SummaryInfo`` class were deprecated in release 1.82, and removed in release 1.85.
As an alternative, please use the ``relative_entropy`` attribute of the ``motif``
instance (see above); it contains the same values as the ``ic_vector`` attribute,
while ``sum(relative_entropy)`` is equal to the value returned by
``information_content``.
The ``replacement_dictionary`` method of the ``SummaryInfo`` class was
deprecated in release 1.82. As an alternative, please use the ``alignment``
property of the ``MultipleSeqAlignment`` object to obtain a new-style
``Alignment`` object, and use its ``substitutions`` attribute to obtain the
replacement dictionary:
deprecated in release 1.82, and removed in release 1.85. As an alternative, please
use the ``alignment`` property of the ``MultipleSeqAlignment`` object to obtain a
new-style ``Alignment`` object, and use its ``substitutions`` attribute to obtain
the replacement dictionary:
>>> alignment = msa.alignment
>>> dictionary = alignment.substitutions
@ -135,10 +135,10 @@ by using ``Bio.Align.read`` instead of ``Bio.AlignIO.read``, or
``Bio.Align.parse`` instead of ``Bio.AlignIO.parse``.
The ``dumb_consensus`` and ``gap_consensus`` methods of the ``SummaryInfo``
class were deprecated in Release 1.82.
class were deprecated in release 1.82, and removed in release 1.85.
The ``print_info_content`` function in ``Bio.Align.AlignInfo`` was deprecated
in Release 1.82.
in release 1.82, and removed in release 1.85.
Bio.kNN
-------

View File

@ -1098,364 +1098,6 @@ add entries for missing letters, for example
This also allows you to change the order of letters in the alphabet.
.. _`sec:summary_info`:
Calculating summary information
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Once you have an alignment, you are very likely going to want to find
out information about it. Instead of trying to have all of the functions
that can generate information about an alignment in the alignment object
itself, weve tried to separate out the functionality into separate
classes, which act on the alignment.
Getting ready to calculate summary information about an object is quick
to do. Lets say weve got an alignment object called ``alignment``, for
example read in using ``Bio.AlignIO.read(...)`` as described in
Chapter :ref:`chapter:msa`. All we need to do to get an object that
will calculate summary information is:
.. cont-doctest
.. code:: pycon
>>> from Bio.Align import AlignInfo
>>> summary_align = AlignInfo.SummaryInfo(msa)
The ``summary_align`` object is very useful, and will do the following
neat things for you:
#. Calculate a quick consensus sequence see
section :ref:`sec:consensus`
#. Get a position specific score matrix for the alignment see
section :ref:`sec:pssm`
#. Calculate the information content for the alignment see
section :ref:`sec:getting_info_content`
#. Generate information on substitutions in the alignment
section :ref:`sec:substitution_matrices`
details using this to generate a substitution matrix.
.. _`sec:consensus`:
Calculating a quick consensus sequence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``SummaryInfo`` object, described in
section :ref:`sec:summary_info`, provides functionality to
calculate a quick consensus of an alignment. Assuming weve got a
``SummaryInfo`` object called ``summary_align`` we can calculate a
consensus by doing:
.. cont-doctest
.. code:: pycon
>>> consensus = summary_align.dumb_consensus()
>>> consensus
Seq('XCTXCTX')
As the name suggests, this is a really simple consensus calculator, and
will just add up all of the residues at each point in the consensus, and
if the most common value is higher than some threshold value will add
the common residue to the consensus. If it doesnt reach the threshold,
it adds an ambiguity character to the consensus. The returned consensus
object is a ``Seq`` object.
You can adjust how ``dumb_consensus`` works by passing optional
parameters:
the threshold
This is the threshold specifying how common a particular residue has
to be at a position before it is added. The default is :math:`0.7`
(meaning :math:`70\%`).
the ambiguous character
This is the ambiguity character to use. The default is N.
Alternatively, you can convert the multiple sequence alignment object
``msa`` to a new-style ``Alignment`` object (see section
:ref:`sec:alignmentobject`) by using the
``alignment`` attribute (see section :ref:`sec:alignment_newstyle`):
.. cont-doctest
.. code:: pycon
>>> alignment = msa.alignment
You can then create a ``Motif`` object (see section
:ref:`sec:motif_object`):
.. cont-doctest
.. code:: pycon
>>> from Bio.motifs import Motif
>>> motif = Motif("ACGT", alignment)
and obtain a quick consensus sequence:
.. cont-doctest
.. code:: pycon
>>> motif.consensus
Seq('ACTCCTA')
The ``motif.counts.calculate_consensus`` method (see section
:ref:`sec:motif_consensus`) lets you specify in
detail how the consensus sequence should be calculated. For example,
.. cont-doctest
.. code:: pycon
>>> motif.counts.calculate_consensus(identity=0.7)
'NCTNCTN'
.. _`sec:pssm`:
Position Specific Score Matrices
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Position specific score matrices (PSSMs) summarize the alignment
information in a different way than a consensus, and may be useful for
different tasks. Basically, a PSSM is a count matrix. For each column in
the alignment, the number of each alphabet letters is counted and
totaled. The totals are displayed relative to some representative
sequence along the left axis. This sequence may be the consensus
sequence, but can also be any sequence in the alignment.
For instance for the alignment above:
.. cont-doctest
.. code:: pycon
>>> print(msa)
Alignment with 4 rows and 7 columns
ACTCCTA seq1
AAT-CTA seq2
CCTACT- seq3
TCTCCTC seq4
we get a PSSM with the consensus sequence along the side using
.. cont-doctest
.. code:: pycon
>>> my_pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=["N"])
>>> print(my_pssm)
A C T
X 2.0 1.0 1.0
C 1.0 3.0 0.0
T 0.0 0.0 4.0
X 1.0 2.0 0.0
C 0.0 4.0 0.0
T 0.0 0.0 4.0
X 2.0 1.0 0.0
<BLANKLINE>
where we ignore any ``N`` ambiguity residues when calculating the PSSM.
Two notes should be made about this:
#. To maintain strictness with the alphabets, you can only include
characters along the top of the PSSM that are in the alphabet of the
alignment object. Gaps are not included along the top axis of the
PSSM.
#. The sequence passed to be displayed along the left side of the axis
does not need to be the consensus. For instance, if you wanted to
display the second sequence in the alignment along this axis, you
would need to do:
.. cont-doctest
.. code:: pycon
>>> second_seq = msa[1]
>>> my_pssm = summary_align.pos_specific_score_matrix(second_seq, chars_to_ignore=["N"])
>>> print(my_pssm)
A C T
A 2.0 1.0 1.0
A 1.0 3.0 0.0
T 0.0 0.0 4.0
- 1.0 2.0 0.0
C 0.0 4.0 0.0
T 0.0 0.0 4.0
A 2.0 1.0 0.0
<BLANKLINE>
The command above returns a ``PSSM`` object. You can access any element
of the PSSM by subscripting like
``your_pssm[sequence_number][residue_count_name]``. For instance, to get
the counts for the A residue in the second element of the above PSSM
you would do:
.. cont-doctest
.. code:: pycon
>>> print(my_pssm[5]["T"])
4.0
The structure of the PSSM class hopefully makes it easy both to access
elements and to pretty print the matrix.
Alternatively, you can convert the multiple sequence alignment object
``msa`` to a new-style ``Alignment`` object (see section
:ref:`sec:alignmentobject`) by using the
``alignment`` attribute (see section :ref:`sec:alignment_newstyle`):
.. cont-doctest
.. code:: pycon
>>> alignment = msa.alignment
You can then create a ``Motif`` object (see section
:ref:`sec:motif_object`):
.. cont-doctest
.. code:: pycon
>>> from Bio.motifs import Motif
>>> motif = Motif("ACGT", alignment)
and obtain the counts of each nucleotide in each position:
.. cont-doctest
.. code:: pycon
>>> counts = motif.counts
>>> print(counts)
0 1 2 3 4 5 6
A: 2.00 1.00 0.00 1.00 0.00 0.00 2.00
C: 1.00 3.00 0.00 2.00 4.00 0.00 1.00
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00
T: 1.00 0.00 4.00 0.00 0.00 4.00 0.00
<BLANKLINE>
>>> print(counts["T"][5])
4.0
.. _`sec:getting_info_content`:
Information Content
~~~~~~~~~~~~~~~~~~~
A potentially useful measure of evolutionary conservation is the
information content of a sequence.
A useful introduction to information theory targeted towards molecular
biologists can be found at
http://www.lecb.ncifcrf.gov/~toms/paper/primer/. For our purposes, we
will be looking at the information content of a consensus sequence, or a
portion of a consensus sequence. We calculate information content at a
particular column in a multiple sequence alignment using the following
formula:
.. math:: IC_{j} = \sum_{i=1}^{N_{a}} P_{ij} \mathrm{log}\left(\frac{P_{ij}}{Q_{i}}\right)
where:
- :math:`IC_{j}` The information content for the :math:`j`-th column
in an alignment.
- :math:`N_{a}` The number of letters in the alphabet.
- :math:`P_{ij}` The frequency of a particular letter :math:`i` in
the :math:`j`-th column (i. e. if G occurred 3 out of 6 times in an
alignment column, this would be 0.5)
- :math:`Q_{i}` The expected frequency of a letter :math:`i`. This is
an optional argument, usage of which is left at the users
discretion. By default, it is automatically assigned to
:math:`0.05 = 1/20` for a protein alphabet, and :math:`0.25 = 1/4`
for a nucleic acid alphabet. This is for getting the information
content without any assumption of prior distributions. When assuming
priors, or when using a non-standard alphabet, you should supply the
values for :math:`Q_{i}`.
Well, now that we have an idea what information content is being
calculated in Biopython, lets look at how to get it for a particular
region of the alignment.
First, we need to use our alignment to get an alignment summary object,
which well assume is called ``summary_align`` (see
section :ref:`sec:summary_info`) for instructions on how to get
this. Once weve got this object, calculating the information content
for a region is as easy as:
.. cont-doctest
.. code:: pycon
>>> e_freq_table = {"A": 0.3, "G": 0.2, "T": 0.3, "C": 0.2}
>>> info_content = summary_align.information_content(
... 2, 6, e_freq_table=e_freq_table, chars_to_ignore=["N"]
... )
>>> info_content # doctest:+ELLIPSIS
6.3910647...
Now, ``info_content`` will contain the relative information content over
the region [2:6] in relation to the expected frequencies.
The value return is calculated using base 2 as the logarithm base in the
formula above. You can modify this by passing the parameter ``log_base``
as the base you want:
.. cont-doctest
.. code:: pycon
>>> info_content = summary_align.information_content(
... 2, 6, e_freq_table=e_freq_table, log_base=10, chars_to_ignore=["N"]
... )
>>> info_content # doctest:+ELLIPSIS
1.923902...
By default nucleotide or amino acid residues with a frequency of 0 in a
column are not take into account when the relative information column
for that column is computed. If this is not the desired result, you can
use ``pseudo_count`` instead.
.. cont-doctest
.. code:: pycon
>>> info_content = summary_align.information_content(
... 2, 6, e_freq_table=e_freq_table, chars_to_ignore=["N", "-"], pseudo_count=1
... )
>>> info_content # doctest:+ELLIPSIS
4.299651...
In this case, the observed frequency :math:`P_{ij}` of a particular
letter :math:`i` in the :math:`j`-th column is computed as follows:
.. math:: P_{ij} = \frac{n_{ij} + k\times Q_{i}}{N_{j} + k}
where:
- :math:`k` the pseudo count you pass as argument.
- :math:`k` the pseudo count you pass as argument.
- :math:`Q_{i}` The expected frequency of the letter :math:`i` as
described above.
Well, now you are ready to calculate information content. If you want to
try applying this to some real life problems, it would probably be best
to dig into the literature on information content to get an idea of how
it is used. Hopefully your digging wont reveal any mistakes made in
coding this function!
.. _`sec:alignment_newstyle`:
Getting a new-style Alignment object

View File

@ -242,141 +242,6 @@ class TestAlignIO_reading(unittest.TestCase):
if alignment_len > 5:
self.assertEqual(alignment[:, -1], columns[-1])
def check_summary_simple(self, msa):
summary = AlignInfo.SummaryInfo(msa)
with self.assertWarns(BiopythonDeprecationWarning):
dumb_consensus = summary.dumb_consensus(threshold=0.7)
all_letters = summary._get_all_letters()
letters = all_letters.replace("-", "")
alignment = msa.alignment
motif = Motif(letters, alignment)
gaps = "-" * len(alignment)
dumb_consensus = "".join(
[
letter
for index, letter in enumerate(dumb_consensus)
if msa[:, index] != gaps
]
)
consensus = motif.counts.calculate_consensus(identity=0.7)
self.assertEqual(dumb_consensus, consensus)
def check_summary(self, msa, molecule_type):
# Check AlignInfo.SummaryInfo likes the alignment; smoke test only
if molecule_type == "DNA":
letters = IUPACData.unambiguous_dna_letters
ambiguous_letters = IUPACData.ambiguous_dna_letters
ambiguous = "N"
elif molecule_type == "RNA":
letters = IUPACData.unambiguous_rna_letters
ambiguous_letters = IUPACData.ambiguous_rna_letters
ambiguous = "N"
elif molecule_type == "protein":
letters = IUPACData.protein_letters
ambiguous_letters = IUPACData.protein_letters
ambiguous = "X"
else:
raise ValueError(f"Unknown molecule type '{molecule_type}'")
chars_to_ignore = set("-" + string.ascii_uppercase).difference(letters)
for record in msa:
record.seq = record.seq.upper()
summary = AlignInfo.SummaryInfo(msa)
alignment = msa.alignment # New-style alignment
alignment.sequences = [sequence.upper() for sequence in alignment.sequences]
all_letters = summary._get_all_letters()
motif_letters = "".join(set(all_letters).union(letters))
motif_letters = motif_letters.replace("-", "")
if set(motif_letters) == set("CGYTWAR"):
ambiguous = "X"
motif = Motif(motif_letters, alignment)
counts = motif.counts
with self.assertWarns(BiopythonDeprecationWarning):
dumb_consensus = summary.dumb_consensus(ambiguous=ambiguous)
consensus = counts.calculate_consensus(identity=0.7)
# skip columns consisting of gaps only:
gaps = "-" * len(alignment)
dumb_consensus = "".join(
[
letter
for index, letter in enumerate(dumb_consensus)
if msa[:, index] != gaps
]
)
self.assertEqual(consensus, dumb_consensus)
with self.assertWarns(BiopythonDeprecationWarning):
pssm = summary.pos_specific_score_matrix()
all_letters = summary._get_all_letters()
j = 0
for i in range(alignment.length):
while set(msa[:, j]) == set("-"):
j += 1
for letter in letters:
count = counts[letter][i]
if letter in all_letters:
self.assertAlmostEqual(count, pssm[j][letter])
else:
self.assertAlmostEqual(count, 0.0)
j += 1
with self.assertWarns(BiopythonDeprecationWarning):
rep_dict = summary.replacement_dictionary(skip_chars=None, letters=letters)
rep_dict = alignment.substitutions
e_freq = 1.0 / len(letters)
ambiguous_letters = ambiguous_letters.upper() + ambiguous_letters.lower()
motif = Motif(letters, alignment)
e_freq_table = dict.fromkeys(ambiguous_letters, e_freq)
with self.assertWarns(BiopythonDeprecationWarning):
info_content = summary.information_content(
e_freq_table=e_freq_table, chars_to_ignore=chars_to_ignore
)
motif.background = e_freq_table
relative_entropy = sum(motif.relative_entropy)
self.assertAlmostEqual(info_content, relative_entropy)
def check_summary_pir(self, msa):
letters = IUPACData.unambiguous_dna_letters
summary = AlignInfo.SummaryInfo(msa)
all_letters = summary._get_all_letters()
alignment = msa.alignment
motif = Motif(letters, alignment)
counts = motif.counts
with self.assertWarns(BiopythonDeprecationWarning):
dumb_consensus = summary.dumb_consensus(ambiguous="N")
gaps = "-" * len(alignment)
dumb_consensus = "".join(
[
letter
for index, letter in enumerate(dumb_consensus)
if msa[:, index] != gaps
]
)
consensus = counts.calculate_consensus(identity=0.7)
self.assertEqual(consensus, dumb_consensus)
with self.assertWarns(BiopythonDeprecationWarning):
pssm = summary.pos_specific_score_matrix()
j = 0
for i in range(alignment.length):
while set(msa[:, j]) == set("-"):
j += 1
for letter in letters:
count = counts[letter][i]
if letter in all_letters:
self.assertAlmostEqual(count, pssm[j][letter])
else:
self.assertAlmostEqual(count, 0.0)
j += 1
with self.assertWarns(BiopythonDeprecationWarning):
rep_dict = summary.replacement_dictionary(skip_chars=None, letters=letters)
rep_dict = alignment.substitutions
e_freq = 1.0 / len(letters)
all_letters = letters.upper() + letters.lower()
e_freq_table = dict.fromkeys(all_letters, e_freq)
with self.assertWarns(BiopythonDeprecationWarning):
info_content = summary.information_content(
e_freq_table=e_freq_table, chars_to_ignore=["-"]
)
relative_entropy = sum(motif.relative_entropy)
self.assertAlmostEqual(info_content, relative_entropy)
def test_reading_alignments_clustal1(self):
path = "Clustalw/clustalw.aln"
self.check_iterator_for_loop_handle(path, "clustal", 1, 2)
@ -402,7 +267,6 @@ class TestAlignIO_reading(unittest.TestCase):
"clustal_consensus": " * *: :: :. :* : :. : . :* :: .: ** **:... *.*** .. .:* * *: .* :* : :* .* *::. . .:: :*..* :* .* .. . : . : *. .:: : . .* . : *.: ..:: * . :: : .*. :. :. . . .* **.*.. :.. *.. . . ::* :.: .*: : * :: *** . * :. . . : *: .:: ::: .. . : : :: * * : .. :.* . ::. :: * : : * * :.. * .. * :** . .:. .. :*. ..: :. . .:* * : : * . ..*:. .** *.*... : :: :* .* ::* : :. :. : "
},
)
self.check_summary(alignment, "protein")
def test_reading_alignments_clustal2(self):
path = "Clustalw/opuntia.aln"
@ -417,7 +281,6 @@ class TestAlignIO_reading(unittest.TestCase):
alignment,
["TTTTTTT", "AAAAAAA", "TTTTTTT", "AAAAAAA", "CCCCCCC", "AAAAAAA"],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_clustal3(self):
path = "Clustalw/hedgehog.aln"
@ -431,7 +294,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignment, ["M----", "F----", "N----", "L----", "V----", "---SS"]
)
self.check_summary(alignment, "protein")
def test_reading_alignments_clustal4(self):
path = "Clustalw/odd_consensus.aln"
@ -452,7 +314,6 @@ class TestAlignIO_reading(unittest.TestCase):
"clustal_consensus": " * * *** ***** * * ** *******************************************************************************************************************************************************************************"
},
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_clustal5(self):
path = "Clustalw/protein.aln"
@ -474,7 +335,6 @@ class TestAlignIO_reading(unittest.TestCase):
"-------------------T",
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_clustal6(self):
path = "Clustalw/promals3d.aln"
@ -496,7 +356,6 @@ class TestAlignIO_reading(unittest.TestCase):
"-T------------------",
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_fasta(self):
path = "GFF/multi.fna" # Trivial nucleotide alignment
@ -511,7 +370,6 @@ class TestAlignIO_reading(unittest.TestCase):
alignment,
[("test1", "ACGTCGCG"), ("test2", "GGGGCCCC"), ("test3", "AAACACAC")],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_nexus1(self):
path = "Nexus/test_Nexus_input.nex"
@ -532,7 +390,6 @@ class TestAlignIO_reading(unittest.TestCase):
"tt--?ag?c",
],
)
self.check_summary_simple(alignment)
def test_reading_alignments_nexus2(self):
path = "Nexus/codonposset.nex"
@ -549,7 +406,6 @@ class TestAlignIO_reading(unittest.TestCase):
("Aerodramus", "?????????TTGTGGTGGGAAT"),
],
)
self.check_summary_simple(alignment)
def test_reading_alignments_msf1(self):
path = "msf/DOA_prot.msf"
@ -586,7 +442,6 @@ class TestAlignIO_reading(unittest.TestCase):
"LLLLLL----L",
],
)
self.check_summary_simple(alignment)
def test_reading_alignments_stockholm1(self):
path = "Stockholm/simple.sth"
@ -607,7 +462,6 @@ class TestAlignIO_reading(unittest.TestCase):
"secondary_structure": ".................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>..............."
},
)
self.check_summary(alignment, "RNA")
def test_reading_alignments_stockholm2(self):
path = "Stockholm/funny.sth"
@ -621,7 +475,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignment, ["MMMEEE", "TQIVVV", "CHEMMM", "RVALLL", "ASDTTT", "SYSEEE"]
)
self.check_summary(alignment, "protein")
def test_reading_alignments_phylip1(self):
path = "Phylip/reference_dna.phy"
@ -635,7 +488,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignment, ["CCTTCG", "GGAAAG", "ATAAAC", "TTTTAA", "GAGGAG", "CTTTTC"]
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip2(self):
path = "Phylip/reference_dna2.phy"
@ -649,7 +501,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignment, ["CCTTCG", "GGAAAG", "ATAAAC", "TTTTAA", "GAGGAG", "CTTTTC"]
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip3(self):
path = "Phylip/hennigian.phy"
@ -671,7 +522,6 @@ class TestAlignIO_reading(unittest.TestCase):
"AAAAAAAAAA",
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip4(self):
path = "Phylip/horses.phy"
@ -693,7 +543,6 @@ class TestAlignIO_reading(unittest.TestCase):
"AAAAAAAAAA",
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip5(self):
path = "Phylip/random.phy"
@ -715,7 +564,6 @@ class TestAlignIO_reading(unittest.TestCase):
"AAAAAAAAAA",
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip6(self):
path = "Phylip/interlaced.phy"
@ -734,7 +582,6 @@ class TestAlignIO_reading(unittest.TestCase):
("CYS1_DICDI", "-----MKVILLFVLAVFTVFVSS-----------...I--"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_phylip7(self):
path = "Phylip/interlaced2.phy"
@ -754,7 +601,6 @@ class TestAlignIO_reading(unittest.TestCase):
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_phylip8(self):
path = "ExtendedPhylip/primates.phyx"
@ -776,7 +622,6 @@ class TestAlignIO_reading(unittest.TestCase):
"TTTTTTTTTTTT",
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_phylip9(self):
path = "Phylip/sequential.phy"
@ -795,7 +640,6 @@ class TestAlignIO_reading(unittest.TestCase):
("CYS1_DICDI", "-----MKVILLFVLAVFTVFVSS-----------...I--"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_phylip10(self):
path = "Phylip/sequential2.phy"
@ -815,7 +659,6 @@ class TestAlignIO_reading(unittest.TestCase):
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_emboss1(self):
path = "Emboss/alignret.txt"
@ -834,7 +677,6 @@ class TestAlignIO_reading(unittest.TestCase):
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_emboss2(self):
path = "Emboss/needle.txt"
@ -885,7 +727,6 @@ class TestAlignIO_reading(unittest.TestCase):
("ref_rec", "KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQ...---"),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_emboss3(self):
@ -903,7 +744,6 @@ class TestAlignIO_reading(unittest.TestCase):
("asis", "TATTTTTTGGATTTTTTTCTAGATTTTCTAGGTT...GAA"),
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_emboss4(self):
path = "Emboss/water.txt"
@ -920,7 +760,6 @@ class TestAlignIO_reading(unittest.TestCase):
("IXI_235", "TSPASIRPPAGPSSR---------RPSPPGPRRP...SHE"),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_emboss5(self):
path = "Emboss/water2.txt"
@ -933,7 +772,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_rows(
alignment, [("asis", "CGTTTGAGT-CTGGGATG"), ("asis", "CGTTTGAGTACTGGGATG")]
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_emboss6(self):
path = "Emboss/matcher_simple.txt"
@ -947,7 +785,6 @@ class TestAlignIO_reading(unittest.TestCase):
alignment,
[("AF069992_1", "GPPPQSPDENRAGESS"), ("CAA85685.1", "GVPPEEAGAAVAAESS")],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_emboss7(self):
path = "Emboss/matcher_pair.txt"
@ -983,7 +820,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_rows(
alignments[4], [("HBA_HUMAN", "VKAAWGKVGA"), ("HBB_HUMAN", "VQAAYQKVVA")]
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_emboss8(self):
@ -1007,7 +843,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_fasta_m10_1(self):
path = "Fasta/output001.m10"
@ -1073,7 +908,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_fasta_m10_2(self):
@ -1140,7 +974,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_fasta_m10_3(self):
@ -1187,7 +1020,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_fasta_m10_4(self):
@ -1211,7 +1043,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_fasta_m10_5(self):
path = "Fasta/output005.m10"
@ -1234,7 +1065,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_fasta_m10_6(self):
path = "Fasta/output006.m10"
@ -1254,7 +1084,6 @@ class TestAlignIO_reading(unittest.TestCase):
("query", "GCAACGCTTCAAGAACTGGAATTAGGAACCGTGA...CAT"),
],
)
self.check_summary(alignment, "DNA")
def test_reading_alignments_fasta_m10_7(self):
path = "Fasta/output007.m10"
@ -1320,7 +1149,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_fasta_m10_8(self):
@ -1372,7 +1200,6 @@ class TestAlignIO_reading(unittest.TestCase):
("sp|P08100|OPSD_HUMAN", "AQQQESATTQKAEKEVTRMVIIMVIAFLICW"),
],
)
self.check_summary(alignments[0], "protein")
self.check_reverse_write_read(alignments)
def test_reading_alignments_ig(self):
@ -1395,7 +1222,6 @@ class TestAlignIO_reading(unittest.TestCase):
"HHHHHHH-AAAAL-R-",
],
)
self.check_summary(alignment, "protein")
def test_reading_alignments_pir(self):
path = "NBRF/clustalw.pir"
@ -1419,7 +1245,6 @@ class TestAlignIO_reading(unittest.TestCase):
),
],
)
self.check_summary_pir(alignment)
def test_reading_alignments_maf1(self):
path = "MAF/humor.maf"
@ -1447,7 +1272,6 @@ class TestAlignIO_reading(unittest.TestCase):
("rn3", "tttgtccatgttggtcaggctggtctcgaactcc...GGT"),
],
)
self.check_summary(alignments[1], "DNA")
self.check_reverse_write_read(alignments)
def test_reading_alignments_maf2(self):
@ -1479,7 +1303,6 @@ class TestAlignIO_reading(unittest.TestCase):
("panTro1.chr6", "gcagctgaaaaca"),
],
)
self.check_summary(alignments[1], "DNA")
self.check_reverse_write_read(alignments)
def test_reading_alignments_maf3(self):
@ -1505,7 +1328,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignments[2], ["gggA", "cccC", "aaaA", "gggG", "cccC", "aaaA"]
)
self.check_summary(alignments[2], "DNA")
self.check_reverse_write_read(alignments)
def test_reading_alignments_maf4(self):
@ -1546,7 +1368,6 @@ class TestAlignIO_reading(unittest.TestCase):
self.check_alignment_columns(
alignments[47], ["TTTTTT", "GGGGGG", "TTTTTT", "TTTTTT", "TGGGAT", "tTTTT-"]
)
self.check_summary(alignments[47], "DNA")
self.check_reverse_write_read(alignments)
def test_reading_alignments_mauve(self):
@ -1589,7 +1410,6 @@ class TestAlignIO_reading(unittest.TestCase):
alignments[4],
[("2/11410-12880", "ATTCGCACATAAGAATGTACCTTGCTGTAATTTA...ATA")],
)
self.check_summary(alignments[4], "DNA")
self.check_reverse_write_read(alignments)

View File

@ -1,225 +0,0 @@
# Copyright 2016 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Bio.Align.AlignInfo related tests."""
import math
import unittest
from Bio import AlignIO
from Bio import BiopythonDeprecationWarning
from Bio.Align import MultipleSeqAlignment
from Bio.Align.AlignInfo import SummaryInfo
from Bio.Data import IUPACData
from Bio.motifs import Motif
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
class AlignInfoTests(unittest.TestCase):
"""Test basic usage."""
def assertAlmostEqualList(self, list1, list2, **kwargs):
self.assertEqual(len(list1), len(list2))
for v1, v2 in zip(list1, list2):
self.assertAlmostEqual(v1, v2, **kwargs)
def test_nucleotides(self):
filename = "GFF/multi.fna"
fmt = "fasta"
msa = AlignIO.read(filename, fmt)
summary = SummaryInfo(msa)
alignment = msa.alignment
motif = Motif("ACGT", alignment)
with self.assertWarns(BiopythonDeprecationWarning):
c = summary.dumb_consensus(threshold=0.1, ambiguous="N")
# dumb_consensus uses ambiguous if multiple letters have the same score
self.assertEqual(c, "ANGNCCCC")
c = motif.counts.calculate_consensus(identity=0.1)
# Instead, EMBOSS uses the first letter it encounters
self.assertEqual(c, "AaGcCCCC")
with self.assertWarns(BiopythonDeprecationWarning):
c = summary.dumb_consensus(ambiguous="N")
self.assertEqual(c, "NNNNNNNN")
c = motif.counts.calculate_consensus(identity=0.7)
self.assertEqual(c, "NNNNNNNN")
with self.assertWarns(BiopythonDeprecationWarning):
c = summary.gap_consensus(ambiguous="N")
self.assertEqual(c, "NNNNNNNN")
expected = {"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}
with self.assertWarns(BiopythonDeprecationWarning):
m = summary.pos_specific_score_matrix(chars_to_ignore=["-"], axis_seq=c)
counts = motif.counts
for i in range(alignment.length):
for letter in "ACGT":
self.assertAlmostEqual(counts[letter][i], m[i][letter])
self.assertEqual(
str(m),
""" A C G T
N 2.0 0.0 1.0 0.0
N 1.0 1.0 1.0 0.0
N 1.0 0.0 2.0 0.0
N 0.0 1.0 1.0 1.0
N 1.0 2.0 0.0 0.0
N 0.0 2.0 1.0 0.0
N 1.0 2.0 0.0 0.0
N 0.0 2.0 1.0 0.0
""",
)
# provide the frequencies and chars to ignore explicitly.
with self.assertWarns(BiopythonDeprecationWarning):
ic = summary.information_content(
e_freq_table=expected, chars_to_ignore=["-"]
)
self.assertAlmostEqual(ic, 7.32029999423075)
ic = sum(motif.relative_entropy)
self.assertAlmostEqual(ic, 7.32029999423075)
def test_proteins(self):
letters = IUPACData.protein_letters
a = MultipleSeqAlignment(
[
SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"),
SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"),
SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003"),
]
)
self.assertEqual(32, a.get_alignment_length())
s = SummaryInfo(a)
alignment = a.alignment
motif = Motif(letters + "*", alignment)
counts = motif.counts
with self.assertWarns(BiopythonDeprecationWarning):
dumb_consensus = s.dumb_consensus()
self.assertEqual(dumb_consensus, "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")
consensus = counts.calculate_consensus(identity=0.7)
self.assertEqual(consensus, dumb_consensus)
with self.assertWarns(BiopythonDeprecationWarning):
c = s.gap_consensus(ambiguous="X")
self.assertEqual(c, "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")
with self.assertWarns(BiopythonDeprecationWarning):
m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c)
j = 0
all_letters = s._get_all_letters()
for i in range(alignment.length):
for letter in letters:
count = counts[letter][i]
if letter in all_letters:
self.assertAlmostEqual(count, m[j][letter])
else:
self.assertAlmostEqual(count, 0.0)
j += 1
self.assertEqual(
str(m),
""" A D E F G H I K L M N P Q R S W Y
M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""",
)
base_freq = 1.0 / len(letters)
e_freq_table = {letter: base_freq for letter in letters}
with self.assertWarns(BiopythonDeprecationWarning):
ic = s.information_content(
e_freq_table=e_freq_table, chars_to_ignore=["-", "*"]
)
self.assertAlmostEqual(ic, 133.061475107)
motif = Motif(letters, alignment)
ic = sum(motif.relative_entropy)
self.assertAlmostEqual(ic, 133.061475107)
def test_pseudo_count(self):
# use example from
# http://biologie.univ-mrs.fr/upload/p202/01.4.PSSM_theory.pdf
msa = MultipleSeqAlignment(
[
SeqRecord(Seq("AACCACGTTTAA"), id="ID001"),
SeqRecord(Seq("CACCACGTGGGT"), id="ID002"),
SeqRecord(Seq("CACCACGTTCGC"), id="ID003"),
SeqRecord(Seq("GCGCACGTGGGG"), id="ID004"),
SeqRecord(Seq("TCGCACGTTGTG"), id="ID005"),
SeqRecord(Seq("TGGCACGTGTTT"), id="ID006"),
SeqRecord(Seq("TGACACGTGGGA"), id="ID007"),
SeqRecord(Seq("TTACACGTGCGC"), id="ID008"),
]
)
summary = SummaryInfo(msa)
expected = {"A": 0.325, "G": 0.175, "T": 0.325, "C": 0.175}
with self.assertWarns(BiopythonDeprecationWarning):
ic = summary.information_content(
e_freq_table=expected, log_base=math.exp(1), pseudo_count=1
)
self.assertAlmostEqual(ic, 7.546369561463767)
ic_vector = [
0.11112361,
0.08677812,
0.35598044,
1.29445419,
0.80272907,
1.29445419,
1.29445419,
0.80272907,
0.60929642,
0.39157892,
0.46539767,
0.03739368,
]
self.assertAlmostEqualList(summary.ic_vector, ic_vector)
# One more time, now using a new-style Alignment object:
alignment = msa.alignment
motif = Motif("ACGT", alignment)
motif.background = expected
motif.pseudocounts = expected
self.assertAlmostEqualList(motif.relative_entropy * math.log(2), ic_vector)
ic = sum(ic_vector)
self.assertAlmostEqual(ic, 7.546369561463767)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -341,35 +341,6 @@ gi|671626|emb|CAA85685.1| -
"TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
)
self.assertEqual(msa.get_alignment_length(), 156)
align_info = AlignInfo.SummaryInfo(msa)
with self.assertWarns(BiopythonDeprecationWarning):
consensus = align_info.dumb_consensus(ambiguous="N")
self.assertIsInstance(consensus, Seq)
self.assertEqual(
consensus,
"TATACATTAAAGNAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTNCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
)
with self.assertWarns(BiopythonDeprecationWarning):
dictionary = align_info.replacement_dictionary(
skip_chars=None, letters="ACGT"
)
self.assertEqual(len(dictionary), 16)
self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
alignment = msa.alignment
dictionary = alignment.substitutions
self.assertEqual(len(dictionary), 4)
@ -391,544 +362,60 @@ gi|671626|emb|CAA85685.1| -
self.assertAlmostEqual(dictionary[("T", "C")], 12)
self.assertAlmostEqual(dictionary[("T", "G")], 0)
self.assertAlmostEqual(dictionary[("T", "T")], 874)
with self.assertWarns(BiopythonDeprecationWarning):
matrix = align_info.pos_specific_score_matrix(consensus, ["N", "-"])
motif = motifs.Motif("ACGT", alignment)
counts = motif.counts
for i in range(alignment.length):
for letter in "ACGT":
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
self.assertEqual(counts.calculate_consensus(identity=0.7), consensus)
self.assertEqual(
str(matrix),
counts.calculate_consensus(identity=0.7),
"TATACATTAAAGNAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTNCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
)
self.assertEqual(
str(counts),
"""\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
N 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
T 0.0 0.0 0.0 3.0
A 3.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
N 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
""",
)
with self.assertWarns(BiopythonDeprecationWarning):
matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N", "-"])
alignment = msa.alignment
motif = motifs.Motif("ACGT", alignment)
counts = motif.counts
for i in range(alignment.length):
for letter in "ACGT":
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
self.assertEqual(
str(matrix),
str(counts),
"""\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
X 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
T 0.0 0.0 0.0 3.0
A 3.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
X 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
""",
)
second_seq = msa[1].seq
with self.assertWarns(BiopythonDeprecationWarning):
matrix = align_info.pos_specific_score_matrix(second_seq, ["N", "-"])
align_info = AlignInfo.SummaryInfo(msa)
self.assertEqual(align_info.get_column(1), "AAAAAAA")
self.assertEqual(align_info.get_column(7), "TTTATTT")
alignment = msa.alignment
motif = motifs.Motif("ACGT", alignment)
counts = motif.counts
for i in range(alignment.length):
for letter in "ACGT":
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
self.assertEqual(
str(matrix),
str(counts),
"""\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
- 0.0 0.0 0.0 3.0
- 3.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
""",
)
e_freq_table = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
with self.assertWarns(BiopythonDeprecationWarning):
value = align_info.information_content(
5, 50, chars_to_ignore=["N"], e_freq_table=e_freq_table
)
self.assertAlmostEqual(value, 88.42309908538343) # MultipleSeqAlignment
value = sum(motif[5:50].relative_entropy)
self.assertAlmostEqual(value, 88.42309908538343) # Alignment
with self.assertWarns(BiopythonDeprecationWarning):
value = align_info.information_content(
e_freq_table=e_freq_table, chars_to_ignore=["N", "-"]
)
self.assertAlmostEqual(value, 306.2080592664532) # MultipleSeqAlignment
relative_entropy = motif.relative_entropy
value = sum(relative_entropy)
self.assertAlmostEqual(value, 306.2080592664532) # Alignment
self.assertEqual(align_info.get_column(1), "AAAAAAA")
self.assertAlmostEqual(align_info.ic_vector[1], 2.00)
self.assertEqual(align_info.get_column(7), "TTTATTT")
self.assertAlmostEqual(align_info.ic_vector[7], 1.4083272214176725)
self.assertAlmostEqual(relative_entropy[0], 2.0)
self.assertAlmostEqual(relative_entropy[1], 2.0)
self.assertAlmostEqual(relative_entropy[2], 2.0)
@ -1085,180 +572,11 @@ A 7.0 0.0 0.0 0.0
self.assertAlmostEqual(relative_entropy[153], 2.0)
self.assertAlmostEqual(relative_entropy[154], 2.0)
self.assertAlmostEqual(relative_entropy[155], 2.0)
handle = StringIO()
with self.assertWarns(BiopythonDeprecationWarning):
AlignInfo.print_info_content(align_info, fout=handle)
self.assertEqual(
handle.getvalue(),
"""\
0 T 2.000
1 A 2.000
2 T 2.000
3 A 2.000
4 C 2.000
5 A 2.000
6 T 2.000
7 T 1.408
8 A 2.000
9 A 2.000
10 A 2.000
11 G 2.000
12 A 1.015
13 A 2.000
14 G 2.000
15 G 2.000
16 G 2.000
17 G 2.000
18 G 2.000
19 A 2.000
20 T 2.000
21 G 2.000
22 C 2.000
23 G 2.000
24 G 2.000
25 A 2.000
26 T 2.000
27 A 2.000
28 A 2.000
29 A 2.000
30 T 2.000
31 G 2.000
32 G 2.000
33 A 2.000
34 A 2.000
35 A 2.000
36 G 2.000
37 G 2.000
38 C 2.000
39 G 2.000
40 A 2.000
41 A 2.000
42 A 2.000
43 G 2.000
44 A 2.000
45 A 2.000
46 A 2.000
47 G 2.000
48 A 2.000
49 A 2.000
50 T 2.000
51 A 2.000
52 T 2.000
53 A 2.000
54 T 2.000
55 A 2.000
56 - 2.000
57 - 2.000
58 - 2.000
59 - 2.000
60 - 2.000
61 - 2.000
62 - 2.000
63 - 2.000
64 - 2.000
65 - 2.000
66 A 2.000
67 T 2.000
68 A 2.000
69 T 2.000
70 A 2.000
71 T 2.000
72 T 2.000
73 T 2.000
74 C 1.408
75 A 1.408
76 A 2.000
77 A 2.000
78 T 2.000
79 T 2.000
80 T 1.015
81 C 2.000
82 C 2.000
83 T 2.000
84 T 2.000
85 A 2.000
86 T 2.000
87 A 2.000
88 T 2.000
89 A 2.000
90 C 1.137
91 C 2.000
92 C 2.000
93 A 2.000
94 A 2.000
95 A 2.000
96 T 2.000
97 A 2.000
98 T 2.000
99 A 2.000
100 A 2.000
101 A 2.000
102 A 2.000
103 A 2.000
104 T 2.000
105 A 2.000
106 T 2.000
107 C 2.000
108 T 2.000
109 A 2.000
110 A 2.000
111 T 2.000
112 A 2.000
113 A 2.000
114 A 2.000
115 T 2.000
116 T 2.000
117 A 2.000
118 G 2.000
119 A 2.000
120 T 2.000
121 G 2.000
122 A 2.000
123 A 2.000
124 T 2.000
125 A 2.000
126 T 2.000
127 C 2.000
128 A 2.000
129 A 2.000
130 A 2.000
131 G 2.000
132 A 2.000
133 A 2.000
134 T 2.000
135 C 2.000
136 C 1.408
137 A 2.000
138 T 2.000
139 T 2.000
140 G 2.000
141 A 2.000
142 T 2.000
143 T 2.000
144 T 2.000
145 A 2.000
146 G 2.000
147 T 2.000
148 G 1.408
149 T 2.000
150 A 2.000
151 C 2.000
152 C 2.000
153 A 2.000
154 G 2.000
155 A 2.000
""",
)
# create a new-style Alignment object
del seq_record
del align_info
del consensus
del dictionary
del matrix
del second_seq
del e_freq_table
del value
del handle
alignment = msa.alignment
self.assertEqual(len(alignment), 7)
seq_record = alignment.sequences[0]
@ -1560,11 +878,6 @@ XX
self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
self.assertEqual(msa.get_alignment_length(), 25)
align_info = AlignInfo.SummaryInfo(msa)
with self.assertWarns(BiopythonDeprecationWarning):
consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
self.assertIsInstance(consensus, Seq)
self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
self.assertEqual(
str(msa),
"""\
@ -1596,7 +909,9 @@ EAS54_6_R 0 TTGGCAGGCCAAGGCCGATGGATCA 25
EAS54_6_R 0 GTTGCTTCTGGCGTGGGTGGGGGGG 25
""",
)
self.assertEqual(motif.counts.calculate_consensus(identity=0.6), consensus)
self.assertEqual(
motif.counts.calculate_consensus(identity=0.6), "NTNGCNTNNNNNGNNGGNTGGNTCN"
)
if __name__ == "__main__":