mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
remove deprecated functionality from Bio.Align.AlignInfo (#4907)
Authored-by: Michiel de Hoon <mdehoon@tkx288.genome.gsc.riken.jp>
This commit is contained in:
@ -11,14 +11,6 @@ functions which return summary type information about alignments should
|
||||
be put into classes in this module.
|
||||
"""
|
||||
|
||||
import math
|
||||
import sys
|
||||
import warnings
|
||||
from collections import Counter
|
||||
|
||||
from Bio import BiopythonDeprecationWarning
|
||||
from Bio.Seq import Seq
|
||||
|
||||
|
||||
class SummaryInfo:
|
||||
"""Calculate summary info about the alignment.
|
||||
@ -36,749 +28,7 @@ class SummaryInfo:
|
||||
self.alignment = alignment
|
||||
self.ic_vector = []
|
||||
|
||||
def dumb_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False):
|
||||
"""Output a fast consensus sequence of the alignment.
|
||||
|
||||
This doesn't do anything fancy at all. It will just go through the
|
||||
sequence residue by residue and count up the number of each type
|
||||
of residue (ie. A or G or T or C for DNA) in all sequences in the
|
||||
alignment. If the percentage of the most common residue type is
|
||||
greater then the passed threshold, then we will add that residue type,
|
||||
otherwise an ambiguous character will be added.
|
||||
|
||||
This could be made a lot fancier (ie. to take a substitution matrix
|
||||
into account), but it just meant for a quick and dirty consensus.
|
||||
|
||||
Arguments:
|
||||
- threshold - The threshold value that is required to add a particular
|
||||
atom.
|
||||
- ambiguous - The ambiguous character to be added when the threshold is
|
||||
not reached.
|
||||
- require_multiple - If set as True, this will require that more than
|
||||
1 sequence be part of an alignment to put it in the consensus (ie.
|
||||
not just 1 sequence and gaps).
|
||||
|
||||
"""
|
||||
warnings.warn(
|
||||
"The `dumb_consensus` method is deprecated and will be removed "
|
||||
"in a future release of Biopython. As an alternative, you can "
|
||||
"convert the multiple sequence alignment object to a new-style "
|
||||
"Alignment object by via its `.alignment` property, and then "
|
||||
"create a Motif object. You can then use the `.consensus` or "
|
||||
"`.degenerate_consensus` property of the Motif object to get a "
|
||||
"consensus sequence. For more control over how the consensus "
|
||||
"sequence is calculated, you can call the `calculate_consensus` "
|
||||
"method on the `.counts` property of the Motif object. This is an "
|
||||
"example for a multiple sequence alignment `msa` of DNA "
|
||||
"nucleotides:"
|
||||
"\n"
|
||||
">>> from Bio.Seq import Seq\n"
|
||||
">>> from Bio.SeqRecord import SeqRecord\n"
|
||||
">>> from Bio.Align import MultipleSeqAlignment\n"
|
||||
">>> from Bio.Align.AlignInfo import SummaryInfo\n"
|
||||
">>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),\n"
|
||||
"... SeqRecord(Seq('ATGT')),\n"
|
||||
"... SeqRecord(Seq('ATGT'))])\n"
|
||||
">>> summary = SummaryInfo(msa)\n"
|
||||
">>> dumb_consensus = summary.dumb_consensus(ambiguous='N')\n"
|
||||
">>> print(dumb_consensus)\n"
|
||||
"ANGT\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> from Bio.motifs import Motif\n"
|
||||
">>> motif = Motif('ACGT', alignment)\n"
|
||||
">>> print(motif.consensus)\n"
|
||||
"ATGT\n"
|
||||
">>> print(motif.degenerate_consensus)\n"
|
||||
"AYGT\n"
|
||||
">>> counts = motif.counts\n"
|
||||
">>> consensus = counts.calculate_consensus(identity=0.7)\n"
|
||||
">>> print(consensus)\n"
|
||||
"ANGT\n"
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
# Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
|
||||
consensus = ""
|
||||
|
||||
# find the length of the consensus we are creating
|
||||
con_len = self.alignment.get_alignment_length()
|
||||
|
||||
# go through each seq item
|
||||
for n in range(con_len):
|
||||
# keep track of the counts of the different atoms we get
|
||||
atom_dict = Counter()
|
||||
num_atoms = 0
|
||||
|
||||
for record in self.alignment:
|
||||
# make sure we haven't run past the end of any sequences
|
||||
# if they are of different lengths
|
||||
try:
|
||||
c = record[n]
|
||||
except IndexError:
|
||||
continue
|
||||
if c != "-" and c != ".":
|
||||
atom_dict[c] += 1
|
||||
|
||||
num_atoms += 1
|
||||
|
||||
max_atoms = []
|
||||
max_size = 0
|
||||
|
||||
for atom in atom_dict:
|
||||
if atom_dict[atom] > max_size:
|
||||
max_atoms = [atom]
|
||||
max_size = atom_dict[atom]
|
||||
elif atom_dict[atom] == max_size:
|
||||
max_atoms.append(atom)
|
||||
|
||||
if require_multiple and num_atoms == 1:
|
||||
consensus += ambiguous
|
||||
elif len(max_atoms) == 1 and max_size / num_atoms >= threshold:
|
||||
consensus += max_atoms[0]
|
||||
else:
|
||||
consensus += ambiguous
|
||||
|
||||
return Seq(consensus)
|
||||
|
||||
def gap_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False):
|
||||
"""Output a fast consensus sequence of the alignment, allowing gaps.
|
||||
|
||||
Same as dumb_consensus(), but allows gap on the output.
|
||||
|
||||
Things to do:
|
||||
- Let the user define that with only one gap, the result
|
||||
character in consensus is gap.
|
||||
- Let the user select gap character, now
|
||||
it takes the same as input.
|
||||
|
||||
"""
|
||||
warnings.warn(
|
||||
"The `gap_consensus` method is deprecated and will be removed "
|
||||
"in a future release of Biopython. As an alternative, you can "
|
||||
"convert the multiple sequence alignment object to a new-style "
|
||||
"Alignment object by via its `.alignment` property, and then "
|
||||
"create a Motif object. You can then use the `.consensus` or "
|
||||
"`.degenerate_consensus` property of the Motif object to get a "
|
||||
"consensus sequence. For more control over how the consensus "
|
||||
"sequence is calculated, you can call the `calculate_consensus` "
|
||||
"method on the `.counts` property of the Motif object. This is an "
|
||||
"example for a multiple sequence alignment `msa` of DNA "
|
||||
"nucleotides:"
|
||||
"\n"
|
||||
">>> from Bio.Seq import Seq\n"
|
||||
">>> from Bio.SeqRecord import SeqRecord\n"
|
||||
">>> from Bio.Align import MultipleSeqAlignment\n"
|
||||
">>> from Bio.Align.AlignInfo import SummaryInfo\n"
|
||||
">>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),\n"
|
||||
"... SeqRecord(Seq('AT-T')),\n"
|
||||
"... SeqRecord(Seq('CT-T')),\n"
|
||||
"... SeqRecord(Seq('GT-T'))])\n"
|
||||
">>> summary = SummaryInfo(msa)\n"
|
||||
">>> gap_consensus = summary.gap_consensus(ambiguous='N')\n"
|
||||
">>> print(gap_consensus)\n"
|
||||
"NT-T\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> from Bio.motifs import Motif\n"
|
||||
">>> motif = Motif('ACGT-', alignment) # include '-' in alphabet\n"
|
||||
">>> print(motif.consensus)\n"
|
||||
"AT-T\n"
|
||||
">>> print(motif.degenerate_consensus)\n"
|
||||
"VT-T\n"
|
||||
">>> counts = motif.counts\n"
|
||||
">>> consensus = counts.calculate_consensus(identity=0.7)\n"
|
||||
">>> print(consensus)\n"
|
||||
"NT-T\n"
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
consensus = ""
|
||||
|
||||
# find the length of the consensus we are creating
|
||||
con_len = self.alignment.get_alignment_length()
|
||||
|
||||
# go through each seq item
|
||||
for n in range(con_len):
|
||||
# keep track of the counts of the different atoms we get
|
||||
atom_dict = Counter()
|
||||
num_atoms = 0
|
||||
|
||||
for record in self.alignment:
|
||||
# make sure we haven't run past the end of any sequences
|
||||
# if they are of different lengths
|
||||
try:
|
||||
c = record[n]
|
||||
except IndexError:
|
||||
continue
|
||||
atom_dict[c] += 1
|
||||
|
||||
num_atoms += 1
|
||||
|
||||
max_atoms = []
|
||||
max_size = 0
|
||||
|
||||
for atom in atom_dict:
|
||||
if atom_dict[atom] > max_size:
|
||||
max_atoms = [atom]
|
||||
max_size = atom_dict[atom]
|
||||
elif atom_dict[atom] == max_size:
|
||||
max_atoms.append(atom)
|
||||
|
||||
if require_multiple and num_atoms == 1:
|
||||
consensus += ambiguous
|
||||
elif len(max_atoms) == 1 and max_size / num_atoms >= threshold:
|
||||
consensus += max_atoms[0]
|
||||
else:
|
||||
consensus += ambiguous
|
||||
|
||||
return Seq(consensus)
|
||||
|
||||
def replacement_dictionary(self, skip_chars=None, letters=None):
|
||||
"""Generate a replacement dictionary to plug into a substitution matrix.
|
||||
|
||||
This should look at an alignment, and be able to generate the number
|
||||
of substitutions of different residues for each other in the
|
||||
aligned object.
|
||||
|
||||
Will then return a dictionary with this information::
|
||||
|
||||
{('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
|
||||
|
||||
This also treats weighted sequences. The following example shows how
|
||||
we calculate the replacement dictionary. Given the following
|
||||
multiple sequence alignment::
|
||||
|
||||
GTATC 0.5
|
||||
AT--C 0.8
|
||||
CTGTC 1.0
|
||||
|
||||
For the first column we have::
|
||||
|
||||
('A', 'G') : 0.5 * 0.8 = 0.4
|
||||
('C', 'G') : 0.5 * 1.0 = 0.5
|
||||
('A', 'C') : 0.8 * 1.0 = 0.8
|
||||
|
||||
We then continue this for all of the columns in the alignment, summing
|
||||
the information for each substitution in each column, until we end
|
||||
up with the replacement dictionary.
|
||||
|
||||
Arguments:
|
||||
- skip_chars - Not used; setting it to anything other than None
|
||||
will raise a ValueError
|
||||
- letters - An iterable (e.g. a string or list of characters to include.
|
||||
"""
|
||||
warnings.warn(
|
||||
"The `replacement_dictionary` method is deprecated and will be "
|
||||
"removed in a future release of Biopython. As an alternative, you "
|
||||
"can convert the multiple sequence alignment object to a new-style "
|
||||
"Alignment object by via its `.alignment` property, and then "
|
||||
"use the `.substitutions` property of the `Alignment` object. "
|
||||
"For example, for a multiple sequence alignment `msa` of DNA "
|
||||
"nucleotides, you would do: "
|
||||
"\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> dictionary = alignment.substitutions\n"
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
|
||||
if skip_chars is not None:
|
||||
raise ValueError(
|
||||
"argument skip_chars has been deprecated; instead, please use 'letters' to specify the characters you want to include"
|
||||
)
|
||||
rep_dict = {(letter1, letter2): 0 for letter1 in letters for letter2 in letters}
|
||||
|
||||
# iterate through each record
|
||||
for rec_num1 in range(len(self.alignment)):
|
||||
# iterate through each record from one beyond the current record
|
||||
# to the end of the list of records
|
||||
for rec_num2 in range(rec_num1 + 1, len(self.alignment)):
|
||||
# for each pair of records, compare the sequences and add
|
||||
# the pertinent info to the dictionary
|
||||
self._pair_replacement(
|
||||
self.alignment[rec_num1].seq,
|
||||
self.alignment[rec_num2].seq,
|
||||
self.alignment[rec_num1].annotations.get("weight", 1.0),
|
||||
self.alignment[rec_num2].annotations.get("weight", 1.0),
|
||||
rep_dict,
|
||||
letters,
|
||||
)
|
||||
|
||||
return rep_dict
|
||||
|
||||
def _pair_replacement(self, seq1, seq2, weight1, weight2, dictionary, letters):
|
||||
"""Compare two sequences and generate info on the replacements seen (PRIVATE).
|
||||
|
||||
Arguments:
|
||||
- seq1, seq2 - The two sequences to compare.
|
||||
- weight1, weight2 - The relative weights of seq1 and seq2.
|
||||
- dictionary - The dictionary containing the starting replacement
|
||||
info that we will modify.
|
||||
- letters - A list of characters to include when calculating replacements.
|
||||
|
||||
"""
|
||||
# loop through each residue in the sequences
|
||||
for residue1, residue2 in zip(seq1, seq2):
|
||||
if residue1 in letters and residue2 in letters:
|
||||
dictionary[(residue1, residue2)] += weight1 * weight2
|
||||
|
||||
def _get_all_letters(self):
|
||||
"""Return a string containing the expected letters in the alignment (PRIVATE)."""
|
||||
set_letters = set()
|
||||
for record in self.alignment:
|
||||
set_letters.update(record.seq)
|
||||
list_letters = sorted(set_letters)
|
||||
all_letters = "".join(list_letters)
|
||||
return all_letters
|
||||
|
||||
def pos_specific_score_matrix(self, axis_seq=None, chars_to_ignore=None):
|
||||
"""Create a position specific score matrix object for the alignment.
|
||||
|
||||
This creates a position specific score matrix (pssm) which is an
|
||||
alternative method to look at a consensus sequence.
|
||||
|
||||
Arguments:
|
||||
- chars_to_ignore - A list of all characters not to include in
|
||||
the pssm.
|
||||
- axis_seq - An optional argument specifying the sequence to
|
||||
put on the axis of the PSSM. This should be a Seq object. If nothing
|
||||
is specified, the consensus sequence, calculated with default
|
||||
parameters, will be used.
|
||||
|
||||
Returns:
|
||||
- A PSSM (position specific score matrix) object.
|
||||
|
||||
"""
|
||||
# determine all of the letters we have to deal with
|
||||
warnings.warn(
|
||||
"The `pos_specific_score_matrix` method is deprecated and will be "
|
||||
"removed in a future release of Biopython. As an alternative, you "
|
||||
"can convert the multiple sequence alignment object to a new-style "
|
||||
"Alignment object by via its `.alignment` property, and then "
|
||||
"create a Motif object. For example, for a multiple sequence "
|
||||
"alignment `msa` of DNA nucleotides, you would do: "
|
||||
"\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> from Bio.motifs import Motif\n"
|
||||
">>> motif = Motif('ACGT', alignment)\n"
|
||||
">>> counts = motif.counts\n"
|
||||
"\n"
|
||||
"The `counts` object contains the same information as the PSSM "
|
||||
"returned by `pos_specific_score_matrix`, but note that the "
|
||||
"indices are reversed:\n"
|
||||
"\n"
|
||||
">>> counts[letter][i] == pssm[index][letter]\n"
|
||||
"True\n"
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
|
||||
all_letters = self._get_all_letters()
|
||||
if not all_letters:
|
||||
raise ValueError("_get_all_letters returned empty string")
|
||||
|
||||
if chars_to_ignore is None:
|
||||
chars_to_ignore = []
|
||||
if not isinstance(chars_to_ignore, list):
|
||||
raise TypeError("chars_to_ignore should be a list.")
|
||||
|
||||
gap_char = "-"
|
||||
chars_to_ignore.append(gap_char)
|
||||
|
||||
for char in chars_to_ignore:
|
||||
all_letters = all_letters.replace(char, "")
|
||||
|
||||
if axis_seq:
|
||||
left_seq = axis_seq
|
||||
if len(axis_seq) != self.alignment.get_alignment_length():
|
||||
raise ValueError(
|
||||
"Axis sequence length does not equal the get_alignment_length"
|
||||
)
|
||||
else:
|
||||
left_seq = self.dumb_consensus()
|
||||
|
||||
pssm_info = []
|
||||
# now start looping through all of the sequences and getting info
|
||||
for residue_num in range(len(left_seq)):
|
||||
score_dict = dict.fromkeys(all_letters, 0)
|
||||
for record in self.alignment:
|
||||
try:
|
||||
this_residue = record.seq[residue_num]
|
||||
# if we hit an index error we've run out of sequence and
|
||||
# should not add new residues
|
||||
except IndexError:
|
||||
this_residue = None
|
||||
|
||||
if this_residue and this_residue not in chars_to_ignore:
|
||||
weight = record.annotations.get("weight", 1.0)
|
||||
try:
|
||||
score_dict[this_residue] += weight
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
"Residue %s not found" % this_residue
|
||||
) from None
|
||||
|
||||
pssm_info.append((left_seq[residue_num], score_dict))
|
||||
|
||||
return PSSM(pssm_info)
|
||||
|
||||
def information_content(
|
||||
self,
|
||||
start=0,
|
||||
end=None,
|
||||
e_freq_table=None,
|
||||
log_base=2,
|
||||
chars_to_ignore=None,
|
||||
pseudo_count=0,
|
||||
):
|
||||
"""Calculate the information content for each residue along an alignment.
|
||||
|
||||
Arguments:
|
||||
- start, end - The starting an ending points to calculate the
|
||||
information content. These points should be relative to the first
|
||||
sequence in the alignment, starting at zero (ie. even if the 'real'
|
||||
first position in the seq is 203 in the initial sequence, for
|
||||
the info content, we need to use zero). This defaults to the entire
|
||||
length of the first sequence.
|
||||
- e_freq_table - A dictionary specifying the expected frequencies
|
||||
for each letter (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}).
|
||||
Gap characters should not be included, since these should not have
|
||||
expected frequencies.
|
||||
- log_base - The base of the logarithm to use in calculating the
|
||||
information content. This defaults to 2 so the info is in bits.
|
||||
- chars_to_ignore - A listing of characters which should be ignored
|
||||
in calculating the info content. Defaults to none.
|
||||
|
||||
Returns:
|
||||
- A number representing the info content for the specified region.
|
||||
|
||||
Please see the Biopython manual for more information on how information
|
||||
content is calculated.
|
||||
|
||||
"""
|
||||
warnings.warn(
|
||||
"The `information_content` method and `ic_vector` attribute of the "
|
||||
"`SummaryInfo` class are deprecated and will be removed in a "
|
||||
"future release of Biopython. As an alternative, you can convert "
|
||||
"the multiple sequence alignment object to a new-style Alignment "
|
||||
"object by via its `.alignment` property, and use the "
|
||||
"`information_content` attribute of the Alignment obecjt. "
|
||||
"For example, for a multiple sequence alignment `msa` of "
|
||||
"DNA nucleotides, you would do: "
|
||||
"\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> from Bio.motifs import Motif\n"
|
||||
">>> motif = Motif('ACGT', alignment)\n"
|
||||
">>> information_content = motif.information_content\n"
|
||||
"\n"
|
||||
"The `information_content` object contains the same values as the "
|
||||
"`ic_vector` attribute of the `SummaryInfo` object. Its sum is "
|
||||
"equal to the value return by the `information_content` method. "
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
# if no end was specified, then we default to the end of the sequence
|
||||
if end is None:
|
||||
end = len(self.alignment[0].seq)
|
||||
if chars_to_ignore is None:
|
||||
chars_to_ignore = []
|
||||
|
||||
if start < 0 or end > len(self.alignment[0].seq):
|
||||
raise ValueError(
|
||||
"Start (%s) and end (%s) are not in the range %s to %s"
|
||||
% (start, end, 0, len(self.alignment[0].seq))
|
||||
)
|
||||
# determine random expected frequencies, if necessary
|
||||
random_expected = None
|
||||
# determine all of the letters we have to deal with
|
||||
all_letters = self._get_all_letters()
|
||||
for char in chars_to_ignore:
|
||||
all_letters = all_letters.replace(char, "")
|
||||
|
||||
info_content = {}
|
||||
for residue_num in range(start, end):
|
||||
freq_dict = self._get_letter_freqs(
|
||||
residue_num,
|
||||
self.alignment,
|
||||
all_letters,
|
||||
chars_to_ignore,
|
||||
pseudo_count,
|
||||
e_freq_table,
|
||||
random_expected,
|
||||
)
|
||||
# print(freq_dict, end="")
|
||||
column_score = self._get_column_info_content(
|
||||
freq_dict, e_freq_table, log_base, random_expected
|
||||
)
|
||||
info_content[residue_num] = column_score
|
||||
# sum up the score
|
||||
total_info = sum(info_content.values())
|
||||
# fill in the ic_vector member: holds IC for each column
|
||||
# reset ic_vector to empty list at each call
|
||||
self.ic_vector = []
|
||||
for i, k in enumerate(info_content):
|
||||
self.ic_vector.append(info_content[i + start])
|
||||
return total_info
|
||||
|
||||
def _get_letter_freqs(
|
||||
self,
|
||||
residue_num,
|
||||
all_records,
|
||||
letters,
|
||||
to_ignore,
|
||||
pseudo_count=0,
|
||||
e_freq_table=None,
|
||||
random_expected=None,
|
||||
):
|
||||
"""Determine the frequency of specific letters in the alignment (PRIVATE).
|
||||
|
||||
Arguments:
|
||||
- residue_num - The number of the column we are getting frequencies
|
||||
from.
|
||||
- all_records - All of the SeqRecords in the alignment.
|
||||
- letters - The letters we are interested in getting the frequency
|
||||
for.
|
||||
- to_ignore - Letters we are specifically supposed to ignore.
|
||||
- pseudo_count - Optional argument specifying the Pseudo count (k)
|
||||
to add in order to prevent a frequency of 0 for a letter.
|
||||
- e_freq_table - An optional argument specifying a dictionary with
|
||||
the expected frequencies for each letter.
|
||||
- random_expected - Optional argument that specify the frequency to use
|
||||
when e_freq_table is not defined.
|
||||
|
||||
This will calculate the frequencies of each of the specified letters
|
||||
in the alignment at the given frequency, and return this as a
|
||||
dictionary where the keys are the letters and the values are the
|
||||
frequencies. Pseudo count can be added to prevent a null frequency
|
||||
"""
|
||||
freq_info = dict.fromkeys(letters, 0)
|
||||
|
||||
total_count = 0
|
||||
|
||||
gap_char = "-"
|
||||
|
||||
if pseudo_count < 0:
|
||||
raise ValueError(
|
||||
"Positive value required for pseudo_count, %s provided" % (pseudo_count)
|
||||
)
|
||||
|
||||
# collect the count info into the dictionary for all the records
|
||||
for record in all_records:
|
||||
try:
|
||||
if record.seq[residue_num] not in to_ignore:
|
||||
weight = record.annotations.get("weight", 1.0)
|
||||
freq_info[record.seq[residue_num]] += weight
|
||||
total_count += weight
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
"Residue %s not found in letters %s"
|
||||
% (record.seq[residue_num], letters)
|
||||
) from None
|
||||
|
||||
if e_freq_table:
|
||||
# check if all the residus in freq_info are in e_freq_table
|
||||
for key in freq_info:
|
||||
if key != gap_char and key not in e_freq_table:
|
||||
raise ValueError("%s not found in expected frequency table" % key)
|
||||
|
||||
if total_count == 0:
|
||||
# This column must be entirely ignored characters
|
||||
for letter in freq_info:
|
||||
if freq_info[letter] != 0:
|
||||
raise ValueError("freq_info[letter] is not 0")
|
||||
# TODO - Map this to NA or NaN?
|
||||
else:
|
||||
# now convert the counts into frequencies
|
||||
for letter in freq_info:
|
||||
if pseudo_count and (random_expected or e_freq_table):
|
||||
# use either the expected random freq or the
|
||||
if e_freq_table:
|
||||
ajust_freq = e_freq_table[letter]
|
||||
else:
|
||||
ajust_freq = random_expected
|
||||
|
||||
ajusted_letter_count = freq_info[letter] + ajust_freq * pseudo_count
|
||||
ajusted_total = total_count + pseudo_count
|
||||
freq_info[letter] = ajusted_letter_count / ajusted_total
|
||||
|
||||
else:
|
||||
freq_info[letter] = freq_info[letter] / total_count
|
||||
|
||||
return freq_info
|
||||
|
||||
def _get_column_info_content(
|
||||
self, obs_freq, e_freq_table, log_base, random_expected
|
||||
):
|
||||
"""Calculate the information content for a column (PRIVATE).
|
||||
|
||||
Arguments:
|
||||
- obs_freq - The frequencies observed for each letter in the column.
|
||||
- e_freq_table - An optional argument specifying a dictionary with
|
||||
the expected frequencies for each letter.
|
||||
- log_base - The base of the logarithm to use in calculating the
|
||||
info content.
|
||||
|
||||
"""
|
||||
gap_char = "-"
|
||||
|
||||
if e_freq_table:
|
||||
# check the expected freq information to make sure it is good
|
||||
for key in obs_freq:
|
||||
if key != gap_char and key not in e_freq_table:
|
||||
raise ValueError(
|
||||
f"Frequency table provided does not contain observed letter {key}"
|
||||
)
|
||||
|
||||
total_info = 0.0
|
||||
|
||||
for letter in obs_freq:
|
||||
inner_log = 0.0
|
||||
# if we have expected frequencies, modify the log value by them
|
||||
# gap characters do not have expected frequencies, so they
|
||||
# should just be the observed frequency.
|
||||
if letter != gap_char:
|
||||
if e_freq_table:
|
||||
inner_log = obs_freq[letter] / e_freq_table[letter]
|
||||
else:
|
||||
inner_log = obs_freq[letter] / random_expected
|
||||
# if the observed frequency is zero, we don't add any info to the
|
||||
# total information content
|
||||
if inner_log > 0:
|
||||
letter_info = (
|
||||
obs_freq[letter] * math.log(inner_log) / math.log(log_base)
|
||||
)
|
||||
total_info += letter_info
|
||||
return total_info
|
||||
|
||||
def get_column(self, col):
|
||||
"""Return column of alignment."""
|
||||
# TODO - Deprecate this and implement slicing?
|
||||
return self.alignment[:, col]
|
||||
|
||||
|
||||
class PSSM:
|
||||
"""Represent a position specific score matrix.
|
||||
|
||||
This class is meant to make it easy to access the info within a PSSM
|
||||
and also make it easy to print out the information in a nice table.
|
||||
|
||||
Let's say you had an alignment like this::
|
||||
|
||||
GTATC
|
||||
AT--C
|
||||
CTGTC
|
||||
|
||||
The position specific score matrix (when printed) looks like::
|
||||
|
||||
G A T C
|
||||
G 1 1 0 1
|
||||
T 0 0 3 0
|
||||
A 1 1 0 0
|
||||
T 0 0 2 0
|
||||
C 0 0 0 3
|
||||
|
||||
You can access a single element of the PSSM using the following::
|
||||
|
||||
your_pssm[sequence_number][residue_count_name]
|
||||
|
||||
For instance, to get the 'T' residue for the second element in the
|
||||
above alignment you would need to do:
|
||||
|
||||
your_pssm[1]['T']
|
||||
"""
|
||||
|
||||
def __init__(self, pssm):
|
||||
"""Initialize with pssm data to represent.
|
||||
|
||||
The pssm passed should be a list with the following structure:
|
||||
|
||||
list[0] - The letter of the residue being represented (for instance,
|
||||
from the example above, the first few list[0]s would be GTAT...
|
||||
list[1] - A dictionary with the letter substitutions and counts.
|
||||
"""
|
||||
warnings.warn(
|
||||
"The `PSSM` class is deprecated and will be removed in a future "
|
||||
"release of Biopython. As an alternative, you can convert the "
|
||||
"multiple sequence alignment object to a new-style Alignment "
|
||||
"object by via its `.alignment` property, and then create a Motif "
|
||||
"object. For example, for a multiple sequence alignment `msa` of "
|
||||
"DNA nucleotides, you would do: "
|
||||
"\n"
|
||||
">>> alignment = msa.alignment\n"
|
||||
">>> from Bio.motifs import Motif\n"
|
||||
">>> motif = Motif('ACGT', alignment)\n"
|
||||
">>> counts = motif.counts\n"
|
||||
"\n"
|
||||
"The `counts` object contains the same information as the PSSM "
|
||||
"returned by `pos_specific_score_matrix`, but note that the "
|
||||
"indices are reversed:\n"
|
||||
"\n"
|
||||
">>> counts[letter][i] == pssm[index][letter]\n"
|
||||
"True\n"
|
||||
"\n"
|
||||
"If your multiple sequence alignment object was obtained using "
|
||||
"Bio.AlignIO, then you can obtain a new-style Alignment object "
|
||||
"directly by using Bio.Align.read instead of Bio.AlignIO.read, "
|
||||
"or Bio.Align.parse instead of Bio.AlignIO.parse.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
self.pssm = pssm
|
||||
|
||||
def __getitem__(self, pos):
|
||||
return self.pssm[pos][1]
|
||||
|
||||
def __str__(self):
|
||||
out = " "
|
||||
all_residues = sorted(self.pssm[0][1])
|
||||
|
||||
# first print out the top header
|
||||
for res in all_residues:
|
||||
out += " %s" % res
|
||||
out += "\n"
|
||||
|
||||
# for each item, write out the substitutions
|
||||
for item in self.pssm:
|
||||
out += "%s " % item[0]
|
||||
for res in all_residues:
|
||||
out += " %.1f" % item[1][res]
|
||||
|
||||
out += "\n"
|
||||
return out
|
||||
|
||||
def get_residue(self, pos):
|
||||
"""Return the residue letter at the specified position."""
|
||||
return self.pssm[pos][0]
|
||||
|
||||
|
||||
def print_info_content(summary_info, fout=None, rep_record=0):
|
||||
"""3 column output: position, aa in representative sequence, ic_vector value."""
|
||||
warnings.warn(
|
||||
"The `print_info_content` function is deprecated and will be removed "
|
||||
"in a future release of Biopython.",
|
||||
BiopythonDeprecationWarning,
|
||||
)
|
||||
fout = fout or sys.stdout
|
||||
if not summary_info.ic_vector:
|
||||
summary_info.information_content()
|
||||
rep_sequence = summary_info.alignment[rep_record]
|
||||
for pos, (aa, ic) in enumerate(zip(rep_sequence, summary_info.ic_vector)):
|
||||
fout.write("%d %s %.3f\n" % (pos, aa, ic))
|
||||
|
@ -97,10 +97,10 @@ wrapped is no longer available since SCOP moved to the EBI website.
|
||||
Bio.AlignInfo
|
||||
-------------
|
||||
The ``pos_specific_score_matrix`` method of the ``SummaryInfo`` class and the
|
||||
``PSSM`` class were deprecated in release 1.82. As an alternative, please use
|
||||
the ``alignment`` property of a ``MultipleSeqAlignment`` object to obtains a
|
||||
new-style ``Alignment`` object, and use it to create a ``Bio.motifs.Motif``
|
||||
object. For example,
|
||||
``PSSM`` class were deprecated in release 1.82, and removed in release 1.85. As
|
||||
an alternative, please use the ``alignment`` property of a ``MultipleSeqAlignment``
|
||||
object to obtains a new-style ``Alignment`` object, and use it to create a
|
||||
``Bio.motifs.Motif`` object. For example,
|
||||
|
||||
>>> alignment = msa.alignment
|
||||
>>> from Bio.motifs import Motif
|
||||
@ -114,17 +114,17 @@ The ``counts`` object contains the same information as the PSSM returned by
|
||||
True
|
||||
|
||||
The ``information_content`` method and the ``ic_vector`` attribute of the
|
||||
``SummaryInfo`` class were deprecated in release 1.82. As an alternative,
|
||||
please use the ``relative_entropy`` attribute of the ``motif`` instance (see
|
||||
above); it contains the same values as the ``ic_vector`` attribute, while
|
||||
``sum(relative_entropy)`` is equal to the value returned by
|
||||
``SummaryInfo`` class were deprecated in release 1.82, and removed in release 1.85.
|
||||
As an alternative, please use the ``relative_entropy`` attribute of the ``motif``
|
||||
instance (see above); it contains the same values as the ``ic_vector`` attribute,
|
||||
while ``sum(relative_entropy)`` is equal to the value returned by
|
||||
``information_content``.
|
||||
|
||||
The ``replacement_dictionary`` method of the ``SummaryInfo`` class was
|
||||
deprecated in release 1.82. As an alternative, please use the ``alignment``
|
||||
property of the ``MultipleSeqAlignment`` object to obtain a new-style
|
||||
``Alignment`` object, and use its ``substitutions`` attribute to obtain the
|
||||
replacement dictionary:
|
||||
deprecated in release 1.82, and removed in release 1.85. As an alternative, please
|
||||
use the ``alignment`` property of the ``MultipleSeqAlignment`` object to obtain a
|
||||
new-style ``Alignment`` object, and use its ``substitutions`` attribute to obtain
|
||||
the replacement dictionary:
|
||||
|
||||
>>> alignment = msa.alignment
|
||||
>>> dictionary = alignment.substitutions
|
||||
@ -135,10 +135,10 @@ by using ``Bio.Align.read`` instead of ``Bio.AlignIO.read``, or
|
||||
``Bio.Align.parse`` instead of ``Bio.AlignIO.parse``.
|
||||
|
||||
The ``dumb_consensus`` and ``gap_consensus`` methods of the ``SummaryInfo``
|
||||
class were deprecated in Release 1.82.
|
||||
class were deprecated in release 1.82, and removed in release 1.85.
|
||||
|
||||
The ``print_info_content`` function in ``Bio.Align.AlignInfo`` was deprecated
|
||||
in Release 1.82.
|
||||
in release 1.82, and removed in release 1.85.
|
||||
|
||||
Bio.kNN
|
||||
-------
|
||||
|
@ -1098,364 +1098,6 @@ add entries for missing letters, for example
|
||||
|
||||
This also allows you to change the order of letters in the alphabet.
|
||||
|
||||
.. _`sec:summary_info`:
|
||||
|
||||
Calculating summary information
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Once you have an alignment, you are very likely going to want to find
|
||||
out information about it. Instead of trying to have all of the functions
|
||||
that can generate information about an alignment in the alignment object
|
||||
itself, we’ve tried to separate out the functionality into separate
|
||||
classes, which act on the alignment.
|
||||
|
||||
Getting ready to calculate summary information about an object is quick
|
||||
to do. Let’s say we’ve got an alignment object called ``alignment``, for
|
||||
example read in using ``Bio.AlignIO.read(...)`` as described in
|
||||
Chapter :ref:`chapter:msa`. All we need to do to get an object that
|
||||
will calculate summary information is:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> from Bio.Align import AlignInfo
|
||||
>>> summary_align = AlignInfo.SummaryInfo(msa)
|
||||
|
||||
The ``summary_align`` object is very useful, and will do the following
|
||||
neat things for you:
|
||||
|
||||
#. Calculate a quick consensus sequence – see
|
||||
section :ref:`sec:consensus`
|
||||
|
||||
#. Get a position specific score matrix for the alignment – see
|
||||
section :ref:`sec:pssm`
|
||||
|
||||
#. Calculate the information content for the alignment – see
|
||||
section :ref:`sec:getting_info_content`
|
||||
|
||||
#. Generate information on substitutions in the alignment –
|
||||
section :ref:`sec:substitution_matrices`
|
||||
details using this to generate a substitution matrix.
|
||||
|
||||
.. _`sec:consensus`:
|
||||
|
||||
Calculating a quick consensus sequence
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``SummaryInfo`` object, described in
|
||||
section :ref:`sec:summary_info`, provides functionality to
|
||||
calculate a quick consensus of an alignment. Assuming we’ve got a
|
||||
``SummaryInfo`` object called ``summary_align`` we can calculate a
|
||||
consensus by doing:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> consensus = summary_align.dumb_consensus()
|
||||
>>> consensus
|
||||
Seq('XCTXCTX')
|
||||
|
||||
As the name suggests, this is a really simple consensus calculator, and
|
||||
will just add up all of the residues at each point in the consensus, and
|
||||
if the most common value is higher than some threshold value will add
|
||||
the common residue to the consensus. If it doesn’t reach the threshold,
|
||||
it adds an ambiguity character to the consensus. The returned consensus
|
||||
object is a ``Seq`` object.
|
||||
|
||||
You can adjust how ``dumb_consensus`` works by passing optional
|
||||
parameters:
|
||||
|
||||
the threshold
|
||||
This is the threshold specifying how common a particular residue has
|
||||
to be at a position before it is added. The default is :math:`0.7`
|
||||
(meaning :math:`70\%`).
|
||||
|
||||
the ambiguous character
|
||||
This is the ambiguity character to use. The default is ’N’.
|
||||
|
||||
Alternatively, you can convert the multiple sequence alignment object
|
||||
``msa`` to a new-style ``Alignment`` object (see section
|
||||
:ref:`sec:alignmentobject`) by using the
|
||||
``alignment`` attribute (see section :ref:`sec:alignment_newstyle`):
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> alignment = msa.alignment
|
||||
|
||||
You can then create a ``Motif`` object (see section
|
||||
:ref:`sec:motif_object`):
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> from Bio.motifs import Motif
|
||||
>>> motif = Motif("ACGT", alignment)
|
||||
|
||||
and obtain a quick consensus sequence:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> motif.consensus
|
||||
Seq('ACTCCTA')
|
||||
|
||||
The ``motif.counts.calculate_consensus`` method (see section
|
||||
:ref:`sec:motif_consensus`) lets you specify in
|
||||
detail how the consensus sequence should be calculated. For example,
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> motif.counts.calculate_consensus(identity=0.7)
|
||||
'NCTNCTN'
|
||||
|
||||
.. _`sec:pssm`:
|
||||
|
||||
Position Specific Score Matrices
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Position specific score matrices (PSSMs) summarize the alignment
|
||||
information in a different way than a consensus, and may be useful for
|
||||
different tasks. Basically, a PSSM is a count matrix. For each column in
|
||||
the alignment, the number of each alphabet letters is counted and
|
||||
totaled. The totals are displayed relative to some representative
|
||||
sequence along the left axis. This sequence may be the consensus
|
||||
sequence, but can also be any sequence in the alignment.
|
||||
|
||||
For instance for the alignment above:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> print(msa)
|
||||
Alignment with 4 rows and 7 columns
|
||||
ACTCCTA seq1
|
||||
AAT-CTA seq2
|
||||
CCTACT- seq3
|
||||
TCTCCTC seq4
|
||||
|
||||
we get a PSSM with the consensus sequence along the side using
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> my_pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=["N"])
|
||||
>>> print(my_pssm)
|
||||
A C T
|
||||
X 2.0 1.0 1.0
|
||||
C 1.0 3.0 0.0
|
||||
T 0.0 0.0 4.0
|
||||
X 1.0 2.0 0.0
|
||||
C 0.0 4.0 0.0
|
||||
T 0.0 0.0 4.0
|
||||
X 2.0 1.0 0.0
|
||||
<BLANKLINE>
|
||||
|
||||
where we ignore any ``N`` ambiguity residues when calculating the PSSM.
|
||||
|
||||
Two notes should be made about this:
|
||||
|
||||
#. To maintain strictness with the alphabets, you can only include
|
||||
characters along the top of the PSSM that are in the alphabet of the
|
||||
alignment object. Gaps are not included along the top axis of the
|
||||
PSSM.
|
||||
|
||||
#. The sequence passed to be displayed along the left side of the axis
|
||||
does not need to be the consensus. For instance, if you wanted to
|
||||
display the second sequence in the alignment along this axis, you
|
||||
would need to do:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> second_seq = msa[1]
|
||||
>>> my_pssm = summary_align.pos_specific_score_matrix(second_seq, chars_to_ignore=["N"])
|
||||
>>> print(my_pssm)
|
||||
A C T
|
||||
A 2.0 1.0 1.0
|
||||
A 1.0 3.0 0.0
|
||||
T 0.0 0.0 4.0
|
||||
- 1.0 2.0 0.0
|
||||
C 0.0 4.0 0.0
|
||||
T 0.0 0.0 4.0
|
||||
A 2.0 1.0 0.0
|
||||
<BLANKLINE>
|
||||
|
||||
The command above returns a ``PSSM`` object. You can access any element
|
||||
of the PSSM by subscripting like
|
||||
``your_pssm[sequence_number][residue_count_name]``. For instance, to get
|
||||
the counts for the ’A’ residue in the second element of the above PSSM
|
||||
you would do:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> print(my_pssm[5]["T"])
|
||||
4.0
|
||||
|
||||
The structure of the PSSM class hopefully makes it easy both to access
|
||||
elements and to pretty print the matrix.
|
||||
|
||||
Alternatively, you can convert the multiple sequence alignment object
|
||||
``msa`` to a new-style ``Alignment`` object (see section
|
||||
:ref:`sec:alignmentobject`) by using the
|
||||
``alignment`` attribute (see section :ref:`sec:alignment_newstyle`):
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> alignment = msa.alignment
|
||||
|
||||
You can then create a ``Motif`` object (see section
|
||||
:ref:`sec:motif_object`):
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> from Bio.motifs import Motif
|
||||
>>> motif = Motif("ACGT", alignment)
|
||||
|
||||
and obtain the counts of each nucleotide in each position:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> counts = motif.counts
|
||||
>>> print(counts)
|
||||
0 1 2 3 4 5 6
|
||||
A: 2.00 1.00 0.00 1.00 0.00 0.00 2.00
|
||||
C: 1.00 3.00 0.00 2.00 4.00 0.00 1.00
|
||||
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
T: 1.00 0.00 4.00 0.00 0.00 4.00 0.00
|
||||
<BLANKLINE>
|
||||
>>> print(counts["T"][5])
|
||||
4.0
|
||||
|
||||
.. _`sec:getting_info_content`:
|
||||
|
||||
Information Content
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A potentially useful measure of evolutionary conservation is the
|
||||
information content of a sequence.
|
||||
|
||||
A useful introduction to information theory targeted towards molecular
|
||||
biologists can be found at
|
||||
http://www.lecb.ncifcrf.gov/~toms/paper/primer/. For our purposes, we
|
||||
will be looking at the information content of a consensus sequence, or a
|
||||
portion of a consensus sequence. We calculate information content at a
|
||||
particular column in a multiple sequence alignment using the following
|
||||
formula:
|
||||
|
||||
.. math:: IC_{j} = \sum_{i=1}^{N_{a}} P_{ij} \mathrm{log}\left(\frac{P_{ij}}{Q_{i}}\right)
|
||||
|
||||
where:
|
||||
|
||||
- :math:`IC_{j}` – The information content for the :math:`j`-th column
|
||||
in an alignment.
|
||||
|
||||
- :math:`N_{a}` – The number of letters in the alphabet.
|
||||
|
||||
- :math:`P_{ij}` – The frequency of a particular letter :math:`i` in
|
||||
the :math:`j`-th column (i. e. if G occurred 3 out of 6 times in an
|
||||
alignment column, this would be 0.5)
|
||||
|
||||
- :math:`Q_{i}` – The expected frequency of a letter :math:`i`. This is
|
||||
an optional argument, usage of which is left at the user’s
|
||||
discretion. By default, it is automatically assigned to
|
||||
:math:`0.05 = 1/20` for a protein alphabet, and :math:`0.25 = 1/4`
|
||||
for a nucleic acid alphabet. This is for getting the information
|
||||
content without any assumption of prior distributions. When assuming
|
||||
priors, or when using a non-standard alphabet, you should supply the
|
||||
values for :math:`Q_{i}`.
|
||||
|
||||
Well, now that we have an idea what information content is being
|
||||
calculated in Biopython, let’s look at how to get it for a particular
|
||||
region of the alignment.
|
||||
|
||||
First, we need to use our alignment to get an alignment summary object,
|
||||
which we’ll assume is called ``summary_align`` (see
|
||||
section :ref:`sec:summary_info`) for instructions on how to get
|
||||
this. Once we’ve got this object, calculating the information content
|
||||
for a region is as easy as:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> e_freq_table = {"A": 0.3, "G": 0.2, "T": 0.3, "C": 0.2}
|
||||
>>> info_content = summary_align.information_content(
|
||||
... 2, 6, e_freq_table=e_freq_table, chars_to_ignore=["N"]
|
||||
... )
|
||||
>>> info_content # doctest:+ELLIPSIS
|
||||
6.3910647...
|
||||
|
||||
Now, ``info_content`` will contain the relative information content over
|
||||
the region [2:6] in relation to the expected frequencies.
|
||||
|
||||
The value return is calculated using base 2 as the logarithm base in the
|
||||
formula above. You can modify this by passing the parameter ``log_base``
|
||||
as the base you want:
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> info_content = summary_align.information_content(
|
||||
... 2, 6, e_freq_table=e_freq_table, log_base=10, chars_to_ignore=["N"]
|
||||
... )
|
||||
>>> info_content # doctest:+ELLIPSIS
|
||||
1.923902...
|
||||
|
||||
By default nucleotide or amino acid residues with a frequency of 0 in a
|
||||
column are not take into account when the relative information column
|
||||
for that column is computed. If this is not the desired result, you can
|
||||
use ``pseudo_count`` instead.
|
||||
|
||||
.. cont-doctest
|
||||
|
||||
.. code:: pycon
|
||||
|
||||
>>> info_content = summary_align.information_content(
|
||||
... 2, 6, e_freq_table=e_freq_table, chars_to_ignore=["N", "-"], pseudo_count=1
|
||||
... )
|
||||
>>> info_content # doctest:+ELLIPSIS
|
||||
4.299651...
|
||||
|
||||
In this case, the observed frequency :math:`P_{ij}` of a particular
|
||||
letter :math:`i` in the :math:`j`-th column is computed as follows:
|
||||
|
||||
.. math:: P_{ij} = \frac{n_{ij} + k\times Q_{i}}{N_{j} + k}
|
||||
|
||||
where:
|
||||
|
||||
- :math:`k` – the pseudo count you pass as argument.
|
||||
|
||||
- :math:`k` – the pseudo count you pass as argument.
|
||||
|
||||
- :math:`Q_{i}` – The expected frequency of the letter :math:`i` as
|
||||
described above.
|
||||
|
||||
Well, now you are ready to calculate information content. If you want to
|
||||
try applying this to some real life problems, it would probably be best
|
||||
to dig into the literature on information content to get an idea of how
|
||||
it is used. Hopefully your digging won’t reveal any mistakes made in
|
||||
coding this function!
|
||||
|
||||
.. _`sec:alignment_newstyle`:
|
||||
|
||||
Getting a new-style Alignment object
|
||||
|
@ -242,141 +242,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
if alignment_len > 5:
|
||||
self.assertEqual(alignment[:, -1], columns[-1])
|
||||
|
||||
def check_summary_simple(self, msa):
|
||||
summary = AlignInfo.SummaryInfo(msa)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
dumb_consensus = summary.dumb_consensus(threshold=0.7)
|
||||
all_letters = summary._get_all_letters()
|
||||
letters = all_letters.replace("-", "")
|
||||
alignment = msa.alignment
|
||||
motif = Motif(letters, alignment)
|
||||
gaps = "-" * len(alignment)
|
||||
dumb_consensus = "".join(
|
||||
[
|
||||
letter
|
||||
for index, letter in enumerate(dumb_consensus)
|
||||
if msa[:, index] != gaps
|
||||
]
|
||||
)
|
||||
consensus = motif.counts.calculate_consensus(identity=0.7)
|
||||
self.assertEqual(dumb_consensus, consensus)
|
||||
|
||||
def check_summary(self, msa, molecule_type):
|
||||
# Check AlignInfo.SummaryInfo likes the alignment; smoke test only
|
||||
if molecule_type == "DNA":
|
||||
letters = IUPACData.unambiguous_dna_letters
|
||||
ambiguous_letters = IUPACData.ambiguous_dna_letters
|
||||
ambiguous = "N"
|
||||
elif molecule_type == "RNA":
|
||||
letters = IUPACData.unambiguous_rna_letters
|
||||
ambiguous_letters = IUPACData.ambiguous_rna_letters
|
||||
ambiguous = "N"
|
||||
elif molecule_type == "protein":
|
||||
letters = IUPACData.protein_letters
|
||||
ambiguous_letters = IUPACData.protein_letters
|
||||
ambiguous = "X"
|
||||
else:
|
||||
raise ValueError(f"Unknown molecule type '{molecule_type}'")
|
||||
chars_to_ignore = set("-" + string.ascii_uppercase).difference(letters)
|
||||
for record in msa:
|
||||
record.seq = record.seq.upper()
|
||||
summary = AlignInfo.SummaryInfo(msa)
|
||||
alignment = msa.alignment # New-style alignment
|
||||
alignment.sequences = [sequence.upper() for sequence in alignment.sequences]
|
||||
all_letters = summary._get_all_letters()
|
||||
motif_letters = "".join(set(all_letters).union(letters))
|
||||
motif_letters = motif_letters.replace("-", "")
|
||||
if set(motif_letters) == set("CGYTWAR"):
|
||||
ambiguous = "X"
|
||||
motif = Motif(motif_letters, alignment)
|
||||
counts = motif.counts
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
dumb_consensus = summary.dumb_consensus(ambiguous=ambiguous)
|
||||
consensus = counts.calculate_consensus(identity=0.7)
|
||||
# skip columns consisting of gaps only:
|
||||
gaps = "-" * len(alignment)
|
||||
dumb_consensus = "".join(
|
||||
[
|
||||
letter
|
||||
for index, letter in enumerate(dumb_consensus)
|
||||
if msa[:, index] != gaps
|
||||
]
|
||||
)
|
||||
self.assertEqual(consensus, dumb_consensus)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
pssm = summary.pos_specific_score_matrix()
|
||||
all_letters = summary._get_all_letters()
|
||||
j = 0
|
||||
for i in range(alignment.length):
|
||||
while set(msa[:, j]) == set("-"):
|
||||
j += 1
|
||||
for letter in letters:
|
||||
count = counts[letter][i]
|
||||
if letter in all_letters:
|
||||
self.assertAlmostEqual(count, pssm[j][letter])
|
||||
else:
|
||||
self.assertAlmostEqual(count, 0.0)
|
||||
j += 1
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
rep_dict = summary.replacement_dictionary(skip_chars=None, letters=letters)
|
||||
rep_dict = alignment.substitutions
|
||||
e_freq = 1.0 / len(letters)
|
||||
ambiguous_letters = ambiguous_letters.upper() + ambiguous_letters.lower()
|
||||
motif = Motif(letters, alignment)
|
||||
e_freq_table = dict.fromkeys(ambiguous_letters, e_freq)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
info_content = summary.information_content(
|
||||
e_freq_table=e_freq_table, chars_to_ignore=chars_to_ignore
|
||||
)
|
||||
motif.background = e_freq_table
|
||||
relative_entropy = sum(motif.relative_entropy)
|
||||
self.assertAlmostEqual(info_content, relative_entropy)
|
||||
|
||||
def check_summary_pir(self, msa):
|
||||
letters = IUPACData.unambiguous_dna_letters
|
||||
summary = AlignInfo.SummaryInfo(msa)
|
||||
all_letters = summary._get_all_letters()
|
||||
alignment = msa.alignment
|
||||
motif = Motif(letters, alignment)
|
||||
counts = motif.counts
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
dumb_consensus = summary.dumb_consensus(ambiguous="N")
|
||||
gaps = "-" * len(alignment)
|
||||
dumb_consensus = "".join(
|
||||
[
|
||||
letter
|
||||
for index, letter in enumerate(dumb_consensus)
|
||||
if msa[:, index] != gaps
|
||||
]
|
||||
)
|
||||
consensus = counts.calculate_consensus(identity=0.7)
|
||||
self.assertEqual(consensus, dumb_consensus)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
pssm = summary.pos_specific_score_matrix()
|
||||
j = 0
|
||||
for i in range(alignment.length):
|
||||
while set(msa[:, j]) == set("-"):
|
||||
j += 1
|
||||
for letter in letters:
|
||||
count = counts[letter][i]
|
||||
if letter in all_letters:
|
||||
self.assertAlmostEqual(count, pssm[j][letter])
|
||||
else:
|
||||
self.assertAlmostEqual(count, 0.0)
|
||||
j += 1
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
rep_dict = summary.replacement_dictionary(skip_chars=None, letters=letters)
|
||||
rep_dict = alignment.substitutions
|
||||
e_freq = 1.0 / len(letters)
|
||||
all_letters = letters.upper() + letters.lower()
|
||||
e_freq_table = dict.fromkeys(all_letters, e_freq)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
info_content = summary.information_content(
|
||||
e_freq_table=e_freq_table, chars_to_ignore=["-"]
|
||||
)
|
||||
relative_entropy = sum(motif.relative_entropy)
|
||||
self.assertAlmostEqual(info_content, relative_entropy)
|
||||
|
||||
def test_reading_alignments_clustal1(self):
|
||||
path = "Clustalw/clustalw.aln"
|
||||
self.check_iterator_for_loop_handle(path, "clustal", 1, 2)
|
||||
@ -402,7 +267,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"clustal_consensus": " * *: :: :. :* : :. : . :* :: .: ** **:... *.*** .. .:* * *: .* :* : :* .* *::. . .:: :*..* :* .* .. . : . : *. .:: : . .* . : *.: ..:: * . :: : .*. :. :. . . .* **.*.. :.. *.. . . ::* :.: .*: : * :: *** . * :. . . : *: .:: ::: .. . : : :: * * : .. :.* . ::. :: * : : * * :.. * .. * :** . .:. .. :*. ..: :. . .:* * : : * . ..*:. .** *.*... : :: :* .* ::* : :. :. : "
|
||||
},
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_clustal2(self):
|
||||
path = "Clustalw/opuntia.aln"
|
||||
@ -417,7 +281,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
alignment,
|
||||
["TTTTTTT", "AAAAAAA", "TTTTTTT", "AAAAAAA", "CCCCCCC", "AAAAAAA"],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_clustal3(self):
|
||||
path = "Clustalw/hedgehog.aln"
|
||||
@ -431,7 +294,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignment, ["M----", "F----", "N----", "L----", "V----", "---SS"]
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_clustal4(self):
|
||||
path = "Clustalw/odd_consensus.aln"
|
||||
@ -452,7 +314,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"clustal_consensus": " * * *** ***** * * ** *******************************************************************************************************************************************************************************"
|
||||
},
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_clustal5(self):
|
||||
path = "Clustalw/protein.aln"
|
||||
@ -474,7 +335,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"-------------------T",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_clustal6(self):
|
||||
path = "Clustalw/promals3d.aln"
|
||||
@ -496,7 +356,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"-T------------------",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_fasta(self):
|
||||
path = "GFF/multi.fna" # Trivial nucleotide alignment
|
||||
@ -511,7 +370,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
alignment,
|
||||
[("test1", "ACGTCGCG"), ("test2", "GGGGCCCC"), ("test3", "AAACACAC")],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_nexus1(self):
|
||||
path = "Nexus/test_Nexus_input.nex"
|
||||
@ -532,7 +390,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"tt--?ag?c",
|
||||
],
|
||||
)
|
||||
self.check_summary_simple(alignment)
|
||||
|
||||
def test_reading_alignments_nexus2(self):
|
||||
path = "Nexus/codonposset.nex"
|
||||
@ -549,7 +406,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("Aerodramus", "?????????TTGTGGTGGGAAT"),
|
||||
],
|
||||
)
|
||||
self.check_summary_simple(alignment)
|
||||
|
||||
def test_reading_alignments_msf1(self):
|
||||
path = "msf/DOA_prot.msf"
|
||||
@ -586,7 +442,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"LLLLLL----L",
|
||||
],
|
||||
)
|
||||
self.check_summary_simple(alignment)
|
||||
|
||||
def test_reading_alignments_stockholm1(self):
|
||||
path = "Stockholm/simple.sth"
|
||||
@ -607,7 +462,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"secondary_structure": ".................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>..............."
|
||||
},
|
||||
)
|
||||
self.check_summary(alignment, "RNA")
|
||||
|
||||
def test_reading_alignments_stockholm2(self):
|
||||
path = "Stockholm/funny.sth"
|
||||
@ -621,7 +475,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignment, ["MMMEEE", "TQIVVV", "CHEMMM", "RVALLL", "ASDTTT", "SYSEEE"]
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_phylip1(self):
|
||||
path = "Phylip/reference_dna.phy"
|
||||
@ -635,7 +488,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignment, ["CCTTCG", "GGAAAG", "ATAAAC", "TTTTAA", "GAGGAG", "CTTTTC"]
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip2(self):
|
||||
path = "Phylip/reference_dna2.phy"
|
||||
@ -649,7 +501,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignment, ["CCTTCG", "GGAAAG", "ATAAAC", "TTTTAA", "GAGGAG", "CTTTTC"]
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip3(self):
|
||||
path = "Phylip/hennigian.phy"
|
||||
@ -671,7 +522,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"AAAAAAAAAA",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip4(self):
|
||||
path = "Phylip/horses.phy"
|
||||
@ -693,7 +543,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"AAAAAAAAAA",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip5(self):
|
||||
path = "Phylip/random.phy"
|
||||
@ -715,7 +564,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"AAAAAAAAAA",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip6(self):
|
||||
path = "Phylip/interlaced.phy"
|
||||
@ -734,7 +582,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("CYS1_DICDI", "-----MKVILLFVLAVFTVFVSS-----------...I--"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_phylip7(self):
|
||||
path = "Phylip/interlaced2.phy"
|
||||
@ -754,7 +601,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_phylip8(self):
|
||||
path = "ExtendedPhylip/primates.phyx"
|
||||
@ -776,7 +622,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"TTTTTTTTTTTT",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_phylip9(self):
|
||||
path = "Phylip/sequential.phy"
|
||||
@ -795,7 +640,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("CYS1_DICDI", "-----MKVILLFVLAVFTVFVSS-----------...I--"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_phylip10(self):
|
||||
path = "Phylip/sequential2.phy"
|
||||
@ -815,7 +659,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_emboss1(self):
|
||||
path = "Emboss/alignret.txt"
|
||||
@ -834,7 +677,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("IXI_237", "TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRP...SHE"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_emboss2(self):
|
||||
path = "Emboss/needle.txt"
|
||||
@ -885,7 +727,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("ref_rec", "KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQ...---"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_emboss3(self):
|
||||
@ -903,7 +744,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("asis", "TATTTTTTGGATTTTTTTCTAGATTTTCTAGGTT...GAA"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_emboss4(self):
|
||||
path = "Emboss/water.txt"
|
||||
@ -920,7 +760,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("IXI_235", "TSPASIRPPAGPSSR---------RPSPPGPRRP...SHE"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_emboss5(self):
|
||||
path = "Emboss/water2.txt"
|
||||
@ -933,7 +772,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_rows(
|
||||
alignment, [("asis", "CGTTTGAGT-CTGGGATG"), ("asis", "CGTTTGAGTACTGGGATG")]
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_emboss6(self):
|
||||
path = "Emboss/matcher_simple.txt"
|
||||
@ -947,7 +785,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
alignment,
|
||||
[("AF069992_1", "GPPPQSPDENRAGESS"), ("CAA85685.1", "GVPPEEAGAAVAAESS")],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_emboss7(self):
|
||||
path = "Emboss/matcher_pair.txt"
|
||||
@ -983,7 +820,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_rows(
|
||||
alignments[4], [("HBA_HUMAN", "VKAAWGKVGA"), ("HBB_HUMAN", "VQAAYQKVVA")]
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_emboss8(self):
|
||||
@ -1007,7 +843,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_fasta_m10_1(self):
|
||||
path = "Fasta/output001.m10"
|
||||
@ -1073,7 +908,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_fasta_m10_2(self):
|
||||
@ -1140,7 +974,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_fasta_m10_3(self):
|
||||
@ -1187,7 +1020,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_fasta_m10_4(self):
|
||||
@ -1211,7 +1043,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_fasta_m10_5(self):
|
||||
path = "Fasta/output005.m10"
|
||||
@ -1234,7 +1065,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_fasta_m10_6(self):
|
||||
path = "Fasta/output006.m10"
|
||||
@ -1254,7 +1084,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("query", "GCAACGCTTCAAGAACTGGAATTAGGAACCGTGA...CAT"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "DNA")
|
||||
|
||||
def test_reading_alignments_fasta_m10_7(self):
|
||||
path = "Fasta/output007.m10"
|
||||
@ -1320,7 +1149,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_fasta_m10_8(self):
|
||||
@ -1372,7 +1200,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("sp|P08100|OPSD_HUMAN", "AQQQESATTQKAEKEVTRMVIIMVIAFLICW"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[0], "protein")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_ig(self):
|
||||
@ -1395,7 +1222,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
"HHHHHHH-AAAAL-R-",
|
||||
],
|
||||
)
|
||||
self.check_summary(alignment, "protein")
|
||||
|
||||
def test_reading_alignments_pir(self):
|
||||
path = "NBRF/clustalw.pir"
|
||||
@ -1419,7 +1245,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
),
|
||||
],
|
||||
)
|
||||
self.check_summary_pir(alignment)
|
||||
|
||||
def test_reading_alignments_maf1(self):
|
||||
path = "MAF/humor.maf"
|
||||
@ -1447,7 +1272,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("rn3", "tttgtccatgttggtcaggctggtctcgaactcc...GGT"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[1], "DNA")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_maf2(self):
|
||||
@ -1479,7 +1303,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
("panTro1.chr6", "gcagctgaaaaca"),
|
||||
],
|
||||
)
|
||||
self.check_summary(alignments[1], "DNA")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_maf3(self):
|
||||
@ -1505,7 +1328,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignments[2], ["gggA", "cccC", "aaaA", "gggG", "cccC", "aaaA"]
|
||||
)
|
||||
self.check_summary(alignments[2], "DNA")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_maf4(self):
|
||||
@ -1546,7 +1368,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
self.check_alignment_columns(
|
||||
alignments[47], ["TTTTTT", "GGGGGG", "TTTTTT", "TTTTTT", "TGGGAT", "tTTTT-"]
|
||||
)
|
||||
self.check_summary(alignments[47], "DNA")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
def test_reading_alignments_mauve(self):
|
||||
@ -1589,7 +1410,6 @@ class TestAlignIO_reading(unittest.TestCase):
|
||||
alignments[4],
|
||||
[("2/11410-12880", "ATTCGCACATAAGAATGTACCTTGCTGTAATTTA...ATA")],
|
||||
)
|
||||
self.check_summary(alignments[4], "DNA")
|
||||
self.check_reverse_write_read(alignments)
|
||||
|
||||
|
||||
|
@ -1,225 +0,0 @@
|
||||
# Copyright 2016 by Peter Cock. All rights reserved.
|
||||
# This code is part of the Biopython distribution and governed by its
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
|
||||
"""Bio.Align.AlignInfo related tests."""
|
||||
import math
|
||||
import unittest
|
||||
|
||||
from Bio import AlignIO
|
||||
from Bio import BiopythonDeprecationWarning
|
||||
from Bio.Align import MultipleSeqAlignment
|
||||
from Bio.Align.AlignInfo import SummaryInfo
|
||||
from Bio.Data import IUPACData
|
||||
from Bio.motifs import Motif
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
|
||||
class AlignInfoTests(unittest.TestCase):
|
||||
"""Test basic usage."""
|
||||
|
||||
def assertAlmostEqualList(self, list1, list2, **kwargs):
|
||||
self.assertEqual(len(list1), len(list2))
|
||||
for v1, v2 in zip(list1, list2):
|
||||
self.assertAlmostEqual(v1, v2, **kwargs)
|
||||
|
||||
def test_nucleotides(self):
|
||||
filename = "GFF/multi.fna"
|
||||
fmt = "fasta"
|
||||
msa = AlignIO.read(filename, fmt)
|
||||
summary = SummaryInfo(msa)
|
||||
alignment = msa.alignment
|
||||
motif = Motif("ACGT", alignment)
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
c = summary.dumb_consensus(threshold=0.1, ambiguous="N")
|
||||
# dumb_consensus uses ambiguous if multiple letters have the same score
|
||||
self.assertEqual(c, "ANGNCCCC")
|
||||
c = motif.counts.calculate_consensus(identity=0.1)
|
||||
# Instead, EMBOSS uses the first letter it encounters
|
||||
self.assertEqual(c, "AaGcCCCC")
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
c = summary.dumb_consensus(ambiguous="N")
|
||||
self.assertEqual(c, "NNNNNNNN")
|
||||
c = motif.counts.calculate_consensus(identity=0.7)
|
||||
self.assertEqual(c, "NNNNNNNN")
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
c = summary.gap_consensus(ambiguous="N")
|
||||
self.assertEqual(c, "NNNNNNNN")
|
||||
|
||||
expected = {"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
m = summary.pos_specific_score_matrix(chars_to_ignore=["-"], axis_seq=c)
|
||||
|
||||
counts = motif.counts
|
||||
|
||||
for i in range(alignment.length):
|
||||
for letter in "ACGT":
|
||||
self.assertAlmostEqual(counts[letter][i], m[i][letter])
|
||||
|
||||
self.assertEqual(
|
||||
str(m),
|
||||
""" A C G T
|
||||
N 2.0 0.0 1.0 0.0
|
||||
N 1.0 1.0 1.0 0.0
|
||||
N 1.0 0.0 2.0 0.0
|
||||
N 0.0 1.0 1.0 1.0
|
||||
N 1.0 2.0 0.0 0.0
|
||||
N 0.0 2.0 1.0 0.0
|
||||
N 1.0 2.0 0.0 0.0
|
||||
N 0.0 2.0 1.0 0.0
|
||||
""",
|
||||
)
|
||||
|
||||
# provide the frequencies and chars to ignore explicitly.
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
ic = summary.information_content(
|
||||
e_freq_table=expected, chars_to_ignore=["-"]
|
||||
)
|
||||
self.assertAlmostEqual(ic, 7.32029999423075)
|
||||
ic = sum(motif.relative_entropy)
|
||||
self.assertAlmostEqual(ic, 7.32029999423075)
|
||||
|
||||
def test_proteins(self):
|
||||
letters = IUPACData.protein_letters
|
||||
a = MultipleSeqAlignment(
|
||||
[
|
||||
SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"),
|
||||
SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"),
|
||||
SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003"),
|
||||
]
|
||||
)
|
||||
self.assertEqual(32, a.get_alignment_length())
|
||||
|
||||
s = SummaryInfo(a)
|
||||
|
||||
alignment = a.alignment
|
||||
motif = Motif(letters + "*", alignment)
|
||||
counts = motif.counts
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
dumb_consensus = s.dumb_consensus()
|
||||
self.assertEqual(dumb_consensus, "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")
|
||||
consensus = counts.calculate_consensus(identity=0.7)
|
||||
self.assertEqual(consensus, dumb_consensus)
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
c = s.gap_consensus(ambiguous="X")
|
||||
self.assertEqual(c, "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c)
|
||||
j = 0
|
||||
all_letters = s._get_all_letters()
|
||||
for i in range(alignment.length):
|
||||
for letter in letters:
|
||||
count = counts[letter][i]
|
||||
if letter in all_letters:
|
||||
self.assertAlmostEqual(count, m[j][letter])
|
||||
else:
|
||||
self.assertAlmostEqual(count, 0.0)
|
||||
j += 1
|
||||
self.assertEqual(
|
||||
str(m),
|
||||
""" A D E F G H I K L M N P Q R S W Y
|
||||
M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
|
||||
X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
|
||||
Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
|
||||
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
|
||||
X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
|
||||
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
|
||||
L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
|
||||
G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
|
||||
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
|
||||
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
|
||||
I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
|
||||
S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
|
||||
P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
|
||||
E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
|
||||
X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
|
||||
X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
""",
|
||||
)
|
||||
|
||||
base_freq = 1.0 / len(letters)
|
||||
e_freq_table = {letter: base_freq for letter in letters}
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
ic = s.information_content(
|
||||
e_freq_table=e_freq_table, chars_to_ignore=["-", "*"]
|
||||
)
|
||||
self.assertAlmostEqual(ic, 133.061475107)
|
||||
motif = Motif(letters, alignment)
|
||||
ic = sum(motif.relative_entropy)
|
||||
self.assertAlmostEqual(ic, 133.061475107)
|
||||
|
||||
def test_pseudo_count(self):
|
||||
# use example from
|
||||
# http://biologie.univ-mrs.fr/upload/p202/01.4.PSSM_theory.pdf
|
||||
msa = MultipleSeqAlignment(
|
||||
[
|
||||
SeqRecord(Seq("AACCACGTTTAA"), id="ID001"),
|
||||
SeqRecord(Seq("CACCACGTGGGT"), id="ID002"),
|
||||
SeqRecord(Seq("CACCACGTTCGC"), id="ID003"),
|
||||
SeqRecord(Seq("GCGCACGTGGGG"), id="ID004"),
|
||||
SeqRecord(Seq("TCGCACGTTGTG"), id="ID005"),
|
||||
SeqRecord(Seq("TGGCACGTGTTT"), id="ID006"),
|
||||
SeqRecord(Seq("TGACACGTGGGA"), id="ID007"),
|
||||
SeqRecord(Seq("TTACACGTGCGC"), id="ID008"),
|
||||
]
|
||||
)
|
||||
|
||||
summary = SummaryInfo(msa)
|
||||
expected = {"A": 0.325, "G": 0.175, "T": 0.325, "C": 0.175}
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
ic = summary.information_content(
|
||||
e_freq_table=expected, log_base=math.exp(1), pseudo_count=1
|
||||
)
|
||||
self.assertAlmostEqual(ic, 7.546369561463767)
|
||||
ic_vector = [
|
||||
0.11112361,
|
||||
0.08677812,
|
||||
0.35598044,
|
||||
1.29445419,
|
||||
0.80272907,
|
||||
1.29445419,
|
||||
1.29445419,
|
||||
0.80272907,
|
||||
0.60929642,
|
||||
0.39157892,
|
||||
0.46539767,
|
||||
0.03739368,
|
||||
]
|
||||
self.assertAlmostEqualList(summary.ic_vector, ic_vector)
|
||||
# One more time, now using a new-style Alignment object:
|
||||
alignment = msa.alignment
|
||||
motif = Motif("ACGT", alignment)
|
||||
motif.background = expected
|
||||
motif.pseudocounts = expected
|
||||
self.assertAlmostEqualList(motif.relative_entropy * math.log(2), ic_vector)
|
||||
ic = sum(ic_vector)
|
||||
self.assertAlmostEqual(ic, 7.546369561463767)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
unittest.main(testRunner=runner)
|
@ -341,35 +341,6 @@ gi|671626|emb|CAA85685.1| -
|
||||
"TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
|
||||
)
|
||||
self.assertEqual(msa.get_alignment_length(), 156)
|
||||
align_info = AlignInfo.SummaryInfo(msa)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
consensus = align_info.dumb_consensus(ambiguous="N")
|
||||
self.assertIsInstance(consensus, Seq)
|
||||
self.assertEqual(
|
||||
consensus,
|
||||
"TATACATTAAAGNAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTNCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
|
||||
)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
dictionary = align_info.replacement_dictionary(
|
||||
skip_chars=None, letters="ACGT"
|
||||
)
|
||||
self.assertEqual(len(dictionary), 16)
|
||||
self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
|
||||
self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
|
||||
alignment = msa.alignment
|
||||
dictionary = alignment.substitutions
|
||||
self.assertEqual(len(dictionary), 4)
|
||||
@ -391,544 +362,60 @@ gi|671626|emb|CAA85685.1| -
|
||||
self.assertAlmostEqual(dictionary[("T", "C")], 12)
|
||||
self.assertAlmostEqual(dictionary[("T", "G")], 0)
|
||||
self.assertAlmostEqual(dictionary[("T", "T")], 874)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
matrix = align_info.pos_specific_score_matrix(consensus, ["N", "-"])
|
||||
|
||||
motif = motifs.Motif("ACGT", alignment)
|
||||
counts = motif.counts
|
||||
for i in range(alignment.length):
|
||||
for letter in "ACGT":
|
||||
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
|
||||
self.assertEqual(counts.calculate_consensus(identity=0.7), consensus)
|
||||
self.assertEqual(
|
||||
str(matrix),
|
||||
counts.calculate_consensus(identity=0.7),
|
||||
"TATACATTAAAGNAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTNCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA",
|
||||
)
|
||||
self.assertEqual(
|
||||
str(counts),
|
||||
"""\
|
||||
A C G T
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 1.0 0.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
N 4.0 0.0 3.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 4.0
|
||||
A 4.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 3.0
|
||||
A 3.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 1.0 6.0 0.0 0.0
|
||||
A 6.0 0.0 0.0 1.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
N 0.0 3.0 0.0 4.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 2.0 0.0 5.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 1.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 1.0 0.0 6.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
||||
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
|
||||
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
|
||||
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
|
||||
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
""",
|
||||
)
|
||||
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N", "-"])
|
||||
|
||||
alignment = msa.alignment
|
||||
motif = motifs.Motif("ACGT", alignment)
|
||||
counts = motif.counts
|
||||
for i in range(alignment.length):
|
||||
for letter in "ACGT":
|
||||
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
|
||||
|
||||
self.assertEqual(
|
||||
str(matrix),
|
||||
str(counts),
|
||||
"""\
|
||||
A C G T
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 1.0 0.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
X 4.0 0.0 3.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 4.0
|
||||
A 4.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 3.0
|
||||
A 3.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 1.0
|
||||
A 1.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 1.0 6.0 0.0 0.0
|
||||
A 6.0 0.0 0.0 1.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
X 0.0 3.0 0.0 4.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 2.0 0.0 5.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 1.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 1.0 0.0 6.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
||||
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
|
||||
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
|
||||
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
|
||||
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
""",
|
||||
)
|
||||
|
||||
second_seq = msa[1].seq
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
matrix = align_info.pos_specific_score_matrix(second_seq, ["N", "-"])
|
||||
align_info = AlignInfo.SummaryInfo(msa)
|
||||
self.assertEqual(align_info.get_column(1), "AAAAAAA")
|
||||
self.assertEqual(align_info.get_column(7), "TTTATTT")
|
||||
|
||||
alignment = msa.alignment
|
||||
motif = motifs.Motif("ACGT", alignment)
|
||||
counts = motif.counts
|
||||
for i in range(alignment.length):
|
||||
for letter in "ACGT":
|
||||
self.assertAlmostEqual(counts[letter][i], matrix[i][letter])
|
||||
|
||||
self.assertEqual(
|
||||
str(matrix),
|
||||
str(counts),
|
||||
"""\
|
||||
A C G T
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 1.0 0.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 4.0 0.0 3.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 4.0
|
||||
A 4.0 0.0 0.0 0.0
|
||||
- 0.0 0.0 0.0 3.0
|
||||
- 3.0 0.0 0.0 0.0
|
||||
- 0.0 0.0 0.0 1.0
|
||||
- 1.0 0.0 0.0 0.0
|
||||
- 0.0 0.0 0.0 1.0
|
||||
- 1.0 0.0 0.0 0.0
|
||||
- 0.0 0.0 0.0 1.0
|
||||
- 1.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 1.0 6.0 0.0 0.0
|
||||
A 6.0 0.0 0.0 1.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 3.0 0.0 4.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 2.0 0.0 5.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
T 0.0 1.0 0.0 6.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
G 1.0 0.0 6.0 0.0
|
||||
T 0.0 0.0 0.0 7.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
C 0.0 7.0 0.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
G 0.0 0.0 7.0 0.0
|
||||
A 7.0 0.0 0.0 0.0
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
||||
A: 0.00 7.00 0.00 7.00 0.00 7.00 0.00 1.00 7.00 7.00 7.00 0.00 4.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 1.00 6.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 1.00 0.00 7.00 0.00 0.00 7.00 0.00 7.00
|
||||
C: 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 3.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00
|
||||
G: 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 3.00 0.00 7.00 7.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00
|
||||
T: 7.00 0.00 7.00 0.00 0.00 0.00 7.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 4.00 0.00 3.00 0.00 1.00 0.00 1.00 0.00 1.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 7.00 7.00 0.00 1.00 0.00 0.00 7.00 7.00 4.00 0.00 0.00 7.00 7.00 0.00 7.00 0.00 7.00 0.00 5.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 7.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 7.00 0.00 0.00 0.00 7.00 0.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.00 0.00 6.00 0.00 7.00 7.00 0.00 0.00 7.00 7.00 7.00 0.00 0.00 7.00 0.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
""",
|
||||
)
|
||||
e_freq_table = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
value = align_info.information_content(
|
||||
5, 50, chars_to_ignore=["N"], e_freq_table=e_freq_table
|
||||
)
|
||||
self.assertAlmostEqual(value, 88.42309908538343) # MultipleSeqAlignment
|
||||
value = sum(motif[5:50].relative_entropy)
|
||||
self.assertAlmostEqual(value, 88.42309908538343) # Alignment
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
value = align_info.information_content(
|
||||
e_freq_table=e_freq_table, chars_to_ignore=["N", "-"]
|
||||
)
|
||||
self.assertAlmostEqual(value, 306.2080592664532) # MultipleSeqAlignment
|
||||
relative_entropy = motif.relative_entropy
|
||||
value = sum(relative_entropy)
|
||||
self.assertAlmostEqual(value, 306.2080592664532) # Alignment
|
||||
self.assertEqual(align_info.get_column(1), "AAAAAAA")
|
||||
self.assertAlmostEqual(align_info.ic_vector[1], 2.00)
|
||||
self.assertEqual(align_info.get_column(7), "TTTATTT")
|
||||
self.assertAlmostEqual(align_info.ic_vector[7], 1.4083272214176725)
|
||||
self.assertAlmostEqual(relative_entropy[0], 2.0)
|
||||
self.assertAlmostEqual(relative_entropy[1], 2.0)
|
||||
self.assertAlmostEqual(relative_entropy[2], 2.0)
|
||||
@ -1085,180 +572,11 @@ A 7.0 0.0 0.0 0.0
|
||||
self.assertAlmostEqual(relative_entropy[153], 2.0)
|
||||
self.assertAlmostEqual(relative_entropy[154], 2.0)
|
||||
self.assertAlmostEqual(relative_entropy[155], 2.0)
|
||||
handle = StringIO()
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
AlignInfo.print_info_content(align_info, fout=handle)
|
||||
self.assertEqual(
|
||||
handle.getvalue(),
|
||||
"""\
|
||||
0 T 2.000
|
||||
1 A 2.000
|
||||
2 T 2.000
|
||||
3 A 2.000
|
||||
4 C 2.000
|
||||
5 A 2.000
|
||||
6 T 2.000
|
||||
7 T 1.408
|
||||
8 A 2.000
|
||||
9 A 2.000
|
||||
10 A 2.000
|
||||
11 G 2.000
|
||||
12 A 1.015
|
||||
13 A 2.000
|
||||
14 G 2.000
|
||||
15 G 2.000
|
||||
16 G 2.000
|
||||
17 G 2.000
|
||||
18 G 2.000
|
||||
19 A 2.000
|
||||
20 T 2.000
|
||||
21 G 2.000
|
||||
22 C 2.000
|
||||
23 G 2.000
|
||||
24 G 2.000
|
||||
25 A 2.000
|
||||
26 T 2.000
|
||||
27 A 2.000
|
||||
28 A 2.000
|
||||
29 A 2.000
|
||||
30 T 2.000
|
||||
31 G 2.000
|
||||
32 G 2.000
|
||||
33 A 2.000
|
||||
34 A 2.000
|
||||
35 A 2.000
|
||||
36 G 2.000
|
||||
37 G 2.000
|
||||
38 C 2.000
|
||||
39 G 2.000
|
||||
40 A 2.000
|
||||
41 A 2.000
|
||||
42 A 2.000
|
||||
43 G 2.000
|
||||
44 A 2.000
|
||||
45 A 2.000
|
||||
46 A 2.000
|
||||
47 G 2.000
|
||||
48 A 2.000
|
||||
49 A 2.000
|
||||
50 T 2.000
|
||||
51 A 2.000
|
||||
52 T 2.000
|
||||
53 A 2.000
|
||||
54 T 2.000
|
||||
55 A 2.000
|
||||
56 - 2.000
|
||||
57 - 2.000
|
||||
58 - 2.000
|
||||
59 - 2.000
|
||||
60 - 2.000
|
||||
61 - 2.000
|
||||
62 - 2.000
|
||||
63 - 2.000
|
||||
64 - 2.000
|
||||
65 - 2.000
|
||||
66 A 2.000
|
||||
67 T 2.000
|
||||
68 A 2.000
|
||||
69 T 2.000
|
||||
70 A 2.000
|
||||
71 T 2.000
|
||||
72 T 2.000
|
||||
73 T 2.000
|
||||
74 C 1.408
|
||||
75 A 1.408
|
||||
76 A 2.000
|
||||
77 A 2.000
|
||||
78 T 2.000
|
||||
79 T 2.000
|
||||
80 T 1.015
|
||||
81 C 2.000
|
||||
82 C 2.000
|
||||
83 T 2.000
|
||||
84 T 2.000
|
||||
85 A 2.000
|
||||
86 T 2.000
|
||||
87 A 2.000
|
||||
88 T 2.000
|
||||
89 A 2.000
|
||||
90 C 1.137
|
||||
91 C 2.000
|
||||
92 C 2.000
|
||||
93 A 2.000
|
||||
94 A 2.000
|
||||
95 A 2.000
|
||||
96 T 2.000
|
||||
97 A 2.000
|
||||
98 T 2.000
|
||||
99 A 2.000
|
||||
100 A 2.000
|
||||
101 A 2.000
|
||||
102 A 2.000
|
||||
103 A 2.000
|
||||
104 T 2.000
|
||||
105 A 2.000
|
||||
106 T 2.000
|
||||
107 C 2.000
|
||||
108 T 2.000
|
||||
109 A 2.000
|
||||
110 A 2.000
|
||||
111 T 2.000
|
||||
112 A 2.000
|
||||
113 A 2.000
|
||||
114 A 2.000
|
||||
115 T 2.000
|
||||
116 T 2.000
|
||||
117 A 2.000
|
||||
118 G 2.000
|
||||
119 A 2.000
|
||||
120 T 2.000
|
||||
121 G 2.000
|
||||
122 A 2.000
|
||||
123 A 2.000
|
||||
124 T 2.000
|
||||
125 A 2.000
|
||||
126 T 2.000
|
||||
127 C 2.000
|
||||
128 A 2.000
|
||||
129 A 2.000
|
||||
130 A 2.000
|
||||
131 G 2.000
|
||||
132 A 2.000
|
||||
133 A 2.000
|
||||
134 T 2.000
|
||||
135 C 2.000
|
||||
136 C 1.408
|
||||
137 A 2.000
|
||||
138 T 2.000
|
||||
139 T 2.000
|
||||
140 G 2.000
|
||||
141 A 2.000
|
||||
142 T 2.000
|
||||
143 T 2.000
|
||||
144 T 2.000
|
||||
145 A 2.000
|
||||
146 G 2.000
|
||||
147 T 2.000
|
||||
148 G 1.408
|
||||
149 T 2.000
|
||||
150 A 2.000
|
||||
151 C 2.000
|
||||
152 C 2.000
|
||||
153 A 2.000
|
||||
154 G 2.000
|
||||
155 A 2.000
|
||||
""",
|
||||
)
|
||||
# create a new-style Alignment object
|
||||
del seq_record
|
||||
del align_info
|
||||
del consensus
|
||||
del dictionary
|
||||
del matrix
|
||||
del second_seq
|
||||
del e_freq_table
|
||||
del value
|
||||
del handle
|
||||
alignment = msa.alignment
|
||||
self.assertEqual(len(alignment), 7)
|
||||
seq_record = alignment.sequences[0]
|
||||
@ -1560,11 +878,6 @@ XX
|
||||
self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
|
||||
self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
|
||||
self.assertEqual(msa.get_alignment_length(), 25)
|
||||
align_info = AlignInfo.SummaryInfo(msa)
|
||||
with self.assertWarns(BiopythonDeprecationWarning):
|
||||
consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
|
||||
self.assertIsInstance(consensus, Seq)
|
||||
self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
|
||||
self.assertEqual(
|
||||
str(msa),
|
||||
"""\
|
||||
@ -1596,7 +909,9 @@ EAS54_6_R 0 TTGGCAGGCCAAGGCCGATGGATCA 25
|
||||
EAS54_6_R 0 GTTGCTTCTGGCGTGGGTGGGGGGG 25
|
||||
""",
|
||||
)
|
||||
self.assertEqual(motif.counts.calculate_consensus(identity=0.6), consensus)
|
||||
self.assertEqual(
|
||||
motif.counts.calculate_consensus(identity=0.6), "NTNGCNTNNNNNGNNGGNTGGNTCN"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user