Files
biopython/Tests/test_align.py
Martin Thoma d2e7b6baa5 Remove executable flags and shebang (#2826)
Remove executable flags and hashbangs from tests; make run_tests.py executable
2020-05-26 13:09:54 +01:00

956 lines
28 KiB
Python

# Copyright 2000-2001 by Brad Chapman. All rights reserved.
# Revisions copyright 2007-2003 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test alignment stuff.
Right now we've got tests for:
- Reading and Writing clustal format
- Reading and Writing fasta format
- Converting between formats
"""
# standard library
import os
import unittest
from io import StringIO
# biopython
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
opuntia_clustal = """\
CLUSTAL X (1.81) multiple sequence alignment
gi|6273285|gb|AF191659.1|AF191 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273284|gb|AF191658.1|AF191 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273287|gb|AF191661.1|AF191 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273286|gb|AF191660.1|AF191 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273290|gb|AF191664.1|AF191 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273289|gb|AF191663.1|AF191 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
gi|6273291|gb|AF191665.1|AF191 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAA
******* **** *************************************
gi|6273285|gb|AF191659.1|AF191 TATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATA
gi|6273284|gb|AF191658.1|AF191 TATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATA
gi|6273287|gb|AF191661.1|AF191 TATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATA
gi|6273286|gb|AF191660.1|AF191 TATATA----------ATATATTTATAATTTCCTTATATATCCAAATATA
gi|6273290|gb|AF191664.1|AF191 TATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATA
gi|6273289|gb|AF191663.1|AF191 TATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATA
gi|6273291|gb|AF191665.1|AF191 TATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATA
****** ******** **** ********* *********
gi|6273285|gb|AF191659.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGT
gi|6273284|gb|AF191658.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGT
gi|6273287|gb|AF191661.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGT
gi|6273286|gb|AF191660.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGT
gi|6273290|gb|AF191664.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGT
gi|6273289|gb|AF191663.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTAT
gi|6273291|gb|AF191665.1|AF191 AAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGT
************************************ *********** *
gi|6273285|gb|AF191659.1|AF191 ACCAGA
gi|6273284|gb|AF191658.1|AF191 ACCAGA
gi|6273287|gb|AF191661.1|AF191 ACCAGA
gi|6273286|gb|AF191660.1|AF191 ACCAGA
gi|6273290|gb|AF191664.1|AF191 ACCAGA
gi|6273289|gb|AF191663.1|AF191 ACCAGA
gi|6273291|gb|AF191665.1|AF191 ACCAGA
******
""" # noqa : W291
cw02_clustal = """\
CLUSTAL X (1.81) multiple sequence alignment
gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN
gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR
* *: :: :. :* : :. : . :* :: .
gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW
gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT---------
: ** **:... *.*** ..
gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT
gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------
.:* * *: .* :* : :* .*
gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE
gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP
*::. . .:: :*..* :* .* .. . : . :
gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRARSRSPEHRRTRARAERSMSPLQPTSEIPRRAPTLEQSSEN
gi|671626|emb|CAA85685.1| VAYVKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYE
*. .:: : . .* . : *.: ..:: * . :: :
gi|4959044|gb|AAD34209.1|AF069 EPEGSSRTRHHVTLRQQISGPELLGRGLFAASGSRNPSQGTSSSDTGSNS
gi|671626|emb|CAA85685.1| CLRGGLDFTKDDENVNSQPFMRWRDRFLFCAEAIYKAQAETGEIKGHYLN
.*. :. :. . . .* **.*.. :.. *.. . .
gi|4959044|gb|AAD34209.1|AF069 ESSGSGQRPPTIVLDLQVRRVRPGEYRQRDSIASRTRSRSQAPNNTVTYE
gi|671626|emb|CAA85685.1| ATAG-----------------------TCEEMIKRAIFARELGVPIVMHD
::* :.: .*: : * ::
gi|4959044|gb|AAD34209.1|AF069 SERGGFRRTFSRSERAGVRTYVSTIRIPIRRILNTGLSETTSVAIQTMLR
gi|671626|emb|CAA85685.1| YLTGGFTANTSLAHYCRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKAL
*** . * :. . . : *: .:: ::: .. . : :
gi|4959044|gb|AAD34209.1|AF069 QIMTGFGELSYFMYSDSDSEPSASVSSRNVERVESRNGRGSSGGGNSSGS
gi|671626|emb|CAA85685.1| RLSGGDHIHSGTVVGKLEGERDITLGFVDLLRDDFIEKDRSRGIYFTQDW
:: * * : .. :.* . ::. :: * : : * * :..
gi|4959044|gb|AAD34209.1|AF069 SSSSSPSPSSSGESSESSSKMFEGSSEGGSSGPSRKDGRHRAPVTFDESG
gi|671626|emb|CAA85685.1| VSLPGVIPVASG-----------------------------GIHVWHMPA
* .. * :** . .:. ..
gi|4959044|gb|AAD34209.1|AF069 SLPFFSLAQFFLLNEDDEDQPRGLTKEQIDNLAMRSFGENDALKTCSVCI
gi|671626|emb|CAA85685.1| LTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVA-----------VEACV
:*. ..: :. . .:* * : : * . ..*:
gi|4959044|gb|AAD34209.1|AF069 TEYTEGDKLRKLPCSHEFHVHCIDRWLSE-NSTCPICRRAVLSSGNRESV
gi|671626|emb|CAA85685.1| KARNEG---RDLAAEGNAIIREACKWSPELAAACEVWKEIKFEFPAMD--
. .** *.*... : :: :* .* ::* : :. :. :
gi|4959044|gb|AAD34209.1|AF069 V
gi|671626|emb|CAA85685.1| -
""" # noqa : W291
opuntia_fasta = """\
>gi|6273285|gb|AF191659.1|AF191
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----
------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCCATTGATTTAGTGTACCAGA
>gi|6273284|gb|AF191658.1|AF191
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--
------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTGTACCAGA
>gi|6273287|gb|AF191661.1|AF191
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----
------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTGTACCAGA
>gi|6273286|gb|AF191660.1|AF191
TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----
------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTGTACCAGA
>gi|6273290|gb|AF191664.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA
------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTGTACCAGA
>gi|6273289|gb|AF191663.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA
------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTATACCAGA
>gi|6273291|gb|AF191665.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA
TATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGA
TGAATATCAAAGAATCTATTGATTTAGTGTACCAGA
"""
class TestBasics(unittest.TestCase):
def test_empty_alignment(self):
"""Very simple tests on an empty alignment."""
alignment = MultipleSeqAlignment([])
self.assertEqual(alignment.get_alignment_length(), 0)
self.assertEqual(len(alignment), 0)
def test_basic_alignment(self):
"""Basic tests on a simple alignment of three sequences."""
alignment = MultipleSeqAlignment([])
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.append(SeqRecord(Seq(letters), id="mixed"))
alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
self.assertEqual(alignment.get_alignment_length(), 26)
self.assertEqual(len(alignment), 3)
self.assertEqual(str(alignment[0].seq), letters)
self.assertEqual(str(alignment[1].seq), letters.lower())
self.assertEqual(str(alignment[2].seq), letters.upper())
self.assertEqual(alignment[0].id, "mixed")
self.assertEqual(alignment[1].id, "lower")
self.assertEqual(alignment[2].id, "upper")
for (col, letter) in enumerate(letters):
self.assertEqual(alignment[:, col], letter + letter.lower() + letter.upper())
# Check row extractions:
self.assertEqual(alignment[0].id, "mixed")
self.assertEqual(alignment[-1].id, "upper")
# Check sub-alignment extraction by row slicing:
self.assertIsInstance(alignment[::-1], MultipleSeqAlignment)
self.assertEqual(alignment[::-1][0].id, "upper")
self.assertEqual(alignment[::-1][2].id, "mixed")
class TestReading(unittest.TestCase):
def test_read_clustal1(self):
"""Parse an alignment file and get an aligment object."""
path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
alignment = AlignIO.read(path, "clustal")
self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
def test_read_clustal2(self):
"""Parse an alignment file and get an aligment object."""
path = os.path.join(os.curdir, "Clustalw", "cw02.aln")
alignment = AlignIO.read(path, "clustal")
self.assertEqual(format(alignment, "clustal"), cw02_clustal)
def test_read_write_clustal(self):
"""Test the base alignment stuff."""
path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
alignment = AlignIO.read(path, "clustal", alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
self.assertEqual(len(alignment), 7)
seq_record = alignment[0]
self.assertEqual(seq_record.description, "gi|6273285|gb|AF191659.1|AF191")
self.assertEqual(seq_record.seq, Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA"))
seq_record = alignment[1]
self.assertEqual(seq_record.description, "gi|6273284|gb|AF191658.1|AF191")
self.assertEqual(seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
seq_record = alignment[2]
self.assertEqual(seq_record.description, "gi|6273287|gb|AF191661.1|AF191")
self.assertEqual(seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
seq_record = alignment[3]
self.assertEqual(seq_record.description, "gi|6273286|gb|AF191660.1|AF191")
self.assertEqual(seq_record.seq, "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
seq_record = alignment[4]
self.assertEqual(seq_record.description, "gi|6273290|gb|AF191664.1|AF191")
self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
seq_record = alignment[5]
self.assertEqual(seq_record.description, "gi|6273289|gb|AF191663.1|AF191")
self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA")
seq_record = alignment[6]
self.assertEqual(seq_record.description, "gi|6273291|gb|AF191665.1|AF191")
self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
self.assertEqual(alignment.get_alignment_length(), 156)
align_info = AlignInfo.SummaryInfo(alignment)
consensus = align_info.dumb_consensus()
self.assertIsInstance(consensus, Seq)
self.assertEqual(consensus, "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA")
dictionary = align_info.replacement_dictionary(["N"])
self.assertEqual(len(dictionary), 16)
self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
matrix = align_info.pos_specific_score_matrix(consensus, ["N"])
self.assertEqual(str(matrix), """\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
X 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
T 0.0 0.0 0.0 3.0
A 3.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
X 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
""")
matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"])
self.assertEqual(str(matrix), """\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
X 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
T 0.0 0.0 0.0 3.0
A 3.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
T 0.0 0.0 0.0 1.0
A 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
X 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
""")
second_seq = alignment[1].seq
matrix = align_info.pos_specific_score_matrix(second_seq, ["N"])
self.assertEqual(str(matrix), """\
A C G T
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 1.0 0.0 0.0 6.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 4.0 0.0 3.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
G 0.0 0.0 7.0 0.0
C 0.0 7.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 4.0
A 4.0 0.0 0.0 0.0
- 0.0 0.0 0.0 3.0
- 3.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
- 0.0 0.0 0.0 1.0
- 1.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
C 1.0 6.0 0.0 0.0
A 6.0 0.0 0.0 1.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 3.0 0.0 4.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 2.0 0.0 5.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
C 0.0 7.0 0.0 0.0
T 0.0 1.0 0.0 6.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
T 0.0 0.0 0.0 7.0
G 1.0 0.0 6.0 0.0
T 0.0 0.0 0.0 7.0
A 7.0 0.0 0.0 0.0
C 0.0 7.0 0.0 0.0
C 0.0 7.0 0.0 0.0
A 7.0 0.0 0.0 0.0
G 0.0 0.0 7.0 0.0
A 7.0 0.0 0.0 0.0
""")
value = align_info.information_content(5, 50, chars_to_ignore=["N"])
self.assertAlmostEqual(value, 88.42, places=2)
value = align_info.information_content(chars_to_ignore=["N"])
self.assertAlmostEqual(value, 287.55, places=2)
e_freq_table = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
value = align_info.information_content(e_freq_table=e_freq_table,
chars_to_ignore=["N"])
self.assertAlmostEqual(value, 287.55, places=2)
self.assertEqual(align_info.get_column(1), "AAAAAAA")
self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2)
self.assertEqual(align_info.get_column(7), "TTTATTT")
self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2)
handle = StringIO()
AlignInfo.print_info_content(align_info, fout=handle)
self.assertEqual(handle.getvalue(), """\
0 T 2.000
1 A 2.000
2 T 2.000
3 A 2.000
4 C 2.000
5 A 2.000
6 T 2.000
7 T 1.408
8 A 2.000
9 A 2.000
10 A 2.000
11 G 2.000
12 A 1.015
13 A 2.000
14 G 2.000
15 G 2.000
16 G 2.000
17 G 2.000
18 G 2.000
19 A 2.000
20 T 2.000
21 G 2.000
22 C 2.000
23 G 2.000
24 G 2.000
25 A 2.000
26 T 2.000
27 A 2.000
28 A 2.000
29 A 2.000
30 T 2.000
31 G 2.000
32 G 2.000
33 A 2.000
34 A 2.000
35 A 2.000
36 G 2.000
37 G 2.000
38 C 2.000
39 G 2.000
40 A 2.000
41 A 2.000
42 A 2.000
43 G 2.000
44 A 2.000
45 A 2.000
46 A 2.000
47 G 2.000
48 A 2.000
49 A 2.000
50 T 2.000
51 A 2.000
52 T 2.000
53 A 2.000
54 T 2.000
55 A 2.000
56 - 0.682
57 - 0.682
58 - 0.333
59 - 0.333
60 - -0.115
61 - -0.115
62 - -0.115
63 - -0.115
64 - -0.115
65 - -0.115
66 A 2.000
67 T 2.000
68 A 2.000
69 T 2.000
70 A 2.000
71 T 2.000
72 T 2.000
73 T 2.000
74 C 1.408
75 A 1.408
76 A 2.000
77 A 2.000
78 T 2.000
79 T 2.000
80 T 1.015
81 C 2.000
82 C 2.000
83 T 2.000
84 T 2.000
85 A 2.000
86 T 2.000
87 A 2.000
88 T 2.000
89 A 2.000
90 C 1.137
91 C 2.000
92 C 2.000
93 A 2.000
94 A 2.000
95 A 2.000
96 T 2.000
97 A 2.000
98 T 2.000
99 A 2.000
100 A 2.000
101 A 2.000
102 A 2.000
103 A 2.000
104 T 2.000
105 A 2.000
106 T 2.000
107 C 2.000
108 T 2.000
109 A 2.000
110 A 2.000
111 T 2.000
112 A 2.000
113 A 2.000
114 A 2.000
115 T 2.000
116 T 2.000
117 A 2.000
118 G 2.000
119 A 2.000
120 T 2.000
121 G 2.000
122 A 2.000
123 A 2.000
124 T 2.000
125 A 2.000
126 T 2.000
127 C 2.000
128 A 2.000
129 A 2.000
130 A 2.000
131 G 2.000
132 A 2.000
133 A 2.000
134 T 2.000
135 C 2.000
136 C 1.408
137 A 2.000
138 T 2.000
139 T 2.000
140 G 2.000
141 A 2.000
142 T 2.000
143 T 2.000
144 T 2.000
145 A 2.000
146 G 2.000
147 T 2.000
148 G 1.408
149 T 2.000
150 A 2.000
151 C 2.000
152 C 2.000
153 A 2.000
154 G 2.000
155 A 2.000
""")
def test_read_fasta(self):
path = os.path.join(os.curdir, "Quality", "example.fasta")
alignment = AlignIO.read(path, "fasta",
alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna))
self.assertEqual(len(alignment), 3)
seq_record = alignment[0]
self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324")
self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC")
seq_record = alignment[1]
self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792")
self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA")
seq_record = alignment[2]
self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
self.assertEqual(alignment.get_alignment_length(), 25)
align_info = AlignInfo.SummaryInfo(alignment)
consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
self.assertIsInstance(consensus, Seq)
self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
self.assertEqual(str(alignment), """\
Alignment with 3 rows and 25 columns
CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324
TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792
GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
def test_format_conversion(self):
"""Parse the alignment file and get an aligment object."""
path = os.path.join(os.curdir, "Clustalw", "opuntia.aln")
alignment = AlignIO.read(path, "clustal")
self.assertEqual(format(alignment, "fasta"), opuntia_fasta)
self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)