Files
biopython/Tests/test_AlignIO_ClustalIO.py
ruff-isort de0bb21fb3 Apply isort (forcing single lines, not sorting by type) via ruff
$ ruff check --fix --select=I \
  --config=lint.isort.force-single-line=true \
  --config=lint.isort.order-by-type=false \
  BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py

Using ruff version 0.4.10
2024-06-26 15:31:39 +09:00

291 lines
14 KiB
Python

# Copyright 2006-2014 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for Bio.AlignIO.ClustalIO module."""
import unittest
from io import StringIO
from Bio.AlignIO.ClustalIO import ClustalIterator
from Bio.AlignIO.ClustalIO import ClustalWriter
# This is a truncated version of the example in Tests/clustalw.aln
# Notice the inclusion of sequence numbers (right hand side)
aln_example1 = """\
CLUSTAL W (1.81) multiple sequence alignment
gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
* *: :: :. :* : :. : . :* :: .
gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
: ** **:... *.*** ..
gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
.:* * *: .* :* : :* .*
gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
*::. . .:: :*..* :* .* .. . : . :
gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
*. .:: : .
""" # noqa : W291
# This example is a truncated version of the dataset used here:
# http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm
# with the last record repeated twice (deliberate toture test)
aln_example2 = """\
CLUSTAL X (1.83) multiple sequence alignment
V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
: . : :.
V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
** .: *::::. : :. . ..:
V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
*.: . * . * *: :
""" # noqa : W291
aln_example3 = """\
CLUSTAL 2.0.9 multiple sequence alignment
Test1seq ------------------------------------------------------------
AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-CDS ------------------------------------------------------ATGAAC
*
Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
* *** ***** * * ** ****************************
Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
******* ** * **** ***************************************
Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
**************************************** *******************
Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
*************************
""" # noqa : W291
aln_example4 = """\
Kalign (2.0) alignment in ClustalW format
Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
"""
aln_example5 = """\
Biopython 1.80.dev0 multiple sequence alignment
gi|4959044|gb|AAD34209.1|AF069 ------------MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNL
gi|671626|emb|CAA85685.1| MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFRVTPQPGVPP
gi|4959044|gb|AAD34209.1|AF069 SEEDYRLMRDNNLLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESS
gi|671626|emb|CAA85685.1| -EEAGAAVAAESSTGTWTTVWTDGLTS-LDRYK-GRCYHI--EPVPGEKD
gi|4959044|gb|AAD34209.1|AF069 DDVTNSDSIIDWLNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSL
gi|671626|emb|CAA85685.1| QCICYVAYPLDLFEEGSVTNMFT-SIV-GNVFGFKALRALRLE-DLRIPV
gi|4959044|gb|AAD34209.1|AF069 EINVNRNNGSQTSENESEPSTRRLSVENMESSSQRQMENSASESASARPS
gi|671626|emb|CAA85685.1| AY-VKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYE
gi|4959044|gb|AAD34209.1|AF069 RAERNSTEAVTEVPTTRAQRRARSRSPEHRRTRARAERSMSPLQPTSEIP
gi|671626|emb|CAA85685.1| CL-RGGLDFTKDDENVNSQPFMRWRD---RFLFC-AEAIYKAQAETGEIK
gi|4959044|gb|AAD34209.1|AF069 RRAPTLEQSSENEPEGSSRTRHHVTLRQQISGPELLGRGLFAASGSRNPS
gi|671626|emb|CAA85685.1| GHYLNATAGTC-E-EMIKRAIFARELGVPIVMHDYLTGG-FTANTSLAHY
gi|4959044|gb|AAD34209.1|AF069 QGTSSSDTGSNSESSGSGQRPPTIVLDLQVRRVRPGEYRQRDSIASRTRS
gi|671626|emb|CAA85685.1| CRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKALRLSGG-DHIHSGTVV
gi|4959044|gb|AAD34209.1|AF069 RSQAPNNTVTYESERGGFRRTFSRSERAGVRTYVSTIRIPIRRILNTGLS
gi|671626|emb|CAA85685.1| GKLEGERDITLGFVDLL-RDDFIEKDRSRGI-YF-TQDWVSLPGVIPVAS
gi|4959044|gb|AAD34209.1|AF069 ETTSVAIQTMLRQIMTGFGELSYFMYSDSDSEPSAS--VSSRN-VER-VE
gi|671626|emb|CAA85685.1| GGIHVWHMPALTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVAVEACVK
gi|4959044|gb|AAD34209.1|AF069 SRN-GRGSSGGGNSSGSSSSS-SPSPSSSGESSESSSKMFEGSSEGGSSG
gi|671626|emb|CAA85685.1| ARNEGRDLAAEGNAIIREACKWSPELAAACEVWKEIKFEFPAMD------
gi|4959044|gb|AAD34209.1|AF069 PSRKDGRHRAPVTFDESGSLPFFSLAQFFLLNEDDEDQPRGLTKEQIDNL
gi|671626|emb|CAA85685.1| --------------------------------------------------
gi|4959044|gb|AAD34209.1|AF069 AMRSFGENDALKTCSVCITEYTEGDKLRKLPCSHEFHVHCIDRWLSENST
gi|671626|emb|CAA85685.1| --------------------------------------------------
gi|4959044|gb|AAD34209.1|AF069 CPICRRAVLSSGNRESVV
gi|671626|emb|CAA85685.1| ------------------
"""
class TestClustalIO(unittest.TestCase):
def test_one(self):
alignments = list(ClustalIterator(StringIO(aln_example1)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "1.81")
alignment = alignments[0]
self.assertEqual(2, len(alignment))
self.assertEqual(alignment[0].id, "gi|4959044|gb|AAD34209.1|AF069")
self.assertEqual(alignment[1].id, "gi|671626|emb|CAA85685.1|")
self.assertEqual(
alignment[0].seq,
"MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN"
"LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW"
"LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT"
"SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE"
"VPTTRAQRRA",
)
def test_two(self):
alignments = list(ClustalIterator(StringIO(aln_example2)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "1.83")
alignment = alignments[0]
self.assertEqual(9, len(alignment))
self.assertEqual(alignment[-1].id, "HISJ_E_COLI")
self.assertEqual(
alignment[-1].seq,
"MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG"
"TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS"
"LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV",
)
def test_cat_one_two(self):
alignments = list(ClustalIterator(StringIO(aln_example2 + aln_example1)))
self.assertEqual(2, len(alignments))
self.assertEqual(9, len(alignments[0]))
self.assertEqual(137, alignments[0].get_alignment_length())
self.assertEqual(2, len(alignments[1]))
self.assertEqual(210, alignments[1].get_alignment_length())
def test_empty(self):
"""Checking empty file."""
self.assertEqual(0, len(list(ClustalIterator(StringIO("")))))
def test_write_read(self):
"""Checking write/read."""
alignments = (
list(ClustalIterator(StringIO(aln_example1)))
+ list(ClustalIterator(StringIO(aln_example2))) * 2
)
handle = StringIO()
self.assertEqual(3, ClustalWriter(handle).write_file(alignments))
handle.seek(0)
for i, a in enumerate(ClustalIterator(handle)):
self.assertEqual(
a.get_alignment_length(), alignments[i].get_alignment_length()
)
def test_write_read_single(self):
"""Testing write/read when there is only one sequence."""
alignment = next(ClustalIterator(StringIO(aln_example1)))
# Now thae just the first row as a new alignment:
alignment = alignment[0:1]
handle = StringIO()
ClustalWriter(handle).write_file([alignment])
handle.seek(0)
for i, a in enumerate(ClustalIterator(handle)):
self.assertEqual(a.get_alignment_length(), alignment.get_alignment_length())
self.assertEqual(len(a), 1)
def test_three(self):
alignments = list(ClustalIterator(StringIO(aln_example3)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "2.0.9")
def test_kalign_header(self):
"""Make sure we can parse the Kalign header."""
alignment = next(ClustalIterator(StringIO(aln_example4)))
self.assertEqual(2, len(alignment))
def test_biopython_header(self):
"""Make sure we can parse the Biopython header."""
alignment = next(ClustalIterator(StringIO(aln_example5)))
self.assertEqual(2, len(alignment))
self.assertEqual(alignment._version, "1.80.dev0")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)