diff --git a/Bio/Align/a2m.py b/Bio/Align/a2m.py new file mode 100644 index 000000000..546a35856 --- /dev/null +++ b/Bio/Align/a2m.py @@ -0,0 +1,130 @@ +# Copyright 2022 by Michiel de Hoon. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.Align support for A2M files. + +A2M files are alignment files created by align2model or hmmscore in the SAM +Sequence Alignment and Modeling Software System. +""" +from Bio.Align import Alignment +from Bio.Align import interfaces +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio import BiopythonExperimentalWarning + +import warnings + +warnings.warn( + "Bio.Align.a2m is an experimental module which may undergo " + "significant changes prior to its future official release.", + BiopythonExperimentalWarning, +) + + +class AlignmentWriter(interfaces.AlignmentWriter): + """Alignment file writer for the A2M file format.""" + + def __init__(self, target): + """Create an AlignmentWriter object. + + Arguments: + - target - output stream or file name + + """ + super().__init__(target, mode="w") + + def format_alignment(self, alignment): + """Return a string with the alignment in the A2M file format.""" + if not isinstance(alignment, Alignment): + raise TypeError("Expected an Alignment object") + lines = [] + state = alignment.column_annotations["state"] + for sequence, line in zip(alignment.sequences, alignment): + lines.append(f">{sequence.id} {sequence.description}") + s = "" + for c, m in zip(line, state): + if m == "D": + s += c.upper() + elif m == "I": + if c == "-": + s += "." + else: + s += c.lower() + lines.append(s) + return "\n".join(lines) + + +class AlignmentIterator(interfaces.AlignmentIterator): + """Alignment iterator for files in the A2M file format. + + An A2M file contains one multiple alignment. Matches are represented by + upper case letters and deletions by dashes in alignment columns containing + matches or deletions only. Insertions are represented by lower case letters, + with gaps aligned to the insertion shown as periods. Header lines start + with '>' followed by the name of the sequence, and optionally a description. + """ + + def __init__(self, source): + """Create an AlignmentIterator object. + + Arguments: + - source - input data or file name + + """ + super().__init__(source, mode="t", fmt="A2M") + + def parse(self, stream): + """Parse the next alignment from the stream.""" + names = [] + descriptions = [] + lines = [] + for line in stream: + if line.startswith(">"): + parts = line[1:].rstrip().split(None, 1) + try: + name, description = parts + except ValueError: + name = parts[0] + description = None + names.append(name) + descriptions.append(description) + lines.append("") + else: + lines[-1] += line.strip() + if not lines: + raise ValueError("Empty file.") + state = "" + for c in lines[0]: + if c == "-" or c.isupper(): + state += "D" # Match/deletion state + elif c == "." or c.islower(): + state += "I" # Insertion state + else: + raise Exception("Unexpected letter '%s' in alignment" % c) + for line in lines[1:]: + for c, m in zip(line, state): + if m == "D": # Match/deletion state + assert c == "-" or c.isupper() + elif m == "I": # Insertion state + assert c == "." or c.islower() + else: + raise Exception("Unexpected letter '%s' in alignment" % c) + for i, line in enumerate(lines): + lines[i] = line.upper().replace(".", "-") + coordinates = Alignment.infer_coordinates(lines) + records = [] + for name, description, line in zip(names, descriptions, lines): + line = line.replace("-", "") + sequence = Seq(line) + if description is None: + record = SeqRecord(sequence, name) + else: + record = SeqRecord(sequence, name, description=description) + records.append(record) + alignment = Alignment(records, coordinates) + alignment.column_annotations = {} + alignment.column_annotations["state"] = state + yield alignment diff --git a/Tests/Clustalw/clustalw.a2m b/Tests/Clustalw/clustalw.a2m new file mode 100644 index 000000000..77b0b81cb --- /dev/null +++ b/Tests/Clustalw/clustalw.a2m @@ -0,0 +1,16 @@ +>gi|4959044|gb|AAD34209.1|AF069 +MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNNLLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW +LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQTSENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE +VPTTRAQRRARSRSPEHRRTRARAERSMSPLQPTSEIPRRAPTLEQSSENEPEGSSRTRHHVTLRQQISGPELLGRGLFAASGSRNPSQGTSSSDTGSNS +ESSGSGQRPPTIVLDLQVRRVRPGEYRQRDSIASRTRSRSQAPNNTVTYESERGGFRRTFSRSERAGVRTYVSTIRIPIRRILNTGLSETTSVAIQTMLR +QIMTGFGELSYFMYSDSDSEPSASVSSRNVERVESRNGRGSSGGGNSSGSSSSSSPSPSSSGESSESSSKMFEGSSEGGSSGPSRKDGRHRAPVTFDESG +SLPFFSLAQFFLLNEDDEDQPRGLTKEQIDNLAMRSFGENDALKTCSVCITEYTEGDKLRKLPCSHEFHVHCIDRWLSE.NSTCPICRRAVLSSGNRESV +V +>gi|671626|emb|CAA85685.1| +---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFRVTPQPG-----------------VPPEEAGAAVAAESSTGT--------- +WTTVWTDGLTSLDRYKG-----RCYHIEPVPG-------------------EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP +VAYVKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYECLRGGLDFTKDDENVNSQPFMRWRDRFLFCAEAIYKAQAETGEIKGHYLN +ATAG-----------------------TCEEMIKRAIFARELGVPIVMHDYLTGGFTANTSLAHYCRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKAL +RLSGGDHIHSGTVVGKLEGERDITLGFVDLLRDDFIEKDRSRGIYFTQDWVSLPGVIPVASG-----------------------------GIHVWHMPA +LTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVA-----------VEACVKARNEG---RDLAAEGNAIIREACKWSPElAAACEVWKEIKFEFPAMD-- +- diff --git a/Tests/Clustalw/kalign.a2m b/Tests/Clustalw/kalign.a2m new file mode 100644 index 000000000..3f7c948ef --- /dev/null +++ b/Tests/Clustalw/kalign.a2m @@ -0,0 +1,4 @@ +>Test1seq +GCTGGGGATGGAGAGGGAACAGAGT.T +>AT3G20900 +GCTGGGGATGGAGAGGGAACAGAGTaG diff --git a/Tests/Clustalw/msaprobs.a2m b/Tests/Clustalw/msaprobs.a2m new file mode 100644 index 000000000..522f34e22 --- /dev/null +++ b/Tests/Clustalw/msaprobs.a2m @@ -0,0 +1,32 @@ +>V_Harveyi_PATH +MKNW........IKV....AVAAI.A..LSAA...................TVQAATEVKVGMSGRYFPFTFVK..QDKLQGFEVDMWDEIGKRNDYKIE +YVTANFSGLFGLLETGRIDTISNQITMTDARKAKYLFADPYVVDGAQITVRK.GNDSIQGVEDLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT..G +IEHDVALGRADAFIMDRLSALE.LIKKTG.LPLQLAGEPFE.....TIQNAWPFVDNEKGRKLQAEVNKALAEMRADGTVEKISVKWFGADITK.... +>B_subtilis_YXEM +MKMKkw......TVL....VVAALlA.vLSACgn............g.nssSKEDDNVLHVGATGQSYPFAYKE..NGKLTGFDVEVMEAVAKKIDMKLD +WKLLEFSGLMGELQTGKLDTISNQVAVTDERKETYNFTKPYAYAGTQIVVKK.DNTDIKSVDDLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETqeG +TLKDVAYGRVDAYVNSRTVLIA.QIKKTG.LPLKLAGDPIV.....YEQVAFPFAKDDAHDKLRKKVNKALDELRKDGTLKKLSEKYFNEDITVeqkh +>FLIY_ECOLI +MKLAhlgrqalmGVM....AVALVaG..MSVKsf.........adeg.llnKVKERGTLLVGLEGTYPPFSFQGd.DGKLTGFEVEFAQQLAKHLGVEAS +LKPTKWDGMLASLDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKgNEGTIKTADDLKGKKVGVGLGTNYEEWLRQN--VQGVDVRTYDDdpT +KYQDLRVGRIDAILVDRLAALD.LVKKTN.DTLAVTGEAFS.....RQESGVALRK--GNEDLLKAVNDAIAEMQKDGTLQALSEKWFGADVTK.... +>Deinococcus_radiodurans +MKKSll......SLKlsglLVPSVlAlsLSACss...............psSTLNQGTLKIAMEGTYPPFTSKNe.QGELVGFDVDIAKAVAQKLNLKPE +FVLTEWSGILAGLQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKnNTFNPQSLADLKGKRVGSTLGSNYEKQLI-D--TGDIKIVTYPGapE +ILADLVAGRIDAAYNDRLVVNY.IIND-QkLPVRGAGQIGD.....AAPVGIALKK--GNSALKDQIDKALTEMRSDGTFEKISQKWFGQDVGQ...p +>B_subtilis_GlnH_homo_YCKK +MKKAll......ALF....MVVSIaA..LAACgagndnqskdnakdgdlwaSIKKKGVLTVGTEGTYEPFTYHDkdTDKLTGYDVEVITEVAKRLGLKVD +FKETQWGSMFAGLNSKRFDVVANQVGKTD-REDKYDFSDKYTTSRAVVVTKK.DNNDIKSEADVKGKTSAQSLTSNYNKLAT-N--A-GAKVEGVEGmaQ +ALQMIQQARVDMTYNDKLAVLN.YLKTSGnKNVKIAFETGE.....PQSTYFTFRK--GSGEVVDQVNKALKEMKEDGTLSKISKKWFGEDVSK.... +>YA80_HAEIN +MKKLlf......TTA....LLTGAiA..FSTFs...........hageiadRVEKTKTLLVGTEGTYAPFTFHDk.SGKLTGFDVEVIRKVAEKLGLKVE +FKETQWDAMYAGLNAKRFDVIANQTNPSPERLKKYSFTTPYNYSGGVIVTKS.SDNSIKSFEDLKGRKSAQSATSNWGKDAK-A--A-GAQILVVDGlaQ +SLELIKQGRAEATINDKLAVLD.YFKQHPnSGLKIAYDRGD.....KTPTAFAFLQ--GEDALITKFNQVLEALRQDGTLKQISIEWFGYDITQ.... +>E_coli_GlnH +MKSVl.......KVS....LAALTlA..FAVSsh.........a.......---ADKKLVVATDTAFVPFEFKQ..GDKYVGFDVDLWAAIAKELKLDYE +LKPMDFSGIIPALQTKNVDLALAGITITDERKKAIDFSDGYYKSGLLVMVKAn-NNDVKSVKDLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNidN +AYMELGTNRADAVLHDTPNILY.FIKTAGnGQFKAVGDSLE.....AQQYGIAFPK--GSDELRDKVNGALKTLRENGTYNEIYKKWFGTEP-K.... +>HISJ_E_COLI +MKKLvl......SLS....LV---lA..FSSAta...............a.FAAIPQNIRIGTDPTYAPFESKNs.QGELVGFDIDLAKELCKRINTQCT +FVENPLDALIPSLKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAK.NSDIQPTVESLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGqdN +IYSDLTAGRIDAAFQDEVAASEgFLKQPVgKDYKFGGPSVKdeklfGVGTGMGLRK--EDNELREALNKAFAEMRADGTYEKLAKKYFDFDVYG...g diff --git a/Tests/Clustalw/muscle.a2m b/Tests/Clustalw/muscle.a2m new file mode 100644 index 000000000..c71741776 --- /dev/null +++ b/Tests/Clustalw/muscle.a2m @@ -0,0 +1,24 @@ +>Test1seq +.................................................................AGTTACAATAACTGACGAAGCTAAGTAGGCTACTA +ATTAACGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATC +GTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA +AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTA +ATACTACTTTCTGTATTGATCCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTTCGTTGACGAAGAAGAGCTC +TATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC +GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTT. +>AT3G20900.1-SEQ +atgaacaaagtagcgaggaagaacaaaacatcaggtgaacaaaaaaaaaactcaatccacatcaaAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAA +ATTAAAGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATA +GTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA +CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTA +ATACTACTTTCTGTATTGATCCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTGCGTTGACGAAGAAGAGCTC +TATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC +GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAg +>AT3G20900.1-CDS +.................................................................----------------------------------- +---------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------ATGAACAAAGTAGCGAGGAAGA +A------------------------------CAAAACATC------------------------------------------------------------ +---------------------------------------------------------------------------------------------------- +------------AGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC +GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAg diff --git a/Tests/Clustalw/probcons.a2m b/Tests/Clustalw/probcons.a2m new file mode 100644 index 000000000..08d50c754 --- /dev/null +++ b/Tests/Clustalw/probcons.a2m @@ -0,0 +1,15 @@ +>plas_horvu +D.VLLGANGGVLVFEPNDFSVKAGETITFKNNAGYPHNVVFDEDAVPSG.VD.VSKISQEEYLTAPGETFSVTLTV...PGTYGFYCEPHAGAGMVGKVT +V +>plas_chlre +-.VKLGADSGALEFVPKTLTIKSGETVNFVNNAGFPHNIVFDEDAIPSG.VN.ADAISRDDYLNAPGETYSVKLTA...AGEYGYYCEPHQGAGMVGKII +V +>plas_anava +-.VKLGSDKGLLVFEPAKLTIKPGDTVEFLNNKVPPHNVVFDAALNPAKsADlAKSLSHKQLLMSPGQSTSTTFPAdapAGEYTFYCEPHRGAGMVGKIT +V +>plas_proho +VqIKMGTDKYAPLYEPKALSISAGDTVEFVMNKVGPHNVIFDK--VPAG.ES.APALSNTKLRIAPGSFYSVTLGT...PGTYSFYCTPHRGAGMVGTIT +V +>azup_achcy +VhMLNKGKDGAMVFEPASLKVAPGDTVTFIPTDK-GHNVETIKGMIPDG.AE.A-------FKSKINENYKVTFTA...PGVYGVKCTPHYGMGMVGVVE +V diff --git a/Tests/test_Align_a2m.py b/Tests/test_Align_a2m.py new file mode 100644 index 000000000..5db13cf5f --- /dev/null +++ b/Tests/test_Align_a2m.py @@ -0,0 +1,321 @@ +# Copyright 2006-2014 by Peter Cock. All rights reserved. +# Copyright 2022 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Tests for Bio.Align.a2m module.""" +import unittest +import warnings + +from io import StringIO + +from Bio import BiopythonExperimentalWarning + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", BiopythonExperimentalWarning) + from Bio.Align.a2m import AlignmentIterator + from Bio.Align.a2m import AlignmentWriter + + +class TestA2MReadingWriting(unittest.TestCase): + def check_reading_writing(self, path): + alignments = AlignmentIterator(path) + stream = StringIO() + writer = AlignmentWriter(stream) + n = writer.write_file(alignments, mincount=1, maxcount=1) + self.assertEqual(n, 1) + alignments = AlignmentIterator(path) + alignment = next(alignments) + stream.seek(0) + saved_alignments = AlignmentIterator(stream) + saved_alignment = next(saved_alignments) + with self.assertRaises(StopIteration): + next(saved_alignments) + self.assertEqual(len(alignment), len(saved_alignment)) + for i, (sequence, saved_sequence) in enumerate( + zip(alignment.sequences, saved_alignment.sequences) + ): + self.assertEqual(sequence.id, saved_sequence.id) + self.assertEqual(sequence.seq, saved_sequence.seq) + self.assertEqual(alignment[i], saved_alignment[i]) + + def test_clustalw(self): + path = "Clustalw/clustalw.a2m" + with open(path) as stream: + alignments = AlignmentIterator(stream) + alignment = next(alignments) + with self.assertRaises(StopIteration): + next(alignments) + self.assertEqual( + repr(alignment), + "" + % id(alignment), + ) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.sequences[0].id, "gi|4959044|gb|AAD34209.1|AF069") + self.assertEqual(alignment.sequences[1].id, "gi|671626|emb|CAA85685.1|") + self.assertEqual( + alignment.sequences[0].seq, + "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNNLLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDWLNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQTSENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTEVPTTRAQRRARSRSPEHRRTRARAERSMSPLQPTSEIPRRAPTLEQSSENEPEGSSRTRHHVTLRQQISGPELLGRGLFAASGSRNPSQGTSSSDTGSNSESSGSGQRPPTIVLDLQVRRVRPGEYRQRDSIASRTRSRSQAPNNTVTYESERGGFRRTFSRSERAGVRTYVSTIRIPIRRILNTGLSETTSVAIQTMLRQIMTGFGELSYFMYSDSDSEPSASVSSRNVERVESRNGRGSSGGGNSSGSSSSSSPSPSSSGESSESSSKMFEGSSEGGSSGPSRKDGRHRAPVTFDESGSLPFFSLAQFFLLNEDDEDQPRGLTKEQIDNLAMRSFGENDALKTCSVCITEYTEGDKLRKLPCSHEFHVHCIDRWLSENSTCPICRRAVLSSGNRESVV", + ) + self.assertEqual( + alignment.sequences[1].seq, + "MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFRVTPQPGVPPEEAGAAVAAESSTGTWTTVWTDGLTSLDRYKGRCYHIEPVPGEKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIPVAYVKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYECLRGGLDFTKDDENVNSQPFMRWRDRFLFCAEAIYKAQAETGEIKGHYLNATAGTCEEMIKRAIFARELGVPIVMHDYLTGGFTANTSLAHYCRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKALRLSGGDHIHSGTVVGKLEGERDITLGFVDLLRDDFIEKDRSRGIYFTQDWVSLPGVIPVASGGIHVWHMPALTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVAVEACVKARNEGRDLAAEGNAIIREACKWSPELAAACEVWKEIKFEFPAMD", + ) + self.assertEqual( + alignment[0], + "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNNLLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDWLNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQTSENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTEVPTTRAQRRARSRSPEHRRTRARAERSMSPLQPTSEIPRRAPTLEQSSENEPEGSSRTRHHVTLRQQISGPELLGRGLFAASGSRNPSQGTSSSDTGSNSESSGSGQRPPTIVLDLQVRRVRPGEYRQRDSIASRTRSRSQAPNNTVTYESERGGFRRTFSRSERAGVRTYVSTIRIPIRRILNTGLSETTSVAIQTMLRQIMTGFGELSYFMYSDSDSEPSASVSSRNVERVESRNGRGSSGGGNSSGSSSSSSPSPSSSGESSESSSKMFEGSSEGGSSGPSRKDGRHRAPVTFDESGSLPFFSLAQFFLLNEDDEDQPRGLTKEQIDNLAMRSFGENDALKTCSVCITEYTEGDKLRKLPCSHEFHVHCIDRWLSE-NSTCPICRRAVLSSGNRESVV", + ) + self.assertEqual( + alignment[1], + "---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFRVTPQPG-----------------VPPEEAGAAVAAESSTGT---------WTTVWTDGLTSLDRYKG-----RCYHIEPVPG-------------------EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIPVAYVKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYECLRGGLDFTKDDENVNSQPFMRWRDRFLFCAEAIYKAQAETGEIKGHYLNATAG-----------------------TCEEMIKRAIFARELGVPIVMHDYLTGGFTANTSLAHYCRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKALRLSGGDHIHSGTVVGKLEGERDITLGFVDLLRDDFIEKDRSRGIYFTQDWVSLPGVIPVASG-----------------------------GIHVWHMPALTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVA-----------VEACVKARNEG---RDLAAEGNAIIREACKWSPELAAACEVWKEIKFEFPAMD---", + ) + self.assertEqual( + alignment.column_annotations["state"], + "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDIDDDDDDDDDDDDDDDDDDDDD", + ) + self.check_reading_writing(path) + + def test_msaprobs(self): + path = "Clustalw/msaprobs.a2m" + # This example was obtained from + # http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm + # and converted to A2M format. + with open(path) as stream: + alignments = AlignmentIterator(stream) + alignment = next(alignments) + with self.assertRaises(StopIteration): + next(alignments) + self.assertEqual( + repr(alignment), + "" + % id(alignment), + ) + self.assertEqual(len(alignment), 8) + self.assertEqual(alignment.shape, (8, 298)) + self.assertEqual(alignment.sequences[0].id, "V_Harveyi_PATH") + self.assertEqual(alignment.sequences[1].id, "B_subtilis_YXEM") + self.assertEqual(alignment.sequences[2].id, "FLIY_ECOLI") + self.assertEqual(alignment.sequences[3].id, "Deinococcus_radiodurans") + self.assertEqual(alignment.sequences[4].id, "B_subtilis_GlnH_homo_YCKK") + self.assertEqual(alignment.sequences[5].id, "YA80_HAEIN") + self.assertEqual(alignment.sequences[6].id, "E_coli_GlnH") + self.assertEqual(alignment.sequences[7].id, "HISJ_E_COLI") + self.assertEqual( + alignment.sequences[0].seq, + "MKNWIKVAVAAIALSAATVQAATEVKVGMSGRYFPFTFVKQDKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGLLETGRIDTISNQITMTDARKAKYLFADPYVVDGAQITVRKGNDSIQGVEDLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDTGIEHDVALGRADAFIMDRLSALELIKKTGLPLQLAGEPFETIQNAWPFVDNEKGRKLQAEVNKALAEMRADGTVEKISVKWFGADITK", + ) + self.assertEqual( + alignment.sequences[1].seq, + "MKMKKWTVLVVAALLAVLSACGNGNSSSKEDDNVLHVGATGQSYPFAYKENGKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGELQTGKLDTISNQVAVTDERKETYNFTKPYAYAGTQIVVKKDNTDIKSVDDLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDAYVNSRTVLIAQIKKTGLPLKLAGDPIVYEQVAFPFAKDDAHDKLRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH", + ) + self.assertEqual( + alignment.sequences[2].seq, + "MKLAHLGRQALMGVMAVALVAGMSVKSFADEGLLNKVKERGTLLVGLEGTYPPFSFQGDDGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLASLDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTADDLKGKKVGVGLGTNYEEWLRQNVQGVDVRTYDDDPTKYQDLRVGRIDAILVDRLAALDLVKKTNDTLAVTGEAFSRQESGVALRKGNEDLLKAVNDAIAEMQKDGTLQALSEKWFGADVTK", + ) + self.assertEqual( + alignment.sequences[3].seq, + "MKKSLLSLKLSGLLVPSVLALSLSACSSPSSTLNQGTLKIAMEGTYPPFTSKNEQGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAGLQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLADLKGKRVGSTLGSNYEKQLIDTGDIKIVTYPGAPEILADLVAGRIDAAYNDRLVVNYIINDQKLPVRGAGQIGDAAPVGIALKKGNSALKDQIDKALTEMRSDGTFEKISQKWFGQDVGQP", + ) + self.assertEqual( + alignment.sequences[4].seq, + "MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVGTEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAGLNSKRFDVVANQVGKTDREDKYDFSDKYTTSRAVVVTKKDNNDIKSEADVKGKTSAQSLTSNYNKLATNAGAKVEGVEGMAQALQMIQQARVDMTYNDKLAVLNYLKTSGNKNVKIAFETGEPQSTYFTFRKGSGEVVDQVNKALKEMKEDGTLSKISKKWFGEDVSK", + ) + self.assertEqual( + alignment.sequences[5].seq, + "MKKLLFTTALLTGAIAFSTFSHAGEIADRVEKTKTLLVGTEGTYAPFTFHDKSGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAGLNAKRFDVIANQTNPSPERLKKYSFTTPYNYSGGVIVTKSSDNSIKSFEDLKGRKSAQSATSNWGKDAKAAGAQILVVDGLAQSLELIKQGRAEATINDKLAVLDYFKQHPNSGLKIAYDRGDKTPTAFAFLQGEDALITKFNQVLEALRQDGTLKQISIEWFGYDITQ", + ) + self.assertEqual( + alignment.sequences[6].seq, + "MKSVLKVSLAALTLAFAVSSHAADKKLVVATDTAFVPFEFKQGDKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPALQTKNVDLALAGITITDERKKAIDFSDGYYKSGLLVMVKANNNDVKSVKDLDGKVVAVKSGTGSVDYAKANIKTKDLRQFPNIDNAYMELGTNRADAVLHDTPNILYFIKTAGNGQFKAVGDSLEAQQYGIAFPKGSDELRDKVNGALKTLRENGTYNEIYKKWFGTEPK", + ) + self.assertEqual( + alignment.sequences[7].seq, + "MKKLVLSLSLVLAFSSATAAFAAIPQNIRIGTDPTYAPFESKNSQGELVGFDIDLAKELCKRINTQCTFVENPLDALIPSLKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQPTVESLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDAAFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKEDNELREALNKAFAEMRADGTYEKLAKKYFDFDVYGG", + ) + self.assertEqual( + alignment[0], + "MKNW--------IKV----AVAAI-A--LSAA-------------------TVQAATEVKVGMSGRYFPFTFVK--QDKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGLLETGRIDTISNQITMTDARKAKYLFADPYVVDGAQITVRK-GNDSIQGVEDLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT--GIEHDVALGRADAFIMDRLSALE-LIKKTG-LPLQLAGEPFE-----TIQNAWPFVDNEKGRKLQAEVNKALAEMRADGTVEKISVKWFGADITK----", + ) + self.assertEqual( + alignment[1], + "MKMKKW------TVL----VVAALLA-VLSACGN------------G-NSSSKEDDNVLHVGATGQSYPFAYKE--NGKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGELQTGKLDTISNQVAVTDERKETYNFTKPYAYAGTQIVVKK-DNTDIKSVDDLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDAYVNSRTVLIA-QIKKTG-LPLKLAGDPIV-----YEQVAFPFAKDDAHDKLRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH", + ) + self.assertEqual( + alignment[2], + "MKLAHLGRQALMGVM----AVALVAG--MSVKSF---------ADEG-LLNKVKERGTLLVGLEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLASLDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTADDLKGKKVGVGLGTNYEEWLRQN--VQGVDVRTYDDDPTKYQDLRVGRIDAILVDRLAALD-LVKKTN-DTLAVTGEAFS-----RQESGVALRK--GNEDLLKAVNDAIAEMQKDGTLQALSEKWFGADVTK----", + ) + self.assertEqual( + alignment[3], + "MKKSLL------SLKLSGLLVPSVLALSLSACSS---------------PSSTLNQGTLKIAMEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAGLQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLADLKGKRVGSTLGSNYEKQLI-D--TGDIKIVTYPGAPEILADLVAGRIDAAYNDRLVVNY-IIND-QKLPVRGAGQIGD-----AAPVGIALKK--GNSALKDQIDKALTEMRSDGTFEKISQKWFGQDVGQ---P", + ) + self.assertEqual( + alignment[4], + "MKKALL------ALF----MVVSIAA--LAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVGTEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAGLNSKRFDVVANQVGKTD-REDKYDFSDKYTTSRAVVVTKK-DNNDIKSEADVKGKTSAQSLTSNYNKLAT-N--A-GAKVEGVEGMAQALQMIQQARVDMTYNDKLAVLN-YLKTSGNKNVKIAFETGE-----PQSTYFTFRK--GSGEVVDQVNKALKEMKEDGTLSKISKKWFGEDVSK----", + ) + self.assertEqual( + alignment[5], + "MKKLLF------TTA----LLTGAIA--FSTFS-----------HAGEIADRVEKTKTLLVGTEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAGLNAKRFDVIANQTNPSPERLKKYSFTTPYNYSGGVIVTKS-SDNSIKSFEDLKGRKSAQSATSNWGKDAK-A--A-GAQILVVDGLAQSLELIKQGRAEATINDKLAVLD-YFKQHPNSGLKIAYDRGD-----KTPTAFAFLQ--GEDALITKFNQVLEALRQDGTLKQISIEWFGYDITQ----", + ) + self.assertEqual( + alignment[6], + "MKSVL-------KVS----LAALTLA--FAVSSH---------A----------ADKKLVVATDTAFVPFEFKQ--GDKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPALQTKNVDLALAGITITDERKKAIDFSDGYYKSGLLVMVKAN-NNDVKSVKDLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNIDNAYMELGTNRADAVLHDTPNILY-FIKTAGNGQFKAVGDSLE-----AQQYGIAFPK--GSDELRDKVNGALKTLRENGTYNEIYKKWFGTEP-K----", + ) + self.assertEqual( + alignment[7], + "MKKLVL------SLS----LV---LA--FSSATA---------------A-FAAIPQNIRIGTDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPSLKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAK-NSDIQPTVESLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDAAFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRK--EDNELREALNKAFAEMRADGTYEKLAKKYFDFDVYG---G", + ) + self.assertEqual( + alignment.column_annotations["state"], + "DDDDIIIIIIIIDDDIIIIDDDDDIDIIDDDDIIIIIIIIIIIIIIIIIIIDDDDDDDDDDDDDDDDDDDDDDDIIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDIIDDDDDDDDDDDDDDDDDDDDDDDIDDDDDDIDDDDDDDDDDDIIIIIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDIIII", + ) + self.check_reading_writing(path) + + def test_muscle(self): + path = "Clustalw/muscle.a2m" + with open(path) as stream: + alignments = AlignmentIterator(stream) + alignment = next(alignments) + with self.assertRaises(StopIteration): + next(alignments) + self.assertEqual( + repr(alignment), + "" + % id(alignment), + ) + self.assertEqual(len(alignment), 3) + self.assertEqual(alignment.sequences[0].id, "Test1seq") + self.assertEqual(alignment.sequences[1].id, "AT3G20900.1-SEQ") + self.assertEqual(alignment.sequences[2].id, "AT3G20900.1-CDS") + self.assertEqual( + alignment.sequences[0].seq, + "AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACAAAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGATCCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTTCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTT", + ) + self.assertEqual( + alignment.sequences[1].seq, + "ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCACATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAACAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGATCCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTGCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAG", + ) + self.assertEqual( + alignment.sequences[2].seq, + "ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAG", + ) + self.assertEqual( + alignment[0], + "-----------------------------------------------------------------AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACAAAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGATCCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTTCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTT-", + ) + self.assertEqual( + alignment[1], + "ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCACATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAATACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATCAATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGACTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAACAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATTGTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGATCCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGTGCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAG", + ) + self.assertEqual( + alignment[2], + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ATGAACAAAGTAGCGAGGAAGAA------------------------------CAAAACATC----------------------------------------------------------------------------------------------------------------------------------------------------------------------------AGCAAAGAAAACGATCTGTCTCCGTCGTAACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCCGGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCTGCTGGGGATGGAGAGGGAACAGAGTAG", + ) + self.assertEqual( + alignment.column_annotations["state"], + "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDI", + ) + self.check_reading_writing(path) + + def test_kalign(self): + path = "Clustalw/kalign.a2m" + with open(path) as stream: + alignments = AlignmentIterator(stream) + alignment = next(alignments) + with self.assertRaises(StopIteration): + next(alignments) + self.assertEqual( + repr(alignment), + "" + % id(alignment), + ) + self.assertEqual(len(alignment), 2) + self.assertEqual(alignment.sequences[0].id, "Test1seq") + self.assertEqual(alignment.sequences[1].id, "AT3G20900") + self.assertEqual(alignment.sequences[0].seq, "GCTGGGGATGGAGAGGGAACAGAGTT") + self.assertEqual(alignment.sequences[1].seq, "GCTGGGGATGGAGAGGGAACAGAGTAG") + self.assertEqual(alignment[0], "GCTGGGGATGGAGAGGGAACAGAGT-T") + self.assertEqual(alignment[1], "GCTGGGGATGGAGAGGGAACAGAGTAG") + self.assertEqual( + alignment.column_annotations["state"], + "DDDDDDDDDDDDDDDDDDDDDDDDDID", + ) + self.check_reading_writing(path) + + def test_probcons(self): + path = "Clustalw/probcons.a2m" + # example taken from the PROBCONS documentation, + # and converted to aligned A2M format. + with open(path) as stream: + alignments = AlignmentIterator(stream) + alignment = next(alignments) + with self.assertRaises(StopIteration): + next(alignments) + self.assertEqual( + repr(alignment), + "" + % id(alignment), + ) + self.assertEqual(len(alignment), 5) + self.assertEqual(alignment.sequences[0].id, "plas_horvu") + self.assertEqual(alignment.sequences[1].id, "plas_chlre") + self.assertEqual(alignment.sequences[2].id, "plas_anava") + self.assertEqual(alignment.sequences[3].id, "plas_proho") + self.assertEqual(alignment.sequences[4].id, "azup_achcy") + self.assertEqual( + alignment.sequences[0].seq, + "DVLLGANGGVLVFEPNDFSVKAGETITFKNNAGYPHNVVFDEDAVPSGVDVSKISQEEYLTAPGETFSVTLTVPGTYGFYCEPHAGAGMVGKVTV", + ) + self.assertEqual( + alignment.sequences[1].seq, + "VKLGADSGALEFVPKTLTIKSGETVNFVNNAGFPHNIVFDEDAIPSGVNADAISRDDYLNAPGETYSVKLTAAGEYGYYCEPHQGAGMVGKIIV", + ) + self.assertEqual( + alignment.sequences[2].seq, + "VKLGSDKGLLVFEPAKLTIKPGDTVEFLNNKVPPHNVVFDAALNPAKSADLAKSLSHKQLLMSPGQSTSTTFPADAPAGEYTFYCEPHRGAGMVGKITV", + ) + self.assertEqual( + alignment.sequences[3].seq, + "VQIKMGTDKYAPLYEPKALSISAGDTVEFVMNKVGPHNVIFDKVPAGESAPALSNTKLRIAPGSFYSVTLGTPGTYSFYCTPHRGAGMVGTITV", + ) + self.assertEqual( + alignment.sequences[4].seq, + "VHMLNKGKDGAMVFEPASLKVAPGDTVTFIPTDKGHNVETIKGMIPDGAEAFKSKINENYKVTFTAPGVYGVKCTPHYGMGMVGVVEV", + ) + self.assertEqual( + alignment[0], + "D-VLLGANGGVLVFEPNDFSVKAGETITFKNNAGYPHNVVFDEDAVPSG-VD-VSKISQEEYLTAPGETFSVTLTV---PGTYGFYCEPHAGAGMVGKVTV", + ) + self.assertEqual( + alignment[1], + "--VKLGADSGALEFVPKTLTIKSGETVNFVNNAGFPHNIVFDEDAIPSG-VN-ADAISRDDYLNAPGETYSVKLTA---AGEYGYYCEPHQGAGMVGKIIV", + ) + self.assertEqual( + alignment[2], + "--VKLGSDKGLLVFEPAKLTIKPGDTVEFLNNKVPPHNVVFDAALNPAKSADLAKSLSHKQLLMSPGQSTSTTFPADAPAGEYTFYCEPHRGAGMVGKITV", + ) + self.assertEqual( + alignment[3], + "VQIKMGTDKYAPLYEPKALSISAGDTVEFVMNKVGPHNVIFDK--VPAG-ES-APALSNTKLRIAPGSFYSVTLGT---PGTYSFYCTPHRGAGMVGTITV", + ) + self.assertEqual( + alignment[4], + "VHMLNKGKDGAMVFEPASLKVAPGDTVTFIPTDK-GHNVETIKGMIPDG-AE-A-------FKSKINENYKVTFTA---PGVYGVKCTPHYGMGMVGVVEV", + ) + self.assertEqual( + alignment.column_annotations["state"], + "DIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDIDDIDDDDDDDDDDDDDDDDDDDDDDDIIIDDDDDDDDDDDDDDDDDDDDDD", + ) + self.check_reading_writing(path) + + def test_empty(self): + """Checking empty file.""" + stream = StringIO() + alignments = AlignmentIterator(stream) + with self.assertRaises(ValueError): + next(alignments) + + +if __name__ == "__main__": + runner = unittest.TextTestRunner(verbosity=2) + unittest.main(testRunner=runner)