Add test cases and sample files for the new parsers.

Do some basic checking for the Xdna and SnapGene parsers in the general SeqIO test. Also do more specific testing in new dedicated test modules for each format. Include ad-hoc generated sample Xdna and SnapGene files. For GCK, the test cases are dependent on the presence of files from the Drosophile Gateway Vectors Collection, which we cannot distribute with Biopython. The tests are automatically skipped unless the files have been manually added to the Tests/Gck directory.
2025-10-20 13:43:47 +08:00 · 2019-08-06 14:05:44 +01:00
parent 7ceb4ed133
commit 5026c51b22
9 changed files with 685 additions and 0 deletions
--- a/Tests/SnapGene/sample-d.dna
+++ b/Tests/SnapGene/sample-d.dna
--- a/Tests/SnapGene/sample-e.dna
+++ b/Tests/SnapGene/sample-e.dna
--- a/Tests/Xdna/sample-a.xdna
+++ b/Tests/Xdna/sample-a.xdna
--- a/Tests/Xdna/sample-b.xdna
+++ b/Tests/Xdna/sample-b.xdna
--- a/Tests/Xdna/sample-c.xprt
+++ b/Tests/Xdna/sample-c.xprt
--- a/Tests/test_SeqIO.py
+++ b/Tests/test_SeqIO.py
@ -4380,6 +4380,44 @@ class TestSeqIO(unittest.TestCase):
                    ]
        self.perform_test("cif-seqres", False, "PDB/2BEG.cif", 5, ids, names, sequences, lengths, alignment, messages)

+    def test_xdna1(self):
+        sequences = ["acttgctataccccgctaccttaaccctggccgtcgcaag...agcagat",
+                     ]
+        ids = ["Sample",
+               ]
+        names = ["Sample",
+                 ]
+        lengths = [1000]
+        alignment = None
+        messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "Sequence should contain A,C,G,T,N,a,c,g,t,n only",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "Missing SFF flow information",
+                    ]
+        self.perform_test("xdna", False, "Xdna/sample-a.xdna", 1, ids, names, sequences, lengths, alignment, messages)
+
+    def test_snapgene1(self):
+        sequences = ["gcacactaagccttccatctattcggcttcttgctctgca...atcgtag",
+                     ]
+        ids = ["Sample",
+               ]
+        names = ["Sample",
+                 ]
+        lengths = [1000]
+        alignment = None
+        messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "Sequence should contain A,C,G,T,N,a,c,g,t,n only",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
+                    "Missing SFF flow information",
+                    ]
+        self.perform_test("snapgene", False, "SnapGene/sample-d.dna", 1, ids, names, sequences, lengths, alignment, messages)
+
    def test_empty_file(self):
        """Check parsers can cope with an empty file."""
        for t_format in SeqIO._FormatToIterator:
--- a/Tests/test_SeqIO_Gck.py
+++ b/Tests/test_SeqIO_Gck.py
@ -0,0 +1,339 @@
+# Copyright 2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Tests for the SeqIO Gck module."""
+
+from io import BytesIO
+import os.path
+import unittest
+
+from Bio import SeqIO
+
+# The Gck parser has been developed using the files of the
+# Drosophila Gateway Vector Collection from Carnegie Science
+# (<https://emb.carnegiescience.edu/drosophila-gateway-vector-collection>)
+# as sample Gck files. We cannot redistribute those files along with
+# Biopython, therefore we skip the test case by default.
+#
+# To test the Gck parser, download the GCK.zip archive from the URL
+# above and place the files it contains under a 'Tests/Gck' directory.
+@unittest.skipUnless(os.path.exists('Gck'), 'Gck sample files not available')
+class TestGck(unittest.TestCase):
+
+    sample_data = {
+        'pACW': {
+            'file': 'Gck/pACW',
+            'name': 'Construct:',
+            'id': 'Construct:',
+            'description': 'Construct:  pACTIN-RW-SV',
+            'length': 7957,
+            'topology': 'circular',
+            'features': [
+                {
+                    'type': 'CDS',
+                    'start': 6155,
+                    'end': 7013,
+                    'strand': 1,
+                    'label': 'ampR'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5216,
+                    'end': 6071,
+                    'strand': 1,
+                    'label': 'SV40 sti/polyA'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 89,
+                    'end': 2662,
+                    'strand': 1,
+                    'label': 'actin5C promoter'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 3722,
+                    'end': 4400,
+                    'strand': 1,
+                    'label': 'chlR'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 4722,
+                    'end': 5025,
+                    'strand': 1,
+                    'label': 'ccdB'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 3489,
+                    'end': 3507,
+                    'strand': 1,
+                    'label': 'attR1'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5175,
+                    'end': 5193,
+                    'strand': -1,
+                    'label': 'attR2'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 3489,
+                    'end': 5193,
+                    'strand': 1,
+                    'label': 'Gateway cassette'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5192,
+                    'end': 5205,
+                    'strand': 1,
+                    'label': 'triple STOP'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 2763,
+                    'end': 3480,
+                    'strand': 1,
+                    'label': 'ECFP'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 2755,
+                    'end': 3482,
+                    'strand': 1,
+                    'label': 'pACTIN-SV'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 2755,
+                    'end': 3482,
+                    'strand': 1,
+                    'label': 'Construct:  pACTIN-RW-SV'
+                    },
+                ]
+            },
+        'pPWF': {
+            'file': 'Gck/pPWG',
+            'name': 'Construct:',
+            'id': 'Construct:',
+            'description': 'Construct:  pPWF',
+            'length': 12320,
+            'topology': 'circular',
+            'features': [
+                {
+                    'type': 'misc_feature',
+                    'start': 0,
+                    'end': 587,
+                    'strand': 1,
+                    'label': 'P 5\' end'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 9327,
+                    'end': 9560,
+                    'strand': -1,
+                    'label': 'P 3\' end'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 1363,
+                    'end': 4244,
+                    'strand': -1,
+                    'label': 'mini-white'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 10466,
+                    'end': 11324,
+                    'strand': 1,
+                    'label': 'ampR'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 7930,
+                    'end': 9314,
+                    'strand': 1,
+                    'label': 'K10 terminator'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 4762,
+                    'end': 4829,
+                    'strand': 1,
+                    'label': 'GAGA repeats'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 4855,
+                    'end': 5177,
+                    'strand': 1,
+                    'label': 'GAL4 sites'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5279,
+                    'end': 5415,
+                    'strand': 1,
+                    'label': 'P intron'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5184,
+                    'end': 5279,
+                    'strand': 1,
+                    'label': 'P promoter'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 4762,
+                    'end': 5416,
+                    'strand': 1,
+                    'label': 'UASp promoter'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 10060,
+                    'end': 12092,
+                    'strand': 1,
+                    'label': 'pUC8'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 7106,
+                    'end': 7124,
+                    'strand': -1,
+                    'label': 'attR2'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 6653,
+                    'end': 6956,
+                    'strand': 1,
+                    'label': 'ccdB'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 5653,
+                    'end': 6331,
+                    'strand': 1,
+                    'label': 'chlR'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5420,
+                    'end': 7124,
+                    'strand': 1,
+                    'label': 'Gateway Cassette'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5420,
+                    'end': 5438,
+                    'strand': 1,
+                    'label': 'attR1'
+                    },
+                {
+                    'type': 'CDS',
+                    'start': 7137,
+                    'end': 7854,
+                    'strand': 1,
+                    'label': 'EGFP'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 7129,
+                    'end': 7856,
+                    'strand': 1,
+                    'label': 'pACTIN-SV'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 7129,
+                    'end': 7856,
+                    'strand': 1,
+                    'label': 'Construct:  pACTIN-WC-SV'
+                    },
+                {
+                    'type': 'misc_feature',
+                    'start': 5416,
+                    'end': 7875,
+                    'strand': 1,
+                    'label': 'Construct:  pPWF'
+                    },
+                ]
+            }
+        }
+
+    def test_read(self):
+        """Read sample files."""
+        for sample in self.sample_data.values():
+            record = SeqIO.read(sample['file'], 'gck')
+            self.assertEqual(sample['name'], record.name)
+            self.assertEqual(sample['id'], record.id)
+            self.assertEqual(sample['description'], record.description)
+            self.assertEqual(sample['length'], len(record))
+            self.assertEqual(sample['topology'], record.annotations['topology'])
+
+            self.assertEqual(len(sample['features']), len(record.features))
+            for i in range(len(sample['features'])):
+                exp_feat = sample['features'][i]
+                read_feat = record.features[i]
+                self.assertEqual(exp_feat['type'], read_feat.type)
+                self.assertEqual(exp_feat['start'], read_feat.location.start)
+                self.assertEqual(exp_feat['end'], read_feat.location.end)
+                self.assertEqual(exp_feat['strand'], read_feat.location.strand)
+                self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
+
+
+@unittest.skipUnless(os.path.exists('Gck'), 'Gck sample files not available')
+class TestInvalidGck(unittest.TestCase):
+
+    def setUp(self):
+        f = open('Gck/pACW', 'rb')
+        self.buffer = f.read()
+        f.close()
+
+    def munge_buffer(self, position, value):
+        mod_buffer = bytearray(self.buffer)
+        if isinstance(value, list):
+            mod_buffer[position:position + len(value) - 1] = value
+        else:
+            mod_buffer[position] = value
+        return BytesIO(mod_buffer)
+
+    def test_conflicting_lengths(self):
+        """Read a file with incorrect length."""
+        # Change the sequence length as indicated in the sequence packet
+        h = self.munge_buffer(28, [0x00, 0x00, 0x20, 0x15])
+        with self.assertRaisesRegexp(ValueError, "Conflicting sequence length values"):
+            SeqIO.read(h, 'gck')
+        h.close()
+
+        # Change the sequence length as indicated in the features packet
+        h = self.munge_buffer(8135, [0x00, 0x00, 0x20, 0x15])
+        with self.assertRaisesRegexp(ValueError, "Conflicting sequence length values"):
+            SeqIO.read(h, 'gck')
+        h.close()
+
+        # Change the number of features
+        h = self.munge_buffer(8140, 0x30)
+        with self.assertRaisesRegexp(ValueError, "Features packet size inconsistent with number of features"):
+            SeqIO.read(h, 'gck')
+        h.close()
+
+        # Change the number of restriction sites
+        h = self.munge_buffer(10528, 0x30)
+        with self.assertRaisesRegexp(ValueError, "Sites packet size inconsistent with number of sites"):
+            SeqIO.read(h, 'gck')
+        h.close()
+
+
+if __name__ == "__main__":
+    runner = unittest.TextTestRunner(verbosity=2)
+    unittest.main(testRunner=runner)
--- a/Tests/test_SeqIO_SnapGene.py
+++ b/Tests/test_SeqIO_SnapGene.py
@ -0,0 +1,134 @@
+# Copyright 2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Tests for the SeqIO SnapGene module."""
+
+import datetime
+from io import BytesIO
+import unittest
+
+from Bio import SeqIO
+
+
+class TestSnapGene(unittest.TestCase):
+
+    sample_data = {
+        'sample-d': {
+            'file': 'SnapGene/sample-d.dna',
+            'name': 'Sample',
+            'id': 'Sample',
+            'description': 'Sample Sequence D',
+            'length': 1000,
+            'topology': 'linear',
+            'date': datetime.datetime(2019, 8, 3, 0, 0),
+            'features': [
+                {
+                    'type': 'misc_binding',
+                    'start': 499,
+                    'end': 700,
+                    'strand': 1,
+                    'label': 'FeatureB'
+                    },
+                {
+                    'type': 'promoter',
+                    'start': 49,
+                    'end': 150,
+                    'strand': 1,
+                    'label': 'FeatureA'
+                    }
+                ]
+            },
+        'sample-b': {
+            'file': 'SnapGene/sample-e.dna',
+            'name': 'Sample',
+            'id': 'Sample',
+            'description': 'Sample Sequence E',
+            'length': 1000,
+            'date': datetime.datetime(2019, 8, 3, 0, 0),
+            'topology': 'circular',
+            'features': [
+                {
+                    'type': 'terminator',
+                    'start': 399,
+                    'end': 750,
+                    'strand': -1,
+                    'label': 'FeatureB'
+                    },
+                {
+                    'type': 'rep_origin',
+                    'start': 160,
+                    'end': 241,
+                    'strand': 1,
+                    'label': 'FeatureA'
+                    }
+                ]
+            }
+        }
+
+    def test_read(self):
+        """Read sample files."""
+        for sample in self.sample_data.values():
+            record = SeqIO.read(sample['file'], 'snapgene')
+            self.assertEqual(sample['name'], record.name)
+            self.assertEqual(sample['id'], record.id)
+            self.assertEqual(sample['description'], record.description)
+            self.assertEqual(sample['length'], len(record))
+            self.assertEqual(sample['date'], record.annotations['date'])
+            self.assertEqual(sample['topology'], record.annotations['topology'])
+
+            self.assertEqual(len(sample['features']), len(record.features))
+            for i in range(len(sample['features'])):
+                exp_feat = sample['features'][i]
+                read_feat = record.features[i]
+                self.assertEqual(exp_feat['type'], read_feat.type)
+                self.assertEqual(exp_feat['start'], read_feat.location.start)
+                self.assertEqual(exp_feat['end'], read_feat.location.end)
+                self.assertEqual(exp_feat['strand'], read_feat.location.strand)
+                self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
+
+
+class TestCorruptedSnapGene(unittest.TestCase):
+
+    def setUp(self):
+        f = open('SnapGene/sample-d.dna', 'rb')
+        self.buffer = f.read()
+        f.close()
+
+    def munge_buffer(self, position, value):
+        mod_buffer = bytearray(self.buffer)
+        if isinstance(value, list):
+            mod_buffer[position:position + len(value) - 1] = value
+        else:
+            mod_buffer[position] = value
+        return BytesIO(mod_buffer)
+
+    def test_invalid_cookie(self):
+        """Read a file with missing or invalid cookie packet."""
+        # Remove the first packet
+        h = BytesIO(self.buffer[19:])
+        with self.assertRaisesRegexp(ValueError, "The file does not start with a SnapGene cookie packet"):
+            SeqIO.read(h, 'snapgene')
+        h.close()
+
+        # Keep the first packet but destroy the magic cookie
+        h = self.munge_buffer(5, [0x4B, 0x41, 0x42, 0x4F, 0x4F, 0x4D])
+        with self.assertRaisesRegexp(ValueError, "The file is not a valid SnapGene file"):
+            SeqIO.read(h, 'snapgene')
+        h.close()
+
+    def test_missing_dna(self):
+        """Read a file without a DNA packet."""
+        # Simulate a missing DNA packet by changing the tag byte to an
+        # unknown packet type, so that the parser will skip the packet.
+        h = self.munge_buffer(19, 0x80)
+        with self.assertRaisesRegexp(ValueError, "No DNA packet in file"):
+            SeqIO.read(h, 'snapgene')
+        h.close()
+
+
+if __name__ == "__main__":
+    runner = unittest.TextTestRunner(verbosity=2)
+    unittest.main(testRunner=runner)
--- a/Tests/test_SeqIO_Xdna.py
+++ b/Tests/test_SeqIO_Xdna.py
@ -0,0 +1,174 @@
+# Copyright 2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Tests for the SeqIO Xdna module."""
+
+from io import BytesIO
+import unittest
+
+from Bio import Alphabet, SeqIO
+
+
+class TestXdna(unittest.TestCase):
+
+    sample_data = {
+        'sample-a': {
+            'file': 'Xdna/sample-a.xdna',
+            'name': 'Sample',
+            'id': 'Sample',
+            'description': 'Sample sequence A',
+            'length': 1000,
+            'alphabet': Alphabet.generic_dna,
+            'topology': 'linear',
+            'features': [
+                {
+                    'type': 'promoter',
+                    'start': 49,
+                    'end': 150,
+                    'strand': 1,
+                    'label': 'FeatureA'
+                    },
+                {
+                    'type': 'misc_binding',
+                    'start': 499,
+                    'end': 700,
+                    'strand': -1,
+                    'label': 'FeatureB'
+                    }
+                ]
+            },
+        'sample-b': {
+            'file': 'Xdna/sample-b.xdna',
+            'name': 'Sample',
+            'id': 'Sample',
+            'description': 'Sample sequence B',
+            'length': 1000,
+            'alphabet': Alphabet.generic_dna,
+            'topology': 'circular',
+            'features': [
+                {
+                    'type': 'rep_origin',
+                    'start': 160,
+                    'end': 241,
+                    'strand': 1,
+                    'label': 'FeatureA'
+                    },
+                {
+                    'type': 'terminator',
+                    'start': 399,
+                    'end': 750,
+                    'strand': -1,
+                    'label': 'FeatureB'
+                    }
+                ]
+            },
+        'sample-c': {
+            'file': 'Xdna/sample-c.xprt',
+            'name': 'Sample',
+            'id': 'Sample',
+            'description': 'Sample Sequence C',
+            'length': 1000,
+            'alphabet': Alphabet.generic_protein,
+            'topology': 'linear',
+            'features': [
+                {
+                    'type': 'misc_feature',
+                    'start': 10,
+                    'end': 11,
+                    'strand': 1,
+                    'label': 'S11'
+                    },
+                {
+                    'type': 'misc_binding',
+                    'start': 164,
+                    'end': 195,
+                    'strand': 1,
+                    'label': 'RIP1'
+                    }
+                ]
+            }
+        }
+
+    def test_read(self):
+        """Read sample files."""
+        for sample in self.sample_data.values():
+            record = SeqIO.read(sample['file'], 'xdna')
+            self.assertEqual(sample['name'], record.name)
+            self.assertEqual(sample['id'], record.id)
+            self.assertEqual(sample['description'], record.description)
+            self.assertEqual(sample['length'], len(record))
+            self.assertEqual(sample['alphabet'], record.seq.alphabet)
+            self.assertEqual(sample['topology'], record.annotations['topology'])
+
+            self.assertEqual(len(sample['features']), len(record.features))
+            for i in range(len(sample['features'])):
+                exp_feat = sample['features'][i]
+                read_feat = record.features[i]
+                self.assertEqual(exp_feat['type'], read_feat.type)
+                self.assertEqual(exp_feat['start'], read_feat.location.start)
+                self.assertEqual(exp_feat['end'], read_feat.location.end)
+                self.assertEqual(exp_feat['strand'], read_feat.location.strand)
+                self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
+
+
+class TestInvalidXdna(unittest.TestCase):
+
+    def setUp(self):
+        f = open('Xdna/sample-a.xdna', 'rb')
+        self.buffer = f.read()
+        f.close()
+
+    def munge_buffer(self, position, value):
+        mod_buffer = bytearray(self.buffer)
+        if isinstance(value, list):
+            mod_buffer[position:position + len(value) - 1] = value
+        else:
+            mod_buffer[position] = value
+        return BytesIO(mod_buffer)
+
+    def test_unsupported_version(self):
+        """Read a file with unexpected version number."""
+        h = self.munge_buffer(0, 0x01)  # Change version byte
+        with self.assertRaisesRegexp(ValueError, "Unsupported XDNA version"):
+            SeqIO.read(h, 'xdna')
+        h.close()
+
+    def test_invalid_sequence_type(self):
+        """Read a file with an unknown sequence type."""
+        h = self.munge_buffer(1, 0x0A)  # Change type byte
+        with self.assertRaisesRegexp(ValueError, "Unknown sequence type"):
+            SeqIO.read(h, 'xdna')
+        h.close()
+
+    def test_corrupted_length(self):
+        """Read a file with incorrect length."""
+        # Set a length shorter than the actual length of the sequence
+        h = self.munge_buffer(29, [0x00, 0x00, 0x00, 0x80])
+        with self.assertRaisesRegexp(ValueError, "invalid literal"):
+            SeqIO.read(h, 'xdna')
+        h.close()
+
+        # Set a length larger than the actual length of the sequence
+        h = self.munge_buffer(29, [0x00, 0x08, 0x00, 0x00])
+        with self.assertRaisesRegexp(ValueError, "Cannot read 2048 bytes from handle"):
+            SeqIO.read(h, 'xdna')
+        h.close()
+
+    def test_missing_features(self):
+        """Read a file with an incorrect number of features."""
+        # Set a larger number of features than the file actually contains
+        # Offset of the features number byte:
+        # header + length of sequence + length of comment + overhangs
+        feature_byte = 112 + 1000 + len('Sample sequence A') + 5
+        h = self.munge_buffer(feature_byte, 3)
+        with self.assertRaisesRegexp(ValueError, "Cannot read 1 bytes from handle"):
+            SeqIO.read(h, 'xdna')
+        h.close()
+
+
+if __name__ == "__main__":
+    runner = unittest.TextTestRunner(verbosity=2)
+    unittest.main(testRunner=runner)