Add test cases and sample files for the new parsers.

Do some basic checking for the Xdna and SnapGene parsers in the
general SeqIO test. Also do more specific testing in new dedicated
test modules for each format.

Include ad-hoc generated sample Xdna and SnapGene files. For GCK,
the test cases are dependent on the presence of files from the
Drosophile Gateway Vectors Collection, which we cannot distribute
with Biopython. The tests are automatically skipped unless the
files have been manually added to the Tests/Gck directory.
This commit is contained in:
Damien Goutte-Gattat
2019-08-06 14:05:44 +01:00
committed by Peter Cock
parent 7ceb4ed133
commit 5026c51b22
9 changed files with 685 additions and 0 deletions

BIN
Tests/SnapGene/sample-d.dna Normal file

Binary file not shown.

BIN
Tests/SnapGene/sample-e.dna Normal file

Binary file not shown.

BIN
Tests/Xdna/sample-a.xdna Normal file

Binary file not shown.

BIN
Tests/Xdna/sample-b.xdna Normal file

Binary file not shown.

BIN
Tests/Xdna/sample-c.xprt Normal file

Binary file not shown.

View File

@ -4380,6 +4380,44 @@ class TestSeqIO(unittest.TestCase):
]
self.perform_test("cif-seqres", False, "PDB/2BEG.cif", 5, ids, names, sequences, lengths, alignment, messages)
def test_xdna1(self):
sequences = ["acttgctataccccgctaccttaaccctggccgtcgcaag...agcagat",
]
ids = ["Sample",
]
names = ["Sample",
]
lengths = [1000]
alignment = None
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Missing SFF flow information",
]
self.perform_test("xdna", False, "Xdna/sample-a.xdna", 1, ids, names, sequences, lengths, alignment, messages)
def test_snapgene1(self):
sequences = ["gcacactaagccttccatctattcggcttcttgctctgca...atcgtag",
]
ids = ["Sample",
]
names = ["Sample",
]
lengths = [1000]
alignment = None
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Missing SFF flow information",
]
self.perform_test("snapgene", False, "SnapGene/sample-d.dna", 1, ids, names, sequences, lengths, alignment, messages)
def test_empty_file(self):
"""Check parsers can cope with an empty file."""
for t_format in SeqIO._FormatToIterator:

339
Tests/test_SeqIO_Gck.py Normal file
View File

@ -0,0 +1,339 @@
# Copyright 2019 Damien Goutte-Gattat. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Tests for the SeqIO Gck module."""
from io import BytesIO
import os.path
import unittest
from Bio import SeqIO
# The Gck parser has been developed using the files of the
# Drosophila Gateway Vector Collection from Carnegie Science
# (<https://emb.carnegiescience.edu/drosophila-gateway-vector-collection>)
# as sample Gck files. We cannot redistribute those files along with
# Biopython, therefore we skip the test case by default.
#
# To test the Gck parser, download the GCK.zip archive from the URL
# above and place the files it contains under a 'Tests/Gck' directory.
@unittest.skipUnless(os.path.exists('Gck'), 'Gck sample files not available')
class TestGck(unittest.TestCase):
sample_data = {
'pACW': {
'file': 'Gck/pACW',
'name': 'Construct:',
'id': 'Construct:',
'description': 'Construct: pACTIN-RW-SV',
'length': 7957,
'topology': 'circular',
'features': [
{
'type': 'CDS',
'start': 6155,
'end': 7013,
'strand': 1,
'label': 'ampR'
},
{
'type': 'misc_feature',
'start': 5216,
'end': 6071,
'strand': 1,
'label': 'SV40 sti/polyA'
},
{
'type': 'misc_feature',
'start': 89,
'end': 2662,
'strand': 1,
'label': 'actin5C promoter'
},
{
'type': 'CDS',
'start': 3722,
'end': 4400,
'strand': 1,
'label': 'chlR'
},
{
'type': 'CDS',
'start': 4722,
'end': 5025,
'strand': 1,
'label': 'ccdB'
},
{
'type': 'misc_feature',
'start': 3489,
'end': 3507,
'strand': 1,
'label': 'attR1'
},
{
'type': 'misc_feature',
'start': 5175,
'end': 5193,
'strand': -1,
'label': 'attR2'
},
{
'type': 'misc_feature',
'start': 3489,
'end': 5193,
'strand': 1,
'label': 'Gateway cassette'
},
{
'type': 'misc_feature',
'start': 5192,
'end': 5205,
'strand': 1,
'label': 'triple STOP'
},
{
'type': 'CDS',
'start': 2763,
'end': 3480,
'strand': 1,
'label': 'ECFP'
},
{
'type': 'misc_feature',
'start': 2755,
'end': 3482,
'strand': 1,
'label': 'pACTIN-SV'
},
{
'type': 'misc_feature',
'start': 2755,
'end': 3482,
'strand': 1,
'label': 'Construct: pACTIN-RW-SV'
},
]
},
'pPWF': {
'file': 'Gck/pPWG',
'name': 'Construct:',
'id': 'Construct:',
'description': 'Construct: pPWF',
'length': 12320,
'topology': 'circular',
'features': [
{
'type': 'misc_feature',
'start': 0,
'end': 587,
'strand': 1,
'label': 'P 5\' end'
},
{
'type': 'misc_feature',
'start': 9327,
'end': 9560,
'strand': -1,
'label': 'P 3\' end'
},
{
'type': 'misc_feature',
'start': 1363,
'end': 4244,
'strand': -1,
'label': 'mini-white'
},
{
'type': 'CDS',
'start': 10466,
'end': 11324,
'strand': 1,
'label': 'ampR'
},
{
'type': 'misc_feature',
'start': 7930,
'end': 9314,
'strand': 1,
'label': 'K10 terminator'
},
{
'type': 'misc_feature',
'start': 4762,
'end': 4829,
'strand': 1,
'label': 'GAGA repeats'
},
{
'type': 'misc_feature',
'start': 4855,
'end': 5177,
'strand': 1,
'label': 'GAL4 sites'
},
{
'type': 'misc_feature',
'start': 5279,
'end': 5415,
'strand': 1,
'label': 'P intron'
},
{
'type': 'misc_feature',
'start': 5184,
'end': 5279,
'strand': 1,
'label': 'P promoter'
},
{
'type': 'misc_feature',
'start': 4762,
'end': 5416,
'strand': 1,
'label': 'UASp promoter'
},
{
'type': 'misc_feature',
'start': 10060,
'end': 12092,
'strand': 1,
'label': 'pUC8'
},
{
'type': 'misc_feature',
'start': 7106,
'end': 7124,
'strand': -1,
'label': 'attR2'
},
{
'type': 'CDS',
'start': 6653,
'end': 6956,
'strand': 1,
'label': 'ccdB'
},
{
'type': 'CDS',
'start': 5653,
'end': 6331,
'strand': 1,
'label': 'chlR'
},
{
'type': 'misc_feature',
'start': 5420,
'end': 7124,
'strand': 1,
'label': 'Gateway Cassette'
},
{
'type': 'misc_feature',
'start': 5420,
'end': 5438,
'strand': 1,
'label': 'attR1'
},
{
'type': 'CDS',
'start': 7137,
'end': 7854,
'strand': 1,
'label': 'EGFP'
},
{
'type': 'misc_feature',
'start': 7129,
'end': 7856,
'strand': 1,
'label': 'pACTIN-SV'
},
{
'type': 'misc_feature',
'start': 7129,
'end': 7856,
'strand': 1,
'label': 'Construct: pACTIN-WC-SV'
},
{
'type': 'misc_feature',
'start': 5416,
'end': 7875,
'strand': 1,
'label': 'Construct: pPWF'
},
]
}
}
def test_read(self):
"""Read sample files."""
for sample in self.sample_data.values():
record = SeqIO.read(sample['file'], 'gck')
self.assertEqual(sample['name'], record.name)
self.assertEqual(sample['id'], record.id)
self.assertEqual(sample['description'], record.description)
self.assertEqual(sample['length'], len(record))
self.assertEqual(sample['topology'], record.annotations['topology'])
self.assertEqual(len(sample['features']), len(record.features))
for i in range(len(sample['features'])):
exp_feat = sample['features'][i]
read_feat = record.features[i]
self.assertEqual(exp_feat['type'], read_feat.type)
self.assertEqual(exp_feat['start'], read_feat.location.start)
self.assertEqual(exp_feat['end'], read_feat.location.end)
self.assertEqual(exp_feat['strand'], read_feat.location.strand)
self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
@unittest.skipUnless(os.path.exists('Gck'), 'Gck sample files not available')
class TestInvalidGck(unittest.TestCase):
def setUp(self):
f = open('Gck/pACW', 'rb')
self.buffer = f.read()
f.close()
def munge_buffer(self, position, value):
mod_buffer = bytearray(self.buffer)
if isinstance(value, list):
mod_buffer[position:position + len(value) - 1] = value
else:
mod_buffer[position] = value
return BytesIO(mod_buffer)
def test_conflicting_lengths(self):
"""Read a file with incorrect length."""
# Change the sequence length as indicated in the sequence packet
h = self.munge_buffer(28, [0x00, 0x00, 0x20, 0x15])
with self.assertRaisesRegexp(ValueError, "Conflicting sequence length values"):
SeqIO.read(h, 'gck')
h.close()
# Change the sequence length as indicated in the features packet
h = self.munge_buffer(8135, [0x00, 0x00, 0x20, 0x15])
with self.assertRaisesRegexp(ValueError, "Conflicting sequence length values"):
SeqIO.read(h, 'gck')
h.close()
# Change the number of features
h = self.munge_buffer(8140, 0x30)
with self.assertRaisesRegexp(ValueError, "Features packet size inconsistent with number of features"):
SeqIO.read(h, 'gck')
h.close()
# Change the number of restriction sites
h = self.munge_buffer(10528, 0x30)
with self.assertRaisesRegexp(ValueError, "Sites packet size inconsistent with number of sites"):
SeqIO.read(h, 'gck')
h.close()
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -0,0 +1,134 @@
# Copyright 2019 Damien Goutte-Gattat. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Tests for the SeqIO SnapGene module."""
import datetime
from io import BytesIO
import unittest
from Bio import SeqIO
class TestSnapGene(unittest.TestCase):
sample_data = {
'sample-d': {
'file': 'SnapGene/sample-d.dna',
'name': 'Sample',
'id': 'Sample',
'description': 'Sample Sequence D',
'length': 1000,
'topology': 'linear',
'date': datetime.datetime(2019, 8, 3, 0, 0),
'features': [
{
'type': 'misc_binding',
'start': 499,
'end': 700,
'strand': 1,
'label': 'FeatureB'
},
{
'type': 'promoter',
'start': 49,
'end': 150,
'strand': 1,
'label': 'FeatureA'
}
]
},
'sample-b': {
'file': 'SnapGene/sample-e.dna',
'name': 'Sample',
'id': 'Sample',
'description': 'Sample Sequence E',
'length': 1000,
'date': datetime.datetime(2019, 8, 3, 0, 0),
'topology': 'circular',
'features': [
{
'type': 'terminator',
'start': 399,
'end': 750,
'strand': -1,
'label': 'FeatureB'
},
{
'type': 'rep_origin',
'start': 160,
'end': 241,
'strand': 1,
'label': 'FeatureA'
}
]
}
}
def test_read(self):
"""Read sample files."""
for sample in self.sample_data.values():
record = SeqIO.read(sample['file'], 'snapgene')
self.assertEqual(sample['name'], record.name)
self.assertEqual(sample['id'], record.id)
self.assertEqual(sample['description'], record.description)
self.assertEqual(sample['length'], len(record))
self.assertEqual(sample['date'], record.annotations['date'])
self.assertEqual(sample['topology'], record.annotations['topology'])
self.assertEqual(len(sample['features']), len(record.features))
for i in range(len(sample['features'])):
exp_feat = sample['features'][i]
read_feat = record.features[i]
self.assertEqual(exp_feat['type'], read_feat.type)
self.assertEqual(exp_feat['start'], read_feat.location.start)
self.assertEqual(exp_feat['end'], read_feat.location.end)
self.assertEqual(exp_feat['strand'], read_feat.location.strand)
self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
class TestCorruptedSnapGene(unittest.TestCase):
def setUp(self):
f = open('SnapGene/sample-d.dna', 'rb')
self.buffer = f.read()
f.close()
def munge_buffer(self, position, value):
mod_buffer = bytearray(self.buffer)
if isinstance(value, list):
mod_buffer[position:position + len(value) - 1] = value
else:
mod_buffer[position] = value
return BytesIO(mod_buffer)
def test_invalid_cookie(self):
"""Read a file with missing or invalid cookie packet."""
# Remove the first packet
h = BytesIO(self.buffer[19:])
with self.assertRaisesRegexp(ValueError, "The file does not start with a SnapGene cookie packet"):
SeqIO.read(h, 'snapgene')
h.close()
# Keep the first packet but destroy the magic cookie
h = self.munge_buffer(5, [0x4B, 0x41, 0x42, 0x4F, 0x4F, 0x4D])
with self.assertRaisesRegexp(ValueError, "The file is not a valid SnapGene file"):
SeqIO.read(h, 'snapgene')
h.close()
def test_missing_dna(self):
"""Read a file without a DNA packet."""
# Simulate a missing DNA packet by changing the tag byte to an
# unknown packet type, so that the parser will skip the packet.
h = self.munge_buffer(19, 0x80)
with self.assertRaisesRegexp(ValueError, "No DNA packet in file"):
SeqIO.read(h, 'snapgene')
h.close()
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

174
Tests/test_SeqIO_Xdna.py Normal file
View File

@ -0,0 +1,174 @@
# Copyright 2019 Damien Goutte-Gattat. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Tests for the SeqIO Xdna module."""
from io import BytesIO
import unittest
from Bio import Alphabet, SeqIO
class TestXdna(unittest.TestCase):
sample_data = {
'sample-a': {
'file': 'Xdna/sample-a.xdna',
'name': 'Sample',
'id': 'Sample',
'description': 'Sample sequence A',
'length': 1000,
'alphabet': Alphabet.generic_dna,
'topology': 'linear',
'features': [
{
'type': 'promoter',
'start': 49,
'end': 150,
'strand': 1,
'label': 'FeatureA'
},
{
'type': 'misc_binding',
'start': 499,
'end': 700,
'strand': -1,
'label': 'FeatureB'
}
]
},
'sample-b': {
'file': 'Xdna/sample-b.xdna',
'name': 'Sample',
'id': 'Sample',
'description': 'Sample sequence B',
'length': 1000,
'alphabet': Alphabet.generic_dna,
'topology': 'circular',
'features': [
{
'type': 'rep_origin',
'start': 160,
'end': 241,
'strand': 1,
'label': 'FeatureA'
},
{
'type': 'terminator',
'start': 399,
'end': 750,
'strand': -1,
'label': 'FeatureB'
}
]
},
'sample-c': {
'file': 'Xdna/sample-c.xprt',
'name': 'Sample',
'id': 'Sample',
'description': 'Sample Sequence C',
'length': 1000,
'alphabet': Alphabet.generic_protein,
'topology': 'linear',
'features': [
{
'type': 'misc_feature',
'start': 10,
'end': 11,
'strand': 1,
'label': 'S11'
},
{
'type': 'misc_binding',
'start': 164,
'end': 195,
'strand': 1,
'label': 'RIP1'
}
]
}
}
def test_read(self):
"""Read sample files."""
for sample in self.sample_data.values():
record = SeqIO.read(sample['file'], 'xdna')
self.assertEqual(sample['name'], record.name)
self.assertEqual(sample['id'], record.id)
self.assertEqual(sample['description'], record.description)
self.assertEqual(sample['length'], len(record))
self.assertEqual(sample['alphabet'], record.seq.alphabet)
self.assertEqual(sample['topology'], record.annotations['topology'])
self.assertEqual(len(sample['features']), len(record.features))
for i in range(len(sample['features'])):
exp_feat = sample['features'][i]
read_feat = record.features[i]
self.assertEqual(exp_feat['type'], read_feat.type)
self.assertEqual(exp_feat['start'], read_feat.location.start)
self.assertEqual(exp_feat['end'], read_feat.location.end)
self.assertEqual(exp_feat['strand'], read_feat.location.strand)
self.assertEqual(exp_feat['label'], read_feat.qualifiers['label'][0])
class TestInvalidXdna(unittest.TestCase):
def setUp(self):
f = open('Xdna/sample-a.xdna', 'rb')
self.buffer = f.read()
f.close()
def munge_buffer(self, position, value):
mod_buffer = bytearray(self.buffer)
if isinstance(value, list):
mod_buffer[position:position + len(value) - 1] = value
else:
mod_buffer[position] = value
return BytesIO(mod_buffer)
def test_unsupported_version(self):
"""Read a file with unexpected version number."""
h = self.munge_buffer(0, 0x01) # Change version byte
with self.assertRaisesRegexp(ValueError, "Unsupported XDNA version"):
SeqIO.read(h, 'xdna')
h.close()
def test_invalid_sequence_type(self):
"""Read a file with an unknown sequence type."""
h = self.munge_buffer(1, 0x0A) # Change type byte
with self.assertRaisesRegexp(ValueError, "Unknown sequence type"):
SeqIO.read(h, 'xdna')
h.close()
def test_corrupted_length(self):
"""Read a file with incorrect length."""
# Set a length shorter than the actual length of the sequence
h = self.munge_buffer(29, [0x00, 0x00, 0x00, 0x80])
with self.assertRaisesRegexp(ValueError, "invalid literal"):
SeqIO.read(h, 'xdna')
h.close()
# Set a length larger than the actual length of the sequence
h = self.munge_buffer(29, [0x00, 0x08, 0x00, 0x00])
with self.assertRaisesRegexp(ValueError, "Cannot read 2048 bytes from handle"):
SeqIO.read(h, 'xdna')
h.close()
def test_missing_features(self):
"""Read a file with an incorrect number of features."""
# Set a larger number of features than the file actually contains
# Offset of the features number byte:
# header + length of sequence + length of comment + overhangs
feature_byte = 112 + 1000 + len('Sample sequence A') + 5
h = self.munge_buffer(feature_byte, 3)
with self.assertRaisesRegexp(ValueError, "Cannot read 1 bytes from handle"):
SeqIO.read(h, 'xdna')
h.close()
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)