support soft-masking in nib files (#2303)

* support soft-masking in nib files

* nib doesn't need to be handled as a special case any more

* style issues

* Update NibIO.py

prettified
This commit is contained in:
mdehoon
2019-10-17 06:23:34 +09:00
committed by GitHub
parent 22df5f84fd
commit 569a6e1ba0
11 changed files with 76 additions and 74 deletions

View File

@ -10,23 +10,29 @@ Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:
0 - T
1 - C
2 - A
3 - G
4 - N (unknown)
- ``0`` - T
- ``1`` - C
- ``2`` - A
- ``3`` - G
- ``4`` - N (unknown)
As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:
- ``8`` - t
- ``9`` - c
- ``a`` - a
- ``b`` - g
- ``c`` - n (unknown)
A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_bigendian.nib", "nib")
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%i %s..." % (len(record), record.seq[:20]))
37 ACGTAAACCGTACCCGTANA...
Notice that the sequence is given in upper case; unknown nucleotides are
written as N.
50 nAGAAGagccgcNGgCActt...
For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
@ -101,17 +107,17 @@ def NibIterator(handle, alphabet=None):
This function is used internally via the Bio.SeqIO functions:
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_bigendian.nib", "nib")
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%s %i" % (record.seq, len(record)))
ACGTAAACCGTACCCGTANANCANNNNACNANNANCN 37
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
You can also call it directly:
>>> with open("Nib/test_bigendian.nib", "rb") as handle:
>>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
... for record in NibIterator(handle):
... print("%s %i" % (record.seq, len(record)))
...
ACGTAAACCGTACCCGTANANCANNNNACNANNANCN 37
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
"""
if alphabet is not None:
@ -140,9 +146,9 @@ def NibIterator(handle, alphabet=None):
if len(indices) != length + 1:
raise ValueError("Unexpected file size")
indices = indices[:length]
if set(indices) != set("01234"):
if not set(indices).issubset("0123489abc"):
raise ValueError("Unexpected sequence data found in file")
table = maketrans("01234", "TCAGN")
table = maketrans("0123489abc", "TCAGNtcagn")
nucleotides = indices.translate(table)
sequence = Seq(nucleotides)
record = SeqRecord(sequence)
@ -182,13 +188,13 @@ class NibWriter(SequenceWriter):
nucleotides = str(sequence)
length = len(sequence)
handle.write(struct.pack("i", length))
table = maketrans("TCAGNtcagn", "0123401234")
table = maketrans("TCAGNtcagn", "0123489abc")
padding = length % 2
suffix = padding * "T"
nucleotides += suffix
indices = nucleotides.translate(table)
if set(indices) != set("01234"):
if not set(nucleotides).issubset("ACGTNacgtn"):
raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
indices = nucleotides.translate(table)
handle.write(hex2bytes(indices))
return count

Binary file not shown.

2
Tests/Nib/test_even.fa Normal file
View File

@ -0,0 +1,2 @@
>even
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG

BIN
Tests/Nib/test_even_bigendian.nib generated Normal file

Binary file not shown.

BIN
Tests/Nib/test_even_littleendian.nib generated Normal file

Binary file not shown.

Binary file not shown.

2
Tests/Nib/test_odd.fa Normal file
View File

@ -0,0 +1,2 @@
>odd
ATCCgCNcnTCGNgACaatccCCaNtNnacgaTNGgANAAgNCac

BIN
Tests/Nib/test_odd_bigendian.nib generated Normal file

Binary file not shown.

BIN
Tests/Nib/test_odd_littleendian.nib generated Normal file

Binary file not shown.

View File

@ -281,8 +281,6 @@ class TestSeqIO(unittest.TestCase):
elif format == "qual":
self.assertIsInstance(r2.seq, UnknownSeq)
self.assertEqual(len(r2), len(r1))
elif format == "nib":
self.assertEqual(str(r1.seq).upper(), str(r2.seq))
else:
self.assertEqual(str(r1.seq), str(r2.seq))
# Beware of different quirks and limitations in the
@ -780,7 +778,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|4218935|gb|AF074388.1|AF074388).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|4218935|gb|AF074388.1|AF074388).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|4218935|gb|AF074388.1|AF074388).",
"Need a DNA, RNA or Protein alphabet",
@ -804,7 +801,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5052071|gb|AF067555.1|AF067555).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5052071|gb|AF067555.1|AF067555).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5052071|gb|AF067555.1|AF067555).",
"Need a DNA, RNA or Protein alphabet",
@ -851,7 +847,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5817701|gb|AF142731.1|AF142731).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5817701|gb|AF142731.1|AF142731).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5817701|gb|AF142731.1|AF142731).",
"Need a DNA, RNA or Protein alphabet",
@ -875,7 +870,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|3176602|gb|U78617.1|LOU78617).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|3176602|gb|U78617.1|LOU78617).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|3176602|gb|U78617.1|LOU78617).",
"Need a DNA, RNA or Protein alphabet",
@ -899,7 +893,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5690369|gb|AF158246.1|AF158246).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5690369|gb|AF158246.1|AF158246).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|5690369|gb|AF158246.1|AF158246).",
"Need a DNA, RNA or Protein alphabet",
@ -1168,7 +1161,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|45478711|ref|NC_005816.1|).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|45478711|ref|NC_005816.1|).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|45478711|ref|NC_005816.1|).",
"Need a DNA, RNA or Protein alphabet",
@ -1357,7 +1349,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|NC_001802.1|).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|NC_001802.1|).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|NC_001802.1|).",
"Need a DNA, RNA or Protein alphabet",
@ -1382,7 +1373,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|nc_001802.1|).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|nc_001802.1|).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=gi|9629357|ref|nc_001802.1|).",
"Need a DNA, RNA or Protein alphabet",
@ -2010,7 +2000,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=NM_006141.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NM_006141.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NM_006141.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NM_006141.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NM_006141.1).",
"Missing SFF flow information",
@ -2066,7 +2055,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AL109817.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL109817.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL109817.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL109817.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL109817.1).",
"Missing SFF flow information",
@ -2085,7 +2073,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=U05344.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U05344.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U05344.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U05344.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U05344.1).",
"Missing SFF flow information",
@ -2104,7 +2091,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AC007323.5).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AC007323.5).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AC007323.5).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AC007323.5).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AC007323.5).",
"Missing SFF flow information",
@ -2161,7 +2147,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AL138972.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL138972.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL138972.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL138972.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL138972.1).",
"Missing SFF flow information",
@ -2180,7 +2165,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=U18266.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U18266.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U18266.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U18266.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U18266.1).",
"Missing SFF flow information",
@ -2213,7 +2197,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=NC_002678.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_002678.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_002678.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_002678.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_002678.1).",
"Missing SFF flow information",
@ -2271,7 +2254,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=NC_005816.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_005816.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_005816.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_005816.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_005816.1).",
"Missing SFF flow information",
@ -2290,7 +2272,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=NC_000932.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_000932.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_000932.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_000932.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_000932.1).",
"Missing SFF flow information",
@ -2310,7 +2291,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=pBAD30).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=pBAD30).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=pBAD30).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=pBAD30).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=pBAD30).",
"Missing SFF flow information",
@ -2366,7 +2346,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=NC_001422.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_001422.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_001422.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_001422.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=NC_001422.1).",
"Missing SFF flow information",
@ -2528,7 +2507,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=X56734.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=X56734.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=X56734.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=X56734.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=X56734.1).",
"Missing SFF flow information",
@ -2547,7 +2525,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"Missing SFF flow information",
@ -2569,7 +2546,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=DD231055.1).",
"Need a DNA, RNA or Protein alphabet",
@ -2590,7 +2566,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AL031232).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL031232).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL031232).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL031232).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AL031232).",
"Missing SFF flow information",
@ -2609,7 +2584,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=U87107.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U87107.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U87107.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U87107.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=U87107.1).",
"Missing SFF flow information",
@ -2628,7 +2602,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AAA03323.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AAA03323.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AAA03323.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AAA03323.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AAA03323.1).",
"Missing SFF flow information",
@ -2647,7 +2620,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=AE017046.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AE017046.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AE017046.1).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AE017046.1).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=AE017046.1).",
"Missing SFF flow information",
@ -2727,7 +2699,6 @@ class TestSeqIO(unittest.TestCase):
"No suitable quality scores found in letter_annotations of SeqRecord (id=Test).",
"Need a Nucleotide or Protein alphabet",
"Need a DNA, RNA or Protein alphabet",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Test).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Test).",
"Need a DNA, RNA or Protein alphabet",
@ -2749,7 +2720,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"Missing SFF flow information",
@ -2769,7 +2739,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=A04195).",
"Missing SFF flow information",
@ -3364,8 +3333,7 @@ class TestSeqIO(unittest.TestCase):
]
lengths = [30]
alignment = None
messages = ["Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"Missing SFF flow information",
messages = ["Missing SFF flow information",
]
self.perform_test("phd", False, "Phd/phd_454", 1, ids, names, sequences, lengths, alignment, messages)
@ -4147,8 +4115,7 @@ class TestSeqIO(unittest.TestCase):
]
lengths = [795]
alignment = None
messages = ["Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"Missing SFF flow information",
messages = ["Missing SFF flow information",
]
self.perform_test("abi", False, "Abi/3100.ab1", 1, ids, names, sequences, lengths, alignment, messages)
@ -4392,7 +4359,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Missing SFF flow information",
@ -4411,7 +4377,6 @@ class TestSeqIO(unittest.TestCase):
messages = ["No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Sequence should contain A,C,G,T,N,a,c,g,t,n only",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"No suitable quality scores found in letter_annotations of SeqRecord (id=Sample).",
"Missing SFF flow information",

View File

@ -10,34 +10,61 @@ from Bio.SeqRecord import SeqRecord
class TestNibReaderWriter(unittest.TestCase):
nucleotides = "ACGTAAACCGTACCCGTANANCANNNNACNANNANCN"
def test_read_bigendian(self):
handle = open("Nib/test_bigendian.nib", "rb")
records = SeqIO.parse(handle, "nib")
record = next(records)
def test_read_even(self):
handle = open("Nib/test_even.fa")
record = SeqIO.read(handle, "fasta")
handle.close()
self.assertEqual(str(record.seq), self.nucleotides)
def test_read_littleendian(self):
handle = open("Nib/test_littleendian.nib", "rb")
records = SeqIO.parse(handle, "nib")
record = next(records)
sequence = str(record.seq)
handle = open("Nib/test_even_bigendian.nib", "rb")
record = SeqIO.read(handle, "nib")
handle.close()
self.assertEqual(str(record.seq), self.nucleotides)
self.assertEqual(sequence, str(record.seq))
handle = open("Nib/test_even_littleendian.nib", "rb")
record = SeqIO.read(handle, "nib")
handle.close()
self.assertEqual(sequence, str(record.seq))
def test_write_and_read(self):
def test_read_odd(self):
handle = open("Nib/test_odd.fa")
record = SeqIO.read(handle, "fasta")
handle.close()
sequence = str(record.seq)
handle = open("Nib/test_odd_bigendian.nib", "rb")
record = SeqIO.read(handle, "nib")
handle.close()
self.assertEqual(sequence, str(record.seq))
handle = open("Nib/test_odd_littleendian.nib", "rb")
record = SeqIO.read(handle, "nib")
handle.close()
self.assertEqual(sequence, str(record.seq))
def test_write_even(self):
handle = open("Nib/test_even.fa")
record = SeqIO.read(handle, "fasta")
handle.close()
sequence = str(record.seq)
handle = BytesIO()
sequence = Seq(self.nucleotides)
record = SeqRecord(sequence)
n = SeqIO.write(record, handle, "nib")
self.assertEqual(n, 1)
handle.flush()
handle.seek(0)
record = SeqIO.read(handle, "nib")
handle.close()
sequence = record.seq
self.assertEqual(str(sequence), self.nucleotides)
self.assertEqual(sequence, str(record.seq))
def test_write_odd(self):
handle = open("Nib/test_odd.fa")
record = SeqIO.read(handle, "fasta")
handle.close()
sequence = str(record.seq)
handle = BytesIO()
n = SeqIO.write(record, handle, "nib")
self.assertEqual(n, 1)
handle.flush()
handle.seek(0)
record = SeqIO.read(handle, "nib")
handle.close()
self.assertEqual(sequence, str(record.seq))
if __name__ == "__main__":