Better handling of SFF files with non-standard (i.e. unknown) index blocks.

2025-10-20 21:53:47 +08:00 · 2009-10-27 12:57:30 +00:00
parent d3699e3d87
commit 6896cd30fd
5 changed files with 153 additions and 4 deletions
--- a/Bio/SeqIO/SffIO.py
+++ b/Bio/SeqIO/SffIO.py
@ -251,14 +251,23 @@ def _sff_do_slow_index(handle) :
    assert read_header_size % 8 == 0 #Important for padding calc later!
    for read in range(number_of_reads) :
        record_offset = handle.tell()
        if record_offset == index_offset :
            #Found index block within reads, ignore it:
            offset = index_offset + index_length
            if offset % 8 :
                offset += 8 - (offset % 8)
            assert offset % 8 == 0
            handle.seek(offset)
            record_offset = offset
        #assert record_offset%8 == 0 #Worth checking, but slow
        #First the fixed header
        data = handle.read(read_header_size)
        read_header_length, name_length, seq_len, clip_qual_left, \
        clip_qual_right, clip_adapter_left, clip_adapter_right \
-            = struct.unpack(read_header_fmt, handle.read(read_header_size))
+            = struct.unpack(read_header_fmt, data)
        if read_header_length < 10 or read_header_length%8 != 0 :
-            raise ValueError("Malformed read header, says length is %i" \
+            raise ValueError("Malformed read header, says length is %i:\n%s" \
-                             % read_header_length)
+                             % (read_header_length, repr(data)))
        #now the name and any padding (remainder of header)
        name = handle.read(name_length)
        padding = read_header_length - read_header_size - name_length
@ -288,7 +297,10 @@ def _sff_find_roche_index(handle) :
    files from Roche 454 machines.
    Returns a tuple of read count, SFF "index" offset and size, XML offset and
-    size, and the actual read index offset and size."""    
+    size, and the actual read index offset and size.
    Raises a ValueError for unsupported or non-Roche index blocks.
    """    
    handle.seek(0)
    header_length, index_offset, index_length, number_of_reads, \
    number_of_flows_per_read, flow_chars, key_sequence \
@ -301,6 +313,12 @@ def _sff_find_roche_index(handle) :
    fmt = ">4s4B"
    fmt_size = struct.calcsize(fmt)
    data = handle.read(fmt_size)
    if not data :
        raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
                         % (index_length, index_offset))
    if len(data) < fmt_size :
        raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
                         % (index_length, index_offset, repr(data)))
    magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
    if magic_number == ".mft" : # 778921588
        #Roche 454 manifest index
@ -580,6 +598,7 @@ def SffIterator(handle, alphabet=Alphabet.generic_dna, trim=False) :
            offset = index_offset + index_length
            if offset % 8 :
                offset += 8 - (offset % 8)
            assert offset % 8 == 0
            handle.seek(offset)
            #Now that we've done this, we don't need to do it again. Clear
            #the index_offset so we can skip extra handle.tell() calls:
@ -596,6 +615,7 @@ def SffIterator(handle, alphabet=Alphabet.generic_dna, trim=False) :
        offset = index_offset + index_length
        if offset % 8 :
            offset += 8 - (offset % 8)
        assert offset % 8 == 0
        handle.seek(offset)
    #Should now be at the end of the file...
    if handle.read(1) :
@ -906,15 +926,35 @@ if __name__ == "__main__" :
    sff = list(SffIterator(open(filename, "rb")))
    sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
    assert len(sff) == len(sff2)
    for old,new in zip(sff,sff2) :
        assert old.id == new.id
        assert str(old.seq) == str(new.seq)
    sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
    assert len(sff) == len(sff2)
    for old,new in zip(sff,sff2) :
        assert old.id == new.id
        assert str(old.seq) == str(new.seq)
    sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
    assert len(sff) == len(sff2)
    for old,new in zip(sff,sff2) :
        assert old.id == new.id
        assert str(old.seq) == str(new.seq)
    sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
    assert len(sff) == len(sff2)
    for old,new in zip(sff,sff2) :
        assert old.id == new.id
        assert str(old.seq) == str(new.seq)
    sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
    assert len(sff) == len(sff2)
    for old,new in zip(sff,sff2) :
        assert old.id == new.id
        assert str(old.seq) == str(new.seq)
    sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
@ -988,4 +1028,107 @@ if __name__ == "__main__" :
        print "Checking what happens on re-reading a handle:"
        print err
    """
    #Ugly code to make test files...
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index)%8
    if padding :
        padding = 8 - padding
    index += chr(0)*padding
    assert len(index)%8 == 0
    #Ugly bit of code to make a fake index at start
    records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
    out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index)%8
    if padding :
        padding = 8 - padding
    index += chr(0)*padding
    w = SffWriter(out_handle, index=False, xml=None)
    #Fake the header...
    w._number_of_reads = len(records)
    w._index_start = 0
    w._index_length = 0
    w._key_sequence = records[0].annotations["flow_key"]
    w._flow_chars = records[0].annotations["flow_chars"]
    w._number_of_flows_per_read = len(w._flow_chars)
    w.write_header()
    w._index_start = out_handle.tell()
    w._index_length = len(index)
    out_handle.seek(0)
    w.write_header() #this time with index info
    w.handle.write(index)
    for record in records :
        w.write_record(record)
    out_handle.close()
    records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
    for old, new in zip(records, records2) :
        assert str(old.seq)==str(new.seq)
    i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
    #Ugly bit of code to make a fake index in middle
    records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
    out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index)%8
    if padding :
        padding = 8 - padding
    index += chr(0)*padding
    w = SffWriter(out_handle, index=False, xml=None)
    #Fake the header...
    w._number_of_reads = len(records)
    w._index_start = 0
    w._index_length = 0
    w._key_sequence = records[0].annotations["flow_key"]
    w._flow_chars = records[0].annotations["flow_chars"]
    w._number_of_flows_per_read = len(w._flow_chars)
    w.write_header()
    for record in records[:5] :
        w.write_record(record)
    w._index_start = out_handle.tell()
    w._index_length = len(index)
    w.handle.write(index)
    for record in records[5:] :
        w.write_record(record)
    out_handle.seek(0)
    w.write_header() #this time with index info
    out_handle.close()
    records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
    for old, new in zip(records, records2) :
        assert str(old.seq)==str(new.seq)
    j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
    #Ugly bit of code to make a fake index at end
    records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
    out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
    w = SffWriter(out_handle, index=False, xml=None)
    #Fake the header...
    w._number_of_reads = len(records)
    w._index_start = 0
    w._index_length = 0
    w._key_sequence = records[0].annotations["flow_key"]
    w._flow_chars = records[0].annotations["flow_chars"]
    w._number_of_flows_per_read = len(w._flow_chars)
    w.write_header()
    for record in records :
        w.write_record(record)
    w._index_start = out_handle.tell()
    w._index_length = len(index)
    out_handle.write(index)
    out_handle.seek(0)
    w.write_header() #this time with index info
    out_handle.close()
    records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
    for old, new in zip(records, records2) :
        assert str(old.seq)==str(new.seq)
    try :
        print _sff_read_roche_index_xml(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
        assert False, "Should fail!"
    except ValueError :
        pass
    k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
    """
    print "Done"
--- a/Tests/Roche/E3MFGYR02_alt_index_at_end.sff
+++ b/Tests/Roche/E3MFGYR02_alt_index_at_end.sff
--- a/Tests/Roche/E3MFGYR02_alt_index_at_start.sff
+++ b/Tests/Roche/E3MFGYR02_alt_index_at_start.sff
--- a/Tests/Roche/E3MFGYR02_alt_index_in_middle.sff
+++ b/Tests/Roche/E3MFGYR02_alt_index_in_middle.sff
--- a/Tests/test_SeqIO_index.py
+++ b/Tests/test_SeqIO_index.py
@ -89,10 +89,16 @@ tests = [
    ("SwissProt/sp010", "swiss", None),
    ("SwissProt/sp016", "swiss", None),
    ("Roche/E3MFGYR02_random_10_reads.sff", "sff", generic_dna),
    ("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim", generic_dna),
    ("Roche/E3MFGYR02_index_at_start.sff", "sff", generic_dna),
    ("Roche/E3MFGYR02_index_in_middle.sff", "sff", generic_dna),
    ("Roche/E3MFGYR02_alt_index_at_start.sff", "sff", generic_dna),
    ("Roche/E3MFGYR02_alt_index_in_middle.sff", "sff", generic_dna),
    ("Roche/E3MFGYR02_alt_index_at_end.sff", "sff", generic_dna),
    ("Roche/greek.sff", "sff", generic_nucleotide),
    ("Roche/greek.sff", "sff-trim", generic_nucleotide),
    ("Roche/paired.sff", "sff", None),
    ("Roche/paired.sff", "sff-trim", None),
    ]
 for filename, format, alphabet in tests :
    assert format in _FormatToIndexedDict