Avoid yield in Bio.SeqIO SffIO (#4813)

This commit is contained in:
mdehoon
2024-09-06 00:40:55 +09:00
committed by GitHub
parent 7154de931a
commit 69b466f038
4 changed files with 306 additions and 333 deletions

View File

@ -275,7 +275,7 @@ def _sff_file_header(handle):
""" """
# file header (part one) # file header (part one)
# use big endiean encdoing > # use big endian encoding >
# magic_number I # magic_number I
# version 4B # version 4B
# index_offset Q # index_offset Q
@ -643,155 +643,6 @@ def _sff_read_roche_index(handle):
_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$") _valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
def _sff_read_seq_record(
handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
):
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
# Now on to the reads...
# the read header format (fixed part):
# read_header_length H
# name_length H
# seq_len I
# clip_qual_left H
# clip_qual_right H
# clip_adapter_left H
# clip_adapter_right H
# [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H"
read_header_size = struct.calcsize(read_header_fmt)
read_flow_fmt = ">%iH" % number_of_flows_per_read
read_flow_size = struct.calcsize(read_flow_fmt)
(
read_header_length,
name_length,
seq_len,
clip_qual_left,
clip_qual_right,
clip_adapter_left,
clip_adapter_right,
) = struct.unpack(read_header_fmt, handle.read(read_header_size))
if clip_qual_left:
clip_qual_left -= 1 # python counting
if clip_adapter_left:
clip_adapter_left -= 1 # python counting
if read_header_length < 10 or read_header_length % 8 != 0:
raise ValueError(
"Malformed read header, says length is %i" % read_header_length
)
# now the name and any padding (remainder of header)
name = handle.read(name_length).decode()
padding = read_header_length - read_header_size - name_length
if handle.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post name %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
# now the flowgram values, flowgram index, bases and qualities
# NOTE - assuming flowgram_format==1, which means struct type H
flow_values = handle.read(read_flow_size) # unpack later if needed
temp_fmt = ">%iB" % seq_len # used for flow index and quals
flow_index = handle.read(seq_len) # unpack later if needed
seq = handle.read(seq_len) # Leave as bytes for Seq object
quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
# now any padding...
padding = (read_flow_size + seq_len * 3) % 8
if padding:
padding = 8 - padding
if handle.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post quality %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
# Follow Roche and apply most aggressive of qual and adapter clipping.
# Note Roche seems to ignore adapter clip fields when writing SFF,
# and uses just the quality clipping values for any clipping.
clip_left = max(clip_qual_left, clip_adapter_left)
# Right clipping of zero means no clipping
if clip_qual_right:
if clip_adapter_right:
clip_right = min(clip_qual_right, clip_adapter_right)
else:
# Typical case with Roche SFF files
clip_right = clip_qual_right
elif clip_adapter_right:
clip_right = clip_adapter_right
else:
clip_right = seq_len
# Now build a SeqRecord
if trim:
if clip_left >= clip_right:
# Raise an error?
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record, trimmed to nothing",
BiopythonParserWarning,
)
seq = ""
quals = []
else:
seq = seq[clip_left:clip_right].upper()
quals = quals[clip_left:clip_right]
# Don't record the clipping values, flow etc, they make no sense now:
annotations = {}
else:
if clip_left >= clip_right:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record", BiopythonParserWarning
)
seq = seq.lower()
else:
# This use of mixed case mimics the Roche SFF tool's FASTA output
seq = (
seq[:clip_left].lower()
+ seq[clip_left:clip_right].upper()
+ seq[clip_right:].lower()
)
annotations = {
"flow_values": struct.unpack(read_flow_fmt, flow_values),
"flow_index": struct.unpack(temp_fmt, flow_index),
"flow_chars": flow_chars,
"flow_key": key_sequence,
"clip_qual_left": clip_qual_left,
"clip_qual_right": clip_qual_right,
"clip_adapter_left": clip_adapter_left,
"clip_adapter_right": clip_adapter_right,
}
if re.match(_valid_UAN_read_name, name):
annotations["time"] = _get_read_time(name)
annotations["region"] = _get_read_region(name)
annotations["coords"] = _get_read_xy(name)
annotations["molecule_type"] = "DNA"
record = SeqRecord(
Seq(seq), id=name, name=name, description="", annotations=annotations
)
# Dirty trick to speed up this line:
# record.letter_annotations["phred_quality"] = quals
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
# Return the record and then continue...
return record
_powers_of_36 = [36**i for i in range(6)]
def _string_as_base_36(string): def _string_as_base_36(string):
"""Interpret a string as a base-36 number as per 454 manual (PRIVATE).""" """Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
total = 0 total = 0
@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read):
return raw return raw
class _AddTellHandle:
"""Wrapper for handles which do not support the tell method (PRIVATE).
Intended for use with things like network handles where tell (and reverse
seek) are not supported. The SFF file needs to track the current offset in
order to deal with the index block.
"""
def __init__(self, handle):
self._handle = handle
self._offset = 0
def read(self, length):
data = self._handle.read(length)
self._offset += len(data)
return data
def tell(self):
return self._offset
def seek(self, offset):
if offset < self._offset:
raise RuntimeError("Can't seek backwards")
self._handle.read(offset - self._offset)
def close(self):
return self._handle.close()
class SffIterator(SequenceIterator): class SffIterator(SequenceIterator):
"""Parser for Standard Flowgram Format (SFF) files.""" """Parser for Standard Flowgram Format (SFF) files."""
# the read header format (fixed part):
# read_header_length H
# name_length H
# seq_len I
# clip_qual_left H
# clip_qual_right H
# clip_adapter_left H
# clip_adapter_right H
# [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H"
read_header_size = struct.calcsize(read_header_fmt)
assert read_header_size % 8 == 0 # Important for padding calc later!
def __init__(self, source, alphabet=None, trim=False): def __init__(self, source, alphabet=None, trim=False):
"""Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects). """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
@ -996,30 +831,51 @@ class SffIterator(SequenceIterator):
raise ValueError("The alphabet argument is no longer supported") raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="b", fmt="SFF") super().__init__(source, mode="b", fmt="SFF")
self.trim = trim self.trim = trim
stream = self.stream
(
self._offset,
self.index_offset,
self.index_length,
self.number_of_reads,
number_of_flows_per_read,
self.flow_chars,
self.key_sequence,
) = _sff_file_header(stream)
# Now on to the reads...
self.read_flow_fmt = ">%iH" % number_of_flows_per_read
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
self._read_counter = 0
def parse(self, handle): def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator.""" """To be removed."""
try: return
if 0 != handle.tell():
raise ValueError("Not at start of file, offset %i" % handle.tell())
except AttributeError:
# Probably a network handle or something like that
handle = _AddTellHandle(handle)
records = self.iterate(handle)
return records
def iterate(self, handle): def __next__(self):
"""Parse the file and generate SeqRecord objects.""" stream = self.stream
trim = self.trim if self._read_counter == self.number_of_reads:
( self._read_counter = None # check EOF only once
header_length, self._check_eof(stream)
index_offset, raise StopIteration
index_length, elif self._read_counter is None:
number_of_reads, raise StopIteration
number_of_flows_per_read, # The spec allows for the index block to be before or even in the middle
flow_chars, # of the reads. We can check that if we keep track of our position
key_sequence, # in the file...
) = _sff_file_header(handle) index_offset = self.index_offset
if self._offset == index_offset:
index_length = self.index_length
offset = index_offset + index_length
if offset % 8:
offset += 8 - (offset % 8)
assert offset % 8 == 0
stream.seek(offset)
self._offset = offset
record = self._sff_read_seq_record(self.stream)
self._read_counter += 1
return record
def _sff_read_seq_record(self, stream):
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
# Now on to the reads... # Now on to the reads...
# the read header format (fixed part): # the read header format (fixed part):
# read_header_length H # read_header_length H
@ -1030,44 +886,150 @@ class SffIterator(SequenceIterator):
# clip_adapter_left H # clip_adapter_left H
# clip_adapter_right H # clip_adapter_right H
# [rest of read header depends on the name length etc] # [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H" read_header_fmt = SffIterator.read_header_fmt
read_header_size = struct.calcsize(read_header_fmt) read_header_size = SffIterator.read_header_size
read_flow_fmt = ">%iH" % number_of_flows_per_read (
read_flow_size = struct.calcsize(read_flow_fmt) read_header_length,
assert 1 == struct.calcsize(">B") name_length,
assert 1 == struct.calcsize(">s") seq_len,
assert 1 == struct.calcsize(">c") clip_qual_left,
assert read_header_size % 8 == 0 # Important for padding calc later! clip_qual_right,
# The spec allows for the index block to be before or even in the middle clip_adapter_left,
# of the reads. We can check that if we keep track of our position clip_adapter_right,
# in the file... ) = struct.unpack(read_header_fmt, stream.read(read_header_size))
for read in range(number_of_reads): if clip_qual_left:
if index_offset and handle.tell() == index_offset: clip_qual_left -= 1 # python counting
offset = index_offset + index_length if clip_adapter_left:
if offset % 8: clip_adapter_left -= 1 # python counting
offset += 8 - (offset % 8) if read_header_length < 10 or read_header_length % 8 != 0:
assert offset % 8 == 0 raise ValueError(
handle.seek(offset) "Malformed read header, says length is %i" % read_header_length
# Now that we've done this, we don't need to do it again. Clear
# the index_offset so we can skip extra handle.tell() calls:
index_offset = 0
yield _sff_read_seq_record(
handle, number_of_flows_per_read, flow_chars, key_sequence, trim
) )
_check_eof(handle, index_offset, index_length) # now the name and any padding (remainder of header)
name = stream.read(name_length).decode()
padding = read_header_length - read_header_size - name_length
if stream.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
def _check_eof(handle, index_offset, index_length): warnings.warn(
"Your SFF file is invalid, post name %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
self._offset += read_header_length
# now the flowgram values, flowgram index, bases and qualities
# NOTE - assuming flowgram_format==1, which means struct type H
flow_values = stream.read(self.read_flow_size) # unpack later if needed
temp_fmt = ">%iB" % seq_len # used for flow index and quals
flow_index = stream.read(seq_len) # unpack later if needed
seq = stream.read(seq_len) # Leave as bytes for Seq object
quals = list(struct.unpack(temp_fmt, stream.read(seq_len)))
self._offset += self.read_flow_size + seq_len * 3
# now any padding...
padding = (self.read_flow_size + seq_len * 3) % 8
if padding:
padding = 8 - padding
if stream.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post quality %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
self._offset += padding
# Follow Roche and apply most aggressive of qual and adapter clipping.
# Note Roche seems to ignore adapter clip fields when writing SFF,
# and uses just the quality clipping values for any clipping.
clip_left = max(clip_qual_left, clip_adapter_left)
# Right clipping of zero means no clipping
if clip_qual_right:
if clip_adapter_right:
clip_right = min(clip_qual_right, clip_adapter_right)
else:
# Typical case with Roche SFF files
clip_right = clip_qual_right
elif clip_adapter_right:
clip_right = clip_adapter_right
else:
clip_right = seq_len
# Now build a SeqRecord
if self.trim:
if clip_left >= clip_right:
# Raise an error?
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record, trimmed to nothing",
BiopythonParserWarning,
)
seq = ""
quals = []
else:
seq = seq[clip_left:clip_right].upper()
quals = quals[clip_left:clip_right]
# Don't record the clipping values, flow etc, they make no sense now:
annotations = {}
else:
if clip_left >= clip_right:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record", BiopythonParserWarning
)
seq = seq.lower()
else:
# This use of mixed case mimics the Roche SFF tool's FASTA output
seq = (
seq[:clip_left].lower()
+ seq[clip_left:clip_right].upper()
+ seq[clip_right:].lower()
)
annotations = {
"flow_values": struct.unpack(self.read_flow_fmt, flow_values),
"flow_index": struct.unpack(temp_fmt, flow_index),
"flow_chars": self.flow_chars,
"flow_key": self.key_sequence,
"clip_qual_left": clip_qual_left,
"clip_qual_right": clip_qual_right,
"clip_adapter_left": clip_adapter_left,
"clip_adapter_right": clip_adapter_right,
}
if re.match(_valid_UAN_read_name, name):
annotations["time"] = _get_read_time(name)
annotations["region"] = _get_read_region(name)
annotations["coords"] = _get_read_xy(name)
annotations["molecule_type"] = "DNA"
record = SeqRecord(
Seq(seq), id=name, name=name, description="", annotations=annotations
)
# Dirty trick to speed up this line:
# record.letter_annotations["phred_quality"] = quals
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
# Return the record and then continue...
return record
def _check_eof(self, stream):
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE). """Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
Will attempt to spot apparent SFF file concatenation and give an error. Will attempt to spot apparent SFF file concatenation and give an error.
Will not attempt to seek, only moves the handle forward. Will not attempt to seek, only moves the stream forward.
""" """
offset = handle.tell() offset = self._offset
extra = b"" extra = b""
padding = 0 padding = 0
index_offset = self.index_offset
index_length = self.index_length
if index_offset and offset <= index_offset: if index_offset and offset <= index_offset:
# Index block then end of file... # Index block then end of file...
if offset < index_offset: if offset < index_offset:
@ -1078,17 +1040,19 @@ def _check_eof(handle, index_offset, index_length):
) )
# Doing read to jump the index rather than a seek # Doing read to jump the index rather than a seek
# in case this is a network handle or similar # in case this is a network handle or similar
handle.read(index_offset + index_length - offset) stream.read(index_length)
self._offset += index_length
offset = index_offset + index_length offset = index_offset + index_length
if offset != handle.tell(): if offset != self._offset:
raise ValueError( raise ValueError(
"Wanted %i, got %i, index is %i to %i" "Wanted %i, got %i, index is %i to %i"
% (offset, handle.tell(), index_offset, index_offset + index_length) % (offset, stream.tell(), index_offset, index_offset + index_length)
) )
if offset % 8: if offset % 8:
padding = 8 - (offset % 8) padding = 8 - (offset % 8)
extra = handle.read(padding) extra = stream.read(padding)
self._offset += padding
if padding >= 4 and extra[-4:] == _sff: if padding >= 4 and extra[-4:] == _sff:
# Seen this in one user supplied file, should have been # Seen this in one user supplied file, should have been
@ -1123,11 +1087,14 @@ def _check_eof(handle, index_offset, index_length):
BiopythonParserWarning, BiopythonParserWarning,
) )
offset = handle.tell() offset = self._offset
if offset % 8 != 0: if offset % 8 != 0:
raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)) raise ValueError(
"Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)
)
# Should now be at the end of the file... # Should now be at the end of the file...
extra = handle.read(4) extra = stream.read(4)
self._offset += 4
if extra == _sff: if extra == _sff:
raise ValueError( raise ValueError(
"Additional data at end of SFF file, " "Additional data at end of SFF file, "
@ -1135,7 +1102,12 @@ def _check_eof(handle, index_offset, index_length):
"See offset %i" % offset "See offset %i" % offset
) )
elif extra: elif extra:
raise ValueError("Additional data at end of SFF file, see offset %i" % offset) raise ValueError(
"Additional data at end of SFF file, see offset %i" % offset
)
_powers_of_36 = [36**i for i in range(6)]
class _SffTrimIterator(SffIterator): class _SffTrimIterator(SffIterator):

View File

@ -27,6 +27,7 @@ re-indexing the file for use another time.
""" """
import re import re
import struct
from io import BytesIO from io import BytesIO
from io import StringIO from io import StringIO
@ -67,13 +68,16 @@ class SffRandomAccess(SeqFileRandomAccess):
SeqFileRandomAccess.__init__(self, filename, format) SeqFileRandomAccess.__init__(self, filename, format)
( (
header_length, header_length,
index_offset, self.index_offset,
index_length, self.index_length,
number_of_reads, number_of_reads,
self._flows_per_read, self.number_of_flows_per_read,
self._flow_chars, self.flow_chars,
self._key_sequence, self.key_sequence,
) = SeqIO.SffIO._sff_file_header(self._handle) ) = SeqIO.SffIO._sff_file_header(self._handle)
# Now on to the reads...
self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
def __iter__(self): def __iter__(self):
"""Load any index block in the file, or build it the slow way (PRIVATE).""" """Load any index block in the file, or build it the slow way (PRIVATE)."""
@ -85,9 +89,9 @@ class SffRandomAccess(SeqFileRandomAccess):
index_offset, index_offset,
index_length, index_length,
number_of_reads, number_of_reads,
self._flows_per_read, self.number_of_flows_per_read,
self._flow_chars, self.flow_chars,
self._key_sequence, self.key_sequence,
) = SeqIO.SffIO._sff_file_header(handle) ) = SeqIO.SffIO._sff_file_header(handle)
if index_offset and index_length: if index_offset and index_length:
# There is an index provided, try this the fast way: # There is an index provided, try this the fast way:
@ -120,9 +124,12 @@ class SffRandomAccess(SeqFileRandomAccess):
# Can have an index at start (or mid-file) # Can have an index at start (or mid-file)
handle.seek(max_offset) handle.seek(max_offset)
# Parse the final read, # Parse the final read,
SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) SeqIO.SffIO._sff_read_raw_record(
handle, self.number_of_flows_per_read
)
# Should now be at the end of the file! # Should now be at the end of the file!
SeqIO.SffIO._check_eof(handle, index_offset, index_length) self._offset = handle.tell()
SeqIO.SffIO.SffIterator._check_eof(self, handle)
return return
# We used to give a warning in this case, but Ion Torrent's # We used to give a warning in this case, but Ion Torrent's
# SFF files don't have an index so that would be annoying. # SFF files don't have an index so that would be annoying.
@ -135,21 +142,23 @@ class SffRandomAccess(SeqFileRandomAccess):
raise ValueError( raise ValueError(
"Indexed %i records, expected %i" % (count, number_of_reads) "Indexed %i records, expected %i" % (count, number_of_reads)
) )
SeqIO.SffIO._check_eof(handle, index_offset, index_length) self._offset = handle.tell()
SeqIO.SffIO.SffIterator._check_eof(self, handle)
def get(self, offset): def get(self, offset):
"""Return the SeqRecord starting at the given offset.""" """Return the SeqRecord starting at the given offset."""
handle = self._handle handle = self._handle
handle.seek(offset) handle.seek(offset)
return SeqIO.SffIO._sff_read_seq_record( self._offset = offset
handle, self._flows_per_read, self._flow_chars, self._key_sequence self.trim = False
) return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
def get_raw(self, offset): def get_raw(self, offset):
"""Return the raw record from the file as a bytes string.""" """Return the raw record from the file as a bytes string."""
handle = self._handle handle = self._handle
handle.seek(offset) handle.seek(offset)
return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) self.stream = handle
return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read)
class SffTrimedRandomAccess(SffRandomAccess): class SffTrimedRandomAccess(SffRandomAccess):
@ -159,13 +168,9 @@ class SffTrimedRandomAccess(SffRandomAccess):
"""Return the SeqRecord starting at the given offset.""" """Return the SeqRecord starting at the given offset."""
handle = self._handle handle = self._handle
handle.seek(offset) handle.seek(offset)
return SeqIO.SffIO._sff_read_seq_record( self._offset = offset
handle, self.trim = True
self._flows_per_read, return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
self._flow_chars,
self._key_sequence,
trim=True,
)
################### ###################

View File

@ -635,20 +635,16 @@ class IndexDictTests(SeqRecordTestBaseClass, SeqIOTestBaseClass):
else: else:
raise RuntimeError(f"Unexpected mode {mode}") raise RuntimeError(f"Unexpected mode {mode}")
if fmt == "sff": if fmt == "sff":
rec2 = SeqIO.SffIO._sff_read_seq_record( rec_dict._proxy.trim = False
handle, rec_dict._proxy._offset = handle.tell()
rec_dict._proxy._flows_per_read, rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
rec_dict._proxy._flow_chars, rec_dict._proxy, handle
rec_dict._proxy._key_sequence,
trim=False,
) )
elif fmt == "sff-trim": elif fmt == "sff-trim":
rec2 = SeqIO.SffIO._sff_read_seq_record( rec_dict._proxy.trim = True
handle, rec_dict._proxy._offset = handle.tell()
rec_dict._proxy._flows_per_read, rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
rec_dict._proxy._flow_chars, rec_dict._proxy, handle
rec_dict._proxy._key_sequence,
trim=True,
) )
elif fmt == "uniprot-xml": elif fmt == "uniprot-xml":
self.assertTrue(raw.startswith(b"<entry "), msg=msg) self.assertTrue(raw.startswith(b"<entry "), msg=msg)