Avoid yield in Bio.SeqIO SffIO (#4813)

This commit is contained in:
mdehoon
2024-09-06 00:40:55 +09:00
committed by GitHub
parent 7154de931a
commit 69b466f038
4 changed files with 306 additions and 333 deletions

View File

@ -275,7 +275,7 @@ def _sff_file_header(handle):
"""
# file header (part one)
# use big endiean encdoing >
# use big endian encoding >
# magic_number I
# version 4B
# index_offset Q
@ -643,155 +643,6 @@ def _sff_read_roche_index(handle):
_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
def _sff_read_seq_record(
handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
):
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
# Now on to the reads...
# the read header format (fixed part):
# read_header_length H
# name_length H
# seq_len I
# clip_qual_left H
# clip_qual_right H
# clip_adapter_left H
# clip_adapter_right H
# [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H"
read_header_size = struct.calcsize(read_header_fmt)
read_flow_fmt = ">%iH" % number_of_flows_per_read
read_flow_size = struct.calcsize(read_flow_fmt)
(
read_header_length,
name_length,
seq_len,
clip_qual_left,
clip_qual_right,
clip_adapter_left,
clip_adapter_right,
) = struct.unpack(read_header_fmt, handle.read(read_header_size))
if clip_qual_left:
clip_qual_left -= 1 # python counting
if clip_adapter_left:
clip_adapter_left -= 1 # python counting
if read_header_length < 10 or read_header_length % 8 != 0:
raise ValueError(
"Malformed read header, says length is %i" % read_header_length
)
# now the name and any padding (remainder of header)
name = handle.read(name_length).decode()
padding = read_header_length - read_header_size - name_length
if handle.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post name %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
# now the flowgram values, flowgram index, bases and qualities
# NOTE - assuming flowgram_format==1, which means struct type H
flow_values = handle.read(read_flow_size) # unpack later if needed
temp_fmt = ">%iB" % seq_len # used for flow index and quals
flow_index = handle.read(seq_len) # unpack later if needed
seq = handle.read(seq_len) # Leave as bytes for Seq object
quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
# now any padding...
padding = (read_flow_size + seq_len * 3) % 8
if padding:
padding = 8 - padding
if handle.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post quality %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
# Follow Roche and apply most aggressive of qual and adapter clipping.
# Note Roche seems to ignore adapter clip fields when writing SFF,
# and uses just the quality clipping values for any clipping.
clip_left = max(clip_qual_left, clip_adapter_left)
# Right clipping of zero means no clipping
if clip_qual_right:
if clip_adapter_right:
clip_right = min(clip_qual_right, clip_adapter_right)
else:
# Typical case with Roche SFF files
clip_right = clip_qual_right
elif clip_adapter_right:
clip_right = clip_adapter_right
else:
clip_right = seq_len
# Now build a SeqRecord
if trim:
if clip_left >= clip_right:
# Raise an error?
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record, trimmed to nothing",
BiopythonParserWarning,
)
seq = ""
quals = []
else:
seq = seq[clip_left:clip_right].upper()
quals = quals[clip_left:clip_right]
# Don't record the clipping values, flow etc, they make no sense now:
annotations = {}
else:
if clip_left >= clip_right:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record", BiopythonParserWarning
)
seq = seq.lower()
else:
# This use of mixed case mimics the Roche SFF tool's FASTA output
seq = (
seq[:clip_left].lower()
+ seq[clip_left:clip_right].upper()
+ seq[clip_right:].lower()
)
annotations = {
"flow_values": struct.unpack(read_flow_fmt, flow_values),
"flow_index": struct.unpack(temp_fmt, flow_index),
"flow_chars": flow_chars,
"flow_key": key_sequence,
"clip_qual_left": clip_qual_left,
"clip_qual_right": clip_qual_right,
"clip_adapter_left": clip_adapter_left,
"clip_adapter_right": clip_adapter_right,
}
if re.match(_valid_UAN_read_name, name):
annotations["time"] = _get_read_time(name)
annotations["region"] = _get_read_region(name)
annotations["coords"] = _get_read_xy(name)
annotations["molecule_type"] = "DNA"
record = SeqRecord(
Seq(seq), id=name, name=name, description="", annotations=annotations
)
# Dirty trick to speed up this line:
# record.letter_annotations["phred_quality"] = quals
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
# Return the record and then continue...
return record
_powers_of_36 = [36**i for i in range(6)]
def _string_as_base_36(string):
"""Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
total = 0
@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read):
return raw
class _AddTellHandle:
"""Wrapper for handles which do not support the tell method (PRIVATE).
Intended for use with things like network handles where tell (and reverse
seek) are not supported. The SFF file needs to track the current offset in
order to deal with the index block.
"""
def __init__(self, handle):
self._handle = handle
self._offset = 0
def read(self, length):
data = self._handle.read(length)
self._offset += len(data)
return data
def tell(self):
return self._offset
def seek(self, offset):
if offset < self._offset:
raise RuntimeError("Can't seek backwards")
self._handle.read(offset - self._offset)
def close(self):
return self._handle.close()
class SffIterator(SequenceIterator):
"""Parser for Standard Flowgram Format (SFF) files."""
# the read header format (fixed part):
# read_header_length H
# name_length H
# seq_len I
# clip_qual_left H
# clip_qual_right H
# clip_adapter_left H
# clip_adapter_right H
# [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H"
read_header_size = struct.calcsize(read_header_fmt)
assert read_header_size % 8 == 0 # Important for padding calc later!
def __init__(self, source, alphabet=None, trim=False):
"""Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
@ -996,30 +831,51 @@ class SffIterator(SequenceIterator):
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="b", fmt="SFF")
self.trim = trim
stream = self.stream
(
self._offset,
self.index_offset,
self.index_length,
self.number_of_reads,
number_of_flows_per_read,
self.flow_chars,
self.key_sequence,
) = _sff_file_header(stream)
# Now on to the reads...
self.read_flow_fmt = ">%iH" % number_of_flows_per_read
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
self._read_counter = 0
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
try:
if 0 != handle.tell():
raise ValueError("Not at start of file, offset %i" % handle.tell())
except AttributeError:
# Probably a network handle or something like that
handle = _AddTellHandle(handle)
records = self.iterate(handle)
return records
"""To be removed."""
return
def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
trim = self.trim
(
header_length,
index_offset,
index_length,
number_of_reads,
number_of_flows_per_read,
flow_chars,
key_sequence,
) = _sff_file_header(handle)
def __next__(self):
stream = self.stream
if self._read_counter == self.number_of_reads:
self._read_counter = None # check EOF only once
self._check_eof(stream)
raise StopIteration
elif self._read_counter is None:
raise StopIteration
# The spec allows for the index block to be before or even in the middle
# of the reads. We can check that if we keep track of our position
# in the file...
index_offset = self.index_offset
if self._offset == index_offset:
index_length = self.index_length
offset = index_offset + index_length
if offset % 8:
offset += 8 - (offset % 8)
assert offset % 8 == 0
stream.seek(offset)
self._offset = offset
record = self._sff_read_seq_record(self.stream)
self._read_counter += 1
return record
def _sff_read_seq_record(self, stream):
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
# Now on to the reads...
# the read header format (fixed part):
# read_header_length H
@ -1030,44 +886,150 @@ class SffIterator(SequenceIterator):
# clip_adapter_left H
# clip_adapter_right H
# [rest of read header depends on the name length etc]
read_header_fmt = ">2HI4H"
read_header_size = struct.calcsize(read_header_fmt)
read_flow_fmt = ">%iH" % number_of_flows_per_read
read_flow_size = struct.calcsize(read_flow_fmt)
assert 1 == struct.calcsize(">B")
assert 1 == struct.calcsize(">s")
assert 1 == struct.calcsize(">c")
assert read_header_size % 8 == 0 # Important for padding calc later!
# The spec allows for the index block to be before or even in the middle
# of the reads. We can check that if we keep track of our position
# in the file...
for read in range(number_of_reads):
if index_offset and handle.tell() == index_offset:
offset = index_offset + index_length
if offset % 8:
offset += 8 - (offset % 8)
assert offset % 8 == 0
handle.seek(offset)
# Now that we've done this, we don't need to do it again. Clear
# the index_offset so we can skip extra handle.tell() calls:
index_offset = 0
yield _sff_read_seq_record(
handle, number_of_flows_per_read, flow_chars, key_sequence, trim
read_header_fmt = SffIterator.read_header_fmt
read_header_size = SffIterator.read_header_size
(
read_header_length,
name_length,
seq_len,
clip_qual_left,
clip_qual_right,
clip_adapter_left,
clip_adapter_right,
) = struct.unpack(read_header_fmt, stream.read(read_header_size))
if clip_qual_left:
clip_qual_left -= 1 # python counting
if clip_adapter_left:
clip_adapter_left -= 1 # python counting
if read_header_length < 10 or read_header_length % 8 != 0:
raise ValueError(
"Malformed read header, says length is %i" % read_header_length
)
_check_eof(handle, index_offset, index_length)
# now the name and any padding (remainder of header)
name = stream.read(name_length).decode()
padding = read_header_length - read_header_size - name_length
if stream.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
def _check_eof(handle, index_offset, index_length):
warnings.warn(
"Your SFF file is invalid, post name %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
self._offset += read_header_length
# now the flowgram values, flowgram index, bases and qualities
# NOTE - assuming flowgram_format==1, which means struct type H
flow_values = stream.read(self.read_flow_size) # unpack later if needed
temp_fmt = ">%iB" % seq_len # used for flow index and quals
flow_index = stream.read(seq_len) # unpack later if needed
seq = stream.read(seq_len) # Leave as bytes for Seq object
quals = list(struct.unpack(temp_fmt, stream.read(seq_len)))
self._offset += self.read_flow_size + seq_len * 3
# now any padding...
padding = (self.read_flow_size + seq_len * 3) % 8
if padding:
padding = 8 - padding
if stream.read(padding).count(_null) != padding:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Your SFF file is invalid, post quality %i "
"byte padding region contained data" % padding,
BiopythonParserWarning,
)
self._offset += padding
# Follow Roche and apply most aggressive of qual and adapter clipping.
# Note Roche seems to ignore adapter clip fields when writing SFF,
# and uses just the quality clipping values for any clipping.
clip_left = max(clip_qual_left, clip_adapter_left)
# Right clipping of zero means no clipping
if clip_qual_right:
if clip_adapter_right:
clip_right = min(clip_qual_right, clip_adapter_right)
else:
# Typical case with Roche SFF files
clip_right = clip_qual_right
elif clip_adapter_right:
clip_right = clip_adapter_right
else:
clip_right = seq_len
# Now build a SeqRecord
if self.trim:
if clip_left >= clip_right:
# Raise an error?
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record, trimmed to nothing",
BiopythonParserWarning,
)
seq = ""
quals = []
else:
seq = seq[clip_left:clip_right].upper()
quals = quals[clip_left:clip_right]
# Don't record the clipping values, flow etc, they make no sense now:
annotations = {}
else:
if clip_left >= clip_right:
import warnings
from Bio import BiopythonParserWarning
warnings.warn(
"Overlapping clip values in SFF record", BiopythonParserWarning
)
seq = seq.lower()
else:
# This use of mixed case mimics the Roche SFF tool's FASTA output
seq = (
seq[:clip_left].lower()
+ seq[clip_left:clip_right].upper()
+ seq[clip_right:].lower()
)
annotations = {
"flow_values": struct.unpack(self.read_flow_fmt, flow_values),
"flow_index": struct.unpack(temp_fmt, flow_index),
"flow_chars": self.flow_chars,
"flow_key": self.key_sequence,
"clip_qual_left": clip_qual_left,
"clip_qual_right": clip_qual_right,
"clip_adapter_left": clip_adapter_left,
"clip_adapter_right": clip_adapter_right,
}
if re.match(_valid_UAN_read_name, name):
annotations["time"] = _get_read_time(name)
annotations["region"] = _get_read_region(name)
annotations["coords"] = _get_read_xy(name)
annotations["molecule_type"] = "DNA"
record = SeqRecord(
Seq(seq), id=name, name=name, description="", annotations=annotations
)
# Dirty trick to speed up this line:
# record.letter_annotations["phred_quality"] = quals
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
# Return the record and then continue...
return record
def _check_eof(self, stream):
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
Will attempt to spot apparent SFF file concatenation and give an error.
Will not attempt to seek, only moves the handle forward.
Will not attempt to seek, only moves the stream forward.
"""
offset = handle.tell()
offset = self._offset
extra = b""
padding = 0
index_offset = self.index_offset
index_length = self.index_length
if index_offset and offset <= index_offset:
# Index block then end of file...
if offset < index_offset:
@ -1078,17 +1040,19 @@ def _check_eof(handle, index_offset, index_length):
)
# Doing read to jump the index rather than a seek
# in case this is a network handle or similar
handle.read(index_offset + index_length - offset)
stream.read(index_length)
self._offset += index_length
offset = index_offset + index_length
if offset != handle.tell():
if offset != self._offset:
raise ValueError(
"Wanted %i, got %i, index is %i to %i"
% (offset, handle.tell(), index_offset, index_offset + index_length)
% (offset, stream.tell(), index_offset, index_offset + index_length)
)
if offset % 8:
padding = 8 - (offset % 8)
extra = handle.read(padding)
extra = stream.read(padding)
self._offset += padding
if padding >= 4 and extra[-4:] == _sff:
# Seen this in one user supplied file, should have been
@ -1123,11 +1087,14 @@ def _check_eof(handle, index_offset, index_length):
BiopythonParserWarning,
)
offset = handle.tell()
offset = self._offset
if offset % 8 != 0:
raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8))
raise ValueError(
"Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)
)
# Should now be at the end of the file...
extra = handle.read(4)
extra = stream.read(4)
self._offset += 4
if extra == _sff:
raise ValueError(
"Additional data at end of SFF file, "
@ -1135,7 +1102,12 @@ def _check_eof(handle, index_offset, index_length):
"See offset %i" % offset
)
elif extra:
raise ValueError("Additional data at end of SFF file, see offset %i" % offset)
raise ValueError(
"Additional data at end of SFF file, see offset %i" % offset
)
_powers_of_36 = [36**i for i in range(6)]
class _SffTrimIterator(SffIterator):

View File

@ -27,6 +27,7 @@ re-indexing the file for use another time.
"""
import re
import struct
from io import BytesIO
from io import StringIO
@ -67,13 +68,16 @@ class SffRandomAccess(SeqFileRandomAccess):
SeqFileRandomAccess.__init__(self, filename, format)
(
header_length,
index_offset,
index_length,
self.index_offset,
self.index_length,
number_of_reads,
self._flows_per_read,
self._flow_chars,
self._key_sequence,
self.number_of_flows_per_read,
self.flow_chars,
self.key_sequence,
) = SeqIO.SffIO._sff_file_header(self._handle)
# Now on to the reads...
self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
def __iter__(self):
"""Load any index block in the file, or build it the slow way (PRIVATE)."""
@ -85,9 +89,9 @@ class SffRandomAccess(SeqFileRandomAccess):
index_offset,
index_length,
number_of_reads,
self._flows_per_read,
self._flow_chars,
self._key_sequence,
self.number_of_flows_per_read,
self.flow_chars,
self.key_sequence,
) = SeqIO.SffIO._sff_file_header(handle)
if index_offset and index_length:
# There is an index provided, try this the fast way:
@ -120,9 +124,12 @@ class SffRandomAccess(SeqFileRandomAccess):
# Can have an index at start (or mid-file)
handle.seek(max_offset)
# Parse the final read,
SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
SeqIO.SffIO._sff_read_raw_record(
handle, self.number_of_flows_per_read
)
# Should now be at the end of the file!
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
self._offset = handle.tell()
SeqIO.SffIO.SffIterator._check_eof(self, handle)
return
# We used to give a warning in this case, but Ion Torrent's
# SFF files don't have an index so that would be annoying.
@ -135,21 +142,23 @@ class SffRandomAccess(SeqFileRandomAccess):
raise ValueError(
"Indexed %i records, expected %i" % (count, number_of_reads)
)
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
self._offset = handle.tell()
SeqIO.SffIO.SffIterator._check_eof(self, handle)
def get(self, offset):
"""Return the SeqRecord starting at the given offset."""
handle = self._handle
handle.seek(offset)
return SeqIO.SffIO._sff_read_seq_record(
handle, self._flows_per_read, self._flow_chars, self._key_sequence
)
self._offset = offset
self.trim = False
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
def get_raw(self, offset):
"""Return the raw record from the file as a bytes string."""
handle = self._handle
handle.seek(offset)
return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
self.stream = handle
return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read)
class SffTrimedRandomAccess(SffRandomAccess):
@ -159,13 +168,9 @@ class SffTrimedRandomAccess(SffRandomAccess):
"""Return the SeqRecord starting at the given offset."""
handle = self._handle
handle.seek(offset)
return SeqIO.SffIO._sff_read_seq_record(
handle,
self._flows_per_read,
self._flow_chars,
self._key_sequence,
trim=True,
)
self._offset = offset
self.trim = True
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
###################

View File

@ -635,20 +635,16 @@ class IndexDictTests(SeqRecordTestBaseClass, SeqIOTestBaseClass):
else:
raise RuntimeError(f"Unexpected mode {mode}")
if fmt == "sff":
rec2 = SeqIO.SffIO._sff_read_seq_record(
handle,
rec_dict._proxy._flows_per_read,
rec_dict._proxy._flow_chars,
rec_dict._proxy._key_sequence,
trim=False,
rec_dict._proxy.trim = False
rec_dict._proxy._offset = handle.tell()
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
rec_dict._proxy, handle
)
elif fmt == "sff-trim":
rec2 = SeqIO.SffIO._sff_read_seq_record(
handle,
rec_dict._proxy._flows_per_read,
rec_dict._proxy._flow_chars,
rec_dict._proxy._key_sequence,
trim=True,
rec_dict._proxy.trim = True
rec_dict._proxy._offset = handle.tell()
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
rec_dict._proxy, handle
)
elif fmt == "uniprot-xml":
self.assertTrue(raw.startswith(b"<entry "), msg=msg)