Avoid yield in Bio.SeqIO SffIO (#4813)

2025-10-20 21:53:47 +08:00 · 2024-09-06 00:40:55 +09:00
parent 7154de931a
commit 69b466f038
4 changed files with 306 additions and 333 deletions
--- a/Bio/SeqIO/SffIO.py
+++ b/Bio/SeqIO/SffIO.py
@ -275,7 +275,7 @@ def _sff_file_header(handle):
    """
    # file header (part one)
-    # use big endiean encdoing   >
+    # use big endian encoding    >
    # magic_number               I
    # version                    4B
    # index_offset               Q
@ -643,155 +643,6 @@ def _sff_read_roche_index(handle):
 _valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
 def _sff_read_seq_record(
    handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
 ):
    """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
    # Now on to the reads...
    # the read header format (fixed part):
    # read_header_length     H
    # name_length            H
    # seq_len                I
    # clip_qual_left         H
    # clip_qual_right        H
    # clip_adapter_left      H
    # clip_adapter_right     H
    # [rest of read header depends on the name length etc]
    read_header_fmt = ">2HI4H"
    read_header_size = struct.calcsize(read_header_fmt)
    read_flow_fmt = ">%iH" % number_of_flows_per_read
    read_flow_size = struct.calcsize(read_flow_fmt)
    (
        read_header_length,
        name_length,
        seq_len,
        clip_qual_left,
        clip_qual_right,
        clip_adapter_left,
        clip_adapter_right,
    ) = struct.unpack(read_header_fmt, handle.read(read_header_size))
    if clip_qual_left:
        clip_qual_left -= 1  # python counting
    if clip_adapter_left:
        clip_adapter_left -= 1  # python counting
    if read_header_length < 10 or read_header_length % 8 != 0:
        raise ValueError(
            "Malformed read header, says length is %i" % read_header_length
        )
    # now the name and any padding (remainder of header)
    name = handle.read(name_length).decode()
    padding = read_header_length - read_header_size - name_length
    if handle.read(padding).count(_null) != padding:
        import warnings
        from Bio import BiopythonParserWarning
        warnings.warn(
            "Your SFF file is invalid, post name %i "
            "byte padding region contained data" % padding,
            BiopythonParserWarning,
        )
    # now the flowgram values, flowgram index, bases and qualities
    # NOTE - assuming flowgram_format==1, which means struct type H
    flow_values = handle.read(read_flow_size)  # unpack later if needed
    temp_fmt = ">%iB" % seq_len  # used for flow index and quals
    flow_index = handle.read(seq_len)  # unpack later if needed
    seq = handle.read(seq_len)  # Leave as bytes for Seq object
    quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
    # now any padding...
    padding = (read_flow_size + seq_len * 3) % 8
    if padding:
        padding = 8 - padding
        if handle.read(padding).count(_null) != padding:
            import warnings
            from Bio import BiopythonParserWarning
            warnings.warn(
                "Your SFF file is invalid, post quality %i "
                "byte padding region contained data" % padding,
                BiopythonParserWarning,
            )
    # Follow Roche and apply most aggressive of qual and adapter clipping.
    # Note Roche seems to ignore adapter clip fields when writing SFF,
    # and uses just the quality clipping values for any clipping.
    clip_left = max(clip_qual_left, clip_adapter_left)
    # Right clipping of zero means no clipping
    if clip_qual_right:
        if clip_adapter_right:
            clip_right = min(clip_qual_right, clip_adapter_right)
        else:
            # Typical case with Roche SFF files
            clip_right = clip_qual_right
    elif clip_adapter_right:
        clip_right = clip_adapter_right
    else:
        clip_right = seq_len
    # Now build a SeqRecord
    if trim:
        if clip_left >= clip_right:
            # Raise an error?
            import warnings
            from Bio import BiopythonParserWarning
            warnings.warn(
                "Overlapping clip values in SFF record, trimmed to nothing",
                BiopythonParserWarning,
            )
            seq = ""
            quals = []
        else:
            seq = seq[clip_left:clip_right].upper()
            quals = quals[clip_left:clip_right]
        # Don't record the clipping values, flow etc, they make no sense now:
        annotations = {}
    else:
        if clip_left >= clip_right:
            import warnings
            from Bio import BiopythonParserWarning
            warnings.warn(
                "Overlapping clip values in SFF record", BiopythonParserWarning
            )
            seq = seq.lower()
        else:
            # This use of mixed case mimics the Roche SFF tool's FASTA output
            seq = (
                seq[:clip_left].lower()
                + seq[clip_left:clip_right].upper()
                + seq[clip_right:].lower()
            )
        annotations = {
            "flow_values": struct.unpack(read_flow_fmt, flow_values),
            "flow_index": struct.unpack(temp_fmt, flow_index),
            "flow_chars": flow_chars,
            "flow_key": key_sequence,
            "clip_qual_left": clip_qual_left,
            "clip_qual_right": clip_qual_right,
            "clip_adapter_left": clip_adapter_left,
            "clip_adapter_right": clip_adapter_right,
        }
    if re.match(_valid_UAN_read_name, name):
        annotations["time"] = _get_read_time(name)
        annotations["region"] = _get_read_region(name)
        annotations["coords"] = _get_read_xy(name)
    annotations["molecule_type"] = "DNA"
    record = SeqRecord(
        Seq(seq), id=name, name=name, description="", annotations=annotations
    )
    # Dirty trick to speed up this line:
    # record.letter_annotations["phred_quality"] = quals
    dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
    # Return the record and then continue...
    return record
 _powers_of_36 = [36**i for i in range(6)]
 def _string_as_base_36(string):
    """Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
    total = 0
@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read):
    return raw
 class _AddTellHandle:
    """Wrapper for handles which do not support the tell method (PRIVATE).
    Intended for use with things like network handles where tell (and reverse
    seek) are not supported. The SFF file needs to track the current offset in
    order to deal with the index block.
    """
    def __init__(self, handle):
        self._handle = handle
        self._offset = 0
    def read(self, length):
        data = self._handle.read(length)
        self._offset += len(data)
        return data
    def tell(self):
        return self._offset
    def seek(self, offset):
        if offset < self._offset:
            raise RuntimeError("Can't seek backwards")
        self._handle.read(offset - self._offset)
    def close(self):
        return self._handle.close()
 class SffIterator(SequenceIterator):
    """Parser for Standard Flowgram Format (SFF) files."""
    # the read header format (fixed part):
    # read_header_length     H
    # name_length            H
    # seq_len                I
    # clip_qual_left         H
    # clip_qual_right        H
    # clip_adapter_left      H
    # clip_adapter_right     H
    # [rest of read header depends on the name length etc]
    read_header_fmt = ">2HI4H"
    read_header_size = struct.calcsize(read_header_fmt)
    assert read_header_size % 8 == 0  # Important for padding calc later!
    def __init__(self, source, alphabet=None, trim=False):
        """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
@ -996,30 +831,51 @@ class SffIterator(SequenceIterator):
            raise ValueError("The alphabet argument is no longer supported")
        super().__init__(source, mode="b", fmt="SFF")
        self.trim = trim
        stream = self.stream
        (
            self._offset,
            self.index_offset,
            self.index_length,
            self.number_of_reads,
            number_of_flows_per_read,
            self.flow_chars,
            self.key_sequence,
        ) = _sff_file_header(stream)
        # Now on to the reads...
        self.read_flow_fmt = ">%iH" % number_of_flows_per_read
        self.read_flow_size = struct.calcsize(self.read_flow_fmt)
        self._read_counter = 0
    def parse(self, handle):
-        """Start parsing the file, and return a SeqRecord generator."""
+        """To be removed."""
-        try:
+        return
            if 0 != handle.tell():
                raise ValueError("Not at start of file, offset %i" % handle.tell())
        except AttributeError:
            # Probably a network handle or something like that
            handle = _AddTellHandle(handle)
        records = self.iterate(handle)
        return records
-    def iterate(self, handle):
+    def __next__(self):
-        """Parse the file and generate SeqRecord objects."""
+        stream = self.stream
-        trim = self.trim
+        if self._read_counter == self.number_of_reads:
-        (
+            self._read_counter = None  # check EOF only once
-            header_length,
+            self._check_eof(stream)
-            index_offset,
+            raise StopIteration
-            index_length,
+        elif self._read_counter is None:
-            number_of_reads,
+            raise StopIteration
-            number_of_flows_per_read,
+        # The spec allows for the index block to be before or even in the middle
-            flow_chars,
+        # of the reads. We can check that if we keep track of our position
-            key_sequence,
+        # in the file...
-        ) = _sff_file_header(handle)
+        index_offset = self.index_offset
        if self._offset == index_offset:
            index_length = self.index_length
            offset = index_offset + index_length
            if offset % 8:
                offset += 8 - (offset % 8)
            assert offset % 8 == 0
            stream.seek(offset)
            self._offset = offset
        record = self._sff_read_seq_record(self.stream)
        self._read_counter += 1
        return record
    def _sff_read_seq_record(self, stream):
        """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
        # Now on to the reads...
        # the read header format (fixed part):
        # read_header_length     H
@ -1030,44 +886,150 @@ class SffIterator(SequenceIterator):
        # clip_adapter_left      H
        # clip_adapter_right     H
        # [rest of read header depends on the name length etc]
-        read_header_fmt = ">2HI4H"
+        read_header_fmt = SffIterator.read_header_fmt
-        read_header_size = struct.calcsize(read_header_fmt)
+        read_header_size = SffIterator.read_header_size
-        read_flow_fmt = ">%iH" % number_of_flows_per_read
+        (
-        read_flow_size = struct.calcsize(read_flow_fmt)
+            read_header_length,
-        assert 1 == struct.calcsize(">B")
+            name_length,
-        assert 1 == struct.calcsize(">s")
+            seq_len,
-        assert 1 == struct.calcsize(">c")
+            clip_qual_left,
-        assert read_header_size % 8 == 0  # Important for padding calc later!
+            clip_qual_right,
-        # The spec allows for the index block to be before or even in the middle
+            clip_adapter_left,
-        # of the reads. We can check that if we keep track of our position
+            clip_adapter_right,
-        # in the file...
+        ) = struct.unpack(read_header_fmt, stream.read(read_header_size))
-        for read in range(number_of_reads):
+        if clip_qual_left:
-            if index_offset and handle.tell() == index_offset:
+            clip_qual_left -= 1  # python counting
-                offset = index_offset + index_length
+        if clip_adapter_left:
-                if offset % 8:
+            clip_adapter_left -= 1  # python counting
-                    offset += 8 - (offset % 8)
+        if read_header_length < 10 or read_header_length % 8 != 0:
-                assert offset % 8 == 0
+            raise ValueError(
-                handle.seek(offset)
+                "Malformed read header, says length is %i" % read_header_length
                # Now that we've done this, we don't need to do it again. Clear
                # the index_offset so we can skip extra handle.tell() calls:
                index_offset = 0
            yield _sff_read_seq_record(
                handle, number_of_flows_per_read, flow_chars, key_sequence, trim
            )
-        _check_eof(handle, index_offset, index_length)
+        # now the name and any padding (remainder of header)
        name = stream.read(name_length).decode()
        padding = read_header_length - read_header_size - name_length
        if stream.read(padding).count(_null) != padding:
            import warnings
            from Bio import BiopythonParserWarning
-def _check_eof(handle, index_offset, index_length):
+            warnings.warn(
                "Your SFF file is invalid, post name %i "
                "byte padding region contained data" % padding,
                BiopythonParserWarning,
            )
        self._offset += read_header_length
        # now the flowgram values, flowgram index, bases and qualities
        # NOTE - assuming flowgram_format==1, which means struct type H
        flow_values = stream.read(self.read_flow_size)  # unpack later if needed
        temp_fmt = ">%iB" % seq_len  # used for flow index and quals
        flow_index = stream.read(seq_len)  # unpack later if needed
        seq = stream.read(seq_len)  # Leave as bytes for Seq object
        quals = list(struct.unpack(temp_fmt, stream.read(seq_len)))
        self._offset += self.read_flow_size + seq_len * 3
        # now any padding...
        padding = (self.read_flow_size + seq_len * 3) % 8
        if padding:
            padding = 8 - padding
            if stream.read(padding).count(_null) != padding:
                import warnings
                from Bio import BiopythonParserWarning
                warnings.warn(
                    "Your SFF file is invalid, post quality %i "
                    "byte padding region contained data" % padding,
                    BiopythonParserWarning,
                )
            self._offset += padding
        # Follow Roche and apply most aggressive of qual and adapter clipping.
        # Note Roche seems to ignore adapter clip fields when writing SFF,
        # and uses just the quality clipping values for any clipping.
        clip_left = max(clip_qual_left, clip_adapter_left)
        # Right clipping of zero means no clipping
        if clip_qual_right:
            if clip_adapter_right:
                clip_right = min(clip_qual_right, clip_adapter_right)
            else:
                # Typical case with Roche SFF files
                clip_right = clip_qual_right
        elif clip_adapter_right:
            clip_right = clip_adapter_right
        else:
            clip_right = seq_len
        # Now build a SeqRecord
        if self.trim:
            if clip_left >= clip_right:
                # Raise an error?
                import warnings
                from Bio import BiopythonParserWarning
                warnings.warn(
                    "Overlapping clip values in SFF record, trimmed to nothing",
                    BiopythonParserWarning,
                )
                seq = ""
                quals = []
            else:
                seq = seq[clip_left:clip_right].upper()
                quals = quals[clip_left:clip_right]
            # Don't record the clipping values, flow etc, they make no sense now:
            annotations = {}
        else:
            if clip_left >= clip_right:
                import warnings
                from Bio import BiopythonParserWarning
                warnings.warn(
                    "Overlapping clip values in SFF record", BiopythonParserWarning
                )
                seq = seq.lower()
            else:
                # This use of mixed case mimics the Roche SFF tool's FASTA output
                seq = (
                    seq[:clip_left].lower()
                    + seq[clip_left:clip_right].upper()
                    + seq[clip_right:].lower()
                )
            annotations = {
                "flow_values": struct.unpack(self.read_flow_fmt, flow_values),
                "flow_index": struct.unpack(temp_fmt, flow_index),
                "flow_chars": self.flow_chars,
                "flow_key": self.key_sequence,
                "clip_qual_left": clip_qual_left,
                "clip_qual_right": clip_qual_right,
                "clip_adapter_left": clip_adapter_left,
                "clip_adapter_right": clip_adapter_right,
            }
        if re.match(_valid_UAN_read_name, name):
            annotations["time"] = _get_read_time(name)
            annotations["region"] = _get_read_region(name)
            annotations["coords"] = _get_read_xy(name)
        annotations["molecule_type"] = "DNA"
        record = SeqRecord(
            Seq(seq), id=name, name=name, description="", annotations=annotations
        )
        # Dirty trick to speed up this line:
        # record.letter_annotations["phred_quality"] = quals
        dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
        # Return the record and then continue...
        return record
    def _check_eof(self, stream):
        """Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
        Will attempt to spot apparent SFF file concatenation and give an error.
-    Will not attempt to seek, only moves the handle forward.
+        Will not attempt to seek, only moves the stream forward.
        """
-    offset = handle.tell()
+        offset = self._offset
        extra = b""
        padding = 0
        index_offset = self.index_offset
        index_length = self.index_length
        if index_offset and offset <= index_offset:
            # Index block then end of file...
            if offset < index_offset:
@ -1078,17 +1040,19 @@ def _check_eof(handle, index_offset, index_length):
                )
            # Doing read to jump the index rather than a seek
            # in case this is a network handle or similar
-        handle.read(index_offset + index_length - offset)
+            stream.read(index_length)
            self._offset += index_length
            offset = index_offset + index_length
-        if offset != handle.tell():
+            if offset != self._offset:
                raise ValueError(
                    "Wanted %i, got %i, index is %i to %i"
-                % (offset, handle.tell(), index_offset, index_offset + index_length)
+                    % (offset, stream.tell(), index_offset, index_offset + index_length)
                )
        if offset % 8:
            padding = 8 - (offset % 8)
-        extra = handle.read(padding)
+            extra = stream.read(padding)
            self._offset += padding
        if padding >= 4 and extra[-4:] == _sff:
            # Seen this in one user supplied file, should have been
@ -1123,11 +1087,14 @@ def _check_eof(handle, index_offset, index_length):
                BiopythonParserWarning,
            )
-    offset = handle.tell()
+        offset = self._offset
        if offset % 8 != 0:
-        raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8))
+            raise ValueError(
                "Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)
            )
        # Should now be at the end of the file...
-    extra = handle.read(4)
+        extra = stream.read(4)
        self._offset += 4
        if extra == _sff:
            raise ValueError(
                "Additional data at end of SFF file, "
@ -1135,7 +1102,12 @@ def _check_eof(handle, index_offset, index_length):
                "See offset %i" % offset
            )
        elif extra:
-        raise ValueError("Additional data at end of SFF file, see offset %i" % offset)
+            raise ValueError(
                "Additional data at end of SFF file, see offset %i" % offset
            )
 _powers_of_36 = [36**i for i in range(6)]
 class _SffTrimIterator(SffIterator):
--- a/Bio/SeqIO/_index.py
+++ b/Bio/SeqIO/_index.py
@ -27,6 +27,7 @@ re-indexing the file for use another time.
 """
 import re
 import struct
 from io import BytesIO
 from io import StringIO
@ -67,13 +68,16 @@ class SffRandomAccess(SeqFileRandomAccess):
        SeqFileRandomAccess.__init__(self, filename, format)
        (
            header_length,
-            index_offset,
+            self.index_offset,
-            index_length,
+            self.index_length,
            number_of_reads,
-            self._flows_per_read,
+            self.number_of_flows_per_read,
-            self._flow_chars,
+            self.flow_chars,
-            self._key_sequence,
+            self.key_sequence,
        ) = SeqIO.SffIO._sff_file_header(self._handle)
        # Now on to the reads...
        self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read
        self.read_flow_size = struct.calcsize(self.read_flow_fmt)
    def __iter__(self):
        """Load any index block in the file, or build it the slow way (PRIVATE)."""
@ -85,9 +89,9 @@ class SffRandomAccess(SeqFileRandomAccess):
            index_offset,
            index_length,
            number_of_reads,
-            self._flows_per_read,
+            self.number_of_flows_per_read,
-            self._flow_chars,
+            self.flow_chars,
-            self._key_sequence,
+            self.key_sequence,
        ) = SeqIO.SffIO._sff_file_header(handle)
        if index_offset and index_length:
            # There is an index provided, try this the fast way:
@ -120,9 +124,12 @@ class SffRandomAccess(SeqFileRandomAccess):
                    # Can have an index at start (or mid-file)
                    handle.seek(max_offset)
                    # Parse the final read,
-                    SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
+                    SeqIO.SffIO._sff_read_raw_record(
                        handle, self.number_of_flows_per_read
                    )
                    # Should now be at the end of the file!
-                SeqIO.SffIO._check_eof(handle, index_offset, index_length)
+                self._offset = handle.tell()
                SeqIO.SffIO.SffIterator._check_eof(self, handle)
                return
        # We used to give a warning in this case, but Ion Torrent's
        # SFF files don't have an index so that would be annoying.
@ -135,21 +142,23 @@ class SffRandomAccess(SeqFileRandomAccess):
            raise ValueError(
                "Indexed %i records, expected %i" % (count, number_of_reads)
            )
-        SeqIO.SffIO._check_eof(handle, index_offset, index_length)
+        self._offset = handle.tell()
        SeqIO.SffIO.SffIterator._check_eof(self, handle)
    def get(self, offset):
        """Return the SeqRecord starting at the given offset."""
        handle = self._handle
        handle.seek(offset)
-        return SeqIO.SffIO._sff_read_seq_record(
+        self._offset = offset
-            handle, self._flows_per_read, self._flow_chars, self._key_sequence
+        self.trim = False
-        )
+        return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
    def get_raw(self, offset):
        """Return the raw record from the file as a bytes string."""
        handle = self._handle
        handle.seek(offset)
-        return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
+        self.stream = handle
        return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read)
 class SffTrimedRandomAccess(SffRandomAccess):
@ -159,13 +168,9 @@ class SffTrimedRandomAccess(SffRandomAccess):
        """Return the SeqRecord starting at the given offset."""
        handle = self._handle
        handle.seek(offset)
-        return SeqIO.SffIO._sff_read_seq_record(
+        self._offset = offset
-            handle,
+        self.trim = True
-            self._flows_per_read,
+        return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
            self._flow_chars,
            self._key_sequence,
            trim=True,
        )
 ###################
--- a/Tests/test_SeqIO_SffIO.py
+++ b/Tests/test_SeqIO_SffIO.py
--- a/Tests/test_SeqIO_index.py
+++ b/Tests/test_SeqIO_index.py
@ -635,20 +635,16 @@ class IndexDictTests(SeqRecordTestBaseClass, SeqIOTestBaseClass):
            else:
                raise RuntimeError(f"Unexpected mode {mode}")
            if fmt == "sff":
-                rec2 = SeqIO.SffIO._sff_read_seq_record(
+                rec_dict._proxy.trim = False
-                    handle,
+                rec_dict._proxy._offset = handle.tell()
-                    rec_dict._proxy._flows_per_read,
+                rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
-                    rec_dict._proxy._flow_chars,
+                    rec_dict._proxy, handle
                    rec_dict._proxy._key_sequence,
                    trim=False,
                )
            elif fmt == "sff-trim":
-                rec2 = SeqIO.SffIO._sff_read_seq_record(
+                rec_dict._proxy.trim = True
-                    handle,
+                rec_dict._proxy._offset = handle.tell()
-                    rec_dict._proxy._flows_per_read,
+                rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
-                    rec_dict._proxy._flow_chars,
+                    rec_dict._proxy, handle
                    rec_dict._proxy._key_sequence,
                    trim=True,
                )
            elif fmt == "uniprot-xml":
                self.assertTrue(raw.startswith(b"<entry "), msg=msg)