mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Avoid yield in Bio.SeqIO SffIO (#4813)
This commit is contained in:
@ -275,7 +275,7 @@ def _sff_file_header(handle):
|
||||
|
||||
"""
|
||||
# file header (part one)
|
||||
# use big endiean encdoing >
|
||||
# use big endian encoding >
|
||||
# magic_number I
|
||||
# version 4B
|
||||
# index_offset Q
|
||||
@ -643,155 +643,6 @@ def _sff_read_roche_index(handle):
|
||||
_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
|
||||
|
||||
|
||||
def _sff_read_seq_record(
|
||||
handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
|
||||
):
|
||||
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
|
||||
# Now on to the reads...
|
||||
# the read header format (fixed part):
|
||||
# read_header_length H
|
||||
# name_length H
|
||||
# seq_len I
|
||||
# clip_qual_left H
|
||||
# clip_qual_right H
|
||||
# clip_adapter_left H
|
||||
# clip_adapter_right H
|
||||
# [rest of read header depends on the name length etc]
|
||||
read_header_fmt = ">2HI4H"
|
||||
read_header_size = struct.calcsize(read_header_fmt)
|
||||
read_flow_fmt = ">%iH" % number_of_flows_per_read
|
||||
read_flow_size = struct.calcsize(read_flow_fmt)
|
||||
|
||||
(
|
||||
read_header_length,
|
||||
name_length,
|
||||
seq_len,
|
||||
clip_qual_left,
|
||||
clip_qual_right,
|
||||
clip_adapter_left,
|
||||
clip_adapter_right,
|
||||
) = struct.unpack(read_header_fmt, handle.read(read_header_size))
|
||||
if clip_qual_left:
|
||||
clip_qual_left -= 1 # python counting
|
||||
if clip_adapter_left:
|
||||
clip_adapter_left -= 1 # python counting
|
||||
if read_header_length < 10 or read_header_length % 8 != 0:
|
||||
raise ValueError(
|
||||
"Malformed read header, says length is %i" % read_header_length
|
||||
)
|
||||
# now the name and any padding (remainder of header)
|
||||
name = handle.read(name_length).decode()
|
||||
padding = read_header_length - read_header_size - name_length
|
||||
if handle.read(padding).count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post name %i "
|
||||
"byte padding region contained data" % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
# now the flowgram values, flowgram index, bases and qualities
|
||||
# NOTE - assuming flowgram_format==1, which means struct type H
|
||||
flow_values = handle.read(read_flow_size) # unpack later if needed
|
||||
temp_fmt = ">%iB" % seq_len # used for flow index and quals
|
||||
flow_index = handle.read(seq_len) # unpack later if needed
|
||||
seq = handle.read(seq_len) # Leave as bytes for Seq object
|
||||
quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
|
||||
# now any padding...
|
||||
padding = (read_flow_size + seq_len * 3) % 8
|
||||
if padding:
|
||||
padding = 8 - padding
|
||||
if handle.read(padding).count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post quality %i "
|
||||
"byte padding region contained data" % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
# Follow Roche and apply most aggressive of qual and adapter clipping.
|
||||
# Note Roche seems to ignore adapter clip fields when writing SFF,
|
||||
# and uses just the quality clipping values for any clipping.
|
||||
clip_left = max(clip_qual_left, clip_adapter_left)
|
||||
# Right clipping of zero means no clipping
|
||||
if clip_qual_right:
|
||||
if clip_adapter_right:
|
||||
clip_right = min(clip_qual_right, clip_adapter_right)
|
||||
else:
|
||||
# Typical case with Roche SFF files
|
||||
clip_right = clip_qual_right
|
||||
elif clip_adapter_right:
|
||||
clip_right = clip_adapter_right
|
||||
else:
|
||||
clip_right = seq_len
|
||||
# Now build a SeqRecord
|
||||
if trim:
|
||||
if clip_left >= clip_right:
|
||||
# Raise an error?
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Overlapping clip values in SFF record, trimmed to nothing",
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
seq = ""
|
||||
quals = []
|
||||
else:
|
||||
seq = seq[clip_left:clip_right].upper()
|
||||
quals = quals[clip_left:clip_right]
|
||||
# Don't record the clipping values, flow etc, they make no sense now:
|
||||
annotations = {}
|
||||
else:
|
||||
if clip_left >= clip_right:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Overlapping clip values in SFF record", BiopythonParserWarning
|
||||
)
|
||||
seq = seq.lower()
|
||||
else:
|
||||
# This use of mixed case mimics the Roche SFF tool's FASTA output
|
||||
seq = (
|
||||
seq[:clip_left].lower()
|
||||
+ seq[clip_left:clip_right].upper()
|
||||
+ seq[clip_right:].lower()
|
||||
)
|
||||
annotations = {
|
||||
"flow_values": struct.unpack(read_flow_fmt, flow_values),
|
||||
"flow_index": struct.unpack(temp_fmt, flow_index),
|
||||
"flow_chars": flow_chars,
|
||||
"flow_key": key_sequence,
|
||||
"clip_qual_left": clip_qual_left,
|
||||
"clip_qual_right": clip_qual_right,
|
||||
"clip_adapter_left": clip_adapter_left,
|
||||
"clip_adapter_right": clip_adapter_right,
|
||||
}
|
||||
if re.match(_valid_UAN_read_name, name):
|
||||
annotations["time"] = _get_read_time(name)
|
||||
annotations["region"] = _get_read_region(name)
|
||||
annotations["coords"] = _get_read_xy(name)
|
||||
annotations["molecule_type"] = "DNA"
|
||||
record = SeqRecord(
|
||||
Seq(seq), id=name, name=name, description="", annotations=annotations
|
||||
)
|
||||
# Dirty trick to speed up this line:
|
||||
# record.letter_annotations["phred_quality"] = quals
|
||||
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
|
||||
# Return the record and then continue...
|
||||
return record
|
||||
|
||||
|
||||
_powers_of_36 = [36**i for i in range(6)]
|
||||
|
||||
|
||||
def _string_as_base_36(string):
|
||||
"""Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
|
||||
total = 0
|
||||
@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read):
|
||||
return raw
|
||||
|
||||
|
||||
class _AddTellHandle:
|
||||
"""Wrapper for handles which do not support the tell method (PRIVATE).
|
||||
|
||||
Intended for use with things like network handles where tell (and reverse
|
||||
seek) are not supported. The SFF file needs to track the current offset in
|
||||
order to deal with the index block.
|
||||
"""
|
||||
|
||||
def __init__(self, handle):
|
||||
self._handle = handle
|
||||
self._offset = 0
|
||||
|
||||
def read(self, length):
|
||||
data = self._handle.read(length)
|
||||
self._offset += len(data)
|
||||
return data
|
||||
|
||||
def tell(self):
|
||||
return self._offset
|
||||
|
||||
def seek(self, offset):
|
||||
if offset < self._offset:
|
||||
raise RuntimeError("Can't seek backwards")
|
||||
self._handle.read(offset - self._offset)
|
||||
|
||||
def close(self):
|
||||
return self._handle.close()
|
||||
|
||||
|
||||
class SffIterator(SequenceIterator):
|
||||
"""Parser for Standard Flowgram Format (SFF) files."""
|
||||
|
||||
# the read header format (fixed part):
|
||||
# read_header_length H
|
||||
# name_length H
|
||||
# seq_len I
|
||||
# clip_qual_left H
|
||||
# clip_qual_right H
|
||||
# clip_adapter_left H
|
||||
# clip_adapter_right H
|
||||
# [rest of read header depends on the name length etc]
|
||||
read_header_fmt = ">2HI4H"
|
||||
read_header_size = struct.calcsize(read_header_fmt)
|
||||
assert read_header_size % 8 == 0 # Important for padding calc later!
|
||||
|
||||
def __init__(self, source, alphabet=None, trim=False):
|
||||
"""Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
|
||||
|
||||
@ -996,30 +831,51 @@ class SffIterator(SequenceIterator):
|
||||
raise ValueError("The alphabet argument is no longer supported")
|
||||
super().__init__(source, mode="b", fmt="SFF")
|
||||
self.trim = trim
|
||||
stream = self.stream
|
||||
(
|
||||
self._offset,
|
||||
self.index_offset,
|
||||
self.index_length,
|
||||
self.number_of_reads,
|
||||
number_of_flows_per_read,
|
||||
self.flow_chars,
|
||||
self.key_sequence,
|
||||
) = _sff_file_header(stream)
|
||||
# Now on to the reads...
|
||||
self.read_flow_fmt = ">%iH" % number_of_flows_per_read
|
||||
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
|
||||
self._read_counter = 0
|
||||
|
||||
def parse(self, handle):
|
||||
"""Start parsing the file, and return a SeqRecord generator."""
|
||||
try:
|
||||
if 0 != handle.tell():
|
||||
raise ValueError("Not at start of file, offset %i" % handle.tell())
|
||||
except AttributeError:
|
||||
# Probably a network handle or something like that
|
||||
handle = _AddTellHandle(handle)
|
||||
records = self.iterate(handle)
|
||||
return records
|
||||
"""To be removed."""
|
||||
return
|
||||
|
||||
def iterate(self, handle):
|
||||
"""Parse the file and generate SeqRecord objects."""
|
||||
trim = self.trim
|
||||
(
|
||||
header_length,
|
||||
index_offset,
|
||||
index_length,
|
||||
number_of_reads,
|
||||
number_of_flows_per_read,
|
||||
flow_chars,
|
||||
key_sequence,
|
||||
) = _sff_file_header(handle)
|
||||
def __next__(self):
|
||||
stream = self.stream
|
||||
if self._read_counter == self.number_of_reads:
|
||||
self._read_counter = None # check EOF only once
|
||||
self._check_eof(stream)
|
||||
raise StopIteration
|
||||
elif self._read_counter is None:
|
||||
raise StopIteration
|
||||
# The spec allows for the index block to be before or even in the middle
|
||||
# of the reads. We can check that if we keep track of our position
|
||||
# in the file...
|
||||
index_offset = self.index_offset
|
||||
if self._offset == index_offset:
|
||||
index_length = self.index_length
|
||||
offset = index_offset + index_length
|
||||
if offset % 8:
|
||||
offset += 8 - (offset % 8)
|
||||
assert offset % 8 == 0
|
||||
stream.seek(offset)
|
||||
self._offset = offset
|
||||
record = self._sff_read_seq_record(self.stream)
|
||||
self._read_counter += 1
|
||||
return record
|
||||
|
||||
def _sff_read_seq_record(self, stream):
|
||||
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
|
||||
# Now on to the reads...
|
||||
# the read header format (fixed part):
|
||||
# read_header_length H
|
||||
@ -1030,112 +886,228 @@ class SffIterator(SequenceIterator):
|
||||
# clip_adapter_left H
|
||||
# clip_adapter_right H
|
||||
# [rest of read header depends on the name length etc]
|
||||
read_header_fmt = ">2HI4H"
|
||||
read_header_size = struct.calcsize(read_header_fmt)
|
||||
read_flow_fmt = ">%iH" % number_of_flows_per_read
|
||||
read_flow_size = struct.calcsize(read_flow_fmt)
|
||||
assert 1 == struct.calcsize(">B")
|
||||
assert 1 == struct.calcsize(">s")
|
||||
assert 1 == struct.calcsize(">c")
|
||||
assert read_header_size % 8 == 0 # Important for padding calc later!
|
||||
# The spec allows for the index block to be before or even in the middle
|
||||
# of the reads. We can check that if we keep track of our position
|
||||
# in the file...
|
||||
for read in range(number_of_reads):
|
||||
if index_offset and handle.tell() == index_offset:
|
||||
offset = index_offset + index_length
|
||||
if offset % 8:
|
||||
offset += 8 - (offset % 8)
|
||||
assert offset % 8 == 0
|
||||
handle.seek(offset)
|
||||
# Now that we've done this, we don't need to do it again. Clear
|
||||
# the index_offset so we can skip extra handle.tell() calls:
|
||||
index_offset = 0
|
||||
yield _sff_read_seq_record(
|
||||
handle, number_of_flows_per_read, flow_chars, key_sequence, trim
|
||||
)
|
||||
_check_eof(handle, index_offset, index_length)
|
||||
|
||||
|
||||
def _check_eof(handle, index_offset, index_length):
|
||||
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
|
||||
|
||||
Will attempt to spot apparent SFF file concatenation and give an error.
|
||||
|
||||
Will not attempt to seek, only moves the handle forward.
|
||||
"""
|
||||
offset = handle.tell()
|
||||
extra = b""
|
||||
padding = 0
|
||||
|
||||
if index_offset and offset <= index_offset:
|
||||
# Index block then end of file...
|
||||
if offset < index_offset:
|
||||
read_header_fmt = SffIterator.read_header_fmt
|
||||
read_header_size = SffIterator.read_header_size
|
||||
(
|
||||
read_header_length,
|
||||
name_length,
|
||||
seq_len,
|
||||
clip_qual_left,
|
||||
clip_qual_right,
|
||||
clip_adapter_left,
|
||||
clip_adapter_right,
|
||||
) = struct.unpack(read_header_fmt, stream.read(read_header_size))
|
||||
if clip_qual_left:
|
||||
clip_qual_left -= 1 # python counting
|
||||
if clip_adapter_left:
|
||||
clip_adapter_left -= 1 # python counting
|
||||
if read_header_length < 10 or read_header_length % 8 != 0:
|
||||
raise ValueError(
|
||||
"Gap of %i bytes after final record end %i, "
|
||||
"before %i where index starts?"
|
||||
% (index_offset - offset, offset, index_offset)
|
||||
"Malformed read header, says length is %i" % read_header_length
|
||||
)
|
||||
# Doing read to jump the index rather than a seek
|
||||
# in case this is a network handle or similar
|
||||
handle.read(index_offset + index_length - offset)
|
||||
offset = index_offset + index_length
|
||||
if offset != handle.tell():
|
||||
# now the name and any padding (remainder of header)
|
||||
name = stream.read(name_length).decode()
|
||||
padding = read_header_length - read_header_size - name_length
|
||||
if stream.read(padding).count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post name %i "
|
||||
"byte padding region contained data" % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
self._offset += read_header_length
|
||||
# now the flowgram values, flowgram index, bases and qualities
|
||||
# NOTE - assuming flowgram_format==1, which means struct type H
|
||||
flow_values = stream.read(self.read_flow_size) # unpack later if needed
|
||||
temp_fmt = ">%iB" % seq_len # used for flow index and quals
|
||||
flow_index = stream.read(seq_len) # unpack later if needed
|
||||
seq = stream.read(seq_len) # Leave as bytes for Seq object
|
||||
quals = list(struct.unpack(temp_fmt, stream.read(seq_len)))
|
||||
self._offset += self.read_flow_size + seq_len * 3
|
||||
# now any padding...
|
||||
padding = (self.read_flow_size + seq_len * 3) % 8
|
||||
if padding:
|
||||
padding = 8 - padding
|
||||
if stream.read(padding).count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post quality %i "
|
||||
"byte padding region contained data" % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
self._offset += padding
|
||||
# Follow Roche and apply most aggressive of qual and adapter clipping.
|
||||
# Note Roche seems to ignore adapter clip fields when writing SFF,
|
||||
# and uses just the quality clipping values for any clipping.
|
||||
clip_left = max(clip_qual_left, clip_adapter_left)
|
||||
# Right clipping of zero means no clipping
|
||||
if clip_qual_right:
|
||||
if clip_adapter_right:
|
||||
clip_right = min(clip_qual_right, clip_adapter_right)
|
||||
else:
|
||||
# Typical case with Roche SFF files
|
||||
clip_right = clip_qual_right
|
||||
elif clip_adapter_right:
|
||||
clip_right = clip_adapter_right
|
||||
else:
|
||||
clip_right = seq_len
|
||||
# Now build a SeqRecord
|
||||
if self.trim:
|
||||
if clip_left >= clip_right:
|
||||
# Raise an error?
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Overlapping clip values in SFF record, trimmed to nothing",
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
seq = ""
|
||||
quals = []
|
||||
else:
|
||||
seq = seq[clip_left:clip_right].upper()
|
||||
quals = quals[clip_left:clip_right]
|
||||
# Don't record the clipping values, flow etc, they make no sense now:
|
||||
annotations = {}
|
||||
else:
|
||||
if clip_left >= clip_right:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Overlapping clip values in SFF record", BiopythonParserWarning
|
||||
)
|
||||
seq = seq.lower()
|
||||
else:
|
||||
# This use of mixed case mimics the Roche SFF tool's FASTA output
|
||||
seq = (
|
||||
seq[:clip_left].lower()
|
||||
+ seq[clip_left:clip_right].upper()
|
||||
+ seq[clip_right:].lower()
|
||||
)
|
||||
annotations = {
|
||||
"flow_values": struct.unpack(self.read_flow_fmt, flow_values),
|
||||
"flow_index": struct.unpack(temp_fmt, flow_index),
|
||||
"flow_chars": self.flow_chars,
|
||||
"flow_key": self.key_sequence,
|
||||
"clip_qual_left": clip_qual_left,
|
||||
"clip_qual_right": clip_qual_right,
|
||||
"clip_adapter_left": clip_adapter_left,
|
||||
"clip_adapter_right": clip_adapter_right,
|
||||
}
|
||||
if re.match(_valid_UAN_read_name, name):
|
||||
annotations["time"] = _get_read_time(name)
|
||||
annotations["region"] = _get_read_region(name)
|
||||
annotations["coords"] = _get_read_xy(name)
|
||||
annotations["molecule_type"] = "DNA"
|
||||
record = SeqRecord(
|
||||
Seq(seq), id=name, name=name, description="", annotations=annotations
|
||||
)
|
||||
# Dirty trick to speed up this line:
|
||||
# record.letter_annotations["phred_quality"] = quals
|
||||
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
|
||||
# Return the record and then continue...
|
||||
return record
|
||||
|
||||
def _check_eof(self, stream):
|
||||
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
|
||||
|
||||
Will attempt to spot apparent SFF file concatenation and give an error.
|
||||
|
||||
Will not attempt to seek, only moves the stream forward.
|
||||
"""
|
||||
offset = self._offset
|
||||
extra = b""
|
||||
padding = 0
|
||||
|
||||
index_offset = self.index_offset
|
||||
index_length = self.index_length
|
||||
if index_offset and offset <= index_offset:
|
||||
# Index block then end of file...
|
||||
if offset < index_offset:
|
||||
raise ValueError(
|
||||
"Gap of %i bytes after final record end %i, "
|
||||
"before %i where index starts?"
|
||||
% (index_offset - offset, offset, index_offset)
|
||||
)
|
||||
# Doing read to jump the index rather than a seek
|
||||
# in case this is a network handle or similar
|
||||
stream.read(index_length)
|
||||
self._offset += index_length
|
||||
offset = index_offset + index_length
|
||||
if offset != self._offset:
|
||||
raise ValueError(
|
||||
"Wanted %i, got %i, index is %i to %i"
|
||||
% (offset, stream.tell(), index_offset, index_offset + index_length)
|
||||
)
|
||||
|
||||
if offset % 8:
|
||||
padding = 8 - (offset % 8)
|
||||
extra = stream.read(padding)
|
||||
self._offset += padding
|
||||
|
||||
if padding >= 4 and extra[-4:] == _sff:
|
||||
# Seen this in one user supplied file, should have been
|
||||
# four bytes of null padding but was actually .sff and
|
||||
# the start of a new concatenated SFF file!
|
||||
raise ValueError(
|
||||
"Wanted %i, got %i, index is %i to %i"
|
||||
% (offset, handle.tell(), index_offset, index_offset + index_length)
|
||||
"Your SFF file is invalid, post index %i byte "
|
||||
"null padding region ended '.sff' which could "
|
||||
"be the start of a concatenated SFF file? "
|
||||
"See offset %i" % (padding, offset)
|
||||
)
|
||||
if padding and not extra:
|
||||
# TODO - Is this error harmless enough to just ignore?
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is technically invalid as it is missing "
|
||||
"a terminal %i byte null padding region." % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
return
|
||||
if extra.count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post index %i byte "
|
||||
"null padding region contained data: %r" % (padding, extra),
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
|
||||
if offset % 8:
|
||||
padding = 8 - (offset % 8)
|
||||
extra = handle.read(padding)
|
||||
offset = self._offset
|
||||
if offset % 8 != 0:
|
||||
raise ValueError(
|
||||
"Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)
|
||||
)
|
||||
# Should now be at the end of the file...
|
||||
extra = stream.read(4)
|
||||
self._offset += 4
|
||||
if extra == _sff:
|
||||
raise ValueError(
|
||||
"Additional data at end of SFF file, "
|
||||
"perhaps multiple SFF files concatenated? "
|
||||
"See offset %i" % offset
|
||||
)
|
||||
elif extra:
|
||||
raise ValueError(
|
||||
"Additional data at end of SFF file, see offset %i" % offset
|
||||
)
|
||||
|
||||
if padding >= 4 and extra[-4:] == _sff:
|
||||
# Seen this in one user supplied file, should have been
|
||||
# four bytes of null padding but was actually .sff and
|
||||
# the start of a new concatenated SFF file!
|
||||
raise ValueError(
|
||||
"Your SFF file is invalid, post index %i byte "
|
||||
"null padding region ended '.sff' which could "
|
||||
"be the start of a concatenated SFF file? "
|
||||
"See offset %i" % (padding, offset)
|
||||
)
|
||||
if padding and not extra:
|
||||
# TODO - Is this error harmless enough to just ignore?
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is technically invalid as it is missing "
|
||||
"a terminal %i byte null padding region." % padding,
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
return
|
||||
if extra.count(_null) != padding:
|
||||
import warnings
|
||||
|
||||
from Bio import BiopythonParserWarning
|
||||
|
||||
warnings.warn(
|
||||
"Your SFF file is invalid, post index %i byte "
|
||||
"null padding region contained data: %r" % (padding, extra),
|
||||
BiopythonParserWarning,
|
||||
)
|
||||
|
||||
offset = handle.tell()
|
||||
if offset % 8 != 0:
|
||||
raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8))
|
||||
# Should now be at the end of the file...
|
||||
extra = handle.read(4)
|
||||
if extra == _sff:
|
||||
raise ValueError(
|
||||
"Additional data at end of SFF file, "
|
||||
"perhaps multiple SFF files concatenated? "
|
||||
"See offset %i" % offset
|
||||
)
|
||||
elif extra:
|
||||
raise ValueError("Additional data at end of SFF file, see offset %i" % offset)
|
||||
_powers_of_36 = [36**i for i in range(6)]
|
||||
|
||||
|
||||
class _SffTrimIterator(SffIterator):
|
||||
|
@ -27,6 +27,7 @@ re-indexing the file for use another time.
|
||||
"""
|
||||
|
||||
import re
|
||||
import struct
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
|
||||
@ -67,13 +68,16 @@ class SffRandomAccess(SeqFileRandomAccess):
|
||||
SeqFileRandomAccess.__init__(self, filename, format)
|
||||
(
|
||||
header_length,
|
||||
index_offset,
|
||||
index_length,
|
||||
self.index_offset,
|
||||
self.index_length,
|
||||
number_of_reads,
|
||||
self._flows_per_read,
|
||||
self._flow_chars,
|
||||
self._key_sequence,
|
||||
self.number_of_flows_per_read,
|
||||
self.flow_chars,
|
||||
self.key_sequence,
|
||||
) = SeqIO.SffIO._sff_file_header(self._handle)
|
||||
# Now on to the reads...
|
||||
self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read
|
||||
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
|
||||
|
||||
def __iter__(self):
|
||||
"""Load any index block in the file, or build it the slow way (PRIVATE)."""
|
||||
@ -85,9 +89,9 @@ class SffRandomAccess(SeqFileRandomAccess):
|
||||
index_offset,
|
||||
index_length,
|
||||
number_of_reads,
|
||||
self._flows_per_read,
|
||||
self._flow_chars,
|
||||
self._key_sequence,
|
||||
self.number_of_flows_per_read,
|
||||
self.flow_chars,
|
||||
self.key_sequence,
|
||||
) = SeqIO.SffIO._sff_file_header(handle)
|
||||
if index_offset and index_length:
|
||||
# There is an index provided, try this the fast way:
|
||||
@ -120,9 +124,12 @@ class SffRandomAccess(SeqFileRandomAccess):
|
||||
# Can have an index at start (or mid-file)
|
||||
handle.seek(max_offset)
|
||||
# Parse the final read,
|
||||
SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
|
||||
SeqIO.SffIO._sff_read_raw_record(
|
||||
handle, self.number_of_flows_per_read
|
||||
)
|
||||
# Should now be at the end of the file!
|
||||
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
|
||||
self._offset = handle.tell()
|
||||
SeqIO.SffIO.SffIterator._check_eof(self, handle)
|
||||
return
|
||||
# We used to give a warning in this case, but Ion Torrent's
|
||||
# SFF files don't have an index so that would be annoying.
|
||||
@ -135,21 +142,23 @@ class SffRandomAccess(SeqFileRandomAccess):
|
||||
raise ValueError(
|
||||
"Indexed %i records, expected %i" % (count, number_of_reads)
|
||||
)
|
||||
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
|
||||
self._offset = handle.tell()
|
||||
SeqIO.SffIO.SffIterator._check_eof(self, handle)
|
||||
|
||||
def get(self, offset):
|
||||
"""Return the SeqRecord starting at the given offset."""
|
||||
handle = self._handle
|
||||
handle.seek(offset)
|
||||
return SeqIO.SffIO._sff_read_seq_record(
|
||||
handle, self._flows_per_read, self._flow_chars, self._key_sequence
|
||||
)
|
||||
self._offset = offset
|
||||
self.trim = False
|
||||
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
|
||||
|
||||
def get_raw(self, offset):
|
||||
"""Return the raw record from the file as a bytes string."""
|
||||
handle = self._handle
|
||||
handle.seek(offset)
|
||||
return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
|
||||
self.stream = handle
|
||||
return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read)
|
||||
|
||||
|
||||
class SffTrimedRandomAccess(SffRandomAccess):
|
||||
@ -159,13 +168,9 @@ class SffTrimedRandomAccess(SffRandomAccess):
|
||||
"""Return the SeqRecord starting at the given offset."""
|
||||
handle = self._handle
|
||||
handle.seek(offset)
|
||||
return SeqIO.SffIO._sff_read_seq_record(
|
||||
handle,
|
||||
self._flows_per_read,
|
||||
self._flow_chars,
|
||||
self._key_sequence,
|
||||
trim=True,
|
||||
)
|
||||
self._offset = offset
|
||||
self.trim = True
|
||||
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
|
||||
|
||||
|
||||
###################
|
||||
|
@ -635,20 +635,16 @@ class IndexDictTests(SeqRecordTestBaseClass, SeqIOTestBaseClass):
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected mode {mode}")
|
||||
if fmt == "sff":
|
||||
rec2 = SeqIO.SffIO._sff_read_seq_record(
|
||||
handle,
|
||||
rec_dict._proxy._flows_per_read,
|
||||
rec_dict._proxy._flow_chars,
|
||||
rec_dict._proxy._key_sequence,
|
||||
trim=False,
|
||||
rec_dict._proxy.trim = False
|
||||
rec_dict._proxy._offset = handle.tell()
|
||||
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
|
||||
rec_dict._proxy, handle
|
||||
)
|
||||
elif fmt == "sff-trim":
|
||||
rec2 = SeqIO.SffIO._sff_read_seq_record(
|
||||
handle,
|
||||
rec_dict._proxy._flows_per_read,
|
||||
rec_dict._proxy._flow_chars,
|
||||
rec_dict._proxy._key_sequence,
|
||||
trim=True,
|
||||
rec_dict._proxy.trim = True
|
||||
rec_dict._proxy._offset = handle.tell()
|
||||
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
|
||||
rec_dict._proxy, handle
|
||||
)
|
||||
elif fmt == "uniprot-xml":
|
||||
self.assertTrue(raw.startswith(b"<entry "), msg=msg)
|
||||
|
Reference in New Issue
Block a user