mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Avoid yield in Bio.SeqIO SffIO (#4813)
This commit is contained in:
@ -275,7 +275,7 @@ def _sff_file_header(handle):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
# file header (part one)
|
# file header (part one)
|
||||||
# use big endiean encdoing >
|
# use big endian encoding >
|
||||||
# magic_number I
|
# magic_number I
|
||||||
# version 4B
|
# version 4B
|
||||||
# index_offset Q
|
# index_offset Q
|
||||||
@ -643,155 +643,6 @@ def _sff_read_roche_index(handle):
|
|||||||
_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
|
_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
|
||||||
|
|
||||||
|
|
||||||
def _sff_read_seq_record(
|
|
||||||
handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
|
|
||||||
):
|
|
||||||
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
|
|
||||||
# Now on to the reads...
|
|
||||||
# the read header format (fixed part):
|
|
||||||
# read_header_length H
|
|
||||||
# name_length H
|
|
||||||
# seq_len I
|
|
||||||
# clip_qual_left H
|
|
||||||
# clip_qual_right H
|
|
||||||
# clip_adapter_left H
|
|
||||||
# clip_adapter_right H
|
|
||||||
# [rest of read header depends on the name length etc]
|
|
||||||
read_header_fmt = ">2HI4H"
|
|
||||||
read_header_size = struct.calcsize(read_header_fmt)
|
|
||||||
read_flow_fmt = ">%iH" % number_of_flows_per_read
|
|
||||||
read_flow_size = struct.calcsize(read_flow_fmt)
|
|
||||||
|
|
||||||
(
|
|
||||||
read_header_length,
|
|
||||||
name_length,
|
|
||||||
seq_len,
|
|
||||||
clip_qual_left,
|
|
||||||
clip_qual_right,
|
|
||||||
clip_adapter_left,
|
|
||||||
clip_adapter_right,
|
|
||||||
) = struct.unpack(read_header_fmt, handle.read(read_header_size))
|
|
||||||
if clip_qual_left:
|
|
||||||
clip_qual_left -= 1 # python counting
|
|
||||||
if clip_adapter_left:
|
|
||||||
clip_adapter_left -= 1 # python counting
|
|
||||||
if read_header_length < 10 or read_header_length % 8 != 0:
|
|
||||||
raise ValueError(
|
|
||||||
"Malformed read header, says length is %i" % read_header_length
|
|
||||||
)
|
|
||||||
# now the name and any padding (remainder of header)
|
|
||||||
name = handle.read(name_length).decode()
|
|
||||||
padding = read_header_length - read_header_size - name_length
|
|
||||||
if handle.read(padding).count(_null) != padding:
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from Bio import BiopythonParserWarning
|
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"Your SFF file is invalid, post name %i "
|
|
||||||
"byte padding region contained data" % padding,
|
|
||||||
BiopythonParserWarning,
|
|
||||||
)
|
|
||||||
# now the flowgram values, flowgram index, bases and qualities
|
|
||||||
# NOTE - assuming flowgram_format==1, which means struct type H
|
|
||||||
flow_values = handle.read(read_flow_size) # unpack later if needed
|
|
||||||
temp_fmt = ">%iB" % seq_len # used for flow index and quals
|
|
||||||
flow_index = handle.read(seq_len) # unpack later if needed
|
|
||||||
seq = handle.read(seq_len) # Leave as bytes for Seq object
|
|
||||||
quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
|
|
||||||
# now any padding...
|
|
||||||
padding = (read_flow_size + seq_len * 3) % 8
|
|
||||||
if padding:
|
|
||||||
padding = 8 - padding
|
|
||||||
if handle.read(padding).count(_null) != padding:
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from Bio import BiopythonParserWarning
|
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"Your SFF file is invalid, post quality %i "
|
|
||||||
"byte padding region contained data" % padding,
|
|
||||||
BiopythonParserWarning,
|
|
||||||
)
|
|
||||||
# Follow Roche and apply most aggressive of qual and adapter clipping.
|
|
||||||
# Note Roche seems to ignore adapter clip fields when writing SFF,
|
|
||||||
# and uses just the quality clipping values for any clipping.
|
|
||||||
clip_left = max(clip_qual_left, clip_adapter_left)
|
|
||||||
# Right clipping of zero means no clipping
|
|
||||||
if clip_qual_right:
|
|
||||||
if clip_adapter_right:
|
|
||||||
clip_right = min(clip_qual_right, clip_adapter_right)
|
|
||||||
else:
|
|
||||||
# Typical case with Roche SFF files
|
|
||||||
clip_right = clip_qual_right
|
|
||||||
elif clip_adapter_right:
|
|
||||||
clip_right = clip_adapter_right
|
|
||||||
else:
|
|
||||||
clip_right = seq_len
|
|
||||||
# Now build a SeqRecord
|
|
||||||
if trim:
|
|
||||||
if clip_left >= clip_right:
|
|
||||||
# Raise an error?
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from Bio import BiopythonParserWarning
|
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"Overlapping clip values in SFF record, trimmed to nothing",
|
|
||||||
BiopythonParserWarning,
|
|
||||||
)
|
|
||||||
seq = ""
|
|
||||||
quals = []
|
|
||||||
else:
|
|
||||||
seq = seq[clip_left:clip_right].upper()
|
|
||||||
quals = quals[clip_left:clip_right]
|
|
||||||
# Don't record the clipping values, flow etc, they make no sense now:
|
|
||||||
annotations = {}
|
|
||||||
else:
|
|
||||||
if clip_left >= clip_right:
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from Bio import BiopythonParserWarning
|
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"Overlapping clip values in SFF record", BiopythonParserWarning
|
|
||||||
)
|
|
||||||
seq = seq.lower()
|
|
||||||
else:
|
|
||||||
# This use of mixed case mimics the Roche SFF tool's FASTA output
|
|
||||||
seq = (
|
|
||||||
seq[:clip_left].lower()
|
|
||||||
+ seq[clip_left:clip_right].upper()
|
|
||||||
+ seq[clip_right:].lower()
|
|
||||||
)
|
|
||||||
annotations = {
|
|
||||||
"flow_values": struct.unpack(read_flow_fmt, flow_values),
|
|
||||||
"flow_index": struct.unpack(temp_fmt, flow_index),
|
|
||||||
"flow_chars": flow_chars,
|
|
||||||
"flow_key": key_sequence,
|
|
||||||
"clip_qual_left": clip_qual_left,
|
|
||||||
"clip_qual_right": clip_qual_right,
|
|
||||||
"clip_adapter_left": clip_adapter_left,
|
|
||||||
"clip_adapter_right": clip_adapter_right,
|
|
||||||
}
|
|
||||||
if re.match(_valid_UAN_read_name, name):
|
|
||||||
annotations["time"] = _get_read_time(name)
|
|
||||||
annotations["region"] = _get_read_region(name)
|
|
||||||
annotations["coords"] = _get_read_xy(name)
|
|
||||||
annotations["molecule_type"] = "DNA"
|
|
||||||
record = SeqRecord(
|
|
||||||
Seq(seq), id=name, name=name, description="", annotations=annotations
|
|
||||||
)
|
|
||||||
# Dirty trick to speed up this line:
|
|
||||||
# record.letter_annotations["phred_quality"] = quals
|
|
||||||
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
|
|
||||||
# Return the record and then continue...
|
|
||||||
return record
|
|
||||||
|
|
||||||
|
|
||||||
_powers_of_36 = [36**i for i in range(6)]
|
|
||||||
|
|
||||||
|
|
||||||
def _string_as_base_36(string):
|
def _string_as_base_36(string):
|
||||||
"""Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
|
"""Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
|
||||||
total = 0
|
total = 0
|
||||||
@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read):
|
|||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
class _AddTellHandle:
|
|
||||||
"""Wrapper for handles which do not support the tell method (PRIVATE).
|
|
||||||
|
|
||||||
Intended for use with things like network handles where tell (and reverse
|
|
||||||
seek) are not supported. The SFF file needs to track the current offset in
|
|
||||||
order to deal with the index block.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, handle):
|
|
||||||
self._handle = handle
|
|
||||||
self._offset = 0
|
|
||||||
|
|
||||||
def read(self, length):
|
|
||||||
data = self._handle.read(length)
|
|
||||||
self._offset += len(data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def tell(self):
|
|
||||||
return self._offset
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
if offset < self._offset:
|
|
||||||
raise RuntimeError("Can't seek backwards")
|
|
||||||
self._handle.read(offset - self._offset)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return self._handle.close()
|
|
||||||
|
|
||||||
|
|
||||||
class SffIterator(SequenceIterator):
|
class SffIterator(SequenceIterator):
|
||||||
"""Parser for Standard Flowgram Format (SFF) files."""
|
"""Parser for Standard Flowgram Format (SFF) files."""
|
||||||
|
|
||||||
|
# the read header format (fixed part):
|
||||||
|
# read_header_length H
|
||||||
|
# name_length H
|
||||||
|
# seq_len I
|
||||||
|
# clip_qual_left H
|
||||||
|
# clip_qual_right H
|
||||||
|
# clip_adapter_left H
|
||||||
|
# clip_adapter_right H
|
||||||
|
# [rest of read header depends on the name length etc]
|
||||||
|
read_header_fmt = ">2HI4H"
|
||||||
|
read_header_size = struct.calcsize(read_header_fmt)
|
||||||
|
assert read_header_size % 8 == 0 # Important for padding calc later!
|
||||||
|
|
||||||
def __init__(self, source, alphabet=None, trim=False):
|
def __init__(self, source, alphabet=None, trim=False):
|
||||||
"""Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
|
"""Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
|
||||||
|
|
||||||
@ -996,30 +831,51 @@ class SffIterator(SequenceIterator):
|
|||||||
raise ValueError("The alphabet argument is no longer supported")
|
raise ValueError("The alphabet argument is no longer supported")
|
||||||
super().__init__(source, mode="b", fmt="SFF")
|
super().__init__(source, mode="b", fmt="SFF")
|
||||||
self.trim = trim
|
self.trim = trim
|
||||||
|
stream = self.stream
|
||||||
|
(
|
||||||
|
self._offset,
|
||||||
|
self.index_offset,
|
||||||
|
self.index_length,
|
||||||
|
self.number_of_reads,
|
||||||
|
number_of_flows_per_read,
|
||||||
|
self.flow_chars,
|
||||||
|
self.key_sequence,
|
||||||
|
) = _sff_file_header(stream)
|
||||||
|
# Now on to the reads...
|
||||||
|
self.read_flow_fmt = ">%iH" % number_of_flows_per_read
|
||||||
|
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
|
||||||
|
self._read_counter = 0
|
||||||
|
|
||||||
def parse(self, handle):
|
def parse(self, handle):
|
||||||
"""Start parsing the file, and return a SeqRecord generator."""
|
"""To be removed."""
|
||||||
try:
|
return
|
||||||
if 0 != handle.tell():
|
|
||||||
raise ValueError("Not at start of file, offset %i" % handle.tell())
|
|
||||||
except AttributeError:
|
|
||||||
# Probably a network handle or something like that
|
|
||||||
handle = _AddTellHandle(handle)
|
|
||||||
records = self.iterate(handle)
|
|
||||||
return records
|
|
||||||
|
|
||||||
def iterate(self, handle):
|
def __next__(self):
|
||||||
"""Parse the file and generate SeqRecord objects."""
|
stream = self.stream
|
||||||
trim = self.trim
|
if self._read_counter == self.number_of_reads:
|
||||||
(
|
self._read_counter = None # check EOF only once
|
||||||
header_length,
|
self._check_eof(stream)
|
||||||
index_offset,
|
raise StopIteration
|
||||||
index_length,
|
elif self._read_counter is None:
|
||||||
number_of_reads,
|
raise StopIteration
|
||||||
number_of_flows_per_read,
|
# The spec allows for the index block to be before or even in the middle
|
||||||
flow_chars,
|
# of the reads. We can check that if we keep track of our position
|
||||||
key_sequence,
|
# in the file...
|
||||||
) = _sff_file_header(handle)
|
index_offset = self.index_offset
|
||||||
|
if self._offset == index_offset:
|
||||||
|
index_length = self.index_length
|
||||||
|
offset = index_offset + index_length
|
||||||
|
if offset % 8:
|
||||||
|
offset += 8 - (offset % 8)
|
||||||
|
assert offset % 8 == 0
|
||||||
|
stream.seek(offset)
|
||||||
|
self._offset = offset
|
||||||
|
record = self._sff_read_seq_record(self.stream)
|
||||||
|
self._read_counter += 1
|
||||||
|
return record
|
||||||
|
|
||||||
|
def _sff_read_seq_record(self, stream):
|
||||||
|
"""Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
|
||||||
# Now on to the reads...
|
# Now on to the reads...
|
||||||
# the read header format (fixed part):
|
# the read header format (fixed part):
|
||||||
# read_header_length H
|
# read_header_length H
|
||||||
@ -1030,44 +886,150 @@ class SffIterator(SequenceIterator):
|
|||||||
# clip_adapter_left H
|
# clip_adapter_left H
|
||||||
# clip_adapter_right H
|
# clip_adapter_right H
|
||||||
# [rest of read header depends on the name length etc]
|
# [rest of read header depends on the name length etc]
|
||||||
read_header_fmt = ">2HI4H"
|
read_header_fmt = SffIterator.read_header_fmt
|
||||||
read_header_size = struct.calcsize(read_header_fmt)
|
read_header_size = SffIterator.read_header_size
|
||||||
read_flow_fmt = ">%iH" % number_of_flows_per_read
|
(
|
||||||
read_flow_size = struct.calcsize(read_flow_fmt)
|
read_header_length,
|
||||||
assert 1 == struct.calcsize(">B")
|
name_length,
|
||||||
assert 1 == struct.calcsize(">s")
|
seq_len,
|
||||||
assert 1 == struct.calcsize(">c")
|
clip_qual_left,
|
||||||
assert read_header_size % 8 == 0 # Important for padding calc later!
|
clip_qual_right,
|
||||||
# The spec allows for the index block to be before or even in the middle
|
clip_adapter_left,
|
||||||
# of the reads. We can check that if we keep track of our position
|
clip_adapter_right,
|
||||||
# in the file...
|
) = struct.unpack(read_header_fmt, stream.read(read_header_size))
|
||||||
for read in range(number_of_reads):
|
if clip_qual_left:
|
||||||
if index_offset and handle.tell() == index_offset:
|
clip_qual_left -= 1 # python counting
|
||||||
offset = index_offset + index_length
|
if clip_adapter_left:
|
||||||
if offset % 8:
|
clip_adapter_left -= 1 # python counting
|
||||||
offset += 8 - (offset % 8)
|
if read_header_length < 10 or read_header_length % 8 != 0:
|
||||||
assert offset % 8 == 0
|
raise ValueError(
|
||||||
handle.seek(offset)
|
"Malformed read header, says length is %i" % read_header_length
|
||||||
# Now that we've done this, we don't need to do it again. Clear
|
|
||||||
# the index_offset so we can skip extra handle.tell() calls:
|
|
||||||
index_offset = 0
|
|
||||||
yield _sff_read_seq_record(
|
|
||||||
handle, number_of_flows_per_read, flow_chars, key_sequence, trim
|
|
||||||
)
|
)
|
||||||
_check_eof(handle, index_offset, index_length)
|
# now the name and any padding (remainder of header)
|
||||||
|
name = stream.read(name_length).decode()
|
||||||
|
padding = read_header_length - read_header_size - name_length
|
||||||
|
if stream.read(padding).count(_null) != padding:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from Bio import BiopythonParserWarning
|
||||||
|
|
||||||
def _check_eof(handle, index_offset, index_length):
|
warnings.warn(
|
||||||
|
"Your SFF file is invalid, post name %i "
|
||||||
|
"byte padding region contained data" % padding,
|
||||||
|
BiopythonParserWarning,
|
||||||
|
)
|
||||||
|
self._offset += read_header_length
|
||||||
|
# now the flowgram values, flowgram index, bases and qualities
|
||||||
|
# NOTE - assuming flowgram_format==1, which means struct type H
|
||||||
|
flow_values = stream.read(self.read_flow_size) # unpack later if needed
|
||||||
|
temp_fmt = ">%iB" % seq_len # used for flow index and quals
|
||||||
|
flow_index = stream.read(seq_len) # unpack later if needed
|
||||||
|
seq = stream.read(seq_len) # Leave as bytes for Seq object
|
||||||
|
quals = list(struct.unpack(temp_fmt, stream.read(seq_len)))
|
||||||
|
self._offset += self.read_flow_size + seq_len * 3
|
||||||
|
# now any padding...
|
||||||
|
padding = (self.read_flow_size + seq_len * 3) % 8
|
||||||
|
if padding:
|
||||||
|
padding = 8 - padding
|
||||||
|
if stream.read(padding).count(_null) != padding:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from Bio import BiopythonParserWarning
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"Your SFF file is invalid, post quality %i "
|
||||||
|
"byte padding region contained data" % padding,
|
||||||
|
BiopythonParserWarning,
|
||||||
|
)
|
||||||
|
self._offset += padding
|
||||||
|
# Follow Roche and apply most aggressive of qual and adapter clipping.
|
||||||
|
# Note Roche seems to ignore adapter clip fields when writing SFF,
|
||||||
|
# and uses just the quality clipping values for any clipping.
|
||||||
|
clip_left = max(clip_qual_left, clip_adapter_left)
|
||||||
|
# Right clipping of zero means no clipping
|
||||||
|
if clip_qual_right:
|
||||||
|
if clip_adapter_right:
|
||||||
|
clip_right = min(clip_qual_right, clip_adapter_right)
|
||||||
|
else:
|
||||||
|
# Typical case with Roche SFF files
|
||||||
|
clip_right = clip_qual_right
|
||||||
|
elif clip_adapter_right:
|
||||||
|
clip_right = clip_adapter_right
|
||||||
|
else:
|
||||||
|
clip_right = seq_len
|
||||||
|
# Now build a SeqRecord
|
||||||
|
if self.trim:
|
||||||
|
if clip_left >= clip_right:
|
||||||
|
# Raise an error?
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from Bio import BiopythonParserWarning
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"Overlapping clip values in SFF record, trimmed to nothing",
|
||||||
|
BiopythonParserWarning,
|
||||||
|
)
|
||||||
|
seq = ""
|
||||||
|
quals = []
|
||||||
|
else:
|
||||||
|
seq = seq[clip_left:clip_right].upper()
|
||||||
|
quals = quals[clip_left:clip_right]
|
||||||
|
# Don't record the clipping values, flow etc, they make no sense now:
|
||||||
|
annotations = {}
|
||||||
|
else:
|
||||||
|
if clip_left >= clip_right:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from Bio import BiopythonParserWarning
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"Overlapping clip values in SFF record", BiopythonParserWarning
|
||||||
|
)
|
||||||
|
seq = seq.lower()
|
||||||
|
else:
|
||||||
|
# This use of mixed case mimics the Roche SFF tool's FASTA output
|
||||||
|
seq = (
|
||||||
|
seq[:clip_left].lower()
|
||||||
|
+ seq[clip_left:clip_right].upper()
|
||||||
|
+ seq[clip_right:].lower()
|
||||||
|
)
|
||||||
|
annotations = {
|
||||||
|
"flow_values": struct.unpack(self.read_flow_fmt, flow_values),
|
||||||
|
"flow_index": struct.unpack(temp_fmt, flow_index),
|
||||||
|
"flow_chars": self.flow_chars,
|
||||||
|
"flow_key": self.key_sequence,
|
||||||
|
"clip_qual_left": clip_qual_left,
|
||||||
|
"clip_qual_right": clip_qual_right,
|
||||||
|
"clip_adapter_left": clip_adapter_left,
|
||||||
|
"clip_adapter_right": clip_adapter_right,
|
||||||
|
}
|
||||||
|
if re.match(_valid_UAN_read_name, name):
|
||||||
|
annotations["time"] = _get_read_time(name)
|
||||||
|
annotations["region"] = _get_read_region(name)
|
||||||
|
annotations["coords"] = _get_read_xy(name)
|
||||||
|
annotations["molecule_type"] = "DNA"
|
||||||
|
record = SeqRecord(
|
||||||
|
Seq(seq), id=name, name=name, description="", annotations=annotations
|
||||||
|
)
|
||||||
|
# Dirty trick to speed up this line:
|
||||||
|
# record.letter_annotations["phred_quality"] = quals
|
||||||
|
dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
|
||||||
|
# Return the record and then continue...
|
||||||
|
return record
|
||||||
|
|
||||||
|
def _check_eof(self, stream):
|
||||||
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
|
"""Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
|
||||||
|
|
||||||
Will attempt to spot apparent SFF file concatenation and give an error.
|
Will attempt to spot apparent SFF file concatenation and give an error.
|
||||||
|
|
||||||
Will not attempt to seek, only moves the handle forward.
|
Will not attempt to seek, only moves the stream forward.
|
||||||
"""
|
"""
|
||||||
offset = handle.tell()
|
offset = self._offset
|
||||||
extra = b""
|
extra = b""
|
||||||
padding = 0
|
padding = 0
|
||||||
|
|
||||||
|
index_offset = self.index_offset
|
||||||
|
index_length = self.index_length
|
||||||
if index_offset and offset <= index_offset:
|
if index_offset and offset <= index_offset:
|
||||||
# Index block then end of file...
|
# Index block then end of file...
|
||||||
if offset < index_offset:
|
if offset < index_offset:
|
||||||
@ -1078,17 +1040,19 @@ def _check_eof(handle, index_offset, index_length):
|
|||||||
)
|
)
|
||||||
# Doing read to jump the index rather than a seek
|
# Doing read to jump the index rather than a seek
|
||||||
# in case this is a network handle or similar
|
# in case this is a network handle or similar
|
||||||
handle.read(index_offset + index_length - offset)
|
stream.read(index_length)
|
||||||
|
self._offset += index_length
|
||||||
offset = index_offset + index_length
|
offset = index_offset + index_length
|
||||||
if offset != handle.tell():
|
if offset != self._offset:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Wanted %i, got %i, index is %i to %i"
|
"Wanted %i, got %i, index is %i to %i"
|
||||||
% (offset, handle.tell(), index_offset, index_offset + index_length)
|
% (offset, stream.tell(), index_offset, index_offset + index_length)
|
||||||
)
|
)
|
||||||
|
|
||||||
if offset % 8:
|
if offset % 8:
|
||||||
padding = 8 - (offset % 8)
|
padding = 8 - (offset % 8)
|
||||||
extra = handle.read(padding)
|
extra = stream.read(padding)
|
||||||
|
self._offset += padding
|
||||||
|
|
||||||
if padding >= 4 and extra[-4:] == _sff:
|
if padding >= 4 and extra[-4:] == _sff:
|
||||||
# Seen this in one user supplied file, should have been
|
# Seen this in one user supplied file, should have been
|
||||||
@ -1123,11 +1087,14 @@ def _check_eof(handle, index_offset, index_length):
|
|||||||
BiopythonParserWarning,
|
BiopythonParserWarning,
|
||||||
)
|
)
|
||||||
|
|
||||||
offset = handle.tell()
|
offset = self._offset
|
||||||
if offset % 8 != 0:
|
if offset % 8 != 0:
|
||||||
raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8))
|
raise ValueError(
|
||||||
|
"Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)
|
||||||
|
)
|
||||||
# Should now be at the end of the file...
|
# Should now be at the end of the file...
|
||||||
extra = handle.read(4)
|
extra = stream.read(4)
|
||||||
|
self._offset += 4
|
||||||
if extra == _sff:
|
if extra == _sff:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Additional data at end of SFF file, "
|
"Additional data at end of SFF file, "
|
||||||
@ -1135,7 +1102,12 @@ def _check_eof(handle, index_offset, index_length):
|
|||||||
"See offset %i" % offset
|
"See offset %i" % offset
|
||||||
)
|
)
|
||||||
elif extra:
|
elif extra:
|
||||||
raise ValueError("Additional data at end of SFF file, see offset %i" % offset)
|
raise ValueError(
|
||||||
|
"Additional data at end of SFF file, see offset %i" % offset
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_powers_of_36 = [36**i for i in range(6)]
|
||||||
|
|
||||||
|
|
||||||
class _SffTrimIterator(SffIterator):
|
class _SffTrimIterator(SffIterator):
|
||||||
|
@ -27,6 +27,7 @@ re-indexing the file for use another time.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import struct
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
@ -67,13 +68,16 @@ class SffRandomAccess(SeqFileRandomAccess):
|
|||||||
SeqFileRandomAccess.__init__(self, filename, format)
|
SeqFileRandomAccess.__init__(self, filename, format)
|
||||||
(
|
(
|
||||||
header_length,
|
header_length,
|
||||||
index_offset,
|
self.index_offset,
|
||||||
index_length,
|
self.index_length,
|
||||||
number_of_reads,
|
number_of_reads,
|
||||||
self._flows_per_read,
|
self.number_of_flows_per_read,
|
||||||
self._flow_chars,
|
self.flow_chars,
|
||||||
self._key_sequence,
|
self.key_sequence,
|
||||||
) = SeqIO.SffIO._sff_file_header(self._handle)
|
) = SeqIO.SffIO._sff_file_header(self._handle)
|
||||||
|
# Now on to the reads...
|
||||||
|
self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read
|
||||||
|
self.read_flow_size = struct.calcsize(self.read_flow_fmt)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Load any index block in the file, or build it the slow way (PRIVATE)."""
|
"""Load any index block in the file, or build it the slow way (PRIVATE)."""
|
||||||
@ -85,9 +89,9 @@ class SffRandomAccess(SeqFileRandomAccess):
|
|||||||
index_offset,
|
index_offset,
|
||||||
index_length,
|
index_length,
|
||||||
number_of_reads,
|
number_of_reads,
|
||||||
self._flows_per_read,
|
self.number_of_flows_per_read,
|
||||||
self._flow_chars,
|
self.flow_chars,
|
||||||
self._key_sequence,
|
self.key_sequence,
|
||||||
) = SeqIO.SffIO._sff_file_header(handle)
|
) = SeqIO.SffIO._sff_file_header(handle)
|
||||||
if index_offset and index_length:
|
if index_offset and index_length:
|
||||||
# There is an index provided, try this the fast way:
|
# There is an index provided, try this the fast way:
|
||||||
@ -120,9 +124,12 @@ class SffRandomAccess(SeqFileRandomAccess):
|
|||||||
# Can have an index at start (or mid-file)
|
# Can have an index at start (or mid-file)
|
||||||
handle.seek(max_offset)
|
handle.seek(max_offset)
|
||||||
# Parse the final read,
|
# Parse the final read,
|
||||||
SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
|
SeqIO.SffIO._sff_read_raw_record(
|
||||||
|
handle, self.number_of_flows_per_read
|
||||||
|
)
|
||||||
# Should now be at the end of the file!
|
# Should now be at the end of the file!
|
||||||
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
|
self._offset = handle.tell()
|
||||||
|
SeqIO.SffIO.SffIterator._check_eof(self, handle)
|
||||||
return
|
return
|
||||||
# We used to give a warning in this case, but Ion Torrent's
|
# We used to give a warning in this case, but Ion Torrent's
|
||||||
# SFF files don't have an index so that would be annoying.
|
# SFF files don't have an index so that would be annoying.
|
||||||
@ -135,21 +142,23 @@ class SffRandomAccess(SeqFileRandomAccess):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Indexed %i records, expected %i" % (count, number_of_reads)
|
"Indexed %i records, expected %i" % (count, number_of_reads)
|
||||||
)
|
)
|
||||||
SeqIO.SffIO._check_eof(handle, index_offset, index_length)
|
self._offset = handle.tell()
|
||||||
|
SeqIO.SffIO.SffIterator._check_eof(self, handle)
|
||||||
|
|
||||||
def get(self, offset):
|
def get(self, offset):
|
||||||
"""Return the SeqRecord starting at the given offset."""
|
"""Return the SeqRecord starting at the given offset."""
|
||||||
handle = self._handle
|
handle = self._handle
|
||||||
handle.seek(offset)
|
handle.seek(offset)
|
||||||
return SeqIO.SffIO._sff_read_seq_record(
|
self._offset = offset
|
||||||
handle, self._flows_per_read, self._flow_chars, self._key_sequence
|
self.trim = False
|
||||||
)
|
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
|
||||||
|
|
||||||
def get_raw(self, offset):
|
def get_raw(self, offset):
|
||||||
"""Return the raw record from the file as a bytes string."""
|
"""Return the raw record from the file as a bytes string."""
|
||||||
handle = self._handle
|
handle = self._handle
|
||||||
handle.seek(offset)
|
handle.seek(offset)
|
||||||
return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
|
self.stream = handle
|
||||||
|
return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read)
|
||||||
|
|
||||||
|
|
||||||
class SffTrimedRandomAccess(SffRandomAccess):
|
class SffTrimedRandomAccess(SffRandomAccess):
|
||||||
@ -159,13 +168,9 @@ class SffTrimedRandomAccess(SffRandomAccess):
|
|||||||
"""Return the SeqRecord starting at the given offset."""
|
"""Return the SeqRecord starting at the given offset."""
|
||||||
handle = self._handle
|
handle = self._handle
|
||||||
handle.seek(offset)
|
handle.seek(offset)
|
||||||
return SeqIO.SffIO._sff_read_seq_record(
|
self._offset = offset
|
||||||
handle,
|
self.trim = True
|
||||||
self._flows_per_read,
|
return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle)
|
||||||
self._flow_chars,
|
|
||||||
self._key_sequence,
|
|
||||||
trim=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
###################
|
###################
|
||||||
|
@ -635,20 +635,16 @@ class IndexDictTests(SeqRecordTestBaseClass, SeqIOTestBaseClass):
|
|||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unexpected mode {mode}")
|
raise RuntimeError(f"Unexpected mode {mode}")
|
||||||
if fmt == "sff":
|
if fmt == "sff":
|
||||||
rec2 = SeqIO.SffIO._sff_read_seq_record(
|
rec_dict._proxy.trim = False
|
||||||
handle,
|
rec_dict._proxy._offset = handle.tell()
|
||||||
rec_dict._proxy._flows_per_read,
|
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
|
||||||
rec_dict._proxy._flow_chars,
|
rec_dict._proxy, handle
|
||||||
rec_dict._proxy._key_sequence,
|
|
||||||
trim=False,
|
|
||||||
)
|
)
|
||||||
elif fmt == "sff-trim":
|
elif fmt == "sff-trim":
|
||||||
rec2 = SeqIO.SffIO._sff_read_seq_record(
|
rec_dict._proxy.trim = True
|
||||||
handle,
|
rec_dict._proxy._offset = handle.tell()
|
||||||
rec_dict._proxy._flows_per_read,
|
rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record(
|
||||||
rec_dict._proxy._flow_chars,
|
rec_dict._proxy, handle
|
||||||
rec_dict._proxy._key_sequence,
|
|
||||||
trim=True,
|
|
||||||
)
|
)
|
||||||
elif fmt == "uniprot-xml":
|
elif fmt == "uniprot-xml":
|
||||||
self.assertTrue(raw.startswith(b"<entry "), msg=msg)
|
self.assertTrue(raw.startswith(b"<entry "), msg=msg)
|
||||||
|
Reference in New Issue
Block a user