Add a parser for 2bit files to Bio.SeqIO (#3388)

* add test script

* add test files

* done

* travis

* travis being obnoxious

* remove unnecessary str support

* avoid using PyUnicode_New as it's not in pypy

* add some documentation

* add some documentation

* move to a comment to travis can't bitch about it

* move to a comment to travis can't bitch about it

* documentation update

* fix warning

* Let's see why 32bit windows is not happy

* Let's see why 32bit windows is not happy

* check the file descriptor in advance; on Windows, lseek with a closed file descriptor will by default close the application

* lseek is fubar on windows

* lseek and read apparently are fubar on Windows. Use Python to seek/read, and numpy to keep the code reasonably fast

* deal with closed files

* travis

* more inane stuff from Travis

* weird

* reverting previous change to the tutorial text

* reverting changes to test_SeqIO.py; these are not needed anymore now that twoBit file reading is done via Python

* improving speed

* using C code for 2bit to sequence conversion

* adding missing C code

* expand exception

* accept formats I and L

* release buffer

* travis

* windows

* private

* NEWS
This commit is contained in:
mdehoon
2020-12-04 00:28:39 +09:00
committed by GitHub
parent cd1d500460
commit 41b6f22574
12 changed files with 871 additions and 10 deletions

View File

@ -95,26 +95,45 @@ class Seq:
self._data = bytes(data)
elif isinstance(data, str):
self._data = bytes(data, encoding="ASCII")
elif self._check_bytes_like(data):
# e.g. the TwoBitSequence objects, which return bytes when
# __getitem__ is called on them.
self._data = data
else:
raise TypeError(
"data should be a string, bytes, bytearray, Seq, or MutableSeq object"
)
def _check_bytes_like(self, data):
# Check if data is bytes-like. This currently requires two things:
# - calling len(data) must return the length of the data
# - calling __getitem__ must return a bytes object for the requested region
try:
len(data)
except TypeError:
return False
try:
c = data[:0]
except TypeError:
return False
if c != b"":
return False
return True
def __bytes__(self):
return self._data
return bytes(self._data[:])
def __repr__(self):
"""Return (truncated) representation of the sequence for debugging."""
data = bytes(self)
if len(data) > 60:
if len(self) > 60:
# Shows the last three letters as it is often useful to see if
# there is a stop codon at the end of a sequence.
# Note total length is 54+3+3=60
start = data[:54].decode("ASCII")
end = data[-3:].decode("ASCII")
start = bytes(self[:54]).decode("ASCII")
end = bytes(self[-3:]).decode("ASCII")
return f"{self.__class__.__name__}('{start}...{end}')"
else:
data = data.decode("ASCII")
data = bytes(self).decode("ASCII")
return f"{self.__class__.__name__}('{data}')"
def __str__(self):
@ -134,7 +153,10 @@ class Seq:
as_string = str(seq_obj)
"""
return self._data.decode("ASCII")
data = self._data
if not isinstance(data, (bytes, bytearray)):
data = bytes(self)
return data.decode("ASCII")
def __hash__(self):
"""Hash of the sequence as a string for comparison.

229
Bio/SeqIO/TwoBitIO.py Normal file
View File

@ -0,0 +1,229 @@
# Copyright 2020 by Michiel de Hoon
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.
This parser reads the index stored in the twoBit file, as well as the masked
regions and the N's for ean sequence. It also creates sequence data objects
(_TwoBitSequenceData objects), which support only two methods: __len__ and
__getitem__. The former will return the length of the sequence, while the
latter returns the sequence (as a bytes object) for the requested region.
Using the information in the index, the __getitem__ method calculates the file
position at which the requested region starts, and only reads the requested
sequence region. Note that the full sequence of a record is loaded only if
specifically requested, making the parser memory-efficient.
The TwoBitIterator object implements the __getitem__, keys, and __len__
methods that allow it to be used as a dictionary.
"""
# The .2bit file format is defined by UCSC as follows
# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
#
#
# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
# randomly-accessible format. The file contains masking information as well
# as the DNA itself.
#
# The file begins with a 16-byte header containing the following fields:
#
# signature - the number 0x1A412743 in the architecture of the machine that
# created the file
# version - zero for now. Readers should abort if they see a version number
# higher than 0
# sequenceCount - the number of sequences in the file
# reserved - always zero for now
#
# All fields are 32 bits unless noted. If the signature value is not as
# given, the reader program should byte-swap the signature and check if the
# swapped version matches. If so, all multiple-byte entities in the file
# will have to be byte-swapped. This enables these binary files to be used
# unchanged on different architectures.
#
# The header is followed by a file index, which contains one entry for each
# sequence. Each index entry contains three fields:
#
# nameSize - a byte containing the length of the name field
# name - the sequence name itself (in ASCII-compatible byte string), of
# variable length depending on nameSize
# offset - the 32-bit offset of the sequence data relative to the start of
# the file, not aligned to any 4-byte padding boundary
#
# The index is followed by the sequence records, which contain nine fields:
#
# dnaSize - number of bases of DNA in the sequence
# nBlockCount - the number of blocks of Ns in the file (representing unknown
# sequence)
# nBlockStarts - an array of length nBlockCount of 32 bit integers
# indicating the (0-based) starting position of a block of Ns
# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
# the length of a block of Ns
# maskBlockCount - the number of masked (lower-case) blocks
# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
# indicating the (0-based) starting position of a masked block
# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
# indicating the length of a masked block
# reserved - always zero for now
# packedDna - the DNA packed to two bits per base, represented as so:
# T - 00, C - 01, A - 10, G - 11. The first base is in the most
# significant 2-bit byte; the last base is in the least significan
# 2 bits. For example, the sequence TCAG is represented as 00011011.
import numpy
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
from . import _twoBitIO
class _TwoBitSequenceData:
"""Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).
Objects of this class store the file position at which the sequence data
start, the sequence length, and the start and end position of unknown (N)
and masked (lowercase) letters in the sequence.
Only two methods are provided: __len__ and __getitem__. The former will
return the length of the sequence, while the latter returns the sequence
(as a bytes object) for the requested region. The full sequence of a record
is loaded only if explicitly requested.
"""
def __init__(self, stream, offset):
"""Initialize the file stream and file position of the sequence data."""
self.stream = stream
self.offset = offset
def __getitem__(self, key):
length = self.length
if isinstance(key, slice):
start, end, step = key.indices(length)
size = len(range(start, end, step))
if size == 0:
return b""
else:
start = key
end = key + 1
step = 1
size = 1
byteStart = start // 4
byteEnd = (end + 3) // 4
byteSize = byteEnd - byteStart
stream = self.stream
try:
stream.seek(self.offset + byteStart)
except ValueError as exception:
if str(exception) == "seek of closed file":
raise ValueError("cannot retrieve sequence: file is closed") from None
raise
data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
sequence = _twoBitIO.convert(
data, start, end, step, self.nBlocks, self.maskBlocks
)
return sequence
def __len__(self):
return self.length
class TwoBitIterator(SequenceIterator):
"""Parser for UCSC twoBit (.2bit) files."""
def __init__(self, source):
"""Read the file index."""
super().__init__(source, mode="b", fmt="twoBit")
# wait to close the file until the TwoBitIterator goes out of scope:
self.should_close_stream = False
stream = self.stream
data = stream.read(4)
if not data:
raise ValueError("Empty file.")
byteorders = ("little", "big")
dtypes = ("<u4", ">u4")
for byteorder, dtype in zip(byteorders, dtypes):
signature = int.from_bytes(data, byteorder)
if signature == 0x1A412743:
break
else:
raise ValueError("Unknown signature")
self.byteorder = byteorder
data = stream.read(4)
version = int.from_bytes(data, byteorder, signed=False)
if version == 1:
raise ValueError(
"version-1 twoBit files with 64-bit offsets for index are currently not supported"
)
if version != 0:
raise ValueError("Found unexpected file version %u; aborting" % version)
data = stream.read(4)
sequenceCount = int.from_bytes(data, byteorder, signed=False)
data = stream.read(4)
reserved = int.from_bytes(data, byteorder, signed=False)
if reserved != 0:
raise ValueError("Found non-zero reserved field; aborting")
sequences = {}
for i in range(sequenceCount):
data = stream.read(1)
nameSize = int.from_bytes(data, byteorder, signed=False)
data = stream.read(nameSize)
name = data.decode("ASCII")
data = stream.read(4)
offset = int.from_bytes(data, byteorder, signed=False)
sequence = _TwoBitSequenceData(stream, offset)
sequences[name] = sequence
self.sequences = sequences
for name, sequence in sequences.items():
offset = sequence.offset
stream.seek(offset)
data = stream.read(4)
dnaSize = int.from_bytes(data, byteorder, signed=False)
sequence.length = dnaSize
data = stream.read(4)
nBlockCount = int.from_bytes(data, byteorder, signed=False)
nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
sequence.nBlocks[:, 0] = nBlockStarts
sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
data = stream.read(4)
maskBlockCount = int.from_bytes(data, byteorder, signed=False)
maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
sequence.maskBlocks[:, 0] = maskBlockStarts
sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
data = stream.read(4)
reserved = int.from_bytes(data, byteorder, signed=False)
if reserved != 0:
raise ValueError("Found non-zero reserved field %u" % reserved)
sequence.offset = stream.tell()
def parse(self, stream):
"""Iterate over the sequences in the file."""
for name, sequence in self.sequences.items():
sequence = Seq(sequence)
record = SeqRecord(sequence, id=name)
yield record
def __getitem__(self, name):
try:
sequence = self.sequences[name]
except ValueError:
raise KeyError(name) from None
sequence = Seq(sequence)
return SeqRecord(sequence, id=name)
def keys(self):
"""Return a list with the names of the sequences in the file."""
return self.sequences.keys()
def __len__(self):
return len(self.sequences)

View File

@ -396,6 +396,7 @@ from Bio.SeqIO import SffIO
from Bio.SeqIO import SnapGeneIO
from Bio.SeqIO import SwissIO
from Bio.SeqIO import TabIO
from Bio.SeqIO import TwoBitIO
from Bio.SeqIO import QualityIO # FastQ and qual files
from Bio.SeqIO import UniprotIO
from Bio.SeqIO import XdnaIO
@ -438,10 +439,10 @@ _FormatToIterator = {
"seqxml": SeqXmlIO.SeqXmlIterator,
"sff": SffIO.SffIterator,
"snapgene": SnapGeneIO.SnapGeneIterator,
# Not sure about this in the long run:
"sff-trim": SffIO._SffTrimIterator,
"sff-trim": SffIO._SffTrimIterator, # Not sure about this in the long run
"swiss": SwissIO.SwissIterator,
"tab": TabIO.TabIterator,
"twobit": TwoBitIO.TwoBitIterator,
"uniprot-xml": UniprotIO.UniprotIterator,
"xdna": XdnaIO.XdnaIterator,
}
@ -573,6 +574,10 @@ def parse(handle, format, alphabet=None):
ID gi|3176602|gb|U78617.1|LOU78617
Sequence length 309
For lazy-loading file formats such as twobit, for which the file contents
is read on demand only, ensure that the file remains open while extracting
sequence data.
If you have a string 'data' containing the file contents, you must
first turn this into a handle in order to parse it:

480
Bio/SeqIO/_twoBitIO.c Normal file
View File

@ -0,0 +1,480 @@
#define PY_SSIZE_T_CLEAN
#include "Python.h"
static const char bases[][4] = {"TTTT", /* 00 00 00 00 */
"TTTC", /* 00 00 00 01 */
"TTTA", /* 00 00 00 10 */
"TTTG", /* 00 00 00 11 */
"TTCT", /* 00 00 01 00 */
"TTCC", /* 00 00 01 01 */
"TTCA", /* 00 00 01 10 */
"TTCG", /* 00 00 01 11 */
"TTAT", /* 00 00 10 00 */
"TTAC", /* 00 00 10 01 */
"TTAA", /* 00 00 10 10 */
"TTAG", /* 00 00 10 11 */
"TTGT", /* 00 00 11 00 */
"TTGC", /* 00 00 11 01 */
"TTGA", /* 00 00 11 10 */
"TTGG", /* 00 00 11 11 */
"TCTT", /* 00 01 00 00 */
"TCTC", /* 00 01 00 01 */
"TCTA", /* 00 01 00 10 */
"TCTG", /* 00 01 00 11 */
"TCCT", /* 00 01 01 00 */
"TCCC", /* 00 01 01 01 */
"TCCA", /* 00 01 01 10 */
"TCCG", /* 00 01 01 11 */
"TCAT", /* 00 01 10 00 */
"TCAC", /* 00 01 10 01 */
"TCAA", /* 00 01 10 10 */
"TCAG", /* 00 01 10 11 */
"TCGT", /* 00 01 11 00 */
"TCGC", /* 00 01 11 01 */
"TCGA", /* 00 01 11 10 */
"TCGG", /* 00 01 11 11 */
"TATT", /* 00 10 00 00 */
"TATC", /* 00 10 00 01 */
"TATA", /* 00 10 00 10 */
"TATG", /* 00 10 00 11 */
"TACT", /* 00 10 01 00 */
"TACC", /* 00 10 01 01 */
"TACA", /* 00 10 01 10 */
"TACG", /* 00 10 01 11 */
"TAAT", /* 00 10 10 00 */
"TAAC", /* 00 10 10 01 */
"TAAA", /* 00 10 10 10 */
"TAAG", /* 00 10 10 11 */
"TAGT", /* 00 10 11 00 */
"TAGC", /* 00 10 11 01 */
"TAGA", /* 00 10 11 10 */
"TAGG", /* 00 10 11 11 */
"TGTT", /* 00 11 00 00 */
"TGTC", /* 00 11 00 01 */
"TGTA", /* 00 11 00 10 */
"TGTG", /* 00 11 00 11 */
"TGCT", /* 00 11 01 00 */
"TGCC", /* 00 11 01 01 */
"TGCA", /* 00 11 01 10 */
"TGCG", /* 00 11 01 11 */
"TGAT", /* 00 11 10 00 */
"TGAC", /* 00 11 10 01 */
"TGAA", /* 00 11 10 10 */
"TGAG", /* 00 11 10 11 */
"TGGT", /* 00 11 11 00 */
"TGGC", /* 00 11 11 01 */
"TGGA", /* 00 11 11 10 */
"TGGG", /* 00 11 11 11 */
"CTTT", /* 01 00 00 00 */
"CTTC", /* 01 00 00 01 */
"CTTA", /* 01 00 00 10 */
"CTTG", /* 01 00 00 11 */
"CTCT", /* 01 00 01 00 */
"CTCC", /* 01 00 01 01 */
"CTCA", /* 01 00 01 10 */
"CTCG", /* 01 00 01 11 */
"CTAT", /* 01 00 10 00 */
"CTAC", /* 01 00 10 01 */
"CTAA", /* 01 00 10 10 */
"CTAG", /* 01 00 10 11 */
"CTGT", /* 01 00 11 00 */
"CTGC", /* 01 00 11 01 */
"CTGA", /* 01 00 11 10 */
"CTGG", /* 01 00 11 11 */
"CCTT", /* 01 01 00 00 */
"CCTC", /* 01 01 00 01 */
"CCTA", /* 01 01 00 10 */
"CCTG", /* 01 01 00 11 */
"CCCT", /* 01 01 01 00 */
"CCCC", /* 01 01 01 01 */
"CCCA", /* 01 01 01 10 */
"CCCG", /* 01 01 01 11 */
"CCAT", /* 01 01 10 00 */
"CCAC", /* 01 01 10 01 */
"CCAA", /* 01 01 10 10 */
"CCAG", /* 01 01 10 11 */
"CCGT", /* 01 01 11 00 */
"CCGC", /* 01 01 11 01 */
"CCGA", /* 01 01 11 10 */
"CCGG", /* 01 01 11 11 */
"CATT", /* 01 10 00 00 */
"CATC", /* 01 10 00 01 */
"CATA", /* 01 10 00 10 */
"CATG", /* 01 10 00 11 */
"CACT", /* 01 10 01 00 */
"CACC", /* 01 10 01 01 */
"CACA", /* 01 10 01 10 */
"CACG", /* 01 10 01 11 */
"CAAT", /* 01 10 10 00 */
"CAAC", /* 01 10 10 01 */
"CAAA", /* 01 10 10 10 */
"CAAG", /* 01 10 10 11 */
"CAGT", /* 01 10 11 00 */
"CAGC", /* 01 10 11 01 */
"CAGA", /* 01 10 11 10 */
"CAGG", /* 01 10 11 11 */
"CGTT", /* 01 11 00 00 */
"CGTC", /* 01 11 00 01 */
"CGTA", /* 01 11 00 10 */
"CGTG", /* 01 11 00 11 */
"CGCT", /* 01 11 01 00 */
"CGCC", /* 01 11 01 01 */
"CGCA", /* 01 11 01 10 */
"CGCG", /* 01 11 01 11 */
"CGAT", /* 01 11 10 00 */
"CGAC", /* 01 11 10 01 */
"CGAA", /* 01 11 10 10 */
"CGAG", /* 01 11 10 11 */
"CGGT", /* 01 11 11 00 */
"CGGC", /* 01 11 11 01 */
"CGGA", /* 01 11 11 10 */
"CGGG", /* 01 11 11 11 */
"ATTT", /* 10 00 00 00 */
"ATTC", /* 10 00 00 01 */
"ATTA", /* 10 00 00 10 */
"ATTG", /* 10 00 00 11 */
"ATCT", /* 10 00 01 00 */
"ATCC", /* 10 00 01 01 */
"ATCA", /* 10 00 01 10 */
"ATCG", /* 10 00 01 11 */
"ATAT", /* 10 00 10 00 */
"ATAC", /* 10 00 10 01 */
"ATAA", /* 10 00 10 10 */
"ATAG", /* 10 00 10 11 */
"ATGT", /* 10 00 11 00 */
"ATGC", /* 10 00 11 01 */
"ATGA", /* 10 00 11 10 */
"ATGG", /* 10 00 11 11 */
"ACTT", /* 10 01 00 00 */
"ACTC", /* 10 01 00 01 */
"ACTA", /* 10 01 00 10 */
"ACTG", /* 10 01 00 11 */
"ACCT", /* 10 01 01 00 */
"ACCC", /* 10 01 01 01 */
"ACCA", /* 10 01 01 10 */
"ACCG", /* 10 01 01 11 */
"ACAT", /* 10 01 10 00 */
"ACAC", /* 10 01 10 01 */
"ACAA", /* 10 01 10 10 */
"ACAG", /* 10 01 10 11 */
"ACGT", /* 10 01 11 00 */
"ACGC", /* 10 01 11 01 */
"ACGA", /* 10 01 11 10 */
"ACGG", /* 10 01 11 11 */
"AATT", /* 10 10 00 00 */
"AATC", /* 10 10 00 01 */
"AATA", /* 10 10 00 10 */
"AATG", /* 10 10 00 11 */
"AACT", /* 10 10 01 00 */
"AACC", /* 10 10 01 01 */
"AACA", /* 10 10 01 10 */
"AACG", /* 10 10 01 11 */
"AAAT", /* 10 10 10 00 */
"AAAC", /* 10 10 10 01 */
"AAAA", /* 10 10 10 10 */
"AAAG", /* 10 10 10 11 */
"AAGT", /* 10 10 11 00 */
"AAGC", /* 10 10 11 01 */
"AAGA", /* 10 10 11 10 */
"AAGG", /* 10 10 11 11 */
"AGTT", /* 10 11 00 00 */
"AGTC", /* 10 11 00 01 */
"AGTA", /* 10 11 00 10 */
"AGTG", /* 10 11 00 11 */
"AGCT", /* 10 11 01 00 */
"AGCC", /* 10 11 01 01 */
"AGCA", /* 10 11 01 10 */
"AGCG", /* 10 11 01 11 */
"AGAT", /* 10 11 10 00 */
"AGAC", /* 10 11 10 01 */
"AGAA", /* 10 11 10 10 */
"AGAG", /* 10 11 10 11 */
"AGGT", /* 10 11 11 00 */
"AGGC", /* 10 11 11 01 */
"AGGA", /* 10 11 11 10 */
"AGGG", /* 10 11 11 11 */
"GTTT", /* 11 00 00 00 */
"GTTC", /* 11 00 00 01 */
"GTTA", /* 11 00 00 10 */
"GTTG", /* 11 00 00 11 */
"GTCT", /* 11 00 01 00 */
"GTCC", /* 11 00 01 01 */
"GTCA", /* 11 00 01 10 */
"GTCG", /* 11 00 01 11 */
"GTAT", /* 11 00 10 00 */
"GTAC", /* 11 00 10 01 */
"GTAA", /* 11 00 10 10 */
"GTAG", /* 11 00 10 11 */
"GTGT", /* 11 00 11 00 */
"GTGC", /* 11 00 11 01 */
"GTGA", /* 11 00 11 10 */
"GTGG", /* 11 00 11 11 */
"GCTT", /* 11 01 00 00 */
"GCTC", /* 11 01 00 01 */
"GCTA", /* 11 01 00 10 */
"GCTG", /* 11 01 00 11 */
"GCCT", /* 11 01 01 00 */
"GCCC", /* 11 01 01 01 */
"GCCA", /* 11 01 01 10 */
"GCCG", /* 11 01 01 11 */
"GCAT", /* 11 01 10 00 */
"GCAC", /* 11 01 10 01 */
"GCAA", /* 11 01 10 10 */
"GCAG", /* 11 01 10 11 */
"GCGT", /* 11 01 11 00 */
"GCGC", /* 11 01 11 01 */
"GCGA", /* 11 01 11 10 */
"GCGG", /* 11 01 11 11 */
"GATT", /* 11 10 00 00 */
"GATC", /* 11 10 00 01 */
"GATA", /* 11 10 00 10 */
"GATG", /* 11 10 00 11 */
"GACT", /* 11 10 01 00 */
"GACC", /* 11 10 01 01 */
"GACA", /* 11 10 01 10 */
"GACG", /* 11 10 01 11 */
"GAAT", /* 11 10 10 00 */
"GAAC", /* 11 10 10 01 */
"GAAA", /* 11 10 10 10 */
"GAAG", /* 11 10 10 11 */
"GAGT", /* 11 10 11 00 */
"GAGC", /* 11 10 11 01 */
"GAGA", /* 11 10 11 10 */
"GAGG", /* 11 10 11 11 */
"GGTT", /* 11 11 00 00 */
"GGTC", /* 11 11 00 01 */
"GGTA", /* 11 11 00 10 */
"GGTG", /* 11 11 00 11 */
"GGCT", /* 11 11 01 00 */
"GGCC", /* 11 11 01 01 */
"GGCA", /* 11 11 01 10 */
"GGCG", /* 11 11 01 11 */
"GGAT", /* 11 11 10 00 */
"GGAC", /* 11 11 10 01 */
"GGAA", /* 11 11 10 10 */
"GGAG", /* 11 11 10 11 */
"GGGT", /* 11 11 11 00 */
"GGGC", /* 11 11 11 01 */
"GGGA", /* 11 11 11 10 */
"GGGG", /* 11 11 11 11 */
};
static int
extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) {
uint32_t i;
const uint32_t size = end - start;
const uint32_t byteStart = start / 4;
const uint32_t byteEnd = (end + 3) / 4;
if (byteSize != byteEnd - byteStart) {
PyErr_Format(PyExc_RuntimeError,
"unexpected number of bytes %u (expected %u)",
byteSize, byteEnd - byteStart);
return -1;
}
start -= byteStart * 4;
if (byteStart + 1 == byteEnd) {
/* one byte only */
memcpy(sequence, &(bases[*bytes][start]), size);
}
else {
end -= byteEnd * 4;
/* end is now a negative number equal to the distance to the byte end */
memcpy(sequence, &(bases[*bytes][start]), 4 - start);
bytes++;
sequence += (4 - start);
for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4)
memcpy(sequence, bases[*bytes], 4);
memcpy(sequence, bases[*bytes], end + 4);
bytes++;
bytes -= byteSize;
}
return 0;
}
static void
applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks)
{
const Py_ssize_t nBlockCount = nBlocks->shape[0];
const uint32_t* const nBlockPositions = nBlocks->buf;
Py_ssize_t i;
for (i = 0; i < nBlockCount; i++) {
uint32_t nBlockStart = nBlockPositions[2*i];
uint32_t nBlockEnd = nBlockPositions[2*i+1];
if (nBlockEnd < start) continue;
if (end < nBlockStart) break;
if (nBlockStart < start) nBlockStart = start;
if (end < nBlockEnd) nBlockEnd = end;
memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart);
}
}
static void
applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks)
{
const Py_ssize_t maskBlockCount = maskBlocks->shape[0];
const uint32_t* const maskBlockPositions = maskBlocks->buf;
const char diff = 'a' - 'A';
Py_ssize_t i;
for (i = 0; i < maskBlockCount; i++) {
uint32_t j;
uint32_t maskBlockStart = maskBlockPositions[2*i];
uint32_t maskBlockEnd = maskBlockPositions[2*i+1];
if (maskBlockEnd < start) continue;
if (end < maskBlockStart) break;
if (maskBlockStart < start) maskBlockStart = start;
if (end < maskBlockEnd) maskBlockEnd = end;
for (j = maskBlockStart - start; j < maskBlockEnd - start; j++)
sequence[j] += diff;
}
}
static int
blocks_converter(PyObject* object, void* pointer)
{
const int flag = PyBUF_ND | PyBUF_FORMAT;
Py_buffer *view = pointer;
if (object == NULL) goto exit;
if (PyObject_GetBuffer(object, view, flag) == -1) {
PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format.");
return 0;
}
if (view->itemsize != sizeof(uint32_t)
|| (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) {
PyErr_Format(PyExc_RuntimeError,
"blocks have incorrect data type (itemsize %zd, format %s)",
view->itemsize, view->format);
goto exit;
}
if (view->ndim != 2) {
PyErr_Format(PyExc_RuntimeError,
"blocks have incorrect rank %d (expected 2)", view->ndim);
goto exit;
}
if (view->shape[1] != 2) {
PyErr_Format(PyExc_RuntimeError,
"blocks should have two colums (found %zd)",
view->shape[1]);
goto exit;
}
return Py_CLEANUP_SUPPORTED;
exit:
PyBuffer_Release(view);
return 0;
}
static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object";
static PyObject*
TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords)
{
const unsigned char *data;
Py_ssize_t start;
Py_ssize_t end;
Py_ssize_t step;
Py_ssize_t size;
Py_ssize_t length;
Py_buffer nBlocks;
Py_buffer maskBlocks;
PyObject *object;
char *sequence;
static char* kwlist[] = {"data", "start", "end", "step",
"nBlocks", "maskBlocks", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist,
&data, &length, &start, &end, &step,
&blocks_converter, &nBlocks,
&blocks_converter, &maskBlocks))
return NULL;
size = (end - start) / step;
object = PyBytes_FromStringAndSize(NULL, size);
if (!object) goto exit;
sequence = PyBytes_AS_STRING(object);
if (step == 1) {
if (extract(data, length, start, end, sequence) < 0) {
Py_DECREF(object);
object = NULL;
goto exit;
}
applyNs(sequence, start, end, &nBlocks);
applyMask(sequence, start, end, &maskBlocks);
}
else {
Py_ssize_t current, i;
Py_ssize_t full_start, full_end;
char* full_sequence;
if (start <= end) {
full_start = start;
full_end = end;
current = 0; /* first position in sequence */
}
else {
full_start = end + 1;
full_end = start + 1;
current = start - end - 1; /* last position in sequence */
}
full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char));
full_sequence[full_end-full_start] = '\0';
if (!full_sequence) {
Py_DECREF(object);
object = NULL;
goto exit;
}
if (extract(data, length, full_start, full_end, full_sequence) < 0) {
PyMem_Free(full_sequence);
Py_DECREF(object);
object = NULL;
goto exit;
}
applyNs(full_sequence, full_start, full_end, &nBlocks);
applyMask(full_sequence, full_start, full_end, &maskBlocks);
for (i = 0; i < size; current += step, i++)
sequence[i] = full_sequence[current];
PyMem_Free(full_sequence);
}
exit:
blocks_converter(NULL, &nBlocks);
blocks_converter(NULL, &maskBlocks);
return object;
}
static struct PyMethodDef _twoBitIO_methods[] = {
{"convert",
(PyCFunction)TwoBit_convert,
METH_VARARGS | METH_KEYWORDS,
TwoBit_convert__doc__
},
{NULL, NULL, 0, NULL} /* sentinel */
};
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_twoBitIO",
"Parser for DNA sequence data in 2bit format",
-1,
_twoBitIO_methods,
NULL,
NULL,
NULL,
NULL
};
PyObject *
PyInit__twoBitIO(void)
{
return PyModule_Create(&moduledef);
}

View File

@ -540,7 +540,27 @@ Length 394
\section{Sequence files as Dictionaries}
We're now going to introduce three related functions in the \verb|Bio.SeqIO|
Looping over the iterator returned by \verb|SeqIO.parse| once will exhaust the file. For self-indexed files, such as files in the twoBit format, the return value of \verb|SeqIO.parse| can also be used as a dictionary, allowing random access to the sequence contents. As in this case parsing is done on demand, the file must remain open as long as the sequence data is being accessed:
%doctest ../Tests/TwoBit
\begin{minted}{pycon}
>>> from Bio import SeqIO
>>> handle = open("sequence.bigendian.2bit", "rb")
>>> records = SeqIO.parse(handle, "twobit")
>>> records.keys()
dict_keys(['seq11111', 'seq222', 'seq3333', 'seq4', 'seq555', 'seq6'])
>>> records['seq222']
SeqRecord(seq=Seq('TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTC...ACA'), id='seq222', name='<unknown name>', description='<unknown description>', dbxrefs=[])
>>> records['seq222'].seq
Seq('TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTC...ACA')
>>> handle.close()
>>> records['seq222'].seq
Traceback (most recent call last):
...
ValueError: cannot retrieve sequence: file is closed
\end{minted}
For other file formats, \verb|Bio.SeqIO| provides three related functions
module which allow dictionary like random access to a multi-sequence file.
There is a trade off here between flexibility and memory usage. In summary:
\begin{itemize}

View File

@ -29,6 +29,9 @@ atomic solvent accessible areas without third-party tools.
Expected ``TypeError`` behaviour has been restored to the ``Seq`` object's
string like methods (fixing a regression in Biopython 1.78).
A parser for twoBit (.2bit) sequence data was added to ``Bio.SeqIO``. This is
a lazy parser that allows fast access to genome-size DNA sequence files.
Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors:

Binary file not shown.

32
Tests/TwoBit/sequence.fa Normal file
View File

@ -0,0 +1,32 @@
>seq11111
GTATACCCCTTGGGCAGATTTACCCCTCTCGTCCCTGTCCCGTGACGGAATCGGGTAATCCATCGACTCT
CGACCTGNNNNNNNNNNNNNNNNNNNCGCGTTTGAGTGTCGGGGGTGCTGTCGTTGTGTCCCCTGTGAGC
CACAGAAATGCATCTTTCCTGAGGACGAGGACGGTAGTcgtgaagcaccgtcggTTTGATTTTATTCAGC
CTGCAGATTGGATGCATAATGGTACACATTATTCTTTATGCTGGTTCCCGACATCATTGTGGTTCGCTGC
TCAACAATCCTGGGTCGTTTGTTGANNNNNNNNNNNNNNNNNNNNNNNCTGAATGCTCTAGTAAATAGCT
AGGGAGaacattagatgtggCAGCAGTAGCTTCTTCATCTGTCGTTAACCTTTGCCGGTCTGTTGCCTAT
ATCCCTAAGTCTGGTTAATGGCTCATCCTCATGTGAAGATGGGCCGGCATGCGAGCACCG
>seq222
TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTCCCGCGTTGGTCGGTCT
GGGACCAAGACCCCGCTAGAtcccctcttcaaNNNNNNNNNNNNNNNNNNNNNNNNNGTGATGAGAATCT
GGGCCATCTACNNNNNNNNNNnnnnnnnngccgcggtacgtggTAGGAACAGTGTATCATCACAGGCGTC
GGCGCATGCGGTAGAGCAAACGGAATACTCTAGTGAGATTCAGCTTTGTACCATCTACA
>seq3333
cgcgtaacgagaTTCCCACGCACTGATTGAGGATACGAACAACGGGAAGGTAATCACTTTGGCATCCATG
CAGGCTTTCTTTTGGTTTCGCTGTATCGAATTcagatcgggtagatgACCTAGGTTAGTACATTCACTAG
GGAGAGTAAAGTGGAGCCCGGGTACCTCGGCTTGTGCTGTTGCTTAAATCGGCCCTCGCGGTTGTCGATA
AACTCGCTTAGTCGAACGACTCACACGGAACGTAATATTTGCCCGCGCATAGGACCTACCCAACTTCGGG
AATGATATTTAGatcgaacggcAATAGACAGAGGGGCTGAGCAGGTCCAAAAACACAGAGGATACTTACG
CACTCTTGCGGGGGTATTCCCCTGATCCCCCTCAAAACGTACGACCGTACCAGCCAGGTCCGCCGTTTAG
CTGTGCCCCCGTCATCCAAAGTttcatcctcagctctacatggttgcgggttcaaattaacgaggatctg
>seq4
NNNNNNNNNNNNNNNNNGACAAGGTTCCTGGGGTGATGCACACGCTCGGAGGGTGGGTAAATCAAAACGG
CGGAAATAATGTCATACATGTAACAGATTATCAACCCGtagtcttccgctagtCAAGAATTTAGTAATGG
TGTGAGGGAGGGGAAATTCTGATCGCACGAATATGAAGTGTAATGACGGCCGAGAAGCCGAGGAGAAAGC
TCACAGTGATCCGCATTCGTCTGCTCCATGGTTTATCCATCGGCGCCGGGAATTTGTATACAAGACCCTG
CGGTAATTATGCACCTAACGCCAGTATTGGCATTAGATCTAGTCGCNNNNNNNNNNNNNNNTG
>seq555
CATGTATCGCATGCGGGACGTTTtgtcctgacataggttgTAGTTCCGTCATACCGTGAGATGGGTCCCT
TCACCCGATGGAGCTAGAAGGAAGATTTGCACCATGGATGGNNNNNNNNNNNNNNNN
>seq6
ACGTacgtNNNNnn

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,69 @@
"""Tests for SeqIO TwoBitIO module."""
import os
import random
import unittest
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.SeqIO import TwoBitIO
class Parsing(unittest.TestCase):
"""Test parsing 2bit files."""
def setUp(self):
path = "TwoBit/sequence.fa"
records = SeqIO.parse(path, "fasta")
self.records = list(records)
def test_littleendian(self):
path = "TwoBit/sequence.littleendian.2bit"
with open(path, "rb") as handle:
records = TwoBitIO.TwoBitIterator(handle)
self.assertEqual(records.byteorder, "little")
self.assertEqual(len(self.records), len(records))
for record1, record2 in zip(self.records, records):
self.assertEqual(record1.id, record2.id)
seq1 = record1.seq
seq2 = record2.seq
self.assertEqual(seq1, seq2)
n = len(seq1)
for i in range(n):
for j in range(i, n):
self.assertEqual(seq1[i:j], seq2[i:j])
self.assertEqual(repr(seq1[i:j]), repr(seq2[i:j]))
def test_bigendian(self):
path = "TwoBit/sequence.bigendian.2bit"
with open(path, "rb") as handle:
records = TwoBitIO.TwoBitIterator(handle)
self.assertEqual(len(records), 6)
self.assertEqual(records.byteorder, "big")
for record1, record2 in zip(self.records, records):
self.assertEqual(record1.id, record2.id)
seq1 = record1.seq
seq2 = record2.seq
self.assertEqual(seq1, seq2)
n = len(seq1)
for i in range(n):
for j in range(i, n):
self.assertEqual(seq1[i:j], seq2[i:j])
self.assertEqual(repr(seq1[i:j]), repr(seq2[i:j]))
def test_sequence_long(self):
path = "TwoBit/sequence.long.2bit"
with open(path, "rb") as handle:
with self.assertRaises(ValueError) as cm:
TwoBitIO.TwoBitIterator(handle)
self.assertEqual(
str(cm.exception),
"version-1 twoBit files with 64-bit offsets for index are currently not supported",
)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)

View File

@ -198,6 +198,7 @@ EXTENSIONS = [
"Bio.Cluster._cluster", ["Bio/Cluster/cluster.c", "Bio/Cluster/clustermodule.c"]
),
Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
Extension("Bio.SeqIO._twoBitIO", ["Bio/SeqIO/_twoBitIO.c"]),
]
# We now define the Biopython version number in Bio/__init__.py