mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Add a parser for 2bit files to Bio.SeqIO (#3388)
* add test script * add test files * done * travis * travis being obnoxious * remove unnecessary str support * avoid using PyUnicode_New as it's not in pypy * add some documentation * add some documentation * move to a comment to travis can't bitch about it * move to a comment to travis can't bitch about it * documentation update * fix warning * Let's see why 32bit windows is not happy * Let's see why 32bit windows is not happy * check the file descriptor in advance; on Windows, lseek with a closed file descriptor will by default close the application * lseek is fubar on windows * lseek and read apparently are fubar on Windows. Use Python to seek/read, and numpy to keep the code reasonably fast * deal with closed files * travis * more inane stuff from Travis * weird * reverting previous change to the tutorial text * reverting changes to test_SeqIO.py; these are not needed anymore now that twoBit file reading is done via Python * improving speed * using C code for 2bit to sequence conversion * adding missing C code * expand exception * accept formats I and L * release buffer * travis * windows * private * NEWS
This commit is contained in:
36
Bio/Seq.py
36
Bio/Seq.py
@ -95,26 +95,45 @@ class Seq:
|
||||
self._data = bytes(data)
|
||||
elif isinstance(data, str):
|
||||
self._data = bytes(data, encoding="ASCII")
|
||||
elif self._check_bytes_like(data):
|
||||
# e.g. the TwoBitSequence objects, which return bytes when
|
||||
# __getitem__ is called on them.
|
||||
self._data = data
|
||||
else:
|
||||
raise TypeError(
|
||||
"data should be a string, bytes, bytearray, Seq, or MutableSeq object"
|
||||
)
|
||||
|
||||
def _check_bytes_like(self, data):
|
||||
# Check if data is bytes-like. This currently requires two things:
|
||||
# - calling len(data) must return the length of the data
|
||||
# - calling __getitem__ must return a bytes object for the requested region
|
||||
try:
|
||||
len(data)
|
||||
except TypeError:
|
||||
return False
|
||||
try:
|
||||
c = data[:0]
|
||||
except TypeError:
|
||||
return False
|
||||
if c != b"":
|
||||
return False
|
||||
return True
|
||||
|
||||
def __bytes__(self):
|
||||
return self._data
|
||||
return bytes(self._data[:])
|
||||
|
||||
def __repr__(self):
|
||||
"""Return (truncated) representation of the sequence for debugging."""
|
||||
data = bytes(self)
|
||||
if len(data) > 60:
|
||||
if len(self) > 60:
|
||||
# Shows the last three letters as it is often useful to see if
|
||||
# there is a stop codon at the end of a sequence.
|
||||
# Note total length is 54+3+3=60
|
||||
start = data[:54].decode("ASCII")
|
||||
end = data[-3:].decode("ASCII")
|
||||
start = bytes(self[:54]).decode("ASCII")
|
||||
end = bytes(self[-3:]).decode("ASCII")
|
||||
return f"{self.__class__.__name__}('{start}...{end}')"
|
||||
else:
|
||||
data = data.decode("ASCII")
|
||||
data = bytes(self).decode("ASCII")
|
||||
return f"{self.__class__.__name__}('{data}')"
|
||||
|
||||
def __str__(self):
|
||||
@ -134,7 +153,10 @@ class Seq:
|
||||
as_string = str(seq_obj)
|
||||
|
||||
"""
|
||||
return self._data.decode("ASCII")
|
||||
data = self._data
|
||||
if not isinstance(data, (bytes, bytearray)):
|
||||
data = bytes(self)
|
||||
return data.decode("ASCII")
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash of the sequence as a string for comparison.
|
||||
|
229
Bio/SeqIO/TwoBitIO.py
Normal file
229
Bio/SeqIO/TwoBitIO.py
Normal file
@ -0,0 +1,229 @@
|
||||
# Copyright 2020 by Michiel de Hoon
|
||||
#
|
||||
# This file is part of the Biopython distribution and governed by your
|
||||
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
||||
# Please see the LICENSE file that should have been included as part of this
|
||||
# package.
|
||||
|
||||
"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.
|
||||
|
||||
This parser reads the index stored in the twoBit file, as well as the masked
|
||||
regions and the N's for ean sequence. It also creates sequence data objects
|
||||
(_TwoBitSequenceData objects), which support only two methods: __len__ and
|
||||
__getitem__. The former will return the length of the sequence, while the
|
||||
latter returns the sequence (as a bytes object) for the requested region.
|
||||
|
||||
Using the information in the index, the __getitem__ method calculates the file
|
||||
position at which the requested region starts, and only reads the requested
|
||||
sequence region. Note that the full sequence of a record is loaded only if
|
||||
specifically requested, making the parser memory-efficient.
|
||||
|
||||
The TwoBitIterator object implements the __getitem__, keys, and __len__
|
||||
methods that allow it to be used as a dictionary.
|
||||
"""
|
||||
|
||||
# The .2bit file format is defined by UCSC as follows
|
||||
# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
|
||||
#
|
||||
#
|
||||
# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
|
||||
# randomly-accessible format. The file contains masking information as well
|
||||
# as the DNA itself.
|
||||
#
|
||||
# The file begins with a 16-byte header containing the following fields:
|
||||
#
|
||||
# signature - the number 0x1A412743 in the architecture of the machine that
|
||||
# created the file
|
||||
# version - zero for now. Readers should abort if they see a version number
|
||||
# higher than 0
|
||||
# sequenceCount - the number of sequences in the file
|
||||
# reserved - always zero for now
|
||||
#
|
||||
# All fields are 32 bits unless noted. If the signature value is not as
|
||||
# given, the reader program should byte-swap the signature and check if the
|
||||
# swapped version matches. If so, all multiple-byte entities in the file
|
||||
# will have to be byte-swapped. This enables these binary files to be used
|
||||
# unchanged on different architectures.
|
||||
#
|
||||
# The header is followed by a file index, which contains one entry for each
|
||||
# sequence. Each index entry contains three fields:
|
||||
#
|
||||
# nameSize - a byte containing the length of the name field
|
||||
# name - the sequence name itself (in ASCII-compatible byte string), of
|
||||
# variable length depending on nameSize
|
||||
# offset - the 32-bit offset of the sequence data relative to the start of
|
||||
# the file, not aligned to any 4-byte padding boundary
|
||||
#
|
||||
# The index is followed by the sequence records, which contain nine fields:
|
||||
#
|
||||
# dnaSize - number of bases of DNA in the sequence
|
||||
# nBlockCount - the number of blocks of Ns in the file (representing unknown
|
||||
# sequence)
|
||||
# nBlockStarts - an array of length nBlockCount of 32 bit integers
|
||||
# indicating the (0-based) starting position of a block of Ns
|
||||
# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
|
||||
# the length of a block of Ns
|
||||
# maskBlockCount - the number of masked (lower-case) blocks
|
||||
# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
|
||||
# indicating the (0-based) starting position of a masked block
|
||||
# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
|
||||
# indicating the length of a masked block
|
||||
# reserved - always zero for now
|
||||
# packedDna - the DNA packed to two bits per base, represented as so:
|
||||
# T - 00, C - 01, A - 10, G - 11. The first base is in the most
|
||||
# significant 2-bit byte; the last base is in the least significan
|
||||
# 2 bits. For example, the sequence TCAG is represented as 00011011.
|
||||
|
||||
|
||||
import numpy
|
||||
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
from .Interfaces import SequenceIterator
|
||||
from . import _twoBitIO
|
||||
|
||||
|
||||
class _TwoBitSequenceData:
|
||||
"""Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).
|
||||
|
||||
Objects of this class store the file position at which the sequence data
|
||||
start, the sequence length, and the start and end position of unknown (N)
|
||||
and masked (lowercase) letters in the sequence.
|
||||
|
||||
Only two methods are provided: __len__ and __getitem__. The former will
|
||||
return the length of the sequence, while the latter returns the sequence
|
||||
(as a bytes object) for the requested region. The full sequence of a record
|
||||
is loaded only if explicitly requested.
|
||||
"""
|
||||
|
||||
def __init__(self, stream, offset):
|
||||
"""Initialize the file stream and file position of the sequence data."""
|
||||
self.stream = stream
|
||||
self.offset = offset
|
||||
|
||||
def __getitem__(self, key):
|
||||
length = self.length
|
||||
if isinstance(key, slice):
|
||||
start, end, step = key.indices(length)
|
||||
size = len(range(start, end, step))
|
||||
if size == 0:
|
||||
return b""
|
||||
else:
|
||||
start = key
|
||||
end = key + 1
|
||||
step = 1
|
||||
size = 1
|
||||
byteStart = start // 4
|
||||
byteEnd = (end + 3) // 4
|
||||
byteSize = byteEnd - byteStart
|
||||
stream = self.stream
|
||||
try:
|
||||
stream.seek(self.offset + byteStart)
|
||||
except ValueError as exception:
|
||||
if str(exception) == "seek of closed file":
|
||||
raise ValueError("cannot retrieve sequence: file is closed") from None
|
||||
raise
|
||||
data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
|
||||
sequence = _twoBitIO.convert(
|
||||
data, start, end, step, self.nBlocks, self.maskBlocks
|
||||
)
|
||||
return sequence
|
||||
|
||||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
|
||||
class TwoBitIterator(SequenceIterator):
|
||||
"""Parser for UCSC twoBit (.2bit) files."""
|
||||
|
||||
def __init__(self, source):
|
||||
"""Read the file index."""
|
||||
super().__init__(source, mode="b", fmt="twoBit")
|
||||
# wait to close the file until the TwoBitIterator goes out of scope:
|
||||
self.should_close_stream = False
|
||||
stream = self.stream
|
||||
data = stream.read(4)
|
||||
if not data:
|
||||
raise ValueError("Empty file.")
|
||||
byteorders = ("little", "big")
|
||||
dtypes = ("<u4", ">u4")
|
||||
for byteorder, dtype in zip(byteorders, dtypes):
|
||||
signature = int.from_bytes(data, byteorder)
|
||||
if signature == 0x1A412743:
|
||||
break
|
||||
else:
|
||||
raise ValueError("Unknown signature")
|
||||
self.byteorder = byteorder
|
||||
data = stream.read(4)
|
||||
version = int.from_bytes(data, byteorder, signed=False)
|
||||
if version == 1:
|
||||
raise ValueError(
|
||||
"version-1 twoBit files with 64-bit offsets for index are currently not supported"
|
||||
)
|
||||
if version != 0:
|
||||
raise ValueError("Found unexpected file version %u; aborting" % version)
|
||||
data = stream.read(4)
|
||||
sequenceCount = int.from_bytes(data, byteorder, signed=False)
|
||||
data = stream.read(4)
|
||||
reserved = int.from_bytes(data, byteorder, signed=False)
|
||||
if reserved != 0:
|
||||
raise ValueError("Found non-zero reserved field; aborting")
|
||||
sequences = {}
|
||||
for i in range(sequenceCount):
|
||||
data = stream.read(1)
|
||||
nameSize = int.from_bytes(data, byteorder, signed=False)
|
||||
data = stream.read(nameSize)
|
||||
name = data.decode("ASCII")
|
||||
data = stream.read(4)
|
||||
offset = int.from_bytes(data, byteorder, signed=False)
|
||||
sequence = _TwoBitSequenceData(stream, offset)
|
||||
sequences[name] = sequence
|
||||
self.sequences = sequences
|
||||
for name, sequence in sequences.items():
|
||||
offset = sequence.offset
|
||||
stream.seek(offset)
|
||||
data = stream.read(4)
|
||||
dnaSize = int.from_bytes(data, byteorder, signed=False)
|
||||
sequence.length = dnaSize
|
||||
data = stream.read(4)
|
||||
nBlockCount = int.from_bytes(data, byteorder, signed=False)
|
||||
nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
|
||||
nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
|
||||
sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
|
||||
sequence.nBlocks[:, 0] = nBlockStarts
|
||||
sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
|
||||
data = stream.read(4)
|
||||
maskBlockCount = int.from_bytes(data, byteorder, signed=False)
|
||||
maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
|
||||
maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
|
||||
sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
|
||||
sequence.maskBlocks[:, 0] = maskBlockStarts
|
||||
sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
|
||||
data = stream.read(4)
|
||||
reserved = int.from_bytes(data, byteorder, signed=False)
|
||||
if reserved != 0:
|
||||
raise ValueError("Found non-zero reserved field %u" % reserved)
|
||||
sequence.offset = stream.tell()
|
||||
|
||||
def parse(self, stream):
|
||||
"""Iterate over the sequences in the file."""
|
||||
for name, sequence in self.sequences.items():
|
||||
sequence = Seq(sequence)
|
||||
record = SeqRecord(sequence, id=name)
|
||||
yield record
|
||||
|
||||
def __getitem__(self, name):
|
||||
try:
|
||||
sequence = self.sequences[name]
|
||||
except ValueError:
|
||||
raise KeyError(name) from None
|
||||
sequence = Seq(sequence)
|
||||
return SeqRecord(sequence, id=name)
|
||||
|
||||
def keys(self):
|
||||
"""Return a list with the names of the sequences in the file."""
|
||||
return self.sequences.keys()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.sequences)
|
@ -396,6 +396,7 @@ from Bio.SeqIO import SffIO
|
||||
from Bio.SeqIO import SnapGeneIO
|
||||
from Bio.SeqIO import SwissIO
|
||||
from Bio.SeqIO import TabIO
|
||||
from Bio.SeqIO import TwoBitIO
|
||||
from Bio.SeqIO import QualityIO # FastQ and qual files
|
||||
from Bio.SeqIO import UniprotIO
|
||||
from Bio.SeqIO import XdnaIO
|
||||
@ -438,10 +439,10 @@ _FormatToIterator = {
|
||||
"seqxml": SeqXmlIO.SeqXmlIterator,
|
||||
"sff": SffIO.SffIterator,
|
||||
"snapgene": SnapGeneIO.SnapGeneIterator,
|
||||
# Not sure about this in the long run:
|
||||
"sff-trim": SffIO._SffTrimIterator,
|
||||
"sff-trim": SffIO._SffTrimIterator, # Not sure about this in the long run
|
||||
"swiss": SwissIO.SwissIterator,
|
||||
"tab": TabIO.TabIterator,
|
||||
"twobit": TwoBitIO.TwoBitIterator,
|
||||
"uniprot-xml": UniprotIO.UniprotIterator,
|
||||
"xdna": XdnaIO.XdnaIterator,
|
||||
}
|
||||
@ -573,6 +574,10 @@ def parse(handle, format, alphabet=None):
|
||||
ID gi|3176602|gb|U78617.1|LOU78617
|
||||
Sequence length 309
|
||||
|
||||
For lazy-loading file formats such as twobit, for which the file contents
|
||||
is read on demand only, ensure that the file remains open while extracting
|
||||
sequence data.
|
||||
|
||||
If you have a string 'data' containing the file contents, you must
|
||||
first turn this into a handle in order to parse it:
|
||||
|
||||
|
480
Bio/SeqIO/_twoBitIO.c
Normal file
480
Bio/SeqIO/_twoBitIO.c
Normal file
@ -0,0 +1,480 @@
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include "Python.h"
|
||||
|
||||
|
||||
static const char bases[][4] = {"TTTT", /* 00 00 00 00 */
|
||||
"TTTC", /* 00 00 00 01 */
|
||||
"TTTA", /* 00 00 00 10 */
|
||||
"TTTG", /* 00 00 00 11 */
|
||||
"TTCT", /* 00 00 01 00 */
|
||||
"TTCC", /* 00 00 01 01 */
|
||||
"TTCA", /* 00 00 01 10 */
|
||||
"TTCG", /* 00 00 01 11 */
|
||||
"TTAT", /* 00 00 10 00 */
|
||||
"TTAC", /* 00 00 10 01 */
|
||||
"TTAA", /* 00 00 10 10 */
|
||||
"TTAG", /* 00 00 10 11 */
|
||||
"TTGT", /* 00 00 11 00 */
|
||||
"TTGC", /* 00 00 11 01 */
|
||||
"TTGA", /* 00 00 11 10 */
|
||||
"TTGG", /* 00 00 11 11 */
|
||||
"TCTT", /* 00 01 00 00 */
|
||||
"TCTC", /* 00 01 00 01 */
|
||||
"TCTA", /* 00 01 00 10 */
|
||||
"TCTG", /* 00 01 00 11 */
|
||||
"TCCT", /* 00 01 01 00 */
|
||||
"TCCC", /* 00 01 01 01 */
|
||||
"TCCA", /* 00 01 01 10 */
|
||||
"TCCG", /* 00 01 01 11 */
|
||||
"TCAT", /* 00 01 10 00 */
|
||||
"TCAC", /* 00 01 10 01 */
|
||||
"TCAA", /* 00 01 10 10 */
|
||||
"TCAG", /* 00 01 10 11 */
|
||||
"TCGT", /* 00 01 11 00 */
|
||||
"TCGC", /* 00 01 11 01 */
|
||||
"TCGA", /* 00 01 11 10 */
|
||||
"TCGG", /* 00 01 11 11 */
|
||||
"TATT", /* 00 10 00 00 */
|
||||
"TATC", /* 00 10 00 01 */
|
||||
"TATA", /* 00 10 00 10 */
|
||||
"TATG", /* 00 10 00 11 */
|
||||
"TACT", /* 00 10 01 00 */
|
||||
"TACC", /* 00 10 01 01 */
|
||||
"TACA", /* 00 10 01 10 */
|
||||
"TACG", /* 00 10 01 11 */
|
||||
"TAAT", /* 00 10 10 00 */
|
||||
"TAAC", /* 00 10 10 01 */
|
||||
"TAAA", /* 00 10 10 10 */
|
||||
"TAAG", /* 00 10 10 11 */
|
||||
"TAGT", /* 00 10 11 00 */
|
||||
"TAGC", /* 00 10 11 01 */
|
||||
"TAGA", /* 00 10 11 10 */
|
||||
"TAGG", /* 00 10 11 11 */
|
||||
"TGTT", /* 00 11 00 00 */
|
||||
"TGTC", /* 00 11 00 01 */
|
||||
"TGTA", /* 00 11 00 10 */
|
||||
"TGTG", /* 00 11 00 11 */
|
||||
"TGCT", /* 00 11 01 00 */
|
||||
"TGCC", /* 00 11 01 01 */
|
||||
"TGCA", /* 00 11 01 10 */
|
||||
"TGCG", /* 00 11 01 11 */
|
||||
"TGAT", /* 00 11 10 00 */
|
||||
"TGAC", /* 00 11 10 01 */
|
||||
"TGAA", /* 00 11 10 10 */
|
||||
"TGAG", /* 00 11 10 11 */
|
||||
"TGGT", /* 00 11 11 00 */
|
||||
"TGGC", /* 00 11 11 01 */
|
||||
"TGGA", /* 00 11 11 10 */
|
||||
"TGGG", /* 00 11 11 11 */
|
||||
"CTTT", /* 01 00 00 00 */
|
||||
"CTTC", /* 01 00 00 01 */
|
||||
"CTTA", /* 01 00 00 10 */
|
||||
"CTTG", /* 01 00 00 11 */
|
||||
"CTCT", /* 01 00 01 00 */
|
||||
"CTCC", /* 01 00 01 01 */
|
||||
"CTCA", /* 01 00 01 10 */
|
||||
"CTCG", /* 01 00 01 11 */
|
||||
"CTAT", /* 01 00 10 00 */
|
||||
"CTAC", /* 01 00 10 01 */
|
||||
"CTAA", /* 01 00 10 10 */
|
||||
"CTAG", /* 01 00 10 11 */
|
||||
"CTGT", /* 01 00 11 00 */
|
||||
"CTGC", /* 01 00 11 01 */
|
||||
"CTGA", /* 01 00 11 10 */
|
||||
"CTGG", /* 01 00 11 11 */
|
||||
"CCTT", /* 01 01 00 00 */
|
||||
"CCTC", /* 01 01 00 01 */
|
||||
"CCTA", /* 01 01 00 10 */
|
||||
"CCTG", /* 01 01 00 11 */
|
||||
"CCCT", /* 01 01 01 00 */
|
||||
"CCCC", /* 01 01 01 01 */
|
||||
"CCCA", /* 01 01 01 10 */
|
||||
"CCCG", /* 01 01 01 11 */
|
||||
"CCAT", /* 01 01 10 00 */
|
||||
"CCAC", /* 01 01 10 01 */
|
||||
"CCAA", /* 01 01 10 10 */
|
||||
"CCAG", /* 01 01 10 11 */
|
||||
"CCGT", /* 01 01 11 00 */
|
||||
"CCGC", /* 01 01 11 01 */
|
||||
"CCGA", /* 01 01 11 10 */
|
||||
"CCGG", /* 01 01 11 11 */
|
||||
"CATT", /* 01 10 00 00 */
|
||||
"CATC", /* 01 10 00 01 */
|
||||
"CATA", /* 01 10 00 10 */
|
||||
"CATG", /* 01 10 00 11 */
|
||||
"CACT", /* 01 10 01 00 */
|
||||
"CACC", /* 01 10 01 01 */
|
||||
"CACA", /* 01 10 01 10 */
|
||||
"CACG", /* 01 10 01 11 */
|
||||
"CAAT", /* 01 10 10 00 */
|
||||
"CAAC", /* 01 10 10 01 */
|
||||
"CAAA", /* 01 10 10 10 */
|
||||
"CAAG", /* 01 10 10 11 */
|
||||
"CAGT", /* 01 10 11 00 */
|
||||
"CAGC", /* 01 10 11 01 */
|
||||
"CAGA", /* 01 10 11 10 */
|
||||
"CAGG", /* 01 10 11 11 */
|
||||
"CGTT", /* 01 11 00 00 */
|
||||
"CGTC", /* 01 11 00 01 */
|
||||
"CGTA", /* 01 11 00 10 */
|
||||
"CGTG", /* 01 11 00 11 */
|
||||
"CGCT", /* 01 11 01 00 */
|
||||
"CGCC", /* 01 11 01 01 */
|
||||
"CGCA", /* 01 11 01 10 */
|
||||
"CGCG", /* 01 11 01 11 */
|
||||
"CGAT", /* 01 11 10 00 */
|
||||
"CGAC", /* 01 11 10 01 */
|
||||
"CGAA", /* 01 11 10 10 */
|
||||
"CGAG", /* 01 11 10 11 */
|
||||
"CGGT", /* 01 11 11 00 */
|
||||
"CGGC", /* 01 11 11 01 */
|
||||
"CGGA", /* 01 11 11 10 */
|
||||
"CGGG", /* 01 11 11 11 */
|
||||
"ATTT", /* 10 00 00 00 */
|
||||
"ATTC", /* 10 00 00 01 */
|
||||
"ATTA", /* 10 00 00 10 */
|
||||
"ATTG", /* 10 00 00 11 */
|
||||
"ATCT", /* 10 00 01 00 */
|
||||
"ATCC", /* 10 00 01 01 */
|
||||
"ATCA", /* 10 00 01 10 */
|
||||
"ATCG", /* 10 00 01 11 */
|
||||
"ATAT", /* 10 00 10 00 */
|
||||
"ATAC", /* 10 00 10 01 */
|
||||
"ATAA", /* 10 00 10 10 */
|
||||
"ATAG", /* 10 00 10 11 */
|
||||
"ATGT", /* 10 00 11 00 */
|
||||
"ATGC", /* 10 00 11 01 */
|
||||
"ATGA", /* 10 00 11 10 */
|
||||
"ATGG", /* 10 00 11 11 */
|
||||
"ACTT", /* 10 01 00 00 */
|
||||
"ACTC", /* 10 01 00 01 */
|
||||
"ACTA", /* 10 01 00 10 */
|
||||
"ACTG", /* 10 01 00 11 */
|
||||
"ACCT", /* 10 01 01 00 */
|
||||
"ACCC", /* 10 01 01 01 */
|
||||
"ACCA", /* 10 01 01 10 */
|
||||
"ACCG", /* 10 01 01 11 */
|
||||
"ACAT", /* 10 01 10 00 */
|
||||
"ACAC", /* 10 01 10 01 */
|
||||
"ACAA", /* 10 01 10 10 */
|
||||
"ACAG", /* 10 01 10 11 */
|
||||
"ACGT", /* 10 01 11 00 */
|
||||
"ACGC", /* 10 01 11 01 */
|
||||
"ACGA", /* 10 01 11 10 */
|
||||
"ACGG", /* 10 01 11 11 */
|
||||
"AATT", /* 10 10 00 00 */
|
||||
"AATC", /* 10 10 00 01 */
|
||||
"AATA", /* 10 10 00 10 */
|
||||
"AATG", /* 10 10 00 11 */
|
||||
"AACT", /* 10 10 01 00 */
|
||||
"AACC", /* 10 10 01 01 */
|
||||
"AACA", /* 10 10 01 10 */
|
||||
"AACG", /* 10 10 01 11 */
|
||||
"AAAT", /* 10 10 10 00 */
|
||||
"AAAC", /* 10 10 10 01 */
|
||||
"AAAA", /* 10 10 10 10 */
|
||||
"AAAG", /* 10 10 10 11 */
|
||||
"AAGT", /* 10 10 11 00 */
|
||||
"AAGC", /* 10 10 11 01 */
|
||||
"AAGA", /* 10 10 11 10 */
|
||||
"AAGG", /* 10 10 11 11 */
|
||||
"AGTT", /* 10 11 00 00 */
|
||||
"AGTC", /* 10 11 00 01 */
|
||||
"AGTA", /* 10 11 00 10 */
|
||||
"AGTG", /* 10 11 00 11 */
|
||||
"AGCT", /* 10 11 01 00 */
|
||||
"AGCC", /* 10 11 01 01 */
|
||||
"AGCA", /* 10 11 01 10 */
|
||||
"AGCG", /* 10 11 01 11 */
|
||||
"AGAT", /* 10 11 10 00 */
|
||||
"AGAC", /* 10 11 10 01 */
|
||||
"AGAA", /* 10 11 10 10 */
|
||||
"AGAG", /* 10 11 10 11 */
|
||||
"AGGT", /* 10 11 11 00 */
|
||||
"AGGC", /* 10 11 11 01 */
|
||||
"AGGA", /* 10 11 11 10 */
|
||||
"AGGG", /* 10 11 11 11 */
|
||||
"GTTT", /* 11 00 00 00 */
|
||||
"GTTC", /* 11 00 00 01 */
|
||||
"GTTA", /* 11 00 00 10 */
|
||||
"GTTG", /* 11 00 00 11 */
|
||||
"GTCT", /* 11 00 01 00 */
|
||||
"GTCC", /* 11 00 01 01 */
|
||||
"GTCA", /* 11 00 01 10 */
|
||||
"GTCG", /* 11 00 01 11 */
|
||||
"GTAT", /* 11 00 10 00 */
|
||||
"GTAC", /* 11 00 10 01 */
|
||||
"GTAA", /* 11 00 10 10 */
|
||||
"GTAG", /* 11 00 10 11 */
|
||||
"GTGT", /* 11 00 11 00 */
|
||||
"GTGC", /* 11 00 11 01 */
|
||||
"GTGA", /* 11 00 11 10 */
|
||||
"GTGG", /* 11 00 11 11 */
|
||||
"GCTT", /* 11 01 00 00 */
|
||||
"GCTC", /* 11 01 00 01 */
|
||||
"GCTA", /* 11 01 00 10 */
|
||||
"GCTG", /* 11 01 00 11 */
|
||||
"GCCT", /* 11 01 01 00 */
|
||||
"GCCC", /* 11 01 01 01 */
|
||||
"GCCA", /* 11 01 01 10 */
|
||||
"GCCG", /* 11 01 01 11 */
|
||||
"GCAT", /* 11 01 10 00 */
|
||||
"GCAC", /* 11 01 10 01 */
|
||||
"GCAA", /* 11 01 10 10 */
|
||||
"GCAG", /* 11 01 10 11 */
|
||||
"GCGT", /* 11 01 11 00 */
|
||||
"GCGC", /* 11 01 11 01 */
|
||||
"GCGA", /* 11 01 11 10 */
|
||||
"GCGG", /* 11 01 11 11 */
|
||||
"GATT", /* 11 10 00 00 */
|
||||
"GATC", /* 11 10 00 01 */
|
||||
"GATA", /* 11 10 00 10 */
|
||||
"GATG", /* 11 10 00 11 */
|
||||
"GACT", /* 11 10 01 00 */
|
||||
"GACC", /* 11 10 01 01 */
|
||||
"GACA", /* 11 10 01 10 */
|
||||
"GACG", /* 11 10 01 11 */
|
||||
"GAAT", /* 11 10 10 00 */
|
||||
"GAAC", /* 11 10 10 01 */
|
||||
"GAAA", /* 11 10 10 10 */
|
||||
"GAAG", /* 11 10 10 11 */
|
||||
"GAGT", /* 11 10 11 00 */
|
||||
"GAGC", /* 11 10 11 01 */
|
||||
"GAGA", /* 11 10 11 10 */
|
||||
"GAGG", /* 11 10 11 11 */
|
||||
"GGTT", /* 11 11 00 00 */
|
||||
"GGTC", /* 11 11 00 01 */
|
||||
"GGTA", /* 11 11 00 10 */
|
||||
"GGTG", /* 11 11 00 11 */
|
||||
"GGCT", /* 11 11 01 00 */
|
||||
"GGCC", /* 11 11 01 01 */
|
||||
"GGCA", /* 11 11 01 10 */
|
||||
"GGCG", /* 11 11 01 11 */
|
||||
"GGAT", /* 11 11 10 00 */
|
||||
"GGAC", /* 11 11 10 01 */
|
||||
"GGAA", /* 11 11 10 10 */
|
||||
"GGAG", /* 11 11 10 11 */
|
||||
"GGGT", /* 11 11 11 00 */
|
||||
"GGGC", /* 11 11 11 01 */
|
||||
"GGGA", /* 11 11 11 10 */
|
||||
"GGGG", /* 11 11 11 11 */
|
||||
};
|
||||
|
||||
static int
|
||||
extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) {
|
||||
uint32_t i;
|
||||
const uint32_t size = end - start;
|
||||
const uint32_t byteStart = start / 4;
|
||||
const uint32_t byteEnd = (end + 3) / 4;
|
||||
|
||||
if (byteSize != byteEnd - byteStart) {
|
||||
PyErr_Format(PyExc_RuntimeError,
|
||||
"unexpected number of bytes %u (expected %u)",
|
||||
byteSize, byteEnd - byteStart);
|
||||
return -1;
|
||||
}
|
||||
|
||||
start -= byteStart * 4;
|
||||
if (byteStart + 1 == byteEnd) {
|
||||
/* one byte only */
|
||||
memcpy(sequence, &(bases[*bytes][start]), size);
|
||||
}
|
||||
else {
|
||||
end -= byteEnd * 4;
|
||||
/* end is now a negative number equal to the distance to the byte end */
|
||||
memcpy(sequence, &(bases[*bytes][start]), 4 - start);
|
||||
bytes++;
|
||||
sequence += (4 - start);
|
||||
for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4)
|
||||
memcpy(sequence, bases[*bytes], 4);
|
||||
memcpy(sequence, bases[*bytes], end + 4);
|
||||
bytes++;
|
||||
bytes -= byteSize;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks)
|
||||
{
|
||||
const Py_ssize_t nBlockCount = nBlocks->shape[0];
|
||||
const uint32_t* const nBlockPositions = nBlocks->buf;
|
||||
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < nBlockCount; i++) {
|
||||
uint32_t nBlockStart = nBlockPositions[2*i];
|
||||
uint32_t nBlockEnd = nBlockPositions[2*i+1];
|
||||
if (nBlockEnd < start) continue;
|
||||
if (end < nBlockStart) break;
|
||||
if (nBlockStart < start) nBlockStart = start;
|
||||
if (end < nBlockEnd) nBlockEnd = end;
|
||||
memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks)
|
||||
{
|
||||
const Py_ssize_t maskBlockCount = maskBlocks->shape[0];
|
||||
const uint32_t* const maskBlockPositions = maskBlocks->buf;
|
||||
const char diff = 'a' - 'A';
|
||||
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < maskBlockCount; i++) {
|
||||
uint32_t j;
|
||||
uint32_t maskBlockStart = maskBlockPositions[2*i];
|
||||
uint32_t maskBlockEnd = maskBlockPositions[2*i+1];
|
||||
if (maskBlockEnd < start) continue;
|
||||
if (end < maskBlockStart) break;
|
||||
if (maskBlockStart < start) maskBlockStart = start;
|
||||
if (end < maskBlockEnd) maskBlockEnd = end;
|
||||
for (j = maskBlockStart - start; j < maskBlockEnd - start; j++)
|
||||
sequence[j] += diff;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
blocks_converter(PyObject* object, void* pointer)
|
||||
{
|
||||
const int flag = PyBUF_ND | PyBUF_FORMAT;
|
||||
Py_buffer *view = pointer;
|
||||
|
||||
if (object == NULL) goto exit;
|
||||
|
||||
if (PyObject_GetBuffer(object, view, flag) == -1) {
|
||||
PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (view->itemsize != sizeof(uint32_t)
|
||||
|| (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) {
|
||||
PyErr_Format(PyExc_RuntimeError,
|
||||
"blocks have incorrect data type (itemsize %zd, format %s)",
|
||||
view->itemsize, view->format);
|
||||
goto exit;
|
||||
}
|
||||
if (view->ndim != 2) {
|
||||
PyErr_Format(PyExc_RuntimeError,
|
||||
"blocks have incorrect rank %d (expected 2)", view->ndim);
|
||||
goto exit;
|
||||
}
|
||||
if (view->shape[1] != 2) {
|
||||
PyErr_Format(PyExc_RuntimeError,
|
||||
"blocks should have two colums (found %zd)",
|
||||
view->shape[1]);
|
||||
goto exit;
|
||||
}
|
||||
return Py_CLEANUP_SUPPORTED;
|
||||
|
||||
exit:
|
||||
PyBuffer_Release(view);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object";
|
||||
|
||||
static PyObject*
|
||||
TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords)
|
||||
{
|
||||
const unsigned char *data;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
Py_ssize_t step;
|
||||
Py_ssize_t size;
|
||||
Py_ssize_t length;
|
||||
Py_buffer nBlocks;
|
||||
Py_buffer maskBlocks;
|
||||
PyObject *object;
|
||||
char *sequence;
|
||||
|
||||
static char* kwlist[] = {"data", "start", "end", "step",
|
||||
"nBlocks", "maskBlocks", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist,
|
||||
&data, &length, &start, &end, &step,
|
||||
&blocks_converter, &nBlocks,
|
||||
&blocks_converter, &maskBlocks))
|
||||
return NULL;
|
||||
|
||||
size = (end - start) / step;
|
||||
object = PyBytes_FromStringAndSize(NULL, size);
|
||||
if (!object) goto exit;
|
||||
|
||||
sequence = PyBytes_AS_STRING(object);
|
||||
|
||||
if (step == 1) {
|
||||
if (extract(data, length, start, end, sequence) < 0) {
|
||||
Py_DECREF(object);
|
||||
object = NULL;
|
||||
goto exit;
|
||||
}
|
||||
applyNs(sequence, start, end, &nBlocks);
|
||||
applyMask(sequence, start, end, &maskBlocks);
|
||||
}
|
||||
else {
|
||||
Py_ssize_t current, i;
|
||||
Py_ssize_t full_start, full_end;
|
||||
char* full_sequence;
|
||||
if (start <= end) {
|
||||
full_start = start;
|
||||
full_end = end;
|
||||
current = 0; /* first position in sequence */
|
||||
}
|
||||
else {
|
||||
full_start = end + 1;
|
||||
full_end = start + 1;
|
||||
current = start - end - 1; /* last position in sequence */
|
||||
}
|
||||
full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char));
|
||||
full_sequence[full_end-full_start] = '\0';
|
||||
if (!full_sequence) {
|
||||
Py_DECREF(object);
|
||||
object = NULL;
|
||||
goto exit;
|
||||
}
|
||||
if (extract(data, length, full_start, full_end, full_sequence) < 0) {
|
||||
PyMem_Free(full_sequence);
|
||||
Py_DECREF(object);
|
||||
object = NULL;
|
||||
goto exit;
|
||||
}
|
||||
applyNs(full_sequence, full_start, full_end, &nBlocks);
|
||||
applyMask(full_sequence, full_start, full_end, &maskBlocks);
|
||||
for (i = 0; i < size; current += step, i++)
|
||||
sequence[i] = full_sequence[current];
|
||||
PyMem_Free(full_sequence);
|
||||
}
|
||||
|
||||
exit:
|
||||
blocks_converter(NULL, &nBlocks);
|
||||
blocks_converter(NULL, &maskBlocks);
|
||||
return object;
|
||||
}
|
||||
|
||||
static struct PyMethodDef _twoBitIO_methods[] = {
|
||||
{"convert",
|
||||
(PyCFunction)TwoBit_convert,
|
||||
METH_VARARGS | METH_KEYWORDS,
|
||||
TwoBit_convert__doc__
|
||||
},
|
||||
{NULL, NULL, 0, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
|
||||
static struct PyModuleDef moduledef = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_twoBitIO",
|
||||
"Parser for DNA sequence data in 2bit format",
|
||||
-1,
|
||||
_twoBitIO_methods,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
PyObject *
|
||||
PyInit__twoBitIO(void)
|
||||
{
|
||||
return PyModule_Create(&moduledef);
|
||||
}
|
@ -540,7 +540,27 @@ Length 394
|
||||
|
||||
\section{Sequence files as Dictionaries}
|
||||
|
||||
We're now going to introduce three related functions in the \verb|Bio.SeqIO|
|
||||
Looping over the iterator returned by \verb|SeqIO.parse| once will exhaust the file. For self-indexed files, such as files in the twoBit format, the return value of \verb|SeqIO.parse| can also be used as a dictionary, allowing random access to the sequence contents. As in this case parsing is done on demand, the file must remain open as long as the sequence data is being accessed:
|
||||
|
||||
%doctest ../Tests/TwoBit
|
||||
\begin{minted}{pycon}
|
||||
>>> from Bio import SeqIO
|
||||
>>> handle = open("sequence.bigendian.2bit", "rb")
|
||||
>>> records = SeqIO.parse(handle, "twobit")
|
||||
>>> records.keys()
|
||||
dict_keys(['seq11111', 'seq222', 'seq3333', 'seq4', 'seq555', 'seq6'])
|
||||
>>> records['seq222']
|
||||
SeqRecord(seq=Seq('TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTC...ACA'), id='seq222', name='<unknown name>', description='<unknown description>', dbxrefs=[])
|
||||
>>> records['seq222'].seq
|
||||
Seq('TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTC...ACA')
|
||||
>>> handle.close()
|
||||
>>> records['seq222'].seq
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: cannot retrieve sequence: file is closed
|
||||
\end{minted}
|
||||
|
||||
For other file formats, \verb|Bio.SeqIO| provides three related functions
|
||||
module which allow dictionary like random access to a multi-sequence file.
|
||||
There is a trade off here between flexibility and memory usage. In summary:
|
||||
\begin{itemize}
|
||||
|
3
NEWS.rst
3
NEWS.rst
@ -29,6 +29,9 @@ atomic solvent accessible areas without third-party tools.
|
||||
Expected ``TypeError`` behaviour has been restored to the ``Seq`` object's
|
||||
string like methods (fixing a regression in Biopython 1.78).
|
||||
|
||||
A parser for twoBit (.2bit) sequence data was added to ``Bio.SeqIO``. This is
|
||||
a lazy parser that allows fast access to genome-size DNA sequence files.
|
||||
|
||||
Many thanks to the Biopython developers and community for making this release
|
||||
possible, especially the following contributors:
|
||||
|
||||
|
BIN
Tests/TwoBit/sequence.bigendian.2bit
Normal file
BIN
Tests/TwoBit/sequence.bigendian.2bit
Normal file
Binary file not shown.
32
Tests/TwoBit/sequence.fa
Normal file
32
Tests/TwoBit/sequence.fa
Normal file
@ -0,0 +1,32 @@
|
||||
>seq11111
|
||||
GTATACCCCTTGGGCAGATTTACCCCTCTCGTCCCTGTCCCGTGACGGAATCGGGTAATCCATCGACTCT
|
||||
CGACCTGNNNNNNNNNNNNNNNNNNNCGCGTTTGAGTGTCGGGGGTGCTGTCGTTGTGTCCCCTGTGAGC
|
||||
CACAGAAATGCATCTTTCCTGAGGACGAGGACGGTAGTcgtgaagcaccgtcggTTTGATTTTATTCAGC
|
||||
CTGCAGATTGGATGCATAATGGTACACATTATTCTTTATGCTGGTTCCCGACATCATTGTGGTTCGCTGC
|
||||
TCAACAATCCTGGGTCGTTTGTTGANNNNNNNNNNNNNNNNNNNNNNNCTGAATGCTCTAGTAAATAGCT
|
||||
AGGGAGaacattagatgtggCAGCAGTAGCTTCTTCATCTGTCGTTAACCTTTGCCGGTCTGTTGCCTAT
|
||||
ATCCCTAAGTCTGGTTAATGGCTCATCCTCATGTGAAGATGGGCCGGCATGCGAGCACCG
|
||||
>seq222
|
||||
TTGATCGGTGACAAATTTTTTACAAAGAACTGTAGGACTTGCTACTTCTCCCTCCCGCGTTGGTCGGTCT
|
||||
GGGACCAAGACCCCGCTAGAtcccctcttcaaNNNNNNNNNNNNNNNNNNNNNNNNNGTGATGAGAATCT
|
||||
GGGCCATCTACNNNNNNNNNNnnnnnnnngccgcggtacgtggTAGGAACAGTGTATCATCACAGGCGTC
|
||||
GGCGCATGCGGTAGAGCAAACGGAATACTCTAGTGAGATTCAGCTTTGTACCATCTACA
|
||||
>seq3333
|
||||
cgcgtaacgagaTTCCCACGCACTGATTGAGGATACGAACAACGGGAAGGTAATCACTTTGGCATCCATG
|
||||
CAGGCTTTCTTTTGGTTTCGCTGTATCGAATTcagatcgggtagatgACCTAGGTTAGTACATTCACTAG
|
||||
GGAGAGTAAAGTGGAGCCCGGGTACCTCGGCTTGTGCTGTTGCTTAAATCGGCCCTCGCGGTTGTCGATA
|
||||
AACTCGCTTAGTCGAACGACTCACACGGAACGTAATATTTGCCCGCGCATAGGACCTACCCAACTTCGGG
|
||||
AATGATATTTAGatcgaacggcAATAGACAGAGGGGCTGAGCAGGTCCAAAAACACAGAGGATACTTACG
|
||||
CACTCTTGCGGGGGTATTCCCCTGATCCCCCTCAAAACGTACGACCGTACCAGCCAGGTCCGCCGTTTAG
|
||||
CTGTGCCCCCGTCATCCAAAGTttcatcctcagctctacatggttgcgggttcaaattaacgaggatctg
|
||||
>seq4
|
||||
NNNNNNNNNNNNNNNNNGACAAGGTTCCTGGGGTGATGCACACGCTCGGAGGGTGGGTAAATCAAAACGG
|
||||
CGGAAATAATGTCATACATGTAACAGATTATCAACCCGtagtcttccgctagtCAAGAATTTAGTAATGG
|
||||
TGTGAGGGAGGGGAAATTCTGATCGCACGAATATGAAGTGTAATGACGGCCGAGAAGCCGAGGAGAAAGC
|
||||
TCACAGTGATCCGCATTCGTCTGCTCCATGGTTTATCCATCGGCGCCGGGAATTTGTATACAAGACCCTG
|
||||
CGGTAATTATGCACCTAACGCCAGTATTGGCATTAGATCTAGTCGCNNNNNNNNNNNNNNNTG
|
||||
>seq555
|
||||
CATGTATCGCATGCGGGACGTTTtgtcctgacataggttgTAGTTCCGTCATACCGTGAGATGGGTCCCT
|
||||
TCACCCGATGGAGCTAGAAGGAAGATTTGCACCATGGATGGNNNNNNNNNNNNNNNN
|
||||
>seq6
|
||||
ACGTacgtNNNNnn
|
BIN
Tests/TwoBit/sequence.littleendian.2bit
Normal file
BIN
Tests/TwoBit/sequence.littleendian.2bit
Normal file
Binary file not shown.
BIN
Tests/TwoBit/sequence.long.2bit
Normal file
BIN
Tests/TwoBit/sequence.long.2bit
Normal file
Binary file not shown.
69
Tests/test_SeqIO_TwoBitIO.py
Normal file
69
Tests/test_SeqIO_TwoBitIO.py
Normal file
@ -0,0 +1,69 @@
|
||||
"""Tests for SeqIO TwoBitIO module."""
|
||||
|
||||
|
||||
import os
|
||||
import random
|
||||
import unittest
|
||||
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqIO import TwoBitIO
|
||||
|
||||
|
||||
class Parsing(unittest.TestCase):
|
||||
"""Test parsing 2bit files."""
|
||||
|
||||
def setUp(self):
|
||||
path = "TwoBit/sequence.fa"
|
||||
records = SeqIO.parse(path, "fasta")
|
||||
self.records = list(records)
|
||||
|
||||
def test_littleendian(self):
|
||||
path = "TwoBit/sequence.littleendian.2bit"
|
||||
with open(path, "rb") as handle:
|
||||
records = TwoBitIO.TwoBitIterator(handle)
|
||||
self.assertEqual(records.byteorder, "little")
|
||||
self.assertEqual(len(self.records), len(records))
|
||||
for record1, record2 in zip(self.records, records):
|
||||
self.assertEqual(record1.id, record2.id)
|
||||
seq1 = record1.seq
|
||||
seq2 = record2.seq
|
||||
self.assertEqual(seq1, seq2)
|
||||
n = len(seq1)
|
||||
for i in range(n):
|
||||
for j in range(i, n):
|
||||
self.assertEqual(seq1[i:j], seq2[i:j])
|
||||
self.assertEqual(repr(seq1[i:j]), repr(seq2[i:j]))
|
||||
|
||||
def test_bigendian(self):
|
||||
path = "TwoBit/sequence.bigendian.2bit"
|
||||
with open(path, "rb") as handle:
|
||||
records = TwoBitIO.TwoBitIterator(handle)
|
||||
self.assertEqual(len(records), 6)
|
||||
self.assertEqual(records.byteorder, "big")
|
||||
for record1, record2 in zip(self.records, records):
|
||||
self.assertEqual(record1.id, record2.id)
|
||||
seq1 = record1.seq
|
||||
seq2 = record2.seq
|
||||
self.assertEqual(seq1, seq2)
|
||||
n = len(seq1)
|
||||
for i in range(n):
|
||||
for j in range(i, n):
|
||||
self.assertEqual(seq1[i:j], seq2[i:j])
|
||||
self.assertEqual(repr(seq1[i:j]), repr(seq2[i:j]))
|
||||
|
||||
def test_sequence_long(self):
|
||||
path = "TwoBit/sequence.long.2bit"
|
||||
with open(path, "rb") as handle:
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
TwoBitIO.TwoBitIterator(handle)
|
||||
self.assertEqual(
|
||||
str(cm.exception),
|
||||
"version-1 twoBit files with 64-bit offsets for index are currently not supported",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
unittest.main(testRunner=runner)
|
1
setup.py
1
setup.py
@ -198,6 +198,7 @@ EXTENSIONS = [
|
||||
"Bio.Cluster._cluster", ["Bio/Cluster/cluster.c", "Bio/Cluster/clustermodule.c"]
|
||||
),
|
||||
Extension("Bio.PDB.kdtrees", ["Bio/PDB/kdtrees.c"]),
|
||||
Extension("Bio.SeqIO._twoBitIO", ["Bio/SeqIO/_twoBitIO.c"]),
|
||||
]
|
||||
|
||||
# We now define the Biopython version number in Bio/__init__.py
|
||||
|
Reference in New Issue
Block a user