mirror of
https://github.com/biopython/biopython.git
synced 2025-10-27 03:54:32 +08:00
483 lines
22 KiB
C
483 lines
22 KiB
C
#define PY_SSIZE_T_CLEAN
|
|
#include "Python.h"
|
|
|
|
|
|
static const char bases[][4] = {"TTTT", /* 00 00 00 00 */
|
|
"TTTC", /* 00 00 00 01 */
|
|
"TTTA", /* 00 00 00 10 */
|
|
"TTTG", /* 00 00 00 11 */
|
|
"TTCT", /* 00 00 01 00 */
|
|
"TTCC", /* 00 00 01 01 */
|
|
"TTCA", /* 00 00 01 10 */
|
|
"TTCG", /* 00 00 01 11 */
|
|
"TTAT", /* 00 00 10 00 */
|
|
"TTAC", /* 00 00 10 01 */
|
|
"TTAA", /* 00 00 10 10 */
|
|
"TTAG", /* 00 00 10 11 */
|
|
"TTGT", /* 00 00 11 00 */
|
|
"TTGC", /* 00 00 11 01 */
|
|
"TTGA", /* 00 00 11 10 */
|
|
"TTGG", /* 00 00 11 11 */
|
|
"TCTT", /* 00 01 00 00 */
|
|
"TCTC", /* 00 01 00 01 */
|
|
"TCTA", /* 00 01 00 10 */
|
|
"TCTG", /* 00 01 00 11 */
|
|
"TCCT", /* 00 01 01 00 */
|
|
"TCCC", /* 00 01 01 01 */
|
|
"TCCA", /* 00 01 01 10 */
|
|
"TCCG", /* 00 01 01 11 */
|
|
"TCAT", /* 00 01 10 00 */
|
|
"TCAC", /* 00 01 10 01 */
|
|
"TCAA", /* 00 01 10 10 */
|
|
"TCAG", /* 00 01 10 11 */
|
|
"TCGT", /* 00 01 11 00 */
|
|
"TCGC", /* 00 01 11 01 */
|
|
"TCGA", /* 00 01 11 10 */
|
|
"TCGG", /* 00 01 11 11 */
|
|
"TATT", /* 00 10 00 00 */
|
|
"TATC", /* 00 10 00 01 */
|
|
"TATA", /* 00 10 00 10 */
|
|
"TATG", /* 00 10 00 11 */
|
|
"TACT", /* 00 10 01 00 */
|
|
"TACC", /* 00 10 01 01 */
|
|
"TACA", /* 00 10 01 10 */
|
|
"TACG", /* 00 10 01 11 */
|
|
"TAAT", /* 00 10 10 00 */
|
|
"TAAC", /* 00 10 10 01 */
|
|
"TAAA", /* 00 10 10 10 */
|
|
"TAAG", /* 00 10 10 11 */
|
|
"TAGT", /* 00 10 11 00 */
|
|
"TAGC", /* 00 10 11 01 */
|
|
"TAGA", /* 00 10 11 10 */
|
|
"TAGG", /* 00 10 11 11 */
|
|
"TGTT", /* 00 11 00 00 */
|
|
"TGTC", /* 00 11 00 01 */
|
|
"TGTA", /* 00 11 00 10 */
|
|
"TGTG", /* 00 11 00 11 */
|
|
"TGCT", /* 00 11 01 00 */
|
|
"TGCC", /* 00 11 01 01 */
|
|
"TGCA", /* 00 11 01 10 */
|
|
"TGCG", /* 00 11 01 11 */
|
|
"TGAT", /* 00 11 10 00 */
|
|
"TGAC", /* 00 11 10 01 */
|
|
"TGAA", /* 00 11 10 10 */
|
|
"TGAG", /* 00 11 10 11 */
|
|
"TGGT", /* 00 11 11 00 */
|
|
"TGGC", /* 00 11 11 01 */
|
|
"TGGA", /* 00 11 11 10 */
|
|
"TGGG", /* 00 11 11 11 */
|
|
"CTTT", /* 01 00 00 00 */
|
|
"CTTC", /* 01 00 00 01 */
|
|
"CTTA", /* 01 00 00 10 */
|
|
"CTTG", /* 01 00 00 11 */
|
|
"CTCT", /* 01 00 01 00 */
|
|
"CTCC", /* 01 00 01 01 */
|
|
"CTCA", /* 01 00 01 10 */
|
|
"CTCG", /* 01 00 01 11 */
|
|
"CTAT", /* 01 00 10 00 */
|
|
"CTAC", /* 01 00 10 01 */
|
|
"CTAA", /* 01 00 10 10 */
|
|
"CTAG", /* 01 00 10 11 */
|
|
"CTGT", /* 01 00 11 00 */
|
|
"CTGC", /* 01 00 11 01 */
|
|
"CTGA", /* 01 00 11 10 */
|
|
"CTGG", /* 01 00 11 11 */
|
|
"CCTT", /* 01 01 00 00 */
|
|
"CCTC", /* 01 01 00 01 */
|
|
"CCTA", /* 01 01 00 10 */
|
|
"CCTG", /* 01 01 00 11 */
|
|
"CCCT", /* 01 01 01 00 */
|
|
"CCCC", /* 01 01 01 01 */
|
|
"CCCA", /* 01 01 01 10 */
|
|
"CCCG", /* 01 01 01 11 */
|
|
"CCAT", /* 01 01 10 00 */
|
|
"CCAC", /* 01 01 10 01 */
|
|
"CCAA", /* 01 01 10 10 */
|
|
"CCAG", /* 01 01 10 11 */
|
|
"CCGT", /* 01 01 11 00 */
|
|
"CCGC", /* 01 01 11 01 */
|
|
"CCGA", /* 01 01 11 10 */
|
|
"CCGG", /* 01 01 11 11 */
|
|
"CATT", /* 01 10 00 00 */
|
|
"CATC", /* 01 10 00 01 */
|
|
"CATA", /* 01 10 00 10 */
|
|
"CATG", /* 01 10 00 11 */
|
|
"CACT", /* 01 10 01 00 */
|
|
"CACC", /* 01 10 01 01 */
|
|
"CACA", /* 01 10 01 10 */
|
|
"CACG", /* 01 10 01 11 */
|
|
"CAAT", /* 01 10 10 00 */
|
|
"CAAC", /* 01 10 10 01 */
|
|
"CAAA", /* 01 10 10 10 */
|
|
"CAAG", /* 01 10 10 11 */
|
|
"CAGT", /* 01 10 11 00 */
|
|
"CAGC", /* 01 10 11 01 */
|
|
"CAGA", /* 01 10 11 10 */
|
|
"CAGG", /* 01 10 11 11 */
|
|
"CGTT", /* 01 11 00 00 */
|
|
"CGTC", /* 01 11 00 01 */
|
|
"CGTA", /* 01 11 00 10 */
|
|
"CGTG", /* 01 11 00 11 */
|
|
"CGCT", /* 01 11 01 00 */
|
|
"CGCC", /* 01 11 01 01 */
|
|
"CGCA", /* 01 11 01 10 */
|
|
"CGCG", /* 01 11 01 11 */
|
|
"CGAT", /* 01 11 10 00 */
|
|
"CGAC", /* 01 11 10 01 */
|
|
"CGAA", /* 01 11 10 10 */
|
|
"CGAG", /* 01 11 10 11 */
|
|
"CGGT", /* 01 11 11 00 */
|
|
"CGGC", /* 01 11 11 01 */
|
|
"CGGA", /* 01 11 11 10 */
|
|
"CGGG", /* 01 11 11 11 */
|
|
"ATTT", /* 10 00 00 00 */
|
|
"ATTC", /* 10 00 00 01 */
|
|
"ATTA", /* 10 00 00 10 */
|
|
"ATTG", /* 10 00 00 11 */
|
|
"ATCT", /* 10 00 01 00 */
|
|
"ATCC", /* 10 00 01 01 */
|
|
"ATCA", /* 10 00 01 10 */
|
|
"ATCG", /* 10 00 01 11 */
|
|
"ATAT", /* 10 00 10 00 */
|
|
"ATAC", /* 10 00 10 01 */
|
|
"ATAA", /* 10 00 10 10 */
|
|
"ATAG", /* 10 00 10 11 */
|
|
"ATGT", /* 10 00 11 00 */
|
|
"ATGC", /* 10 00 11 01 */
|
|
"ATGA", /* 10 00 11 10 */
|
|
"ATGG", /* 10 00 11 11 */
|
|
"ACTT", /* 10 01 00 00 */
|
|
"ACTC", /* 10 01 00 01 */
|
|
"ACTA", /* 10 01 00 10 */
|
|
"ACTG", /* 10 01 00 11 */
|
|
"ACCT", /* 10 01 01 00 */
|
|
"ACCC", /* 10 01 01 01 */
|
|
"ACCA", /* 10 01 01 10 */
|
|
"ACCG", /* 10 01 01 11 */
|
|
"ACAT", /* 10 01 10 00 */
|
|
"ACAC", /* 10 01 10 01 */
|
|
"ACAA", /* 10 01 10 10 */
|
|
"ACAG", /* 10 01 10 11 */
|
|
"ACGT", /* 10 01 11 00 */
|
|
"ACGC", /* 10 01 11 01 */
|
|
"ACGA", /* 10 01 11 10 */
|
|
"ACGG", /* 10 01 11 11 */
|
|
"AATT", /* 10 10 00 00 */
|
|
"AATC", /* 10 10 00 01 */
|
|
"AATA", /* 10 10 00 10 */
|
|
"AATG", /* 10 10 00 11 */
|
|
"AACT", /* 10 10 01 00 */
|
|
"AACC", /* 10 10 01 01 */
|
|
"AACA", /* 10 10 01 10 */
|
|
"AACG", /* 10 10 01 11 */
|
|
"AAAT", /* 10 10 10 00 */
|
|
"AAAC", /* 10 10 10 01 */
|
|
"AAAA", /* 10 10 10 10 */
|
|
"AAAG", /* 10 10 10 11 */
|
|
"AAGT", /* 10 10 11 00 */
|
|
"AAGC", /* 10 10 11 01 */
|
|
"AAGA", /* 10 10 11 10 */
|
|
"AAGG", /* 10 10 11 11 */
|
|
"AGTT", /* 10 11 00 00 */
|
|
"AGTC", /* 10 11 00 01 */
|
|
"AGTA", /* 10 11 00 10 */
|
|
"AGTG", /* 10 11 00 11 */
|
|
"AGCT", /* 10 11 01 00 */
|
|
"AGCC", /* 10 11 01 01 */
|
|
"AGCA", /* 10 11 01 10 */
|
|
"AGCG", /* 10 11 01 11 */
|
|
"AGAT", /* 10 11 10 00 */
|
|
"AGAC", /* 10 11 10 01 */
|
|
"AGAA", /* 10 11 10 10 */
|
|
"AGAG", /* 10 11 10 11 */
|
|
"AGGT", /* 10 11 11 00 */
|
|
"AGGC", /* 10 11 11 01 */
|
|
"AGGA", /* 10 11 11 10 */
|
|
"AGGG", /* 10 11 11 11 */
|
|
"GTTT", /* 11 00 00 00 */
|
|
"GTTC", /* 11 00 00 01 */
|
|
"GTTA", /* 11 00 00 10 */
|
|
"GTTG", /* 11 00 00 11 */
|
|
"GTCT", /* 11 00 01 00 */
|
|
"GTCC", /* 11 00 01 01 */
|
|
"GTCA", /* 11 00 01 10 */
|
|
"GTCG", /* 11 00 01 11 */
|
|
"GTAT", /* 11 00 10 00 */
|
|
"GTAC", /* 11 00 10 01 */
|
|
"GTAA", /* 11 00 10 10 */
|
|
"GTAG", /* 11 00 10 11 */
|
|
"GTGT", /* 11 00 11 00 */
|
|
"GTGC", /* 11 00 11 01 */
|
|
"GTGA", /* 11 00 11 10 */
|
|
"GTGG", /* 11 00 11 11 */
|
|
"GCTT", /* 11 01 00 00 */
|
|
"GCTC", /* 11 01 00 01 */
|
|
"GCTA", /* 11 01 00 10 */
|
|
"GCTG", /* 11 01 00 11 */
|
|
"GCCT", /* 11 01 01 00 */
|
|
"GCCC", /* 11 01 01 01 */
|
|
"GCCA", /* 11 01 01 10 */
|
|
"GCCG", /* 11 01 01 11 */
|
|
"GCAT", /* 11 01 10 00 */
|
|
"GCAC", /* 11 01 10 01 */
|
|
"GCAA", /* 11 01 10 10 */
|
|
"GCAG", /* 11 01 10 11 */
|
|
"GCGT", /* 11 01 11 00 */
|
|
"GCGC", /* 11 01 11 01 */
|
|
"GCGA", /* 11 01 11 10 */
|
|
"GCGG", /* 11 01 11 11 */
|
|
"GATT", /* 11 10 00 00 */
|
|
"GATC", /* 11 10 00 01 */
|
|
"GATA", /* 11 10 00 10 */
|
|
"GATG", /* 11 10 00 11 */
|
|
"GACT", /* 11 10 01 00 */
|
|
"GACC", /* 11 10 01 01 */
|
|
"GACA", /* 11 10 01 10 */
|
|
"GACG", /* 11 10 01 11 */
|
|
"GAAT", /* 11 10 10 00 */
|
|
"GAAC", /* 11 10 10 01 */
|
|
"GAAA", /* 11 10 10 10 */
|
|
"GAAG", /* 11 10 10 11 */
|
|
"GAGT", /* 11 10 11 00 */
|
|
"GAGC", /* 11 10 11 01 */
|
|
"GAGA", /* 11 10 11 10 */
|
|
"GAGG", /* 11 10 11 11 */
|
|
"GGTT", /* 11 11 00 00 */
|
|
"GGTC", /* 11 11 00 01 */
|
|
"GGTA", /* 11 11 00 10 */
|
|
"GGTG", /* 11 11 00 11 */
|
|
"GGCT", /* 11 11 01 00 */
|
|
"GGCC", /* 11 11 01 01 */
|
|
"GGCA", /* 11 11 01 10 */
|
|
"GGCG", /* 11 11 01 11 */
|
|
"GGAT", /* 11 11 10 00 */
|
|
"GGAC", /* 11 11 10 01 */
|
|
"GGAA", /* 11 11 10 10 */
|
|
"GGAG", /* 11 11 10 11 */
|
|
"GGGT", /* 11 11 11 00 */
|
|
"GGGC", /* 11 11 11 01 */
|
|
"GGGA", /* 11 11 11 10 */
|
|
"GGGG", /* 11 11 11 11 */
|
|
};
|
|
|
|
static int
|
|
extract(const unsigned char* bytes, Py_ssize_t byteSize,
|
|
Py_ssize_t start, Py_ssize_t end, char sequence[]) {
|
|
Py_ssize_t i;
|
|
const Py_ssize_t size = end - start;
|
|
const Py_ssize_t byteStart = start / 4;
|
|
const Py_ssize_t byteEnd = (end + 3) / 4;
|
|
|
|
if (byteSize != byteEnd - byteStart) {
|
|
PyErr_Format(PyExc_RuntimeError,
|
|
"unexpected number of bytes %u (expected %u)",
|
|
byteSize, byteEnd - byteStart);
|
|
return -1;
|
|
}
|
|
|
|
start -= byteStart * 4;
|
|
if (byteStart + 1 == byteEnd) {
|
|
/* one byte only */
|
|
memcpy(sequence, &(bases[*bytes][start]), size);
|
|
}
|
|
else {
|
|
end -= byteEnd * 4;
|
|
/* end is now a negative number equal to the distance to the byte end */
|
|
memcpy(sequence, &(bases[*bytes][start]), 4 - start);
|
|
bytes++;
|
|
sequence += (4 - start);
|
|
for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4)
|
|
memcpy(sequence, bases[*bytes], 4);
|
|
memcpy(sequence, bases[*bytes], end + 4);
|
|
bytes++;
|
|
bytes -= byteSize;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
applyNs(char sequence[], Py_ssize_t start, Py_ssize_t end, Py_buffer *nBlocks)
|
|
{
|
|
const Py_ssize_t nBlockCount = nBlocks->shape[0];
|
|
const uint32_t* const nBlockPositions = nBlocks->buf;
|
|
|
|
Py_ssize_t i;
|
|
for (i = 0; i < nBlockCount; i++) {
|
|
Py_ssize_t nBlockStart = nBlockPositions[2*i];
|
|
Py_ssize_t nBlockEnd = nBlockPositions[2*i+1];
|
|
if (nBlockEnd < start) continue;
|
|
if (end < nBlockStart) break;
|
|
if (nBlockStart < start) nBlockStart = start;
|
|
if (end < nBlockEnd) nBlockEnd = end;
|
|
memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart);
|
|
}
|
|
}
|
|
|
|
static void
|
|
applyMask(char sequence[], Py_ssize_t start, Py_ssize_t end,
|
|
Py_buffer* maskBlocks)
|
|
{
|
|
const Py_ssize_t maskBlockCount = maskBlocks->shape[0];
|
|
const uint32_t* const maskBlockPositions = maskBlocks->buf;
|
|
const char diff = 'a' - 'A';
|
|
|
|
Py_ssize_t i;
|
|
for (i = 0; i < maskBlockCount; i++) {
|
|
Py_ssize_t j;
|
|
Py_ssize_t maskBlockStart = maskBlockPositions[2*i];
|
|
Py_ssize_t maskBlockEnd = maskBlockPositions[2*i+1];
|
|
if (maskBlockEnd < start) continue;
|
|
if (end < maskBlockStart) break;
|
|
if (maskBlockStart < start) maskBlockStart = start;
|
|
if (end < maskBlockEnd) maskBlockEnd = end;
|
|
for (j = maskBlockStart - start; j < maskBlockEnd - start; j++)
|
|
sequence[j] += diff;
|
|
}
|
|
}
|
|
|
|
static int
|
|
blocks_converter(PyObject* object, void* pointer)
|
|
{
|
|
const int flag = PyBUF_ND | PyBUF_FORMAT;
|
|
Py_buffer *view = pointer;
|
|
|
|
if (object == NULL) goto exit;
|
|
|
|
if (PyObject_GetBuffer(object, view, flag) == -1) {
|
|
PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format.");
|
|
return 0;
|
|
}
|
|
|
|
if (view->itemsize != sizeof(uint32_t)
|
|
|| (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) {
|
|
PyErr_Format(PyExc_RuntimeError,
|
|
"blocks have incorrect data type (itemsize %zd, format %s)",
|
|
view->itemsize, view->format);
|
|
goto exit;
|
|
}
|
|
if (view->ndim != 2) {
|
|
PyErr_Format(PyExc_RuntimeError,
|
|
"blocks have incorrect rank %d (expected 2)", view->ndim);
|
|
goto exit;
|
|
}
|
|
if (view->shape[1] != 2) {
|
|
PyErr_Format(PyExc_RuntimeError,
|
|
"blocks should have two columns (found %zd)",
|
|
view->shape[1]);
|
|
goto exit;
|
|
}
|
|
return Py_CLEANUP_SUPPORTED;
|
|
|
|
exit:
|
|
PyBuffer_Release(view);
|
|
return 0;
|
|
}
|
|
|
|
static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object";
|
|
|
|
static PyObject*
|
|
TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords)
|
|
{
|
|
const unsigned char *data;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
Py_ssize_t step;
|
|
Py_ssize_t size;
|
|
Py_ssize_t length;
|
|
Py_buffer nBlocks;
|
|
Py_buffer maskBlocks;
|
|
PyObject *object;
|
|
char *sequence;
|
|
|
|
static char* kwlist[] = {"data", "start", "end", "step",
|
|
"nBlocks", "maskBlocks", NULL};
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist,
|
|
&data, &length, &start, &end, &step,
|
|
&blocks_converter, &nBlocks,
|
|
&blocks_converter, &maskBlocks))
|
|
return NULL;
|
|
|
|
size = (end - start) / step;
|
|
object = PyBytes_FromStringAndSize(NULL, size);
|
|
if (!object) goto exit;
|
|
|
|
sequence = PyBytes_AS_STRING(object);
|
|
|
|
if (step == 1) {
|
|
if (extract(data, length, start, end, sequence) < 0) {
|
|
Py_DECREF(object);
|
|
object = NULL;
|
|
goto exit;
|
|
}
|
|
applyNs(sequence, start, end, &nBlocks);
|
|
applyMask(sequence, start, end, &maskBlocks);
|
|
}
|
|
else {
|
|
Py_ssize_t current, i;
|
|
Py_ssize_t full_start, full_end;
|
|
char* full_sequence;
|
|
if (start <= end) {
|
|
full_start = start;
|
|
full_end = end;
|
|
current = 0; /* first position in sequence */
|
|
}
|
|
else {
|
|
full_start = end + 1;
|
|
full_end = start + 1;
|
|
current = start - end - 1; /* last position in sequence */
|
|
}
|
|
full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char));
|
|
full_sequence[full_end-full_start] = '\0';
|
|
if (!full_sequence) {
|
|
Py_DECREF(object);
|
|
object = NULL;
|
|
goto exit;
|
|
}
|
|
if (extract(data, length, full_start, full_end, full_sequence) < 0) {
|
|
PyMem_Free(full_sequence);
|
|
Py_DECREF(object);
|
|
object = NULL;
|
|
goto exit;
|
|
}
|
|
applyNs(full_sequence, full_start, full_end, &nBlocks);
|
|
applyMask(full_sequence, full_start, full_end, &maskBlocks);
|
|
for (i = 0; i < size; current += step, i++)
|
|
sequence[i] = full_sequence[current];
|
|
PyMem_Free(full_sequence);
|
|
}
|
|
|
|
exit:
|
|
blocks_converter(NULL, &nBlocks);
|
|
blocks_converter(NULL, &maskBlocks);
|
|
return object;
|
|
}
|
|
|
|
static struct PyMethodDef _twoBitIO_methods[] = {
|
|
{"convert",
|
|
(PyCFunction)TwoBit_convert,
|
|
METH_VARARGS | METH_KEYWORDS,
|
|
TwoBit_convert__doc__
|
|
},
|
|
{NULL, NULL, 0, NULL} /* sentinel */
|
|
};
|
|
|
|
|
|
static struct PyModuleDef moduledef = {
|
|
PyModuleDef_HEAD_INIT,
|
|
"_twoBitIO",
|
|
"Parser for DNA sequence data in 2bit format",
|
|
-1,
|
|
_twoBitIO_methods,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
PyObject *
|
|
PyInit__twoBitIO(void)
|
|
{
|
|
return PyModule_Create(&moduledef);
|
|
}
|