mirror of
				https://github.com/biopython/biopython.git
				synced 2025-10-27 03:54:32 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			483 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			483 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #define PY_SSIZE_T_CLEAN
 | |
| #include "Python.h"
 | |
| 
 | |
| 
 | |
| static const char bases[][4] = {"TTTT",  /* 00 00 00 00 */
 | |
|                                 "TTTC",  /* 00 00 00 01 */
 | |
|                                 "TTTA",  /* 00 00 00 10 */
 | |
|                                 "TTTG",  /* 00 00 00 11 */
 | |
|                                 "TTCT",  /* 00 00 01 00 */
 | |
|                                 "TTCC",  /* 00 00 01 01 */
 | |
|                                 "TTCA",  /* 00 00 01 10 */
 | |
|                                 "TTCG",  /* 00 00 01 11 */
 | |
|                                 "TTAT",  /* 00 00 10 00 */
 | |
|                                 "TTAC",  /* 00 00 10 01 */
 | |
|                                 "TTAA",  /* 00 00 10 10 */
 | |
|                                 "TTAG",  /* 00 00 10 11 */
 | |
|                                 "TTGT",  /* 00 00 11 00 */
 | |
|                                 "TTGC",  /* 00 00 11 01 */
 | |
|                                 "TTGA",  /* 00 00 11 10 */
 | |
|                                 "TTGG",  /* 00 00 11 11 */
 | |
|                                 "TCTT",  /* 00 01 00 00 */
 | |
|                                 "TCTC",  /* 00 01 00 01 */
 | |
|                                 "TCTA",  /* 00 01 00 10 */
 | |
|                                 "TCTG",  /* 00 01 00 11 */
 | |
|                                 "TCCT",  /* 00 01 01 00 */
 | |
|                                 "TCCC",  /* 00 01 01 01 */
 | |
|                                 "TCCA",  /* 00 01 01 10 */
 | |
|                                 "TCCG",  /* 00 01 01 11 */
 | |
|                                 "TCAT",  /* 00 01 10 00 */
 | |
|                                 "TCAC",  /* 00 01 10 01 */
 | |
|                                 "TCAA",  /* 00 01 10 10 */
 | |
|                                 "TCAG",  /* 00 01 10 11 */
 | |
|                                 "TCGT",  /* 00 01 11 00 */
 | |
|                                 "TCGC",  /* 00 01 11 01 */
 | |
|                                 "TCGA",  /* 00 01 11 10 */
 | |
|                                 "TCGG",  /* 00 01 11 11 */
 | |
|                                 "TATT",  /* 00 10 00 00 */
 | |
|                                 "TATC",  /* 00 10 00 01 */
 | |
|                                 "TATA",  /* 00 10 00 10 */
 | |
|                                 "TATG",  /* 00 10 00 11 */
 | |
|                                 "TACT",  /* 00 10 01 00 */
 | |
|                                 "TACC",  /* 00 10 01 01 */
 | |
|                                 "TACA",  /* 00 10 01 10 */
 | |
|                                 "TACG",  /* 00 10 01 11 */
 | |
|                                 "TAAT",  /* 00 10 10 00 */
 | |
|                                 "TAAC",  /* 00 10 10 01 */
 | |
|                                 "TAAA",  /* 00 10 10 10 */
 | |
|                                 "TAAG",  /* 00 10 10 11 */
 | |
|                                 "TAGT",  /* 00 10 11 00 */
 | |
|                                 "TAGC",  /* 00 10 11 01 */
 | |
|                                 "TAGA",  /* 00 10 11 10 */
 | |
|                                 "TAGG",  /* 00 10 11 11 */
 | |
|                                 "TGTT",  /* 00 11 00 00 */
 | |
|                                 "TGTC",  /* 00 11 00 01 */
 | |
|                                 "TGTA",  /* 00 11 00 10 */
 | |
|                                 "TGTG",  /* 00 11 00 11 */
 | |
|                                 "TGCT",  /* 00 11 01 00 */
 | |
|                                 "TGCC",  /* 00 11 01 01 */
 | |
|                                 "TGCA",  /* 00 11 01 10 */
 | |
|                                 "TGCG",  /* 00 11 01 11 */
 | |
|                                 "TGAT",  /* 00 11 10 00 */
 | |
|                                 "TGAC",  /* 00 11 10 01 */
 | |
|                                 "TGAA",  /* 00 11 10 10 */
 | |
|                                 "TGAG",  /* 00 11 10 11 */
 | |
|                                 "TGGT",  /* 00 11 11 00 */
 | |
|                                 "TGGC",  /* 00 11 11 01 */
 | |
|                                 "TGGA",  /* 00 11 11 10 */
 | |
|                                 "TGGG",  /* 00 11 11 11 */
 | |
|                                 "CTTT",  /* 01 00 00 00 */
 | |
|                                 "CTTC",  /* 01 00 00 01 */
 | |
|                                 "CTTA",  /* 01 00 00 10 */
 | |
|                                 "CTTG",  /* 01 00 00 11 */
 | |
|                                 "CTCT",  /* 01 00 01 00 */
 | |
|                                 "CTCC",  /* 01 00 01 01 */
 | |
|                                 "CTCA",  /* 01 00 01 10 */
 | |
|                                 "CTCG",  /* 01 00 01 11 */
 | |
|                                 "CTAT",  /* 01 00 10 00 */
 | |
|                                 "CTAC",  /* 01 00 10 01 */
 | |
|                                 "CTAA",  /* 01 00 10 10 */
 | |
|                                 "CTAG",  /* 01 00 10 11 */
 | |
|                                 "CTGT",  /* 01 00 11 00 */
 | |
|                                 "CTGC",  /* 01 00 11 01 */
 | |
|                                 "CTGA",  /* 01 00 11 10 */
 | |
|                                 "CTGG",  /* 01 00 11 11 */
 | |
|                                 "CCTT",  /* 01 01 00 00 */
 | |
|                                 "CCTC",  /* 01 01 00 01 */
 | |
|                                 "CCTA",  /* 01 01 00 10 */
 | |
|                                 "CCTG",  /* 01 01 00 11 */
 | |
|                                 "CCCT",  /* 01 01 01 00 */
 | |
|                                 "CCCC",  /* 01 01 01 01 */
 | |
|                                 "CCCA",  /* 01 01 01 10 */
 | |
|                                 "CCCG",  /* 01 01 01 11 */
 | |
|                                 "CCAT",  /* 01 01 10 00 */
 | |
|                                 "CCAC",  /* 01 01 10 01 */
 | |
|                                 "CCAA",  /* 01 01 10 10 */
 | |
|                                 "CCAG",  /* 01 01 10 11 */
 | |
|                                 "CCGT",  /* 01 01 11 00 */
 | |
|                                 "CCGC",  /* 01 01 11 01 */
 | |
|                                 "CCGA",  /* 01 01 11 10 */
 | |
|                                 "CCGG",  /* 01 01 11 11 */
 | |
|                                 "CATT",  /* 01 10 00 00 */
 | |
|                                 "CATC",  /* 01 10 00 01 */
 | |
|                                 "CATA",  /* 01 10 00 10 */
 | |
|                                 "CATG",  /* 01 10 00 11 */
 | |
|                                 "CACT",  /* 01 10 01 00 */
 | |
|                                 "CACC",  /* 01 10 01 01 */
 | |
|                                 "CACA",  /* 01 10 01 10 */
 | |
|                                 "CACG",  /* 01 10 01 11 */
 | |
|                                 "CAAT",  /* 01 10 10 00 */
 | |
|                                 "CAAC",  /* 01 10 10 01 */
 | |
|                                 "CAAA",  /* 01 10 10 10 */
 | |
|                                 "CAAG",  /* 01 10 10 11 */
 | |
|                                 "CAGT",  /* 01 10 11 00 */
 | |
|                                 "CAGC",  /* 01 10 11 01 */
 | |
|                                 "CAGA",  /* 01 10 11 10 */
 | |
|                                 "CAGG",  /* 01 10 11 11 */
 | |
|                                 "CGTT",  /* 01 11 00 00 */
 | |
|                                 "CGTC",  /* 01 11 00 01 */
 | |
|                                 "CGTA",  /* 01 11 00 10 */
 | |
|                                 "CGTG",  /* 01 11 00 11 */
 | |
|                                 "CGCT",  /* 01 11 01 00 */
 | |
|                                 "CGCC",  /* 01 11 01 01 */
 | |
|                                 "CGCA",  /* 01 11 01 10 */
 | |
|                                 "CGCG",  /* 01 11 01 11 */
 | |
|                                 "CGAT",  /* 01 11 10 00 */
 | |
|                                 "CGAC",  /* 01 11 10 01 */
 | |
|                                 "CGAA",  /* 01 11 10 10 */
 | |
|                                 "CGAG",  /* 01 11 10 11 */
 | |
|                                 "CGGT",  /* 01 11 11 00 */
 | |
|                                 "CGGC",  /* 01 11 11 01 */
 | |
|                                 "CGGA",  /* 01 11 11 10 */
 | |
|                                 "CGGG",  /* 01 11 11 11 */
 | |
|                                 "ATTT",  /* 10 00 00 00 */
 | |
|                                 "ATTC",  /* 10 00 00 01 */
 | |
|                                 "ATTA",  /* 10 00 00 10 */
 | |
|                                 "ATTG",  /* 10 00 00 11 */
 | |
|                                 "ATCT",  /* 10 00 01 00 */
 | |
|                                 "ATCC",  /* 10 00 01 01 */
 | |
|                                 "ATCA",  /* 10 00 01 10 */
 | |
|                                 "ATCG",  /* 10 00 01 11 */
 | |
|                                 "ATAT",  /* 10 00 10 00 */
 | |
|                                 "ATAC",  /* 10 00 10 01 */
 | |
|                                 "ATAA",  /* 10 00 10 10 */
 | |
|                                 "ATAG",  /* 10 00 10 11 */
 | |
|                                 "ATGT",  /* 10 00 11 00 */
 | |
|                                 "ATGC",  /* 10 00 11 01 */
 | |
|                                 "ATGA",  /* 10 00 11 10 */
 | |
|                                 "ATGG",  /* 10 00 11 11 */
 | |
|                                 "ACTT",  /* 10 01 00 00 */
 | |
|                                 "ACTC",  /* 10 01 00 01 */
 | |
|                                 "ACTA",  /* 10 01 00 10 */
 | |
|                                 "ACTG",  /* 10 01 00 11 */
 | |
|                                 "ACCT",  /* 10 01 01 00 */
 | |
|                                 "ACCC",  /* 10 01 01 01 */
 | |
|                                 "ACCA",  /* 10 01 01 10 */
 | |
|                                 "ACCG",  /* 10 01 01 11 */
 | |
|                                 "ACAT",  /* 10 01 10 00 */
 | |
|                                 "ACAC",  /* 10 01 10 01 */
 | |
|                                 "ACAA",  /* 10 01 10 10 */
 | |
|                                 "ACAG",  /* 10 01 10 11 */
 | |
|                                 "ACGT",  /* 10 01 11 00 */
 | |
|                                 "ACGC",  /* 10 01 11 01 */
 | |
|                                 "ACGA",  /* 10 01 11 10 */
 | |
|                                 "ACGG",  /* 10 01 11 11 */
 | |
|                                 "AATT",  /* 10 10 00 00 */
 | |
|                                 "AATC",  /* 10 10 00 01 */
 | |
|                                 "AATA",  /* 10 10 00 10 */
 | |
|                                 "AATG",  /* 10 10 00 11 */
 | |
|                                 "AACT",  /* 10 10 01 00 */
 | |
|                                 "AACC",  /* 10 10 01 01 */
 | |
|                                 "AACA",  /* 10 10 01 10 */
 | |
|                                 "AACG",  /* 10 10 01 11 */
 | |
|                                 "AAAT",  /* 10 10 10 00 */
 | |
|                                 "AAAC",  /* 10 10 10 01 */
 | |
|                                 "AAAA",  /* 10 10 10 10 */
 | |
|                                 "AAAG",  /* 10 10 10 11 */
 | |
|                                 "AAGT",  /* 10 10 11 00 */
 | |
|                                 "AAGC",  /* 10 10 11 01 */
 | |
|                                 "AAGA",  /* 10 10 11 10 */
 | |
|                                 "AAGG",  /* 10 10 11 11 */
 | |
|                                 "AGTT",  /* 10 11 00 00 */
 | |
|                                 "AGTC",  /* 10 11 00 01 */
 | |
|                                 "AGTA",  /* 10 11 00 10 */
 | |
|                                 "AGTG",  /* 10 11 00 11 */
 | |
|                                 "AGCT",  /* 10 11 01 00 */
 | |
|                                 "AGCC",  /* 10 11 01 01 */
 | |
|                                 "AGCA",  /* 10 11 01 10 */
 | |
|                                 "AGCG",  /* 10 11 01 11 */
 | |
|                                 "AGAT",  /* 10 11 10 00 */
 | |
|                                 "AGAC",  /* 10 11 10 01 */
 | |
|                                 "AGAA",  /* 10 11 10 10 */
 | |
|                                 "AGAG",  /* 10 11 10 11 */
 | |
|                                 "AGGT",  /* 10 11 11 00 */
 | |
|                                 "AGGC",  /* 10 11 11 01 */
 | |
|                                 "AGGA",  /* 10 11 11 10 */
 | |
|                                 "AGGG",  /* 10 11 11 11 */
 | |
|                                 "GTTT",  /* 11 00 00 00 */
 | |
|                                 "GTTC",  /* 11 00 00 01 */
 | |
|                                 "GTTA",  /* 11 00 00 10 */
 | |
|                                 "GTTG",  /* 11 00 00 11 */
 | |
|                                 "GTCT",  /* 11 00 01 00 */
 | |
|                                 "GTCC",  /* 11 00 01 01 */
 | |
|                                 "GTCA",  /* 11 00 01 10 */
 | |
|                                 "GTCG",  /* 11 00 01 11 */
 | |
|                                 "GTAT",  /* 11 00 10 00 */
 | |
|                                 "GTAC",  /* 11 00 10 01 */
 | |
|                                 "GTAA",  /* 11 00 10 10 */
 | |
|                                 "GTAG",  /* 11 00 10 11 */
 | |
|                                 "GTGT",  /* 11 00 11 00 */
 | |
|                                 "GTGC",  /* 11 00 11 01 */
 | |
|                                 "GTGA",  /* 11 00 11 10 */
 | |
|                                 "GTGG",  /* 11 00 11 11 */
 | |
|                                 "GCTT",  /* 11 01 00 00 */
 | |
|                                 "GCTC",  /* 11 01 00 01 */
 | |
|                                 "GCTA",  /* 11 01 00 10 */
 | |
|                                 "GCTG",  /* 11 01 00 11 */
 | |
|                                 "GCCT",  /* 11 01 01 00 */
 | |
|                                 "GCCC",  /* 11 01 01 01 */
 | |
|                                 "GCCA",  /* 11 01 01 10 */
 | |
|                                 "GCCG",  /* 11 01 01 11 */
 | |
|                                 "GCAT",  /* 11 01 10 00 */
 | |
|                                 "GCAC",  /* 11 01 10 01 */
 | |
|                                 "GCAA",  /* 11 01 10 10 */
 | |
|                                 "GCAG",  /* 11 01 10 11 */
 | |
|                                 "GCGT",  /* 11 01 11 00 */
 | |
|                                 "GCGC",  /* 11 01 11 01 */
 | |
|                                 "GCGA",  /* 11 01 11 10 */
 | |
|                                 "GCGG",  /* 11 01 11 11 */
 | |
|                                 "GATT",  /* 11 10 00 00 */
 | |
|                                 "GATC",  /* 11 10 00 01 */
 | |
|                                 "GATA",  /* 11 10 00 10 */
 | |
|                                 "GATG",  /* 11 10 00 11 */
 | |
|                                 "GACT",  /* 11 10 01 00 */
 | |
|                                 "GACC",  /* 11 10 01 01 */
 | |
|                                 "GACA",  /* 11 10 01 10 */
 | |
|                                 "GACG",  /* 11 10 01 11 */
 | |
|                                 "GAAT",  /* 11 10 10 00 */
 | |
|                                 "GAAC",  /* 11 10 10 01 */
 | |
|                                 "GAAA",  /* 11 10 10 10 */
 | |
|                                 "GAAG",  /* 11 10 10 11 */
 | |
|                                 "GAGT",  /* 11 10 11 00 */
 | |
|                                 "GAGC",  /* 11 10 11 01 */
 | |
|                                 "GAGA",  /* 11 10 11 10 */
 | |
|                                 "GAGG",  /* 11 10 11 11 */
 | |
|                                 "GGTT",  /* 11 11 00 00 */
 | |
|                                 "GGTC",  /* 11 11 00 01 */
 | |
|                                 "GGTA",  /* 11 11 00 10 */
 | |
|                                 "GGTG",  /* 11 11 00 11 */
 | |
|                                 "GGCT",  /* 11 11 01 00 */
 | |
|                                 "GGCC",  /* 11 11 01 01 */
 | |
|                                 "GGCA",  /* 11 11 01 10 */
 | |
|                                 "GGCG",  /* 11 11 01 11 */
 | |
|                                 "GGAT",  /* 11 11 10 00 */
 | |
|                                 "GGAC",  /* 11 11 10 01 */
 | |
|                                 "GGAA",  /* 11 11 10 10 */
 | |
|                                 "GGAG",  /* 11 11 10 11 */
 | |
|                                 "GGGT",  /* 11 11 11 00 */
 | |
|                                 "GGGC",  /* 11 11 11 01 */
 | |
|                                 "GGGA",  /* 11 11 11 10 */
 | |
|                                 "GGGG",  /* 11 11 11 11 */
 | |
|                                };
 | |
| 
 | |
| static int
 | |
| extract(const unsigned char* bytes, Py_ssize_t byteSize,
 | |
|         Py_ssize_t start, Py_ssize_t end, char sequence[]) {
 | |
|     Py_ssize_t i;
 | |
|     const Py_ssize_t size = end - start;
 | |
|     const Py_ssize_t byteStart = start / 4;
 | |
|     const Py_ssize_t byteEnd = (end + 3) / 4;
 | |
| 
 | |
|     if (byteSize != byteEnd - byteStart) {
 | |
|         PyErr_Format(PyExc_RuntimeError,
 | |
|                      "unexpected number of bytes %u (expected %u)",
 | |
|                      byteSize, byteEnd - byteStart);
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     start -= byteStart * 4;
 | |
|     if (byteStart + 1 == byteEnd) {
 | |
|         /* one byte only */
 | |
|         memcpy(sequence, &(bases[*bytes][start]), size);
 | |
|     }
 | |
|     else {
 | |
|         end -= byteEnd * 4;
 | |
|         /* end is now a negative number equal to the distance to the byte end */
 | |
|         memcpy(sequence, &(bases[*bytes][start]), 4 - start);
 | |
|         bytes++;
 | |
|         sequence += (4 - start);
 | |
|         for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4)
 | |
|             memcpy(sequence, bases[*bytes], 4);
 | |
|         memcpy(sequence, bases[*bytes], end + 4);
 | |
|         bytes++;
 | |
|         bytes -= byteSize;
 | |
|     }
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static void
 | |
| applyNs(char sequence[], Py_ssize_t start, Py_ssize_t end, Py_buffer *nBlocks)
 | |
| {
 | |
|     const Py_ssize_t nBlockCount = nBlocks->shape[0];
 | |
|     const uint32_t* const nBlockPositions = nBlocks->buf;
 | |
| 
 | |
|     Py_ssize_t i;
 | |
|     for (i = 0; i < nBlockCount; i++) {
 | |
|         Py_ssize_t nBlockStart = nBlockPositions[2*i];
 | |
|         Py_ssize_t nBlockEnd = nBlockPositions[2*i+1];
 | |
|         if (nBlockEnd < start) continue;
 | |
|         if (end < nBlockStart) break;
 | |
|         if (nBlockStart < start) nBlockStart = start;
 | |
|         if (end < nBlockEnd) nBlockEnd = end;
 | |
|         memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void
 | |
| applyMask(char sequence[], Py_ssize_t start, Py_ssize_t end,
 | |
|           Py_buffer* maskBlocks)
 | |
| {
 | |
|     const Py_ssize_t maskBlockCount = maskBlocks->shape[0];
 | |
|     const uint32_t* const maskBlockPositions = maskBlocks->buf;
 | |
|     const char diff = 'a' - 'A';
 | |
| 
 | |
|     Py_ssize_t i;
 | |
|     for (i = 0; i < maskBlockCount; i++) {
 | |
|         Py_ssize_t j;
 | |
|         Py_ssize_t maskBlockStart = maskBlockPositions[2*i];
 | |
|         Py_ssize_t maskBlockEnd = maskBlockPositions[2*i+1];
 | |
|         if (maskBlockEnd < start) continue;
 | |
|         if (end < maskBlockStart) break;
 | |
|         if (maskBlockStart < start) maskBlockStart = start;
 | |
|         if (end < maskBlockEnd) maskBlockEnd = end;
 | |
|         for (j = maskBlockStart - start; j < maskBlockEnd - start; j++)
 | |
|             sequence[j] += diff;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static int
 | |
| blocks_converter(PyObject* object, void* pointer)
 | |
| {
 | |
|     const int flag = PyBUF_ND | PyBUF_FORMAT;
 | |
|     Py_buffer *view = pointer;
 | |
| 
 | |
|     if (object == NULL) goto exit;
 | |
| 
 | |
|     if (PyObject_GetBuffer(object, view, flag) == -1) {
 | |
|         PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format.");
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     if (view->itemsize != sizeof(uint32_t)
 | |
|      || (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) {
 | |
|         PyErr_Format(PyExc_RuntimeError,
 | |
|                      "blocks have incorrect data type (itemsize %zd, format %s)",
 | |
|                      view->itemsize, view->format);
 | |
|         goto exit;
 | |
|     }
 | |
|     if (view->ndim != 2) {
 | |
|         PyErr_Format(PyExc_RuntimeError,
 | |
|                      "blocks have incorrect rank %d (expected 2)", view->ndim);
 | |
|         goto exit;
 | |
|     }
 | |
|     if (view->shape[1] != 2) {
 | |
|         PyErr_Format(PyExc_RuntimeError,
 | |
|                      "blocks should have two columns (found %zd)",
 | |
|                      view->shape[1]);
 | |
|         goto exit;
 | |
|     }
 | |
|     return Py_CLEANUP_SUPPORTED;
 | |
| 
 | |
| exit:
 | |
|     PyBuffer_Release(view);
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object";
 | |
| 
 | |
| static PyObject*
 | |
| TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords)
 | |
| {
 | |
|     const unsigned char *data;
 | |
|     Py_ssize_t start;
 | |
|     Py_ssize_t end;
 | |
|     Py_ssize_t step;
 | |
|     Py_ssize_t size;
 | |
|     Py_ssize_t length;
 | |
|     Py_buffer nBlocks;
 | |
|     Py_buffer maskBlocks;
 | |
|     PyObject *object;
 | |
|     char *sequence;
 | |
| 
 | |
|     static char* kwlist[] = {"data", "start", "end", "step",
 | |
|                              "nBlocks", "maskBlocks", NULL};
 | |
| 
 | |
|     if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist,
 | |
|                                      &data, &length, &start, &end, &step,
 | |
|                                      &blocks_converter, &nBlocks,
 | |
|                                      &blocks_converter, &maskBlocks))
 | |
|         return NULL;
 | |
| 
 | |
|     size = (end - start) / step;
 | |
|     object = PyBytes_FromStringAndSize(NULL, size);
 | |
|     if (!object) goto exit;
 | |
| 
 | |
|     sequence = PyBytes_AS_STRING(object);
 | |
| 
 | |
|     if (step == 1) {
 | |
|         if (extract(data, length, start, end, sequence) < 0) {
 | |
|             Py_DECREF(object);
 | |
|             object = NULL;
 | |
|             goto exit;
 | |
|         }
 | |
|         applyNs(sequence, start, end, &nBlocks);
 | |
|         applyMask(sequence, start, end, &maskBlocks);
 | |
|     }
 | |
|     else {
 | |
|         Py_ssize_t current, i;
 | |
|         Py_ssize_t full_start, full_end;
 | |
|         char* full_sequence;
 | |
|         if (start <= end) {
 | |
|             full_start = start;
 | |
|             full_end = end;
 | |
|             current = 0; /* first position in sequence */
 | |
|         }
 | |
|         else {
 | |
|             full_start = end + 1;
 | |
|             full_end = start + 1;
 | |
|             current = start - end - 1; /* last position in sequence */
 | |
|         }
 | |
|         full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char));
 | |
|         full_sequence[full_end-full_start] = '\0';
 | |
|         if (!full_sequence) {
 | |
|             Py_DECREF(object);
 | |
|             object = NULL;
 | |
|             goto exit;
 | |
|         }
 | |
|         if (extract(data, length, full_start, full_end, full_sequence) < 0) {
 | |
|             PyMem_Free(full_sequence);
 | |
|             Py_DECREF(object);
 | |
|             object = NULL;
 | |
|             goto exit;
 | |
|         }
 | |
|         applyNs(full_sequence, full_start, full_end, &nBlocks);
 | |
|         applyMask(full_sequence, full_start, full_end, &maskBlocks);
 | |
|         for (i = 0; i < size; current += step, i++)
 | |
|             sequence[i] = full_sequence[current];
 | |
|         PyMem_Free(full_sequence);
 | |
|     }
 | |
| 
 | |
| exit:
 | |
|     blocks_converter(NULL, &nBlocks);
 | |
|     blocks_converter(NULL, &maskBlocks);
 | |
|     return object;
 | |
| }
 | |
| 
 | |
| static struct PyMethodDef _twoBitIO_methods[] = {
 | |
|     {"convert",
 | |
|      (PyCFunction)TwoBit_convert,
 | |
|      METH_VARARGS | METH_KEYWORDS,
 | |
|      TwoBit_convert__doc__
 | |
|     },
 | |
|     {NULL, NULL, 0, NULL} /* sentinel */
 | |
| };
 | |
| 
 | |
| 
 | |
| static struct PyModuleDef moduledef = {
 | |
|     PyModuleDef_HEAD_INIT,
 | |
|     "_twoBitIO",
 | |
|     "Parser for DNA sequence data in 2bit format",
 | |
|     -1,
 | |
|     _twoBitIO_methods,
 | |
|     NULL,
 | |
|     NULL,
 | |
|     NULL,
 | |
|     NULL
 | |
| };
 | |
| 
 | |
| PyObject *
 | |
| PyInit__twoBitIO(void)
 | |
| {
 | |
|     return PyModule_Create(&moduledef);
 | |
| }
 |