Added support for HMM-only searches

This commit is contained in:
Samuel Prince
2024-09-16 07:14:38 -04:00
committed by Wibowo Arindrarto
parent 213fd253b0
commit f0f6694f96
2 changed files with 35 additions and 23 deletions

View File

@ -42,7 +42,8 @@ _DIV_HEADER_OPT = "# - - -"
_DIV_QUERY_END = "//"
_DIV_QUERY_START = "Query:"
# hits
_DIV_HITS_END = "Internal CM pipeline statistics summary:"
_DIV_HITS_END_CM = "Internal CM pipeline statistics summary:"
_DIV_HITS_END_HMM = "Internal HMM-only pipeline statistics summary:"
_DIV_HIT_SCORE = "Hit scores:"
_DIV_HIT_ALIGNMENT = "Hit alignments:"
_DIV_NO_HIT = " [No hits detected that satisfy reporting thresholds]"
@ -154,7 +155,7 @@ class InfernalTextParser:
while self.line and not self.line.startswith(_DIV_QUERY_END):
hit_list = self._parse_hit(qid, qdesc)
# read through the statistics summary
if self.line.startswith(_DIV_HITS_END):
if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
while self.line and not self.line.startswith(_DIV_QUERY_END):
self.line = read_forward(self.handle)
# create qresult, set its attributes and yield
@ -192,7 +193,7 @@ class InfernalTextParser:
elif self.line.startswith(_DIV_NO_HIT):
while True:
self.line = read_forward(self.handle)
if self.line.startswith(_DIV_HITS_END):
if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
return []
elif self.line.startswith(_DIV_HIT_SCORE):
# read through the header
@ -202,7 +203,7 @@ class InfernalTextParser:
# parse the hit score table
while True:
# we've reached the end of the hit score table
if self.line.startswith(_DIV_HITS_END):
if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
# create the last hit
hit = Hit(hsp_list)
for attr, value in hit_attrs.items():
@ -275,10 +276,10 @@ class InfernalTextParser:
elif self.line.startswith(_DIV_NO_HIT):
while True:
self.line = read_forward(self.handle)
if self.line.startswith(_DIV_HITS_END):
if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
return []
# we've reached the end of the hit section
elif self.line.startswith(_DIV_HITS_END):
elif self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
# create the last hit
hit = Hit(hsp_list)
for attr, value in hit_attrs.items():
@ -356,10 +357,17 @@ class InfernalTextParser:
frag_list = []
model_seq = ""
hit_seq = ""
annot = {"NC": "", "CS": "", "similarity": "", "PP": ""}
if model == "cm":
annot = {"NC": "", "CS": "", "similarity": "", "PP": ""}
else:
annot = {"CS": "", "similarity": "", "PP": ""}
while True:
# we've reached the end of the alignment section
if self.line.startswith(_DIV_ALIGNMENT_START) or self.line.startswith(_DIV_HITS_END):
if (
self.line.startswith(_DIV_ALIGNMENT_START) or
self.line.startswith(_DIV_HITS_END_CM) or
self.line.startswith(_DIV_HITS_END_HMM)
):
# Process local end in infernal hit alignment. Local end are
# large insertion or deletion indicated by *[NN]* where N is
# the number of model positions are deleted or the number of
@ -419,21 +427,24 @@ class InfernalTextParser:
# parse the alignment blocks in the hsp
# each block have 4 (hmmonly) or 5 (cm) lines followed by an empty line
block_size = 6 if model == "cm" else 5
offset = 1 if model == "cm" else 0 # offset for the annotation line indexes
lines = [None] * block_size
for i in range(block_size):
lines[i] = self.line
self.line = read_forward(self.handle)
# get the position of the alignment in the string using the pp line
blklen = len(lines[5].strip().split()[0])
blkstart = len(lines[5]) - blklen - 4
blkend = len(lines[5]) - 4
model_seq += lines[2][blkstart:blkend]
hit_seq += lines[4][blkstart:blkend]
annot["NC"] += lines[0][blkstart:blkend]
annot["CS"] += lines[1][blkstart:blkend]
annot["similarity"] += lines[3][blkstart:blkend]
annot["PP"] += lines[5][blkstart:blkend]
# get the position of the alignment in the string using the PP line
blklen = len(lines[4+offset].strip().split()[0])
blkstart = len(lines[4+offset]) - blklen - 4
blkend = len(lines[4+offset]) - 4
model_seq += lines[1+offset][blkstart:blkend]
hit_seq += lines[3+offset][blkstart:blkend]
# NC line is specific to cm model searches
if model == "cm":
annot["NC"] += lines[0][blkstart:blkend]
annot["CS"] += lines[0+offset][blkstart:blkend]
annot["similarity"] += lines[2+offset][blkstart:blkend]
annot["PP"] += lines[4+offset][blkstart:blkend]
return frag_list

View File

@ -443,15 +443,16 @@ class CmsearchCases(unittest.TestCase):
self.assertEqual(hsp.query_start, 7)
self.assertEqual(hsp.query_end, 100)
self.assertEqual(hsp.query_endtype, "..")
self.assertEqual(hsp.hit_start, 681855)
self.assertEqual(hsp.hit_end, 681762)
self.assertEqual(hsp.hit_start, 681762)
self.assertEqual(hsp.hit_end, 681855)
self.assertEqual(hsp.hit_endtype, "..")
self.assertEqual(hsp.avg_acc, 0.74)
# first fragment
frag = hsp[0]
self.assertEqual(frag.query_start, 7)
self.assertEqual(frag.query_end, 100)
self.assertEqual(frag.hit_start, 681855)
self.assertEqual(frag.hit_end, 681762)
self.assertEqual(frag.hit_start, 681762)
self.assertEqual(frag.hit_end, 681855)
self.assertEqual(frag.hit_strand, -1)
self.assertEqual(frag.query.seq, 'UCucgGCcUUUUGGCUaaGAUCAAGUGUAGUAUCUGUUCUUuuCAGUuUAAuAuCUGauAuugucucuAuugggggccaau-uauaUUAaauuaA')
self.assertEqual(frag.hit.seq,'UCUUUGCCUUUUGGCUUAGAUCAAGUGUAGUAUCUGUUCUUUUCAGUGUAACAACUGAAAUGACCUCAAU-GAGGCUCAUUaCCUUUUAAUUUGU')