Added support for HMM-only searches

2025-10-20 13:43:47 +08:00 · 2024-09-16 07:14:38 -04:00
parent 213fd253b0
commit f0f6694f96
2 changed files with 35 additions and 23 deletions
--- a/Bio/SearchIO/InfernalIO/infernal_text.py
+++ b/Bio/SearchIO/InfernalIO/infernal_text.py
@ -42,7 +42,8 @@ _DIV_HEADER_OPT = "# - - -"
 _DIV_QUERY_END = "//"
 _DIV_QUERY_START = "Query:"
 # hits 
-_DIV_HITS_END = "Internal CM pipeline statistics summary:"
+_DIV_HITS_END_CM = "Internal CM pipeline statistics summary:"
+_DIV_HITS_END_HMM = "Internal HMM-only pipeline statistics summary:"
 _DIV_HIT_SCORE = "Hit scores:"
 _DIV_HIT_ALIGNMENT = "Hit alignments:"
 _DIV_NO_HIT = "   [No hits detected that satisfy reporting thresholds]"
@ -154,7 +155,7 @@ class InfernalTextParser:
            while self.line and not self.line.startswith(_DIV_QUERY_END):
                hit_list = self._parse_hit(qid, qdesc)
                # read through the statistics summary
-                if self.line.startswith(_DIV_HITS_END):
+                if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
                    while self.line and not self.line.startswith(_DIV_QUERY_END):
                        self.line = read_forward(self.handle)
            # create qresult, set its attributes and yield   
@ -192,7 +193,7 @@ class InfernalTextParser:
                elif self.line.startswith(_DIV_NO_HIT):
                    while True:
                        self.line = read_forward(self.handle)
-                        if self.line.startswith(_DIV_HITS_END):
+                        if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
                            return []
                elif self.line.startswith(_DIV_HIT_SCORE):
                    # read through the header 
@ -202,7 +203,7 @@ class InfernalTextParser:
                    # parse the hit score table
                    while True:
                        # we've reached the end of the hit score table
-                        if self.line.startswith(_DIV_HITS_END):
+                        if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
                            # create the last hit 
                            hit = Hit(hsp_list)
                            for attr, value in hit_attrs.items():
@ -275,10 +276,10 @@ class InfernalTextParser:
                elif self.line.startswith(_DIV_NO_HIT):
                    while True:
                        self.line = read_forward(self.handle)
-                        if self.line.startswith(_DIV_HITS_END):
+                        if self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
                            return []
                # we've reached the end of the hit section
-                elif self.line.startswith(_DIV_HITS_END):
+                elif self.line.startswith(_DIV_HITS_END_CM) or self.line.startswith(_DIV_HITS_END_HMM):
                    # create the last hit 
                    hit = Hit(hsp_list)
                    for attr, value in hit_attrs.items():
@ -356,10 +357,17 @@ class InfernalTextParser:
        frag_list = []
        model_seq = ""
        hit_seq = ""
-        annot = {"NC": "", "CS": "", "similarity": "", "PP": ""}
+        if model == "cm":
+            annot = {"NC": "", "CS": "", "similarity": "", "PP": ""}
+        else:
+            annot = {"CS": "", "similarity": "", "PP": ""}
        while True:
            # we've reached the end of the alignment section
-            if self.line.startswith(_DIV_ALIGNMENT_START) or self.line.startswith(_DIV_HITS_END):
+            if (
+                self.line.startswith(_DIV_ALIGNMENT_START) or 
+                self.line.startswith(_DIV_HITS_END_CM) or
+                self.line.startswith(_DIV_HITS_END_HMM)
+            ):
                # Process local end in infernal hit alignment. Local end are
                # large insertion or deletion indicated by *[NN]* where N is 
                # the number of model positions are deleted or the number of
@ -419,21 +427,24 @@ class InfernalTextParser:
            # parse the alignment blocks in the hsp
            # each block have 4 (hmmonly) or 5 (cm) lines followed by an empty line
            block_size = 6 if model == "cm" else 5
+            offset = 1 if model == "cm" else 0 # offset for the annotation line indexes
            lines = [None] * block_size
            for i in range(block_size):
                lines[i] = self.line
                self.line = read_forward(self.handle)
-                                    
-            # get the position of the alignment in the string using the pp line
-            blklen = len(lines[5].strip().split()[0])
-            blkstart = len(lines[5]) - blklen - 4
-            blkend = len(lines[5]) - 4
-            model_seq += lines[2][blkstart:blkend]
-            hit_seq += lines[4][blkstart:blkend]
-            annot["NC"] += lines[0][blkstart:blkend]
-            annot["CS"] += lines[1][blkstart:blkend]
-            annot["similarity"] += lines[3][blkstart:blkend]
-            annot["PP"] += lines[5][blkstart:blkend]
+                                                
+            # get the position of the alignment in the string using the PP line
+            blklen = len(lines[4+offset].strip().split()[0])
+            blkstart = len(lines[4+offset]) - blklen - 4
+            blkend = len(lines[4+offset]) - 4
+            model_seq += lines[1+offset][blkstart:blkend]
+            hit_seq += lines[3+offset][blkstart:blkend]
+            # NC line is specific to cm model searches
+            if model == "cm":
+                annot["NC"] += lines[0][blkstart:blkend]
+            annot["CS"] += lines[0+offset][blkstart:blkend]
+            annot["similarity"] += lines[2+offset][blkstart:blkend]
+            annot["PP"] += lines[4+offset][blkstart:blkend]

        return frag_list

--- a/Tests/test_SearchIO_infernal_text.py
+++ b/Tests/test_SearchIO_infernal_text.py
@ -443,15 +443,16 @@ class CmsearchCases(unittest.TestCase):
        self.assertEqual(hsp.query_start, 7)
        self.assertEqual(hsp.query_end, 100)
        self.assertEqual(hsp.query_endtype, "..")
-        self.assertEqual(hsp.hit_start, 681855)
-        self.assertEqual(hsp.hit_end, 681762)
+        self.assertEqual(hsp.hit_start, 681762)
+        self.assertEqual(hsp.hit_end, 681855)
        self.assertEqual(hsp.hit_endtype, "..")
        self.assertEqual(hsp.avg_acc, 0.74)
        # first fragment
+        frag = hsp[0]
        self.assertEqual(frag.query_start, 7)
        self.assertEqual(frag.query_end, 100)
-        self.assertEqual(frag.hit_start, 681855)
-        self.assertEqual(frag.hit_end, 681762)
+        self.assertEqual(frag.hit_start, 681762)
+        self.assertEqual(frag.hit_end, 681855)
        self.assertEqual(frag.hit_strand, -1)
        self.assertEqual(frag.query.seq, 'UCucgGCcUUUUGGCUaaGAUCAAGUGUAGUAUCUGUUCUUuuCAGUuUAAuAuCUGauAuugucucuAuugggggccaau-uauaUUAaauuaA')
        self.assertEqual(frag.hit.seq,'UCUUUGCCUUUUGGCUUAGAUCAAGUGUAGUAUCUGUUCUUUUCAGUGUAACAACUGAAAUGACCUCAAU-GAGGCUCAUUaCCUUUUAAUUUGU')