diff --git a/Bio/SearchIO/BlastIO/blast_tab.py b/Bio/SearchIO/BlastIO/blast_tab.py index 2020c1744..b2752831f 100644 --- a/Bio/SearchIO/BlastIO/blast_tab.py +++ b/Bio/SearchIO/BlastIO/blast_tab.py @@ -545,7 +545,7 @@ class BlastTabIndexer(SearchIndexer): "Custom fields is missing an ID column. One of these must be " "present: 'qseqid', 'qacc', or 'qaccver'." ) - + def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" handle = self._handle diff --git a/Bio/SearchIO/InfernalIO/__init__.py b/Bio/SearchIO/InfernalIO/__init__.py index 84df84387..a8510b845 100644 --- a/Bio/SearchIO/InfernalIO/__init__.py +++ b/Bio/SearchIO/InfernalIO/__init__.py @@ -7,6 +7,7 @@ from .infernal_tab import InfernalTabParser +from .infernal_tab import InfernalTabIndexer from .infernal_text import InfernalTextParser diff --git a/Bio/SearchIO/InfernalIO/_base.py b/Bio/SearchIO/InfernalIO/_base.py index 16a5a9599..9b6c2603b 100644 --- a/Bio/SearchIO/InfernalIO/_base.py +++ b/Bio/SearchIO/InfernalIO/_base.py @@ -43,3 +43,4 @@ class _BaseInfernalParser: setattr(hit, attr, value) hit_list.append(hit) return hit_list + diff --git a/Bio/SearchIO/InfernalIO/infernal_tab.py b/Bio/SearchIO/InfernalIO/infernal_tab.py index 867a8d9af..188783a91 100644 --- a/Bio/SearchIO/InfernalIO/infernal_tab.py +++ b/Bio/SearchIO/InfernalIO/infernal_tab.py @@ -17,7 +17,7 @@ from Bio.SearchIO.HmmerIO import Hmmer3TabParser from ._base import _BaseInfernalParser -__all__ = ("InfernalTabParser") +__all__ = ("InfernalTabParser", "InfernalTabIndexer") # tabular format column names @@ -251,6 +251,97 @@ class InfernalTabParser(_BaseInfernalParser): self.line = self.handle.readline() +class InfernalTabIndexer(SearchIndexer): + """Indexer class for Infernal tabular output.""" + + + _parser = InfernalTabParser + + + def __iter__(self): + """Iterate over the file handle; yields key, start offset, and length.""" + handle = self._handle + handle.seek(0) + qresult_key = None + query_id_idx = None + # mark for the comments + comment_mark = b"#" + # mark for the row names line + row_names_mark = b"#target name" + row_names_mark_fmt_2 = b"#idx" + split_mark = b" " + + # set line with initial mock value, to emulate header + line = comment_mark + + # read through header and get the query index position based on the + # tabular output format + while line.startswith(comment_mark): + start_offset = handle.tell() + line = handle.readline() + if line.startswith(row_names_mark): + query_id_idx = 2 + elif line.startswith(row_names_mark_fmt_2): + query_id_idx = 3 + + assert query_id_idx is not None + + # handle empty file + if not line: + return + + # index the qresults + while True: + # get end offset here since we only know a qresult ends after + # encountering the next one + end_offset = handle.tell() + + # process the previous line + cols = [x for x in line.strip().split(split_mark) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + if curr_key != qresult_key: + adj_end = end_offset - len(line) + yield (qresult_key.decode(), start_offset, adj_end - start_offset) + qresult_key = curr_key + start_offset = adj_end + + line = handle.readline() + if not line or line.startswith(comment_mark): + yield (qresult_key.decode(), start_offset, end_offset - start_offset) + break + + + def get_raw(self, offset): + """Return the raw bytes string of a QueryResult object from the given offset.""" + handle = self._handle + handle.seek(offset) + query_id_idx = 2 + qresult_key = None + qresult_raw = b"" + split_mark = b" " + comment_mark = b"#" + + while True: + line = handle.readline() + if not line or line.startswith(comment_mark): + break + cols = [x for x in line.strip().split(split_mark) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + if curr_key != qresult_key: + break + qresult_raw += line + + + + return qresult_raw + + # if not used as a module, run the doctest if __name__ == "__main__": from Bio._utils import run_doctest diff --git a/Bio/SearchIO/__init__.py b/Bio/SearchIO/__init__.py index 291e20e94..cf3b69834 100644 --- a/Bio/SearchIO/__init__.py +++ b/Bio/SearchIO/__init__.py @@ -245,7 +245,6 @@ _INDEXER_MAP = { "hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"), "hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"), "infernal-tab": ("InfernalIO", "InfernalTabIndexer"), - "infernal-text": ("InfernalIO", "InfernalTextIndexer"), "phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"), } diff --git a/Tests/test_SearchIO_infernal_tab_index.py b/Tests/test_SearchIO_infernal_tab_index.py index cdbb2eb70..ab91d0da4 100644 --- a/Tests/test_SearchIO_infernal_tab_index.py +++ b/Tests/test_SearchIO_infernal_tab_index.py @@ -18,59 +18,58 @@ from search_tests_common import CheckRaw class InfernalTabRawCases(CheckRaw): fmt = "infernal-tab" + def test_infernal_tab_single(self): - """Test infernal-tab raw string retrieval, cmsearch, single query (U2_Yeast).""" - filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl") - raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target -#---------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- -ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence. + """Test infernal-tab raw string retrieval, cmsearch, single query.""" + filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl") + raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence. """ self.check_raw(filename, "U2", raw) - def test_infernal_tab_multiple_first(self): + + def test_infernal_tab_multiple(self): + """Test infernal-tab raw string retrieval, cmsearch, single query, multiple non-consecutive hits.""" + filename = os.path.join("Infernal", "U2_Yeast-shuf.tbl") + raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.3e-20 ! - +ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 737498 737324 - no 1 0.39 0.0 19.8 0.11 ? - +ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1370418 1370563 + no 1 0.34 0.1 15.6 1.1 ? - +ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1079243 1079392 + no 1 0.39 0.0 15.3 1.3 ? - +ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 425490 425693 + no 1 0.34 0.9 13.7 3.1 ? - +ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 1073786 1073950 + no 1 0.33 0.5 11.9 8.3 ? - +""" + self.check_raw(filename, "U2", raw) + + + def test_infernal_tab_single_first(self): """Test infernal-tab raw string retrieval, cmsearch, multiple queries, first.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl") - raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target -#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- -U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA + filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl") + raw = """U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 196571 196389 - no 1 0.33 5.3 12.8 1.3 ? U2 spliceosomal RNA """ self.check_raw(filename, "ENA|BK006935|BK006935.2", raw) + def test_infernal_tab_multiple_middle(self): """Test infernal-tab raw string retrieval, cmsearch, multiple queries, middle.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl") - raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target -#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- -U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA + filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl") + raw = """U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA """ self.check_raw(filename, "ENA|BK006936|BK006936.2", raw) + def test_infernal_tab_multiple_last(self): """Test infernal-tab raw string retrieval, cmsearch, multiple queries, last.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl") - raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target -#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- -5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA + filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl") + raw = """5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 193 229986 229885 - no 1 0.32 0.1 11.1 4.7 ? U2 spliceosomal RNA """ self.check_raw(filename, "ENA|BK006937|BK006937.2", raw) + def test_infernal_tab_multiple_fmt_2(self): """Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl") - raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target -#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- --------------------- -1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA -""" - self.check_raw(filename, "ENA|BK006936|BK006936.2", raw, fmt=2) - - def test_infernal_tab_multiple_fmt_2_infer(self): - """Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2, inferred.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl") - raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target -#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- --------------------- -1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA + filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan-fmt_2.tbl") + raw = """1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA """ self.check_raw(filename, "ENA|BK006936|BK006936.2", raw) @@ -78,24 +77,28 @@ U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 class InfernalTabIndexCases(CheckIndex): fmt = "infernal-tab" - def test_infernal_tab_1q(self): - """Test infernal-tab indexing, cmsearch, one query, one hit.""" - filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl") + + def test_infernal_tab_single(self): + """Test infernal-tab indexing, cmsearch, single query.""" + filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl") self.check_index(filename, self.fmt) - def test_infernal_tab_1q_0m(self): - """Test infernal-tab indexing, cmsearch, single query, no hits.""" - filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.tbl") + + def test_infernal_tab_single_no_hit(self): + """Test infernal-tab indexing, cmsearch, single query.""" + filename = os.path.join("Infernal", "IRES_Yeast.tbl") self.check_index(filename, self.fmt) - def test_infernal_tab_1q_mm(self): - """Test infernal-tab indexing, cmsearch, single query, multiple hits.""" - filename = os.path.join("Infernal", "cmsearch_114_5S_Yeast.tbl") - self.check_index(filename, self.fmt) - def test_infernal_tab_mq_mm(self): - """Test infernal-tab indexing, cmscan, multiple query, multiple matches.""" - filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl") + def test_infernal_tab_single_multiple_hit(self): + """Test infernal-tab indexing, cmsearch, single query.""" + filename = os.path.join("Infernal", "5S_Yeast.tbl") + self.check_index(filename, self.fmt) + + + def test_infernal_tab_multiple_multiple_hit(self): + """Test infernal-tab indexing, cmscan, multiple query.""" + filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl") self.check_index(filename, self.fmt)