Added tabular format indexer and related tests

This commit is contained in:
Samuel Prince
2024-09-18 10:32:05 -04:00
committed by Wibowo Arindrarto
parent 10e37cecc3
commit 361870aa9d
6 changed files with 142 additions and 47 deletions

View File

@ -7,6 +7,7 @@
from .infernal_tab import InfernalTabParser
from .infernal_tab import InfernalTabIndexer
from .infernal_text import InfernalTextParser

View File

@ -43,3 +43,4 @@ class _BaseInfernalParser:
setattr(hit, attr, value)
hit_list.append(hit)
return hit_list

View File

@ -17,7 +17,7 @@ from Bio.SearchIO.HmmerIO import Hmmer3TabParser
from ._base import _BaseInfernalParser
__all__ = ("InfernalTabParser")
__all__ = ("InfernalTabParser", "InfernalTabIndexer")
# tabular format column names
@ -251,6 +251,97 @@ class InfernalTabParser(_BaseInfernalParser):
self.line = self.handle.readline()
class InfernalTabIndexer(SearchIndexer):
"""Indexer class for Infernal tabular output."""
_parser = InfernalTabParser
def __iter__(self):
"""Iterate over the file handle; yields key, start offset, and length."""
handle = self._handle
handle.seek(0)
qresult_key = None
query_id_idx = None
# mark for the comments
comment_mark = b"#"
# mark for the row names line
row_names_mark = b"#target name"
row_names_mark_fmt_2 = b"#idx"
split_mark = b" "
# set line with initial mock value, to emulate header
line = comment_mark
# read through header and get the query index position based on the
# tabular output format
while line.startswith(comment_mark):
start_offset = handle.tell()
line = handle.readline()
if line.startswith(row_names_mark):
query_id_idx = 2
elif line.startswith(row_names_mark_fmt_2):
query_id_idx = 3
assert query_id_idx is not None
# handle empty file
if not line:
return
# index the qresults
while True:
# get end offset here since we only know a qresult ends after
# encountering the next one
end_offset = handle.tell()
# process the previous line
cols = [x for x in line.strip().split(split_mark) if x]
if qresult_key is None:
qresult_key = cols[query_id_idx]
else:
curr_key = cols[query_id_idx]
if curr_key != qresult_key:
adj_end = end_offset - len(line)
yield (qresult_key.decode(), start_offset, adj_end - start_offset)
qresult_key = curr_key
start_offset = adj_end
line = handle.readline()
if not line or line.startswith(comment_mark):
yield (qresult_key.decode(), start_offset, end_offset - start_offset)
break
def get_raw(self, offset):
"""Return the raw bytes string of a QueryResult object from the given offset."""
handle = self._handle
handle.seek(offset)
query_id_idx = 2
qresult_key = None
qresult_raw = b""
split_mark = b" "
comment_mark = b"#"
while True:
line = handle.readline()
if not line or line.startswith(comment_mark):
break
cols = [x for x in line.strip().split(split_mark) if x]
if qresult_key is None:
qresult_key = cols[query_id_idx]
else:
curr_key = cols[query_id_idx]
if curr_key != qresult_key:
break
qresult_raw += line
return qresult_raw
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest

View File

@ -245,7 +245,6 @@ _INDEXER_MAP = {
"hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"),
"hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
"infernal-tab": ("InfernalIO", "InfernalTabIndexer"),
"infernal-text": ("InfernalIO", "InfernalTextIndexer"),
"phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
}

View File

@ -18,59 +18,58 @@ from search_tests_common import CheckRaw
class InfernalTabRawCases(CheckRaw):
fmt = "infernal-tab"
def test_infernal_tab_single(self):
"""Test infernal-tab raw string retrieval, cmsearch, single query (U2_Yeast)."""
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
#---------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
"""Test infernal-tab raw string retrieval, cmsearch, single query."""
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
"""
self.check_raw(filename, "U2", raw)
def test_infernal_tab_multiple_first(self):
def test_infernal_tab_multiple(self):
"""Test infernal-tab raw string retrieval, cmsearch, single query, multiple non-consecutive hits."""
filename = os.path.join("Infernal", "U2_Yeast-shuf.tbl")
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.3e-20 ! -
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 737498 737324 - no 1 0.39 0.0 19.8 0.11 ? -
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1370418 1370563 + no 1 0.34 0.1 15.6 1.1 ? -
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1079243 1079392 + no 1 0.39 0.0 15.3 1.3 ? -
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 425490 425693 + no 1 0.34 0.9 13.7 3.1 ? -
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 1073786 1073950 + no 1 0.33 0.5 11.9 8.3 ? -
"""
self.check_raw(filename, "U2", raw)
def test_infernal_tab_single_first(self):
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, first."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
raw = """U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 196571 196389 - no 1 0.33 5.3 12.8 1.3 ? U2 spliceosomal RNA
"""
self.check_raw(filename, "ENA|BK006935|BK006935.2", raw)
def test_infernal_tab_multiple_middle(self):
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, middle."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
raw = """U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
"""
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
def test_infernal_tab_multiple_last(self):
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, last."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
raw = """5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 193 229986 229885 - no 1 0.32 0.1 11.1 4.7 ? U2 spliceosomal RNA
"""
self.check_raw(filename, "ENA|BK006937|BK006937.2", raw)
def test_infernal_tab_multiple_fmt_2(self):
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
"""
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw, fmt=2)
def test_infernal_tab_multiple_fmt_2_infer(self):
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2, inferred."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan-fmt_2.tbl")
raw = """1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
"""
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
@ -78,24 +77,28 @@ U2 RF00004 ENA|BK006937|BK006937.2 - cm 1
class InfernalTabIndexCases(CheckIndex):
fmt = "infernal-tab"
def test_infernal_tab_1q(self):
"""Test infernal-tab indexing, cmsearch, one query, one hit."""
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
def test_infernal_tab_single(self):
"""Test infernal-tab indexing, cmsearch, single query."""
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
self.check_index(filename, self.fmt)
def test_infernal_tab_1q_0m(self):
"""Test infernal-tab indexing, cmsearch, single query, no hits."""
filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.tbl")
def test_infernal_tab_single_no_hit(self):
"""Test infernal-tab indexing, cmsearch, single query."""
filename = os.path.join("Infernal", "IRES_Yeast.tbl")
self.check_index(filename, self.fmt)
def test_infernal_tab_1q_mm(self):
"""Test infernal-tab indexing, cmsearch, single query, multiple hits."""
filename = os.path.join("Infernal", "cmsearch_114_5S_Yeast.tbl")
def test_infernal_tab_single_multiple_hit(self):
"""Test infernal-tab indexing, cmsearch, single query."""
filename = os.path.join("Infernal", "5S_Yeast.tbl")
self.check_index(filename, self.fmt)
def test_infernal_tab_mq_mm(self):
"""Test infernal-tab indexing, cmscan, multiple query, multiple matches."""
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
def test_infernal_tab_multiple_multiple_hit(self):
"""Test infernal-tab indexing, cmscan, multiple query."""
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
self.check_index(filename, self.fmt)