mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Added tabular format indexer and related tests
This commit is contained in:
committed by
Wibowo Arindrarto
parent
10e37cecc3
commit
361870aa9d
@ -7,6 +7,7 @@
|
||||
|
||||
|
||||
from .infernal_tab import InfernalTabParser
|
||||
from .infernal_tab import InfernalTabIndexer
|
||||
from .infernal_text import InfernalTextParser
|
||||
|
||||
|
||||
|
@ -43,3 +43,4 @@ class _BaseInfernalParser:
|
||||
setattr(hit, attr, value)
|
||||
hit_list.append(hit)
|
||||
return hit_list
|
||||
|
||||
|
@ -17,7 +17,7 @@ from Bio.SearchIO.HmmerIO import Hmmer3TabParser
|
||||
from ._base import _BaseInfernalParser
|
||||
|
||||
|
||||
__all__ = ("InfernalTabParser")
|
||||
__all__ = ("InfernalTabParser", "InfernalTabIndexer")
|
||||
|
||||
|
||||
# tabular format column names
|
||||
@ -251,6 +251,97 @@ class InfernalTabParser(_BaseInfernalParser):
|
||||
self.line = self.handle.readline()
|
||||
|
||||
|
||||
class InfernalTabIndexer(SearchIndexer):
|
||||
"""Indexer class for Infernal tabular output."""
|
||||
|
||||
|
||||
_parser = InfernalTabParser
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the file handle; yields key, start offset, and length."""
|
||||
handle = self._handle
|
||||
handle.seek(0)
|
||||
qresult_key = None
|
||||
query_id_idx = None
|
||||
# mark for the comments
|
||||
comment_mark = b"#"
|
||||
# mark for the row names line
|
||||
row_names_mark = b"#target name"
|
||||
row_names_mark_fmt_2 = b"#idx"
|
||||
split_mark = b" "
|
||||
|
||||
# set line with initial mock value, to emulate header
|
||||
line = comment_mark
|
||||
|
||||
# read through header and get the query index position based on the
|
||||
# tabular output format
|
||||
while line.startswith(comment_mark):
|
||||
start_offset = handle.tell()
|
||||
line = handle.readline()
|
||||
if line.startswith(row_names_mark):
|
||||
query_id_idx = 2
|
||||
elif line.startswith(row_names_mark_fmt_2):
|
||||
query_id_idx = 3
|
||||
|
||||
assert query_id_idx is not None
|
||||
|
||||
# handle empty file
|
||||
if not line:
|
||||
return
|
||||
|
||||
# index the qresults
|
||||
while True:
|
||||
# get end offset here since we only know a qresult ends after
|
||||
# encountering the next one
|
||||
end_offset = handle.tell()
|
||||
|
||||
# process the previous line
|
||||
cols = [x for x in line.strip().split(split_mark) if x]
|
||||
if qresult_key is None:
|
||||
qresult_key = cols[query_id_idx]
|
||||
else:
|
||||
curr_key = cols[query_id_idx]
|
||||
if curr_key != qresult_key:
|
||||
adj_end = end_offset - len(line)
|
||||
yield (qresult_key.decode(), start_offset, adj_end - start_offset)
|
||||
qresult_key = curr_key
|
||||
start_offset = adj_end
|
||||
|
||||
line = handle.readline()
|
||||
if not line or line.startswith(comment_mark):
|
||||
yield (qresult_key.decode(), start_offset, end_offset - start_offset)
|
||||
break
|
||||
|
||||
|
||||
def get_raw(self, offset):
|
||||
"""Return the raw bytes string of a QueryResult object from the given offset."""
|
||||
handle = self._handle
|
||||
handle.seek(offset)
|
||||
query_id_idx = 2
|
||||
qresult_key = None
|
||||
qresult_raw = b""
|
||||
split_mark = b" "
|
||||
comment_mark = b"#"
|
||||
|
||||
while True:
|
||||
line = handle.readline()
|
||||
if not line or line.startswith(comment_mark):
|
||||
break
|
||||
cols = [x for x in line.strip().split(split_mark) if x]
|
||||
if qresult_key is None:
|
||||
qresult_key = cols[query_id_idx]
|
||||
else:
|
||||
curr_key = cols[query_id_idx]
|
||||
if curr_key != qresult_key:
|
||||
break
|
||||
qresult_raw += line
|
||||
|
||||
|
||||
|
||||
return qresult_raw
|
||||
|
||||
|
||||
# if not used as a module, run the doctest
|
||||
if __name__ == "__main__":
|
||||
from Bio._utils import run_doctest
|
||||
|
@ -245,7 +245,6 @@ _INDEXER_MAP = {
|
||||
"hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"),
|
||||
"hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
||||
"infernal-tab": ("InfernalIO", "InfernalTabIndexer"),
|
||||
"infernal-text": ("InfernalIO", "InfernalTextIndexer"),
|
||||
"phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
||||
}
|
||||
|
||||
|
@ -18,59 +18,58 @@ from search_tests_common import CheckRaw
|
||||
class InfernalTabRawCases(CheckRaw):
|
||||
fmt = "infernal-tab"
|
||||
|
||||
|
||||
def test_infernal_tab_single(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, single query (U2_Yeast)."""
|
||||
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
|
||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
||||
#---------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
||||
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, single query."""
|
||||
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
|
||||
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
|
||||
"""
|
||||
self.check_raw(filename, "U2", raw)
|
||||
|
||||
def test_infernal_tab_multiple_first(self):
|
||||
|
||||
def test_infernal_tab_multiple(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, single query, multiple non-consecutive hits."""
|
||||
filename = os.path.join("Infernal", "U2_Yeast-shuf.tbl")
|
||||
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.3e-20 ! -
|
||||
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 737498 737324 - no 1 0.39 0.0 19.8 0.11 ? -
|
||||
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1370418 1370563 + no 1 0.34 0.1 15.6 1.1 ? -
|
||||
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1079243 1079392 + no 1 0.39 0.0 15.3 1.3 ? -
|
||||
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 425490 425693 + no 1 0.34 0.9 13.7 3.1 ? -
|
||||
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 1073786 1073950 + no 1 0.33 0.5 11.9 8.3 ? -
|
||||
"""
|
||||
self.check_raw(filename, "U2", raw)
|
||||
|
||||
|
||||
def test_infernal_tab_single_first(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, first."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
||||
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
|
||||
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||
raw = """U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
|
||||
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 196571 196389 - no 1 0.33 5.3 12.8 1.3 ? U2 spliceosomal RNA
|
||||
"""
|
||||
self.check_raw(filename, "ENA|BK006935|BK006935.2", raw)
|
||||
|
||||
|
||||
def test_infernal_tab_multiple_middle(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, middle."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
||||
U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
|
||||
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||
raw = """U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
|
||||
"""
|
||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
||||
|
||||
|
||||
def test_infernal_tab_multiple_last(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, last."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
||||
5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
|
||||
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||
raw = """5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
|
||||
U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 193 229986 229885 - no 1 0.32 0.1 11.1 4.7 ? U2 spliceosomal RNA
|
||||
"""
|
||||
self.check_raw(filename, "ENA|BK006937|BK006937.2", raw)
|
||||
|
||||
|
||||
def test_infernal_tab_multiple_fmt_2(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
|
||||
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
|
||||
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
|
||||
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
||||
"""
|
||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw, fmt=2)
|
||||
|
||||
def test_infernal_tab_multiple_fmt_2_infer(self):
|
||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2, inferred."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
|
||||
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
|
||||
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
|
||||
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
||||
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan-fmt_2.tbl")
|
||||
raw = """1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
||||
"""
|
||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
||||
|
||||
@ -78,24 +77,28 @@ U2 RF00004 ENA|BK006937|BK006937.2 - cm 1
|
||||
class InfernalTabIndexCases(CheckIndex):
|
||||
fmt = "infernal-tab"
|
||||
|
||||
def test_infernal_tab_1q(self):
|
||||
"""Test infernal-tab indexing, cmsearch, one query, one hit."""
|
||||
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
|
||||
|
||||
def test_infernal_tab_single(self):
|
||||
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
|
||||
self.check_index(filename, self.fmt)
|
||||
|
||||
def test_infernal_tab_1q_0m(self):
|
||||
"""Test infernal-tab indexing, cmsearch, single query, no hits."""
|
||||
filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.tbl")
|
||||
|
||||
def test_infernal_tab_single_no_hit(self):
|
||||
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||
filename = os.path.join("Infernal", "IRES_Yeast.tbl")
|
||||
self.check_index(filename, self.fmt)
|
||||
|
||||
def test_infernal_tab_1q_mm(self):
|
||||
"""Test infernal-tab indexing, cmsearch, single query, multiple hits."""
|
||||
filename = os.path.join("Infernal", "cmsearch_114_5S_Yeast.tbl")
|
||||
|
||||
def test_infernal_tab_single_multiple_hit(self):
|
||||
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||
filename = os.path.join("Infernal", "5S_Yeast.tbl")
|
||||
self.check_index(filename, self.fmt)
|
||||
|
||||
def test_infernal_tab_mq_mm(self):
|
||||
"""Test infernal-tab indexing, cmscan, multiple query, multiple matches."""
|
||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
||||
|
||||
def test_infernal_tab_multiple_multiple_hit(self):
|
||||
"""Test infernal-tab indexing, cmscan, multiple query."""
|
||||
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||
self.check_index(filename, self.fmt)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user