mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Added tabular format indexer and related tests
This commit is contained in:
committed by
Wibowo Arindrarto
parent
10e37cecc3
commit
361870aa9d
@ -545,7 +545,7 @@ class BlastTabIndexer(SearchIndexer):
|
|||||||
"Custom fields is missing an ID column. One of these must be "
|
"Custom fields is missing an ID column. One of these must be "
|
||||||
"present: 'qseqid', 'qacc', or 'qaccver'."
|
"present: 'qseqid', 'qacc', or 'qaccver'."
|
||||||
)
|
)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the file handle; yields key, start offset, and length."""
|
"""Iterate over the file handle; yields key, start offset, and length."""
|
||||||
handle = self._handle
|
handle = self._handle
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
|
|
||||||
from .infernal_tab import InfernalTabParser
|
from .infernal_tab import InfernalTabParser
|
||||||
|
from .infernal_tab import InfernalTabIndexer
|
||||||
from .infernal_text import InfernalTextParser
|
from .infernal_text import InfernalTextParser
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,3 +43,4 @@ class _BaseInfernalParser:
|
|||||||
setattr(hit, attr, value)
|
setattr(hit, attr, value)
|
||||||
hit_list.append(hit)
|
hit_list.append(hit)
|
||||||
return hit_list
|
return hit_list
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ from Bio.SearchIO.HmmerIO import Hmmer3TabParser
|
|||||||
from ._base import _BaseInfernalParser
|
from ._base import _BaseInfernalParser
|
||||||
|
|
||||||
|
|
||||||
__all__ = ("InfernalTabParser")
|
__all__ = ("InfernalTabParser", "InfernalTabIndexer")
|
||||||
|
|
||||||
|
|
||||||
# tabular format column names
|
# tabular format column names
|
||||||
@ -251,6 +251,97 @@ class InfernalTabParser(_BaseInfernalParser):
|
|||||||
self.line = self.handle.readline()
|
self.line = self.handle.readline()
|
||||||
|
|
||||||
|
|
||||||
|
class InfernalTabIndexer(SearchIndexer):
|
||||||
|
"""Indexer class for Infernal tabular output."""
|
||||||
|
|
||||||
|
|
||||||
|
_parser = InfernalTabParser
|
||||||
|
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
"""Iterate over the file handle; yields key, start offset, and length."""
|
||||||
|
handle = self._handle
|
||||||
|
handle.seek(0)
|
||||||
|
qresult_key = None
|
||||||
|
query_id_idx = None
|
||||||
|
# mark for the comments
|
||||||
|
comment_mark = b"#"
|
||||||
|
# mark for the row names line
|
||||||
|
row_names_mark = b"#target name"
|
||||||
|
row_names_mark_fmt_2 = b"#idx"
|
||||||
|
split_mark = b" "
|
||||||
|
|
||||||
|
# set line with initial mock value, to emulate header
|
||||||
|
line = comment_mark
|
||||||
|
|
||||||
|
# read through header and get the query index position based on the
|
||||||
|
# tabular output format
|
||||||
|
while line.startswith(comment_mark):
|
||||||
|
start_offset = handle.tell()
|
||||||
|
line = handle.readline()
|
||||||
|
if line.startswith(row_names_mark):
|
||||||
|
query_id_idx = 2
|
||||||
|
elif line.startswith(row_names_mark_fmt_2):
|
||||||
|
query_id_idx = 3
|
||||||
|
|
||||||
|
assert query_id_idx is not None
|
||||||
|
|
||||||
|
# handle empty file
|
||||||
|
if not line:
|
||||||
|
return
|
||||||
|
|
||||||
|
# index the qresults
|
||||||
|
while True:
|
||||||
|
# get end offset here since we only know a qresult ends after
|
||||||
|
# encountering the next one
|
||||||
|
end_offset = handle.tell()
|
||||||
|
|
||||||
|
# process the previous line
|
||||||
|
cols = [x for x in line.strip().split(split_mark) if x]
|
||||||
|
if qresult_key is None:
|
||||||
|
qresult_key = cols[query_id_idx]
|
||||||
|
else:
|
||||||
|
curr_key = cols[query_id_idx]
|
||||||
|
if curr_key != qresult_key:
|
||||||
|
adj_end = end_offset - len(line)
|
||||||
|
yield (qresult_key.decode(), start_offset, adj_end - start_offset)
|
||||||
|
qresult_key = curr_key
|
||||||
|
start_offset = adj_end
|
||||||
|
|
||||||
|
line = handle.readline()
|
||||||
|
if not line or line.startswith(comment_mark):
|
||||||
|
yield (qresult_key.decode(), start_offset, end_offset - start_offset)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw(self, offset):
|
||||||
|
"""Return the raw bytes string of a QueryResult object from the given offset."""
|
||||||
|
handle = self._handle
|
||||||
|
handle.seek(offset)
|
||||||
|
query_id_idx = 2
|
||||||
|
qresult_key = None
|
||||||
|
qresult_raw = b""
|
||||||
|
split_mark = b" "
|
||||||
|
comment_mark = b"#"
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = handle.readline()
|
||||||
|
if not line or line.startswith(comment_mark):
|
||||||
|
break
|
||||||
|
cols = [x for x in line.strip().split(split_mark) if x]
|
||||||
|
if qresult_key is None:
|
||||||
|
qresult_key = cols[query_id_idx]
|
||||||
|
else:
|
||||||
|
curr_key = cols[query_id_idx]
|
||||||
|
if curr_key != qresult_key:
|
||||||
|
break
|
||||||
|
qresult_raw += line
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return qresult_raw
|
||||||
|
|
||||||
|
|
||||||
# if not used as a module, run the doctest
|
# if not used as a module, run the doctest
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from Bio._utils import run_doctest
|
from Bio._utils import run_doctest
|
||||||
|
@ -245,7 +245,6 @@ _INDEXER_MAP = {
|
|||||||
"hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"),
|
"hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"),
|
||||||
"hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
"hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
||||||
"infernal-tab": ("InfernalIO", "InfernalTabIndexer"),
|
"infernal-tab": ("InfernalIO", "InfernalTabIndexer"),
|
||||||
"infernal-text": ("InfernalIO", "InfernalTextIndexer"),
|
|
||||||
"phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
"phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,59 +18,58 @@ from search_tests_common import CheckRaw
|
|||||||
class InfernalTabRawCases(CheckRaw):
|
class InfernalTabRawCases(CheckRaw):
|
||||||
fmt = "infernal-tab"
|
fmt = "infernal-tab"
|
||||||
|
|
||||||
|
|
||||||
def test_infernal_tab_single(self):
|
def test_infernal_tab_single(self):
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, single query (U2_Yeast)."""
|
"""Test infernal-tab raw string retrieval, cmsearch, single query."""
|
||||||
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
|
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
|
||||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
|
||||||
#---------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
|
||||||
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 5.9e-20 ! TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.
|
|
||||||
"""
|
"""
|
||||||
self.check_raw(filename, "U2", raw)
|
self.check_raw(filename, "U2", raw)
|
||||||
|
|
||||||
def test_infernal_tab_multiple_first(self):
|
|
||||||
|
def test_infernal_tab_multiple(self):
|
||||||
|
"""Test infernal-tab raw string retrieval, cmsearch, single query, multiple non-consecutive hits."""
|
||||||
|
filename = os.path.join("Infernal", "U2_Yeast-shuf.tbl")
|
||||||
|
raw = """ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.3e-20 ! -
|
||||||
|
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 737498 737324 - no 1 0.39 0.0 19.8 0.11 ? -
|
||||||
|
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1370418 1370563 + no 1 0.34 0.1 15.6 1.1 ? -
|
||||||
|
ENA|BK006936|BK006936.2 - U2 RF00004 cm 1 193 1079243 1079392 + no 1 0.39 0.0 15.3 1.3 ? -
|
||||||
|
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 425490 425693 + no 1 0.34 0.9 13.7 3.1 ? -
|
||||||
|
ENA|BK006948|BK006948.2 - U2 RF00004 cm 1 193 1073786 1073950 + no 1 0.33 0.5 11.9 8.3 ? -
|
||||||
|
"""
|
||||||
|
self.check_raw(filename, "U2", raw)
|
||||||
|
|
||||||
|
|
||||||
|
def test_infernal_tab_single_first(self):
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, first."""
|
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, first."""
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
raw = """U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
|
||||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
|
||||||
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 52929 53083 + no 1 0.44 0.0 13.5 0.91 ? U2 spliceosomal RNA
|
|
||||||
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 196571 196389 - no 1 0.33 5.3 12.8 1.3 ? U2 spliceosomal RNA
|
U2 RF00004 ENA|BK006935|BK006935.2 - cm 1 193 196571 196389 - no 1 0.33 5.3 12.8 1.3 ? U2 spliceosomal RNA
|
||||||
"""
|
"""
|
||||||
self.check_raw(filename, "ENA|BK006935|BK006935.2", raw)
|
self.check_raw(filename, "ENA|BK006935|BK006935.2", raw)
|
||||||
|
|
||||||
|
|
||||||
def test_infernal_tab_multiple_middle(self):
|
def test_infernal_tab_multiple_middle(self):
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, middle."""
|
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, middle."""
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
raw = """U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
|
||||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
|
||||||
U2 RF00004 ENA|BK006936|BK006936.2 - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! U2 spliceosomal RNA
|
|
||||||
"""
|
"""
|
||||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
||||||
|
|
||||||
|
|
||||||
def test_infernal_tab_multiple_last(self):
|
def test_infernal_tab_multiple_last(self):
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, last."""
|
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, last."""
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||||
raw = """#target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target
|
raw = """5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
|
||||||
#------------------- --------- ----------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
|
|
||||||
5S_rRNA RF00001 ENA|BK006937|BK006937.2 - cm 1 119 761 644 - no 1 0.41 0.3 14.1 2.4 ? 5S ribosomal RNA
|
|
||||||
U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 193 229986 229885 - no 1 0.32 0.1 11.1 4.7 ? U2 spliceosomal RNA
|
U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 193 229986 229885 - no 1 0.32 0.1 11.1 4.7 ? U2 spliceosomal RNA
|
||||||
"""
|
"""
|
||||||
self.check_raw(filename, "ENA|BK006937|BK006937.2", raw)
|
self.check_raw(filename, "ENA|BK006937|BK006937.2", raw)
|
||||||
|
|
||||||
|
|
||||||
def test_infernal_tab_multiple_fmt_2(self):
|
def test_infernal_tab_multiple_fmt_2(self):
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2."""
|
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2."""
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
|
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan-fmt_2.tbl")
|
||||||
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
|
raw = """1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
||||||
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
|
|
||||||
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
|
||||||
"""
|
|
||||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw, fmt=2)
|
|
||||||
|
|
||||||
def test_infernal_tab_multiple_fmt_2_infer(self):
|
|
||||||
"""Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2, inferred."""
|
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
|
|
||||||
raw = """#idx target name accession query name accession clan name mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc olp anyidx afrct1 afrct2 winidx wfrct1 wfrct2 mdl len seq len description of target
|
|
||||||
#--- -------------------- --------- ----------------------- --------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --- ------ ------ ------ ------ ------ ------ ------- ------- ---------------------
|
|
||||||
1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA
|
|
||||||
"""
|
"""
|
||||||
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
self.check_raw(filename, "ENA|BK006936|BK006936.2", raw)
|
||||||
|
|
||||||
@ -78,24 +77,28 @@ U2 RF00004 ENA|BK006937|BK006937.2 - cm 1
|
|||||||
class InfernalTabIndexCases(CheckIndex):
|
class InfernalTabIndexCases(CheckIndex):
|
||||||
fmt = "infernal-tab"
|
fmt = "infernal-tab"
|
||||||
|
|
||||||
def test_infernal_tab_1q(self):
|
|
||||||
"""Test infernal-tab indexing, cmsearch, one query, one hit."""
|
def test_infernal_tab_single(self):
|
||||||
filename = os.path.join("Infernal", "cmsearch_114_U2_Yeast.tbl")
|
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||||
|
filename = os.path.join("Infernal", "U2_Yeast-threshold.tbl")
|
||||||
self.check_index(filename, self.fmt)
|
self.check_index(filename, self.fmt)
|
||||||
|
|
||||||
def test_infernal_tab_1q_0m(self):
|
|
||||||
"""Test infernal-tab indexing, cmsearch, single query, no hits."""
|
def test_infernal_tab_single_no_hit(self):
|
||||||
filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.tbl")
|
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||||
|
filename = os.path.join("Infernal", "IRES_Yeast.tbl")
|
||||||
self.check_index(filename, self.fmt)
|
self.check_index(filename, self.fmt)
|
||||||
|
|
||||||
def test_infernal_tab_1q_mm(self):
|
|
||||||
"""Test infernal-tab indexing, cmsearch, single query, multiple hits."""
|
|
||||||
filename = os.path.join("Infernal", "cmsearch_114_5S_Yeast.tbl")
|
|
||||||
self.check_index(filename, self.fmt)
|
|
||||||
|
|
||||||
def test_infernal_tab_mq_mm(self):
|
def test_infernal_tab_single_multiple_hit(self):
|
||||||
"""Test infernal-tab indexing, cmscan, multiple query, multiple matches."""
|
"""Test infernal-tab indexing, cmsearch, single query."""
|
||||||
filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast.tbl")
|
filename = os.path.join("Infernal", "5S_Yeast.tbl")
|
||||||
|
self.check_index(filename, self.fmt)
|
||||||
|
|
||||||
|
|
||||||
|
def test_infernal_tab_multiple_multiple_hit(self):
|
||||||
|
"""Test infernal-tab indexing, cmscan, multiple query."""
|
||||||
|
filename = os.path.join("Infernal", "IRES_5S_U2_Yeast-cmscan.tbl")
|
||||||
self.check_index(filename, self.fmt)
|
self.check_index(filename, self.fmt)
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user