From 38f0fed34a8a87d0e3f85c2c79d39dd800cc3859 Mon Sep 17 00:00:00 2001 From: Samuel Prince Date: Sun, 19 Jan 2025 09:35:33 -0500 Subject: [PATCH] Infer the tabular format from the header --- Bio/SearchIO/InfernalIO/infernal_tab.py | 169 ++++++++++++++++++++-- Tests/test_SearchIO_infernal_tab.py | 88 +++++++++++ Tests/test_SearchIO_infernal_tab_index.py | 9 +- 3 files changed, 252 insertions(+), 14 deletions(-) diff --git a/Bio/SearchIO/InfernalIO/infernal_tab.py b/Bio/SearchIO/InfernalIO/infernal_tab.py index 5f0e6fec2..b911405ce 100644 --- a/Bio/SearchIO/InfernalIO/infernal_tab.py +++ b/Bio/SearchIO/InfernalIO/infernal_tab.py @@ -13,6 +13,7 @@ from Bio.SearchIO._model import HSPFragment from Bio.SearchIO._model import QueryResult from ._base import _BaseInfernalParser +from Bio.File import _open_for_random_access __all__ = ("InfernalTabParser", "InfernalTabIndexer") @@ -92,7 +93,80 @@ _TAB_FORMAT = { "description", ), } -_DEFAULT_TAB_FORMAT = 1 + +_TABULAR_FMT_1_HEADER_FIELDS = [ + "target name", + "accession", + "query name", + "accession", + "mdl", + "mdl from", + "mdl to", + "seq from", + "seq to", + "strand", + "trunc", + "pass", + "gc", + "bias", + "score", + "E-value", + "inc", + "description of target", +] +_TABULAR_FMT_2_HEADER_FIELDS = [ + "idx", + "target name", + "accession", + "query name", + "accession", + "clan name", + "mdl", + "mdl from", + "mdl to", + "seq from", + "seq to", + "strand", + "trunc", + "pass", + "gc", + "bias", + "score", + "E-value", + "inc", + "olp", + "anyidx", + "afrct1", + "afrct2", + "winidx", + "wfrct1", + "wfrct2", + "mdl len", + "seq len", + "description of target", +] +_TABULAR_FMT_3_HEADER_FIELDS = [ + "target name", + "accession", + "query name", + "accession", + "mdl", + "mdl from", + "mdl to", + "seq from", + "seq to", + "strand", + "trunc", + "pass", + "gc", + "bias", + "score", + "E-value", + "inc", + "mdl len", + "seq len", + "description of target", +] # column to class attribute map _COLUMN_QRESULT = { @@ -134,25 +208,81 @@ _COLUMN_FRAG = { } +def tabular_format_from_header(handle, is_byte=False): + """Determine the tabular format based on the header (PRIVATE).""" + # infernal tabular output columns are space separated files with spaces in + # column names, ex: + # #target name accession + # #------------------- --------- + # so we use the second line of the header where the is not spaces to + # determine the position of the headers labels. Then we can get the label names + # and compare the the different tabular format names. This label comparison is done + # (instead of counting fields) to avoid issues if future tabular format have the + # same number of fields, but with different signification + + def process_line(handle, is_byte): + """Process a line from the handle, decoding byte stream.""" + line = handle.readline().strip() + return line.decode() if is_byte else line + + header_label = process_line(handle, is_byte) + header_delim = process_line(handle, is_byte) + + if header_label == "" or header_delim == "": + raise ValueError( + "Unexpected empty line in the header of Infernal tabular output." + ) + + # get the indices of the spaces fields delimiter from the second header line + # the end of the file is added to the indices to process the last field + indices = [idx for idx, char in enumerate(header_delim) if char == " "] + [ + len(header_delim) + ] + prev_idx = 1 + fields = [] + for idx in indices: + fields.append(header_label[prev_idx:idx].strip()) + prev_idx = idx + + # compare the headers fields to the possible fields to determine the format + if fields == _TABULAR_FMT_1_HEADER_FIELDS: + return 1 + elif fields == _TABULAR_FMT_2_HEADER_FIELDS: + return 2 + elif fields == _TABULAR_FMT_3_HEADER_FIELDS: + return 3 + else: + raise ValueError( + "Cannot determine the tabular format from the header. \ + The tabular file is likely incorrect or its format is unsupported. \ + Tabular format 1, 2 and 3 are currently supported)." + ) + + class InfernalTabParser(_BaseInfernalParser): """Parser for the Infernal tabular format.""" - def __init__(self, handle, fmt=_DEFAULT_TAB_FORMAT): + def __init__(self, handle, fmt=None): """Initialize the class.""" self.handle = handle - self.line = self.handle.readline().strip() - if not isinstance(fmt, int): - raise TypeError - if not 1 <= fmt <= 3: - raise ValueError("Invalid tabular format number, must be 1, 2 or 3.") - self.fmt = fmt + # if a tabular fmt is specified, set its value + if isinstance(fmt, int): + if not 1 <= fmt <= 3: + raise ValueError( + "Unsupported tabular format. Format 1, 2 and 3 are currently supported." + ) + self.fmt = fmt + # if the format is not provided by the user, guess the format + else: + self.fmt = tabular_format_from_header(handle) def __iter__(self): """Iterate over InfernalTabParser, yields query results.""" - # read through the header and footer + # read through the header (and footer if there is no results) + self.line = self.handle.readline() while self.line.startswith("#"): self.line = self.handle.readline() - # if we have result rows, parse it + # if we have result rows, parse them if self.line: yield from self._parse_qresult() @@ -286,10 +416,23 @@ class InfernalTabIndexer(SearchIndexer): _parser = InfernalTabParser - def __init__(self, filename, fmt=_DEFAULT_TAB_FORMAT): + def __init__(self, filename, fmt=None): """Initialize the class.""" + # the index of the query id column in the tabular file varies depending on the fmt + # we need to use the format set by the user, or infer the format to parse the records + if isinstance(fmt, int): + if not 1 <= fmt <= 3: + raise ValueError( + "Unsupported tabular format. Format 1, 2 and 3 are currently supported." + ) + self._query_id_idx = 3 if fmt == 2 else 2 + # if the format is not provided by the user, infer the format + else: + handle = _open_for_random_access(filename) + fmt = tabular_format_from_header(handle, is_byte=True) + self._query_id_idx = 3 if fmt == 2 else 2 + SearchIndexer.__init__(self, filename, fmt=fmt) - self._query_id_idx = 3 if fmt == 2 else 2 def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" @@ -298,7 +441,7 @@ class InfernalTabIndexer(SearchIndexer): qresult_key = None start_offset = handle.tell() - # set line with initial mock value + # set line with mock value line = b"#" # read through header diff --git a/Tests/test_SearchIO_infernal_tab.py b/Tests/test_SearchIO_infernal_tab.py index 15d76fd25..4cfe3c35e 100644 --- a/Tests/test_SearchIO_infernal_tab.py +++ b/Tests/test_SearchIO_infernal_tab.py @@ -210,6 +210,54 @@ class CmscanCases(unittest.TestCase): self.assertRaises(StopIteration, next, qresults) self.assertEqual(count, 1) + def test_cmscan_mq_mm_fmt2_infer(self): + """Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, fmt 2, inferred (IRES_5S_U2_Yeast_fmt_2)""" + tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl") + qresults = parse(tab_file, FMT) + counter = itertools.count(start=1) + + # first qresult + qresult, count = next_result(qresults, counter) + self.assertEqual(len(qresult), 1) + self.assertEqual(qresult.id, "ENA|BK006936|BK006936.2") + self.assertEqual(qresult.accession, "-") + self.assertEqual(qresult.clan, "-") + self.assertEqual(qresult.seq_len, 813184) + hit = qresult[0] + self.assertEqual(len(hit), 1) + self.assertEqual(hit.id, "U2") + self.assertEqual(hit.accession, "RF00004") + self.assertEqual(hit.description, "U2 spliceosomal RNA") + self.assertEqual(hit.seq_len, 193) + # first hsp + hsp = hit[0] + self.assertEqual(len(hsp), 1) + self.assertEqual(hsp.evalue, 1.2e-20) + self.assertEqual(hsp.bitscore, 98.7) + self.assertEqual(hsp.bias, 0.1) + self.assertEqual(hsp.gc, 0.33) + self.assertEqual(hsp.truncated, "no") + self.assertEqual(hsp.model, "cm") + self.assertEqual(hsp.pipeline_pass, 1) + self.assertTrue(hsp.is_included) + self.assertEqual("*", hsp.olp) + self.assertEqual("-", hsp.anyidx) + self.assertEqual("-", hsp.afrct1) + self.assertEqual("-", hsp.afrct2) + self.assertEqual("-", hsp.winidx) + self.assertEqual("-", hsp.wfrct1) + self.assertEqual("-", hsp.wfrct2) + frag = hsp[0] + self.assertEqual(frag.query_start, 1) + self.assertEqual(frag.query_end, 193) + self.assertEqual(frag.hit_start, 681747) + self.assertEqual(frag.hit_end, 681858) + self.assertEqual(frag.hit_strand, -1) + + # test if we've properly finished iteration + self.assertRaises(StopIteration, next, qresults) + self.assertEqual(count, 1) + def test_cmscan_mq_mm_fmt3(self): """Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, fmt 3 (IRES_5S_U2_Yeast_fmt_3)""" tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast_fmt_3.tbl") @@ -250,6 +298,46 @@ class CmscanCases(unittest.TestCase): self.assertRaises(StopIteration, next, qresults) self.assertEqual(count, 1) + def test_cmscan_mq_mm_fmt3_infer(self): + """Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, fmt 3, inferred (IRES_5S_U2_Yeast_fmt_3)""" + tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast_fmt_3.tbl") + qresults = parse(tab_file, FMT) + counter = itertools.count(start=1) + + # first qresult + qresult, count = next_result(qresults, counter) + self.assertEqual(len(qresult), 1) + self.assertEqual("ENA|BK006936|BK006936.2", qresult.id) + self.assertEqual(qresult.accession, "-") + self.assertEqual(813184, qresult.seq_len) + hit = qresult[0] + self.assertEqual(len(hit), 1) + self.assertEqual(hit.id, "U2") + self.assertEqual(hit.accession, "RF00004") + self.assertEqual(hit.description, "U2 spliceosomal RNA") + self.assertEqual(193, hit.seq_len) + # first hsp + hsp = hit[0] + self.assertEqual(len(hsp), 1) + self.assertEqual(hsp.evalue, 1.2e-20) + self.assertEqual(hsp.bitscore, 98.7) + self.assertEqual(hsp.bias, 0.1) + self.assertEqual(hsp.gc, 0.33) + self.assertEqual(hsp.truncated, "no") + self.assertEqual(hsp.model, "cm") + self.assertEqual(hsp.pipeline_pass, 1) + self.assertTrue(hsp.is_included) + frag = hsp[0] + self.assertEqual(frag.query_start, 1) + self.assertEqual(frag.query_end, 193) + self.assertEqual(frag.hit_start, 681747) + self.assertEqual(frag.hit_end, 681858) + self.assertEqual(frag.hit_strand, -1) + + # test if we've properly finished iteration + self.assertRaises(StopIteration, next, qresults) + self.assertEqual(count, 1) + class CmsearchCases(unittest.TestCase): """Test parsing cmsearch output.""" diff --git a/Tests/test_SearchIO_infernal_tab_index.py b/Tests/test_SearchIO_infernal_tab_index.py index 186a4b46d..28d412568 100644 --- a/Tests/test_SearchIO_infernal_tab_index.py +++ b/Tests/test_SearchIO_infernal_tab_index.py @@ -55,6 +55,13 @@ U2 RF00004 ENA|BK006937|BK006937.2 - cm 1 """ self.check_raw(filename, "ENA|BK006936|BK006936.2", raw, fmt=2) + def test_infernal_tab_multiple_fmt_2_infer(self): + """Test infernal-tab raw string retrieval, cmsearch, multiple queries, fmt 2, inferred.""" + filename = os.path.join("Infernal", "cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl") + raw = """1 U2 RF00004 ENA|BK006936|BK006936.2 - - cm 1 193 681858 681747 - no 1 0.33 0.1 98.7 1.2e-20 ! * - - - - - - 193 813184 U2 spliceosomal RNA +""" + self.check_raw(filename, "ENA|BK006936|BK006936.2", raw) + class InfernalTabIndexCases(CheckIndex): fmt = "infernal-tab" @@ -65,7 +72,7 @@ class InfernalTabIndexCases(CheckIndex): self.check_index(filename, self.fmt) def test_infernal_tab_1q_0m(self): - """Test infernal-tab indexing, cmsearch, single query, not hits.""" + """Test infernal-tab indexing, cmsearch, single query, no hits.""" filename = os.path.join("Infernal", "cmsearch_114_IRES_Yeast.tbl") self.check_index(filename, self.fmt)