Files
biopython/Tests/test_SearchIO_infernal_tab.py
2025-03-22 06:59:36 +01:00

507 lines
20 KiB
Python

# Copyright 2024 by Samuel Prince. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Tests for SearchIO InfernalIO infernal-tab parser."""
import os
import unittest
import itertools
from Bio.SearchIO import parse
# test case files are in the Infernal directory
TEST_DIR = "Infernal"
FMT = "infernal-tab"
def get_file(filename):
"""Return the path of a test file."""
return os.path.join(TEST_DIR, filename)
def next_result(qresults, counter):
"""Iterate over the results and counter."""
return next(qresults), next(counter)
class CmscanCases(unittest.TestCase):
"""Test parsing cmscan output."""
def test_cmscan_mq_mm(self):
"""Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, default format (IRES_5S_U2_Yeast)"""
tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
# first qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual(qresult.id, "ENA|BK006935|BK006935.2")
self.assertEqual(qresult.accession, "-")
hit = qresult[0]
self.assertEqual(len(hit), 2)
self.assertEqual(hit.id, "U2")
self.assertEqual(hit.accession, "RF00004")
self.assertEqual(hit.description, "U2 spliceosomal RNA")
# first hsp
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 0.91)
self.assertEqual(hsp.bitscore, 13.5)
self.assertEqual(hsp.bias, 0.0)
self.assertEqual(hsp.gc, 0.44)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertFalse(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(52929, frag.hit_start)
self.assertEqual(53083, frag.hit_end)
self.assertEqual(frag.hit_strand, 0)
# second hsp
hsp = hit[1]
self.assertEqual(len(hsp), 1)
self.assertEqual(1.3, hsp.evalue)
self.assertEqual(12.8, hsp.bitscore)
self.assertEqual(5.3, hsp.bias)
self.assertEqual(hsp.gc, 0.33)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertFalse(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 196389)
self.assertEqual(frag.hit_end, 196571)
self.assertEqual(frag.hit_strand, -1)
# second qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual("ENA|BK006936|BK006936.2", qresult.id)
self.assertEqual(qresult.accession, "-")
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "U2")
self.assertEqual(hit.accession, "RF00004")
self.assertEqual(hit.description, "U2 spliceosomal RNA")
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 1.2e-20)
self.assertEqual(hsp.bitscore, 98.7)
self.assertEqual(hsp.bias, 0.1)
self.assertEqual(hsp.gc, 0.33)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 681747)
self.assertEqual(frag.hit_end, 681858)
self.assertEqual(frag.hit_strand, -1)
# third qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 2)
self.assertEqual(qresult.id, "ENA|BK006937|BK006937.2")
self.assertEqual(qresult.accession, "-")
# first hit
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "5S_rRNA")
self.assertEqual(hit.accession, "RF00001")
self.assertEqual(hit.description, "5S ribosomal RNA")
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 2.4)
self.assertEqual(hsp.bitscore, 14.1)
self.assertEqual(hsp.bias, 0.3)
self.assertEqual(hsp.gc, 0.41)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertFalse(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 119)
self.assertEqual(frag.hit_start, 644)
self.assertEqual(frag.hit_end, 761)
self.assertEqual(frag.hit_strand, -1)
# second hit
hit = qresult[1]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "U2")
self.assertEqual(hit.accession, "RF00004")
self.assertEqual(hit.description, "U2 spliceosomal RNA")
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 4.7)
self.assertEqual(hsp.bitscore, 11.1)
self.assertEqual(hsp.bias, 0.1)
self.assertEqual(hsp.gc, 0.32)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertFalse(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 229885)
self.assertEqual(frag.hit_end, 229986)
self.assertEqual(frag.hit_strand, -1)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 3)
def test_cmscan_mq_mm_fmt2(self):
"""Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, fmt 2 (IRES_5S_U2_Yeast_fmt_2)"""
tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast_fmt_2.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
# first qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual(qresult.id, "ENA|BK006936|BK006936.2")
self.assertEqual(qresult.accession, "-")
self.assertEqual(qresult.clan, "-")
self.assertEqual(qresult.seq_len, 813184)
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "U2")
self.assertEqual(hit.accession, "RF00004")
self.assertEqual(hit.description, "U2 spliceosomal RNA")
self.assertEqual(hit.seq_len, 193)
# first hsp
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 1.2e-20)
self.assertEqual(hsp.bitscore, 98.7)
self.assertEqual(hsp.bias, 0.1)
self.assertEqual(hsp.gc, 0.33)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
self.assertEqual("*", hsp.olp)
self.assertEqual(None, hsp.anyidx)
self.assertEqual(None, hsp.afrct1)
self.assertEqual(None, hsp.afrct2)
self.assertEqual(None, hsp.winidx)
self.assertEqual(None, hsp.wfrct1)
self.assertEqual(None, hsp.wfrct2)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 681747)
self.assertEqual(frag.hit_end, 681858)
self.assertEqual(frag.hit_strand, -1)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
def test_cmscan_mq_mm_fmt2_clan(self):
"""Test parsing infernal-tab, cmscan, mulitple queries, multiple hit, one hsp, fmt 2, multiple clan (SSU_clan_fmt_2)"""
tab_file = get_file("cmscan_115_SSU_clan-fmt_2.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
# first qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 2)
self.assertEqual(qresult.id, "ENA|BK006945|BK006945.2")
self.assertEqual(qresult.accession, "-")
self.assertEqual(qresult.clan, "CL00111")
self.assertEqual(qresult.seq_len, 1078177)
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "SSU_rRNA_eukarya")
self.assertEqual(hit.accession, "RF01960")
self.assertEqual(hit.description, "Eukaryotic small subunit ribosomal RNA")
self.assertEqual(hit.seq_len, 1831)
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 0)
self.assertEqual(hsp.bitscore, 1817.8)
self.assertEqual(hsp.bias, 9.5)
self.assertEqual(hsp.gc, 0.45)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
self.assertEqual("^", hsp.olp)
self.assertEqual(None, hsp.anyidx)
self.assertEqual(None, hsp.afrct1)
self.assertEqual(None, hsp.afrct2)
self.assertEqual(None, hsp.winidx)
self.assertEqual(None, hsp.wfrct1)
self.assertEqual(None, hsp.wfrct2)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 1831)
self.assertEqual(frag.hit_start, 455933)
self.assertEqual(frag.hit_end, 457732)
self.assertEqual(frag.hit_strand, -1)
# second hit (overlapping with the first hit)
hit = qresult[1]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "SSU_rRNA_archaea")
self.assertEqual(hit.accession, "RF01959")
self.assertEqual(hit.description, "Archaeal small subunit ribosomal RNA")
self.assertEqual(hit.seq_len, 1478)
# first hsp
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 5.8e-187)
self.assertEqual(hsp.bitscore, 633.9)
self.assertEqual(hsp.bias, 9.7)
self.assertEqual(hsp.gc, 0.45)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
self.assertEqual("=", hsp.olp)
self.assertEqual(1, hsp.anyidx)
self.assertEqual(0.998, hsp.afrct1)
self.assertEqual(1.000, hsp.afrct2)
self.assertEqual(None, hsp.winidx)
self.assertEqual(None, hsp.wfrct1)
self.assertEqual(None, hsp.wfrct2)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 1478)
self.assertEqual(frag.hit_start, 455930)
self.assertEqual(frag.hit_end, 457732)
self.assertEqual(frag.hit_strand, -1)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
def test_cmscan_mq_mm_fmt3(self):
"""Test parsing infernal-tab, cmscan, multiple queries, multiple hit, one hsp, fmt 3 (IRES_5S_U2_Yeast_fmt_3)"""
tab_file = get_file("cmscan_115_IRES_5S_U2_Yeast_fmt_3.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
# first qresult
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual("ENA|BK006936|BK006936.2", qresult.id)
self.assertEqual(qresult.accession, "-")
self.assertEqual(813184, qresult.seq_len)
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "U2")
self.assertEqual(hit.accession, "RF00004")
self.assertEqual(hit.description, "U2 spliceosomal RNA")
self.assertEqual(193, hit.seq_len)
# first hsp
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 1.2e-20)
self.assertEqual(hsp.bitscore, 98.7)
self.assertEqual(hsp.bias, 0.1)
self.assertEqual(hsp.gc, 0.33)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 681747)
self.assertEqual(frag.hit_end, 681858)
self.assertEqual(frag.hit_strand, -1)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
class CmsearchCases(unittest.TestCase):
"""Test parsing cmsearch output."""
def test_1q_0m(self):
"""Test parsing infernal-tab, cmsearch, one query, no hits (IRES_Yeast)"""
tab_file = get_file("cmsearch_114_IRES_Yeast.tbl")
qresults = parse(tab_file, FMT)
self.assertRaises(StopIteration, next, qresults)
def test_cmsearch_1q_1m(self):
"""Test parsing infernal-tab, cmsearch, one queries, one hit, one hsp (U2_Yeast)"""
tab_file = get_file("cmsearch_114_U2_Yeast.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual(qresult.id, "U2")
self.assertEqual(qresult.accession, "RF00004")
hit = qresult[0]
self.assertEqual(len(hit), 1)
self.assertEqual(hit.id, "ENA|BK006936|BK006936.2")
self.assertEqual(hit.accession, "-")
self.assertEqual(
hit.description,
"TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence.",
)
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 5.9e-20)
self.assertEqual(hsp.bitscore, 98.7)
self.assertEqual(hsp.bias, 0.1)
self.assertTrue(hsp.is_included)
self.assertEqual(hsp.gc, 0.33)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 193)
self.assertEqual(frag.hit_start, 681747)
self.assertEqual(frag.hit_end, 681858)
self.assertEqual(frag.hit_strand, -1)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
def test_cmsearch_1q_mm(self):
"""Test parsing infernal-tab, cmsearch, one queries, multiple hit, one hsp (5S_Yeast)"""
tab_file = get_file("cmsearch_114_5S_Yeast.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
qresult, count = next_result(qresults, counter)
self.assertEqual(len(qresult), 1)
self.assertEqual(qresult.id, "5S_rRNA")
self.assertEqual(qresult.accession, "RF00001")
# first hit
hit = qresult[0]
self.assertEqual(6, len(hit))
self.assertEqual(hit.id, "ENA|BK006945|BK006945.2")
self.assertEqual(hit.accession, "-")
self.assertEqual(
hit.description,
"TPA_inf: Saccharomyces cerevisiae S288C chromosome XII, complete sequence.",
)
hsp = hit[0]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 1.6e-18)
self.assertEqual(hsp.bitscore, 88.8)
self.assertEqual(hsp.bias, 0.0)
self.assertEqual(hsp.gc, 0.52)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.model, "cm")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 119)
self.assertEqual(frag.hit_start, 459676)
self.assertEqual(frag.hit_end, 459796)
self.assertEqual(frag.hit_strand, 0)
# last hit
hsp = hit[-1]
self.assertEqual(len(hsp), 1)
self.assertEqual(hsp.evalue, 4.4e-17)
self.assertEqual(hsp.bitscore, 83.2)
self.assertEqual(hsp.bias, 0.0)
self.assertEqual(hsp.gc, 0.53)
self.assertEqual(hsp.truncated, "no")
self.assertEqual(hsp.pipeline_pass, 1)
self.assertTrue(hsp.is_included)
frag = hsp[0]
self.assertEqual(frag.query_start, 1)
self.assertEqual(frag.query_end, 119)
self.assertEqual(frag.hit_start, 485697)
self.assertEqual(frag.hit_end, 485817)
self.assertEqual(frag.hit_strand, 0)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
def test_cmsearch_1q_mm_shuf(self):
"""Test parsing infernal-tab, cmsearch, one queries, multiple non-consecutive hits, one hsp (U2_Yeast_full_shuffled)"""
tab_file = get_file("cmsearch_114_U2_Yeast_full_shuffled.tbl")
qresults = parse(tab_file, FMT)
counter = itertools.count(start=1)
qresult, count = next_result(qresults, counter)
self.assertEqual(2, len(qresult))
self.assertEqual(qresult.id, "U2")
self.assertEqual(qresult.accession, "RF00004")
# first hit
# first hit (3 hsps at rank 1,3 and 4)
hit = qresult[0]
self.assertEqual(3, len(hit))
self.assertEqual(hit.id, "ENA|BK006936|BK006936.2")
self.assertEqual(hit.description, "-")
self.assertEqual(hit.query_id, "U2")
# first hsp (rank 1)
hsp = hit[0]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 681747)
self.assertEqual(hsp.hit_end, 681858)
# second hsp (rank 3)
hsp = hit[1]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 1370418)
self.assertEqual(hsp.hit_end, 1370563)
# last hsp (rank 4)
hsp = hit[2]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 1079243)
self.assertEqual(hsp.hit_end, 1079392)
# second hit
hit = qresult[1]
self.assertEqual(3, len(hit))
self.assertEqual(hit.id, "ENA|BK006948|BK006948.2")
self.assertEqual(hit.description, "-")
self.assertEqual(hit.query_id, "U2")
# first hsp (rank 2)
hsp = hit[0]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 737324)
self.assertEqual(hsp.hit_end, 737498)
# second hsp (rank 5)
hsp = hit[1]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 425490)
self.assertEqual(hsp.hit_end, 425693)
# last hsp (rank 6)
hsp = hit[2]
self.assertEqual(hsp.query_start, 1)
self.assertEqual(hsp.query_end, 193)
self.assertEqual(hsp.hit_start, 1073786)
self.assertEqual(hsp.hit_end, 1073950)
# test if we've properly finished iteration
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(count, 1)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)