Add support and test cases for hmmscan 3b1.1 tab output

This commit is contained in:
bow
2014-05-23 01:25:26 +02:00
parent 684d36cddc
commit b5be0f125e
4 changed files with 88 additions and 1 deletions

View File

@ -97,7 +97,10 @@ class Hmmer3TabParser(object):
prev = cur
prev_qid = cur_qid
# only parse the result row if it's not EOF
if self.line:
# NOTE: we are not parsing the extra '#' lines appended to the end
# of hmmer31b1 tabular results since storing them in qresult
# objects means we can not do a single-pass parsing
if self.line and not self.line.startswith('#'):
cur = self._parse_row()
cur_qid = cur['qresult']['id']
else:

View File

@ -24,6 +24,7 @@ text_30_hmmsearch_003.out single query, multiple matches, multiple hsps per m
text_30_hmmsearch_004.out single query, multiple matches, multiple hsps per match, no alignment width
text_30_hmmsearch_005.out multiple queries
tab_31b1_hmmscan_001.out multiple queries
tab_30_hmmscan_001.out multiple queries
tab_30_hmmscan_002.out single query, no match
tab_30_hmmscan_003.out single query, one match, one hsp per match

View File

@ -0,0 +1,24 @@
# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- ---------------------
Globin PF00042.17 gi|4885477|ref|NP_005359.1| - 1e-22 80.5 0.3 1.6e-22 79.8 0.3 1.3 1 0 0 1 1 1 1 Globin
Ig_3 PF13927.1 gi|126362951|ref|NP_001075106.1| - 9.3e-10 38.8 0.4 1.4e-09 38.3 0.4 1.3 1 0 0 1 1 1 1 Immunoglobulin domain
Ig_2 PF13895.1 gi|126362951|ref|NP_001075106.1| - 1.6e-06 28.1 0.1 2e-06 27.8 0.1 1.1 1 0 0 1 1 1 1 Immunoglobulin domain
Xpo1 PF08389.7 gi|22748937|ref|NP_065801.1| - 8.5e-34 116.6 7.8 1.2e-33 116.1 4.9 2.8 2 0 0 2 2 2 1 Exportin 1-like protein
IBN_N PF03810.14 gi|22748937|ref|NP_065801.1| - 0.0044 16.9 0.0 0.036 13.9 0.0 2.7 2 0 0 2 2 2 1 Importin-beta N-terminal domain
Rac1 PF09632.5 gi|22748937|ref|NP_065801.1| - 0.095 11.7 0.3 1.3 8.0 0.3 2.3 2 0 0 2 2 2 0 Rac1-binding domain
Pou PF00157.12 gi|125490392|ref|NP_038661.2| - 7.6e-37 124.8 0.5 1.5e-36 123.9 0.5 1.5 1 0 0 1 1 1 1 Pou domain - N-terminal to homeobox domain
Homeobox PF00046.24 gi|125490392|ref|NP_038661.2| - 1.8e-18 65.8 1.1 3.4e-18 64.9 1.1 1.5 1 0 0 1 1 1 1 Homeobox domain
HTH_31 PF13560.1 gi|125490392|ref|NP_038661.2| - 0.013 15.6 0.0 0.18 12.0 0.0 2.2 2 0 0 2 2 2 0 Helix-turn-helix domain
Homeobox_KN PF05920.6 gi|125490392|ref|NP_038661.2| - 0.043 13.5 0.0 0.1 12.2 0.0 1.6 1 0 0 1 1 1 0 Homeobox KN domain
DUF521 PF04412.8 gi|125490392|ref|NP_038661.2| - 0.15 10.5 0.1 0.28 9.6 0.1 1.4 1 0 0 1 1 1 0 Protein of unknown function (DUF521)
#
# Program: hmmscan
# Version: 3.1b1 (May 2013)
# Pipeline mode: SCAN
# Query file: prot_multi.fa
# Target file: /home/bow/db/hmmer/protdb/Pfam-A.hmm
# Option settings: hmmscan -o hmmscan/text_31b1_hmmscan_001.out --tblout hmmscan/tab_31b1_hmmscan_001.out --domtblout hmmscan/domtab_31b1_hmmscan_001.out --pfamtblout hmmscan/pfamtab_31b1_hmmscan_001.out --cpu 2 /home/bow/db/hmmer/protdb/Pfam-A.hmm prot_multi.fa
# Current dir: /home/bow/devel/sandbox/biopy_cases
# Date: Sun May 11 21:26:57 2014
# [ok]

View File

@ -28,6 +28,65 @@ def get_file(filename):
class HmmscanCases(unittest.TestCase):
def test_31b1_hmmscan_001(self):
"""Test parsing hmmer3-tab, hmmscan 3.1b1, multiple queries (tab_31b1_hmmscan_001)"""
tab_file = get_file('tab_31b1_hmmscan_001.out')
qresults = list(parse(tab_file, FMT))
self.assertEqual(4, len(qresults))
# first qresult, first hit, first hsp
qresult = qresults[0]
self.assertEqual(1, len(qresult))
self.assertEqual('gi|4885477|ref|NP_005359.1|', qresult.id)
self.assertEqual('-', qresult.acc)
hit = qresult[0]
self.assertEqual(1, len(hit))
self.assertEqual('Globin', hit.id)
self.assertEqual('PF00042.17', hit.acc)
self.assertEqual(1e-22, hit.evalue)
self.assertEqual(80.5, hit.bitscore)
self.assertEqual(0.3, hit.bias)
self.assertEqual(1.3, hit.domain_exp_num)
self.assertEqual(1, hit.region_num)
self.assertEqual(0, hit.cluster_num)
self.assertEqual(0, hit.overlap_num)
self.assertEqual(1, hit.env_num)
self.assertEqual(1, hit.domain_obs_num)
self.assertEqual(1, hit.domain_reported_num)
self.assertEqual(1, hit.domain_included_num)
self.assertEqual('Globin', hit.description)
hsp = hit.hsps[0]
self.assertEqual(1.6e-22, hsp.evalue)
self.assertEqual(79.8, hsp.bitscore)
self.assertEqual(0.3, hsp.bias)
# last qresult, last hit, last hsp
qresult = qresults[-1]
self.assertEqual(5, len(qresult))
self.assertEqual('gi|125490392|ref|NP_038661.2|', qresult.id)
self.assertEqual('-', qresult.acc)
hit = qresult[-1]
self.assertEqual(1, len(hit))
self.assertEqual('DUF521', hit.id)
self.assertEqual('PF04412.8', hit.acc)
self.assertEqual(0.15, hit.evalue)
self.assertEqual(10.5, hit.bitscore)
self.assertEqual(0.1, hit.bias)
self.assertEqual(1.4, hit.domain_exp_num)
self.assertEqual(1, hit.region_num)
self.assertEqual(0, hit.cluster_num)
self.assertEqual(0, hit.overlap_num)
self.assertEqual(1, hit.env_num)
self.assertEqual(1, hit.domain_obs_num)
self.assertEqual(1, hit.domain_reported_num)
self.assertEqual(0, hit.domain_included_num)
self.assertEqual('Protein of unknown function (DUF521)', hit.description)
hsp = hit.hsps[0]
self.assertEqual(0.28, hsp.evalue)
self.assertEqual(9.6, hsp.bitscore)
self.assertEqual(0.1, hsp.bias)
def test_30_hmmscan_001(self):
"Test parsing hmmer3-tab, hmmscan 3.0, multiple queries (tab_30_hmmscan_001)"