adding test, handling duplicate hits, handling dbxrefs, and calculating aln_span

This commit is contained in:
azneto
2018-02-24 11:34:50 -03:00
committed by Wibowo Arindrarto
parent dce9f54939
commit d94c47c4c4
4 changed files with 105 additions and 100 deletions

View File

@ -1,13 +1,27 @@
"""Test."""
"""Bio.SearchIO parser for InterProScan XML output formats."""
# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats
import sys
import re
from Bio.Alphabet import generic_protein
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
from xml.etree import cElementTree as ElementTree
# For speed try to use cElementTree rather than ElementTree
try:
if (3, 0) <= sys.version_info[:2] <= (3, 1):
# Workaround for bug in python 3.0 and 3.1,
# see http://bugs.python.org/issue9257
from xml.etree import ElementTree as ElementTree
else:
from xml.etree import cElementTree as ElementTree
except ImportError:
from xml.etree import ElementTree as ElementTree
# element - hit attribute name mapping
_ELEM_HIT = {
'name': ('id', str),
'ac': ('accession', str),
'name': ('accession', str),
'ac': ('id', str),
'desc': ('description', str),
'library': ('target', str),
'version': ('target_version', str),
@ -44,7 +58,7 @@ class InterproXmlParser(object):
"""Parse the header for the InterProScan version."""
event, elem = next(self.xml_iter)
meta = dict()
meta['target'] = 'Interpro'
meta['target'] = 'InterPro'
meta['program'] = 'InterProScan'
meta['version'] = elem.attrib['interproscan-version']
# store the namespace value
@ -65,8 +79,20 @@ class InterproXmlParser(object):
query_desc = xref.attrib['name']
# parse each hit
hit_list = [hit for hit in self._parse_hit(
elem.find(self.NS + 'matches'), query_id, query_seq)]
hit_list = []
for hit_new in self._parse_hit(
elem.find(self.NS + 'matches'), query_id, query_seq):
# interproscan results contain duplicate hits rather than
# a single hit with multiple hsps. In this case the hsps
# of a duplicate hit will be appended to the already
# existing hit
for hit in hit_list:
if hit.id == hit_new.id:
for hsp in hit_new.hsps:
hit.hsps.append(hsp)
break
else:
hit_list.append(hit_new)
# create qresult and assing attributes
qresult = QueryResult(hit_list, query_id)
@ -86,22 +112,24 @@ class InterproXmlParser(object):
signature = hit_elem.find(self.NS + 'signature')
hit_id = signature.attrib['ac']
# store alt_ids and alt_descs
alt_ids, alt_descs = self._parse_alt_ids(
# store xrefs and alt_descs
xrefs = self._parse_xrefs(
signature.find(self.NS + 'entry'))
# parse each hsp
hsps = [hsp for hsp in self._parse_hsp(
hit_elem.find(self.NS + 'locations'), query_id, hit_id, query_seq)]
hit_elem.find(self.NS + 'locations'),
query_id, hit_id, query_seq)]
# create hit and assign attributes
hit = Hit(hsps, hit_id)
# setattr(hit, '_id_alt', alt_ids)
setattr(hit, 'dbxrefs', xrefs)
for key, (attr, caster) in _ELEM_HIT.items():
value = signature.attrib.get(key)
if value is not None:
setattr(hit, attr, caster(value))
signature_lib = signature.find(self.NS + 'signature-library-release')
signature_lib = signature.find(
self.NS + 'signature-library-release')
for key, (attr, caster) in _ELEM_HIT.items():
value = signature_lib.attrib.get(key)
if value is not None:
@ -127,7 +155,14 @@ class InterproXmlParser(object):
# start should be 0-based
if attr.endswith('start'):
value = caster(value) - 1
# store query start and end to calculate aln_span
if attr == 'query_start':
start = int(value)
if attr == 'query_end':
end = int(value)
setattr(frag, attr, caster(value))
# calculate aln_span and store
setattr(frag, 'aln_span', end - start)
# create hsp and assign attributes
hsp = HSP([frag])
@ -139,34 +174,30 @@ class InterproXmlParser(object):
setattr(hsp, attr, caster(value))
yield hsp
def _parse_alt_ids(self, root_entry_elem):
def _parse_xrefs(self, root_entry_elem):
"""Parse xrefs."""
alt_ids, alt_descs = [], []
xrefs = []
# store entry id and description
if root_entry_elem is not None:
alt_ids.append(root_entry_elem.attrib['ac'])
if (root_entry_elem.attrib['name'] is not None or
root_entry_elem.attrib['desc'] is not None):
alt_descs.append('%s %s %s' % (
root_entry_elem.attrib['ac'],
root_entry_elem.attrib.get('name'),
root_entry_elem.attrib.get('desc')))
xrefs.append('IPR:' + root_entry_elem.attrib['ac'])
# store go-xrefs and pathway-refs id and description
if root_entry_elem is not None:
alt_elem = []
alt_elem = alt_elem + root_entry_elem.findall(
xref_elems = []
xref_elems = xref_elems + root_entry_elem.findall(
self.NS + 'go-xref')
alt_elem = alt_elem + root_entry_elem.findall(
xref_elems = xref_elems + root_entry_elem.findall(
self.NS + 'pathway-xref')
for entry in alt_elem:
if entry.attrib['id'] is not None:
alt_ids.append(entry.attrib['id'])
if (entry.attrib['name'] is not None or
entry.attrib['category'] is not None):
alt_descs.append('%s %s [%s]' % (
entry.attrib['id'],
entry.attrib.get('name'),
entry.attrib.get('db')))
return alt_ids, alt_descs
for entry in xref_elems:
xref = entry.attrib['id']
if ':' not in xref:
xref = entry.attrib['db'] + ':' + xref
xrefs.append(xref)
return xrefs
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()

View File

@ -124,8 +124,9 @@ class Hit(_BaseSearchObject):
self._description = None
self._description_alt = []
self._query_description = None
self.target = '<unknown target>'
self.target_version = '<unknown target version>'
self.target = None
self.target_version = None
self.dbxrefs = []
# TODO - Move this into the for look below in case
# hsps is a single use iterator?
@ -186,8 +187,14 @@ class Hit(_BaseSearchObject):
lines.append(hid_line)
# set target line
if self.target is not None:
lines.append(' Target: %s %s' % (self.target, self.target_version))
# set dbxrefs line
if self.dbxrefs:
lines.append("Database cross-references: " +
", ".join(self.dbxrefs))
# set hsp line and table
if not self.hsps:
lines.append(' HSPs: ?')

View File

@ -92,6 +92,14 @@
<mobidblite-location start="136" end="158"/>
</locations>
</mobidblite-match>
<mobidblite-match>
<signature ac="mobidb-lite" desc="consensus disorder prediction" name="disorder_prediction">
<signature-library-release library="MOBIDB_LITE" version="1.0"/>
</signature>
<locations>
<mobidblite-location start="36" end="58"/>
</locations>
</mobidblite-match>
<superfamilyhmmer3-match evalue="2.48E-35">
<signature ac="SSF47113" name="Histone-fold">
<entry ac="IPR009072" desc="Histone-fold" name="Histone-fold" type="HOMOLOGOUS_SUPERFAMILY">

View File

@ -34,82 +34,41 @@ class BlastnXmlCases(unittest.TestCase):
qresult = next(qresults)
counter += 1
self.assertEqual('2.2.12', qresult.version)
self.assertEqual(10.0, qresult.param_evalue_threshold)
self.assertEqual(1, qresult.param_score_match)
self.assertEqual(-3, qresult.param_score_mismatch)
self.assertEqual(5, qresult.param_gap_open)
self.assertEqual(2, qresult.param_gap_extend)
self.assertEqual('5.26-65.0', qresult.version)
# test parsed values of qresult
self.assertEqual('gi|1348916|gb|G26684.1|G26684', qresult.id)
self.assertEqual('human STS STS_D11570, sequence tagged site',
self.assertEqual('AT5G23090.4', qresult.id)
self.assertEqual('pacid=19665592 transcript=AT5G23090.4 locus=AT5G23090 ID=AT5G23090.4.TAIR10 annot-version=TAIR10',
qresult.description)
self.assertEqual(285, qresult.seq_len)
self.assertEqual(371021, qresult.stat_db_num)
self.assertEqual(1233631384, qresult.stat_db_len)
self.assertEqual(0, qresult.stat_eff_space)
self.assertEqual(0.710603, qresult.stat_kappa)
self.assertEqual(1.37406, qresult.stat_lambda)
self.assertEqual(1.30725, qresult.stat_entropy)
self.assertEqual(2, len(qresult))
self.assertEqual(4, len(qresult))
hit = qresult[0]
self.assertEqual('gi|9950606|gb|AE004854.1|', hit.id)
self.assertEqual('Pseudomonas aeruginosa PAO1, section 415 of 529 of the complete genome', hit.description)
self.assertEqual(11884, hit.seq_len)
self.assertEqual(1, len(hit))
self.assertRaises(IndexError, hit.__getitem__, 1)
self.assertEqual('PF00808', hit.id)
self.assertEqual('Histone-like transcription factor (CBF/NF-Y) and archaeal histone', hit.description)
self.assertEqual('PFAM', hit.target)
self.assertEqual('31.0', hit.target_version)
self.assertEqual(2, len(hit))
hsp = hit.hsps[0]
self.assertEqual(38.1576, hsp.bitscore)
self.assertEqual(19, hsp.bitscore_raw)
self.assertEqual(1.0598, hsp.evalue)
self.assertEqual(67, hsp.query_start)
self.assertEqual(86, hsp.query_end)
self.assertEqual(6011, hsp.hit_start)
self.assertEqual(6030, hsp.hit_end)
self.assertEqual(1, hsp.query_frame)
self.assertEqual(1, hsp.hit_frame)
self.assertEqual(19, hsp.ident_num)
self.assertEqual(19, hsp.pos_num)
self.assertEqual(0, hsp.gap_num)
self.assertEqual(19, hsp.aln_span)
self.assertEqual('CAGGCCAGCGACTTCTGGG', str(hsp.query.seq))
self.assertEqual('CAGGCCAGCGACTTCTGGG', str(hsp.hit.seq))
self.assertEqual('|||||||||||||||||||', hsp.aln_annotation['similarity'])
self.assertRaises(IndexError, hit.__getitem__, 1)
self.assertEqual(76.7, hsp.bitscore)
self.assertEqual(1.1e-21, hsp.evalue)
self.assertEqual(13, hsp.query_start)
self.assertEqual(79, hsp.query_end)
self.assertEqual(0, hsp.hit_start)
self.assertEqual(65, hsp.hit_end)
self.assertEqual(66, hsp.aln_span)
self.assertEqual('MDPMDIVGKSKEDASLPKATMTKIIKEMLPPDVRVARDAQDLLIECCVEFINLVSSESNDVCNKEDKRTIAPEHVLKALQVLGFGEYIEEVYAAYEQHKYETMDTQRSVKWNPGAQMTEEEAAAEQQRMFAEARARMNGGVSVPQPEHPETDQRSPQS', str(hsp.query.seq))
# parse last hit
hit = qresult[-1]
self.assertEqual('gi|15073988|emb|AL591786.1|SME591786', hit.id)
self.assertEqual('Sinorhizobium meliloti 1021 complete chromosome; segment 5/12', hit.description)
self.assertEqual(299350, hit.seq_len)
self.assertEqual('SSF47113', hit.id)
self.assertEqual(1, len(hit))
self.assertRaises(IndexError, hit.__getitem__, 1)
self.assertEqual('IPR:IPR009072', hit.dbxrefs[0])
self.assertEqual('GO:0046982', hit.dbxrefs[1])
hsp = hit.hsps[0]
self.assertEqual(36.1753, hsp.bitscore)
self.assertEqual(18, hsp.bitscore_raw)
self.assertEqual(4.18768, hsp.evalue)
self.assertEqual(203, hsp.query_start)
self.assertEqual(224, hsp.query_end)
self.assertEqual(83627, hsp.hit_start)
self.assertEqual(83648, hsp.hit_end)
self.assertEqual(1, hsp.query_frame)
self.assertEqual(-1, hsp.hit_frame)
self.assertEqual(20, hsp.ident_num)
self.assertEqual(20, hsp.pos_num)
self.assertEqual(0, hsp.gap_num)
self.assertEqual(21, hsp.aln_span)
self.assertEqual('TGAAAGGAAATNAAAATGGAA', str(hsp.query.seq))
self.assertEqual('TGAAAGGAAATCAAAATGGAA', str(hsp.hit.seq))
self.assertEqual('||||||||||| |||||||||', hsp.aln_annotation['similarity'])
self.assertRaises(IndexError, hit.__getitem__, 1)
# check if we've finished iteration over qresults
self.assertRaises(StopIteration, next, qresults)
self.assertEqual(1, counter)
self.assertEqual(11, hsp.query_start)
self.assertEqual(141, hsp.query_end)
if __name__ == "__main__":