mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
adding test, handling duplicate hits, handling dbxrefs, and calculating aln_span
This commit is contained in:
committed by
Wibowo Arindrarto
parent
dce9f54939
commit
d94c47c4c4
@ -1,13 +1,27 @@
|
||||
"""Test."""
|
||||
"""Bio.SearchIO parser for InterProScan XML output formats."""
|
||||
# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats
|
||||
|
||||
import sys
|
||||
import re
|
||||
from Bio.Alphabet import generic_protein
|
||||
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
|
||||
from xml.etree import cElementTree as ElementTree
|
||||
|
||||
# For speed try to use cElementTree rather than ElementTree
|
||||
try:
|
||||
if (3, 0) <= sys.version_info[:2] <= (3, 1):
|
||||
# Workaround for bug in python 3.0 and 3.1,
|
||||
# see http://bugs.python.org/issue9257
|
||||
from xml.etree import ElementTree as ElementTree
|
||||
else:
|
||||
from xml.etree import cElementTree as ElementTree
|
||||
except ImportError:
|
||||
from xml.etree import ElementTree as ElementTree
|
||||
|
||||
|
||||
# element - hit attribute name mapping
|
||||
_ELEM_HIT = {
|
||||
'name': ('id', str),
|
||||
'ac': ('accession', str),
|
||||
'name': ('accession', str),
|
||||
'ac': ('id', str),
|
||||
'desc': ('description', str),
|
||||
'library': ('target', str),
|
||||
'version': ('target_version', str),
|
||||
@ -44,7 +58,7 @@ class InterproXmlParser(object):
|
||||
"""Parse the header for the InterProScan version."""
|
||||
event, elem = next(self.xml_iter)
|
||||
meta = dict()
|
||||
meta['target'] = 'Interpro'
|
||||
meta['target'] = 'InterPro'
|
||||
meta['program'] = 'InterProScan'
|
||||
meta['version'] = elem.attrib['interproscan-version']
|
||||
# store the namespace value
|
||||
@ -65,8 +79,20 @@ class InterproXmlParser(object):
|
||||
query_desc = xref.attrib['name']
|
||||
|
||||
# parse each hit
|
||||
hit_list = [hit for hit in self._parse_hit(
|
||||
elem.find(self.NS + 'matches'), query_id, query_seq)]
|
||||
hit_list = []
|
||||
for hit_new in self._parse_hit(
|
||||
elem.find(self.NS + 'matches'), query_id, query_seq):
|
||||
# interproscan results contain duplicate hits rather than
|
||||
# a single hit with multiple hsps. In this case the hsps
|
||||
# of a duplicate hit will be appended to the already
|
||||
# existing hit
|
||||
for hit in hit_list:
|
||||
if hit.id == hit_new.id:
|
||||
for hsp in hit_new.hsps:
|
||||
hit.hsps.append(hsp)
|
||||
break
|
||||
else:
|
||||
hit_list.append(hit_new)
|
||||
|
||||
# create qresult and assing attributes
|
||||
qresult = QueryResult(hit_list, query_id)
|
||||
@ -86,22 +112,24 @@ class InterproXmlParser(object):
|
||||
signature = hit_elem.find(self.NS + 'signature')
|
||||
hit_id = signature.attrib['ac']
|
||||
|
||||
# store alt_ids and alt_descs
|
||||
alt_ids, alt_descs = self._parse_alt_ids(
|
||||
# store xrefs and alt_descs
|
||||
xrefs = self._parse_xrefs(
|
||||
signature.find(self.NS + 'entry'))
|
||||
|
||||
# parse each hsp
|
||||
hsps = [hsp for hsp in self._parse_hsp(
|
||||
hit_elem.find(self.NS + 'locations'), query_id, hit_id, query_seq)]
|
||||
hit_elem.find(self.NS + 'locations'),
|
||||
query_id, hit_id, query_seq)]
|
||||
|
||||
# create hit and assign attributes
|
||||
hit = Hit(hsps, hit_id)
|
||||
# setattr(hit, '_id_alt', alt_ids)
|
||||
setattr(hit, 'dbxrefs', xrefs)
|
||||
for key, (attr, caster) in _ELEM_HIT.items():
|
||||
value = signature.attrib.get(key)
|
||||
if value is not None:
|
||||
setattr(hit, attr, caster(value))
|
||||
signature_lib = signature.find(self.NS + 'signature-library-release')
|
||||
signature_lib = signature.find(
|
||||
self.NS + 'signature-library-release')
|
||||
for key, (attr, caster) in _ELEM_HIT.items():
|
||||
value = signature_lib.attrib.get(key)
|
||||
if value is not None:
|
||||
@ -127,7 +155,14 @@ class InterproXmlParser(object):
|
||||
# start should be 0-based
|
||||
if attr.endswith('start'):
|
||||
value = caster(value) - 1
|
||||
# store query start and end to calculate aln_span
|
||||
if attr == 'query_start':
|
||||
start = int(value)
|
||||
if attr == 'query_end':
|
||||
end = int(value)
|
||||
setattr(frag, attr, caster(value))
|
||||
# calculate aln_span and store
|
||||
setattr(frag, 'aln_span', end - start)
|
||||
|
||||
# create hsp and assign attributes
|
||||
hsp = HSP([frag])
|
||||
@ -139,34 +174,30 @@ class InterproXmlParser(object):
|
||||
setattr(hsp, attr, caster(value))
|
||||
yield hsp
|
||||
|
||||
def _parse_alt_ids(self, root_entry_elem):
|
||||
def _parse_xrefs(self, root_entry_elem):
|
||||
"""Parse xrefs."""
|
||||
alt_ids, alt_descs = [], []
|
||||
xrefs = []
|
||||
# store entry id and description
|
||||
if root_entry_elem is not None:
|
||||
alt_ids.append(root_entry_elem.attrib['ac'])
|
||||
if (root_entry_elem.attrib['name'] is not None or
|
||||
root_entry_elem.attrib['desc'] is not None):
|
||||
alt_descs.append('%s %s %s' % (
|
||||
root_entry_elem.attrib['ac'],
|
||||
root_entry_elem.attrib.get('name'),
|
||||
root_entry_elem.attrib.get('desc')))
|
||||
xrefs.append('IPR:' + root_entry_elem.attrib['ac'])
|
||||
|
||||
# store go-xrefs and pathway-refs id and description
|
||||
if root_entry_elem is not None:
|
||||
alt_elem = []
|
||||
alt_elem = alt_elem + root_entry_elem.findall(
|
||||
xref_elems = []
|
||||
xref_elems = xref_elems + root_entry_elem.findall(
|
||||
self.NS + 'go-xref')
|
||||
alt_elem = alt_elem + root_entry_elem.findall(
|
||||
xref_elems = xref_elems + root_entry_elem.findall(
|
||||
self.NS + 'pathway-xref')
|
||||
|
||||
for entry in alt_elem:
|
||||
if entry.attrib['id'] is not None:
|
||||
alt_ids.append(entry.attrib['id'])
|
||||
if (entry.attrib['name'] is not None or
|
||||
entry.attrib['category'] is not None):
|
||||
alt_descs.append('%s %s [%s]' % (
|
||||
entry.attrib['id'],
|
||||
entry.attrib.get('name'),
|
||||
entry.attrib.get('db')))
|
||||
return alt_ids, alt_descs
|
||||
for entry in xref_elems:
|
||||
xref = entry.attrib['id']
|
||||
if ':' not in xref:
|
||||
xref = entry.attrib['db'] + ':' + xref
|
||||
xrefs.append(xref)
|
||||
return xrefs
|
||||
|
||||
|
||||
# if not used as a module, run the doctest
|
||||
if __name__ == "__main__":
|
||||
from Bio._utils import run_doctest
|
||||
run_doctest()
|
||||
|
@ -124,8 +124,9 @@ class Hit(_BaseSearchObject):
|
||||
self._description = None
|
||||
self._description_alt = []
|
||||
self._query_description = None
|
||||
self.target = '<unknown target>'
|
||||
self.target_version = '<unknown target version>'
|
||||
self.target = None
|
||||
self.target_version = None
|
||||
self.dbxrefs = []
|
||||
|
||||
# TODO - Move this into the for look below in case
|
||||
# hsps is a single use iterator?
|
||||
@ -186,7 +187,13 @@ class Hit(_BaseSearchObject):
|
||||
lines.append(hid_line)
|
||||
|
||||
# set target line
|
||||
lines.append(' Target: %s %s' % (self.target, self.target_version))
|
||||
if self.target is not None:
|
||||
lines.append(' Target: %s %s' % (self.target, self.target_version))
|
||||
|
||||
# set dbxrefs line
|
||||
if self.dbxrefs:
|
||||
lines.append("Database cross-references: " +
|
||||
", ".join(self.dbxrefs))
|
||||
|
||||
# set hsp line and table
|
||||
if not self.hsps:
|
||||
|
@ -92,6 +92,14 @@
|
||||
<mobidblite-location start="136" end="158"/>
|
||||
</locations>
|
||||
</mobidblite-match>
|
||||
<mobidblite-match>
|
||||
<signature ac="mobidb-lite" desc="consensus disorder prediction" name="disorder_prediction">
|
||||
<signature-library-release library="MOBIDB_LITE" version="1.0"/>
|
||||
</signature>
|
||||
<locations>
|
||||
<mobidblite-location start="36" end="58"/>
|
||||
</locations>
|
||||
</mobidblite-match>
|
||||
<superfamilyhmmer3-match evalue="2.48E-35">
|
||||
<signature ac="SSF47113" name="Histone-fold">
|
||||
<entry ac="IPR009072" desc="Histone-fold" name="Histone-fold" type="HOMOLOGOUS_SUPERFAMILY">
|
||||
|
@ -34,82 +34,41 @@ class BlastnXmlCases(unittest.TestCase):
|
||||
qresult = next(qresults)
|
||||
counter += 1
|
||||
|
||||
self.assertEqual('2.2.12', qresult.version)
|
||||
self.assertEqual(10.0, qresult.param_evalue_threshold)
|
||||
self.assertEqual(1, qresult.param_score_match)
|
||||
self.assertEqual(-3, qresult.param_score_mismatch)
|
||||
self.assertEqual(5, qresult.param_gap_open)
|
||||
self.assertEqual(2, qresult.param_gap_extend)
|
||||
self.assertEqual('5.26-65.0', qresult.version)
|
||||
|
||||
# test parsed values of qresult
|
||||
self.assertEqual('gi|1348916|gb|G26684.1|G26684', qresult.id)
|
||||
self.assertEqual('human STS STS_D11570, sequence tagged site',
|
||||
self.assertEqual('AT5G23090.4', qresult.id)
|
||||
self.assertEqual('pacid=19665592 transcript=AT5G23090.4 locus=AT5G23090 ID=AT5G23090.4.TAIR10 annot-version=TAIR10',
|
||||
qresult.description)
|
||||
self.assertEqual(285, qresult.seq_len)
|
||||
self.assertEqual(371021, qresult.stat_db_num)
|
||||
self.assertEqual(1233631384, qresult.stat_db_len)
|
||||
self.assertEqual(0, qresult.stat_eff_space)
|
||||
self.assertEqual(0.710603, qresult.stat_kappa)
|
||||
self.assertEqual(1.37406, qresult.stat_lambda)
|
||||
self.assertEqual(1.30725, qresult.stat_entropy)
|
||||
self.assertEqual(2, len(qresult))
|
||||
self.assertEqual(4, len(qresult))
|
||||
|
||||
hit = qresult[0]
|
||||
self.assertEqual('gi|9950606|gb|AE004854.1|', hit.id)
|
||||
self.assertEqual('Pseudomonas aeruginosa PAO1, section 415 of 529 of the complete genome', hit.description)
|
||||
self.assertEqual(11884, hit.seq_len)
|
||||
self.assertEqual(1, len(hit))
|
||||
self.assertRaises(IndexError, hit.__getitem__, 1)
|
||||
self.assertEqual('PF00808', hit.id)
|
||||
self.assertEqual('Histone-like transcription factor (CBF/NF-Y) and archaeal histone', hit.description)
|
||||
self.assertEqual('PFAM', hit.target)
|
||||
self.assertEqual('31.0', hit.target_version)
|
||||
self.assertEqual(2, len(hit))
|
||||
|
||||
hsp = hit.hsps[0]
|
||||
self.assertEqual(38.1576, hsp.bitscore)
|
||||
self.assertEqual(19, hsp.bitscore_raw)
|
||||
self.assertEqual(1.0598, hsp.evalue)
|
||||
self.assertEqual(67, hsp.query_start)
|
||||
self.assertEqual(86, hsp.query_end)
|
||||
self.assertEqual(6011, hsp.hit_start)
|
||||
self.assertEqual(6030, hsp.hit_end)
|
||||
self.assertEqual(1, hsp.query_frame)
|
||||
self.assertEqual(1, hsp.hit_frame)
|
||||
self.assertEqual(19, hsp.ident_num)
|
||||
self.assertEqual(19, hsp.pos_num)
|
||||
self.assertEqual(0, hsp.gap_num)
|
||||
self.assertEqual(19, hsp.aln_span)
|
||||
self.assertEqual('CAGGCCAGCGACTTCTGGG', str(hsp.query.seq))
|
||||
self.assertEqual('CAGGCCAGCGACTTCTGGG', str(hsp.hit.seq))
|
||||
self.assertEqual('|||||||||||||||||||', hsp.aln_annotation['similarity'])
|
||||
self.assertRaises(IndexError, hit.__getitem__, 1)
|
||||
self.assertEqual(76.7, hsp.bitscore)
|
||||
self.assertEqual(1.1e-21, hsp.evalue)
|
||||
self.assertEqual(13, hsp.query_start)
|
||||
self.assertEqual(79, hsp.query_end)
|
||||
self.assertEqual(0, hsp.hit_start)
|
||||
self.assertEqual(65, hsp.hit_end)
|
||||
self.assertEqual(66, hsp.aln_span)
|
||||
self.assertEqual('MDPMDIVGKSKEDASLPKATMTKIIKEMLPPDVRVARDAQDLLIECCVEFINLVSSESNDVCNKEDKRTIAPEHVLKALQVLGFGEYIEEVYAAYEQHKYETMDTQRSVKWNPGAQMTEEEAAAEQQRMFAEARARMNGGVSVPQPEHPETDQRSPQS', str(hsp.query.seq))
|
||||
|
||||
# parse last hit
|
||||
hit = qresult[-1]
|
||||
self.assertEqual('gi|15073988|emb|AL591786.1|SME591786', hit.id)
|
||||
self.assertEqual('Sinorhizobium meliloti 1021 complete chromosome; segment 5/12', hit.description)
|
||||
self.assertEqual(299350, hit.seq_len)
|
||||
self.assertEqual('SSF47113', hit.id)
|
||||
self.assertEqual(1, len(hit))
|
||||
self.assertRaises(IndexError, hit.__getitem__, 1)
|
||||
self.assertEqual('IPR:IPR009072', hit.dbxrefs[0])
|
||||
self.assertEqual('GO:0046982', hit.dbxrefs[1])
|
||||
|
||||
hsp = hit.hsps[0]
|
||||
self.assertEqual(36.1753, hsp.bitscore)
|
||||
self.assertEqual(18, hsp.bitscore_raw)
|
||||
self.assertEqual(4.18768, hsp.evalue)
|
||||
self.assertEqual(203, hsp.query_start)
|
||||
self.assertEqual(224, hsp.query_end)
|
||||
self.assertEqual(83627, hsp.hit_start)
|
||||
self.assertEqual(83648, hsp.hit_end)
|
||||
self.assertEqual(1, hsp.query_frame)
|
||||
self.assertEqual(-1, hsp.hit_frame)
|
||||
self.assertEqual(20, hsp.ident_num)
|
||||
self.assertEqual(20, hsp.pos_num)
|
||||
self.assertEqual(0, hsp.gap_num)
|
||||
self.assertEqual(21, hsp.aln_span)
|
||||
self.assertEqual('TGAAAGGAAATNAAAATGGAA', str(hsp.query.seq))
|
||||
self.assertEqual('TGAAAGGAAATCAAAATGGAA', str(hsp.hit.seq))
|
||||
self.assertEqual('||||||||||| |||||||||', hsp.aln_annotation['similarity'])
|
||||
self.assertRaises(IndexError, hit.__getitem__, 1)
|
||||
|
||||
# check if we've finished iteration over qresults
|
||||
self.assertRaises(StopIteration, next, qresults)
|
||||
self.assertEqual(1, counter)
|
||||
self.assertEqual(11, hsp.query_start)
|
||||
self.assertEqual(141, hsp.query_end)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user