Added Compass parsing code contributed by James Casbon.

This commit is contained in:
chapmanb
2004-05-05 14:45:29 +00:00
parent 836d8e5c54
commit 76ce5ca7b3
7 changed files with 472 additions and 1 deletions

260
Bio/Compass/__init__.py Normal file
View File

@ -0,0 +1,260 @@
# Copyright 2004 by James Casbon. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""
Code to deal with COMPASS output, a program for profile/profile comparison.
Compass is described in:
Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
7;326(1):317-36.
Tested with COMPASS 1.24.
Classes:
Record One result of a compass file
_Scanner Scan compass results
_Consumer Consume scanner events
RecordParser Parse one compass record
Iterator Iterate through a number of compass records
"""
from Bio import File
from Bio.ParserSupport import *
import re,string
class Record:
"""
Hold information from one compass hit.
Ali1 one is the query, Ali2 the hit.
"""
def __init__(self):
self.query=''
self.hit=''
self.gap_threshold=0
self.query_length=0
self.query_filtered_length=0
self.query_nseqs=0
self.query_neffseqs=0
self.hit_length=0
self.hit_filtered_length=0
self.hit_nseqs=0
self.hit_neffseqs=0
self.sw_score=0
self.evalue=-1
self.query_start=-1
self.hit_start=-1
self.query_aln=''
self.hit_aln=''
self.positives=''
def query_coverage(self):
"""Return the length of the query covered in alignment"""
s = string.replace(self.query_aln, "=", "")
return len(s)
def hit_coverage(self):
"""Return the length of the hit covered in the alignment"""
s = string.replace(self.hit_aln, "=", "")
return len(s)
class _Scanner:
"""Reads compass output and generate events"""
def feed(self, handle, consumer):
"""Feed in COMPASS ouput"""
if isinstance(handle, File.UndoHandle):
pass
else:
handle = File.UndoHandle(handle)
assert isinstance(handle, File.UndoHandle), \
"handle must be an UndoHandle"
if handle.peekline():
self._scan_record(handle, consumer)
def _scan_record(self,handle,consumer):
self._scan_names(handle, consumer)
self._scan_threshold(handle, consumer)
self._scan_lengths(handle,consumer)
self._scan_profilewidth(handle, consumer)
self._scan_scores(handle,consumer)
self._scan_alignment(handle,consumer)
def _scan_names(self,handle,consumer):
"""
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
"""
read_and_call(handle, consumer.names, contains="Ali1:")
def _scan_threshold(self,handle, consumer):
"""
Threshold of effective gap content in columns: 0.5
"""
read_and_call(handle, consumer.threshold, start="Threshold")
def _scan_lengths(self,handle, consumer):
"""
length1=388 filtered_length1=386 length2=145 filtered_length2=137
"""
read_and_call(handle, consumer.lengths, start="length1=")
def _scan_profilewidth(self,handle,consumer):
"""
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
"""
read_and_call(handle, consumer.profilewidth, contains="Nseqs1")
def _scan_scores(self,handle, consumer):
"""
Smith-Waterman score = 37 Evalue = 5.75e+02
"""
read_and_call(handle, consumer.scores, start="Smith-Waterman")
def _scan_alignment(self,handle, consumer):
"""
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
"""
while 1:
line = handle.readline()
if not line:
break
if is_blank_line(line):
continue
else:
consumer.query_alignment(line)
read_and_call(handle, consumer.positive_alignment)
read_and_call(handle, consumer.hit_alignment)
class _Consumer:
# all regular expressions used -- compile only once
_re_names = re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+")
_re_threshold = \
re.compile("Threshold of effective gap content in columns: (\S+)")
_re_lengths = \
re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)"
+ "\s+filtered_length2=(\S+)")
_re_profilewidth = \
re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)")
_re_scores = re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)")
_re_start = re.compile("(\d+)")
_re_align = re.compile("^.{15}(\S+)")
_re_positive_alignment = re.compile("^.{15}(.+)")
def __init__(self):
self.data = None
def names(self, line):
"""
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
------query----- -------hit-------------
"""
self.data = Record()
m = self.__class__._re_names.search(line)
self.data.query = m.group(1)
self.data.hit = m.group(2)
def threshold(self,line):
m = self.__class__._re_threshold.search(line)
self.data.gap_threshold = float(m.group(1))
def lengths(self,line):
m = self.__class__._re_lengths.search(line)
self.data.query_length = int(m.group(1))
self.data.query_filtered_length = float(m.group(2))
self.data.hit_length = int(m.group(3))
self.data.hit_filtered_length = float(m.group(4))
def profilewidth(self,line):
m = self.__class__._re_profilewidth.search(line)
self.data.query_nseqs = int(m.group(1))
self.data.query_neffseqs = float(m.group(2))
self.data.hit_nseqs = int(m.group(3))
self.data.hit_neffseqs = float(m.group(4))
def scores(self, line):
m = self.__class__._re_scores.search(line)
if m:
self.data.sw_score = int(m.group(1))
self.data.evalue = float(m.group(2))
else:
self.data.sw_score = 0
self.data.evalue = -1.0
def query_alignment(self, line):
m = self.__class__._re_start.search(line)
if m:
self.data.query_start = int(m.group(1))
m = self.__class__._re_align.match(line)
assert m!=None, "invalid match"
self.data.query_aln = self.data.query_aln + m.group(1)
def positive_alignment(self,line):
m = self.__class__._re_positive_alignment.match(line)
assert m!=None, "invalid match"
self.data.positives = self.data.positives + m.group(1)
def hit_alignment(self,line):
m = self.__class__._re_start.search(line)
if m:
self.data.hit_start = int(m.group(1))
m = self.__class__._re_align.match(line)
assert m!=None, "invalid match"
self.data.hit_aln = self.data.hit_aln + m.group(1)
class RecordParser(AbstractParser):
"""Parses compass results into a Record object.
"""
def __init__(self):
self._scanner = _Scanner()
self._consumer = _Consumer()
def parse(self, handle):
if isinstance(handle, File.UndoHandle):
uhandle = handle
else:
uhandle = File.UndoHandle(handle)
self._scanner.feed(uhandle, self._consumer)
return self._consumer.data
class Iterator:
"""Iterate through a file of compass results"""
def __init__(self, handle):
self._uhandle = File.UndoHandle(handle)
self._parser = RecordParser()
def next(self):
lines = []
while 1:
line = self._uhandle.readline()
if not line:
break
if line[0:4] == "Ali1" and lines:
self._uhandle.saveline(line)
break
lines.append(line)
if not lines:
return None
data = string.join(lines, '')
return self._parser.parse(File.StringHandle(data))

View File

@ -5,7 +5,6 @@ This is a list of people who have made contributions to Biopython.
This is certainly not comprehensive, and if you've been overlooked
(sorry!), please write Jeff at jchang@smi.stanford.edu.
Cecilia Alsmark <Cecilia.Alsmark@ebc.uu.se>
Sebastian Bassi <sbassi@asalup.org>
Yves Bastide <ybastide@irisa.fr>

40
Tests/Compass/comtest1 Normal file
View File

@ -0,0 +1,40 @@
....Ali1: 60456.blo.gz.aln Ali2: 60456.blo.gz.aln
Threshold of effective gap content in columns: 0.5
length1=388 filtered_length1=386 length2=388 filtered_length2=386
Nseqs1=399 Neff1=12.972 Nseqs2=399 Neff2=12.972
Smith-Waterman score = 2759 Evalue = 0.00e+00
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
QUERY ISYATAYEKLEEAMDRMERVLKERKL
++++++++++++++++++++++++++
QUERY ISYATAYEKLEEAMDRMERVLKERKL

41
Tests/Compass/comtest2 Normal file
View File

@ -0,0 +1,41 @@
Ali1: 60456.blo.gz.aln Ali2: allscop//14982.blo.gz.aln
Threshold of effective gap content in columns: 0.5
length1=388 filtered_length1=386 length2=116 filtered_length2=115
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.313
Smith-Waterman score = 35 Evalue = 1.01e+03
QUERY 178 KKDLEEIAD
++ ++++++
QUERY 9 QAAVQAVTA
Ali1: 60456.blo.gz.aln Ali2: allscop//14983.blo.gz.aln
Threshold of effective gap content in columns: 0.5
length1=388 filtered_length1=386 length2=121 filtered_length2=119
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.168
Smith-Waterman score = 35 Evalue = 1.01e+03
QUERY 178 KKDLEEIAD
++ ++++++
QUERY 9 REAVEAAVD
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
Threshold of effective gap content in columns: 0.5
length1=388 filtered_length1=386 length2=145 filtered_length2=137
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=5.869
Smith-Waterman score = 37 Evalue = 5.75e+02
QUERY 371 LEEAMDRMER~~~V
+ ++++ + + +
QUERY 76 LQNFIDQLDNpddL
Ali1: 60456.blo.gz.aln Ali2: allscop//15010.blo.gz.aln
Threshold of effective gap content in columns: 0.5
length1=388 filtered_length1=386 length2=141 filtered_length2=141
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
Smith-Waterman score = 37 Evalue = 5.75e+02
QUERY 163 LIINSP
++++++
QUERY 32 LFDAHD

12
Tests/output/test_Compass Normal file
View File

@ -0,0 +1,12 @@
test_Compass
testAlignmentParsingOne (test_Compass.CompassTest) ... ok
testAlignmentParsingTwo (test_Compass.CompassTest) ... ok
testCompassIteratorEasy (test_Compass.CompassTest) ... ok
testCompassIteratorHard (test_Compass.CompassTest) ... ok
testCompassParser (test_Compass.CompassTest) ... ok
testCompassScanAndConsume (test_Compass.CompassTest) ... ok
----------------------------------------------------------------------
Ran 6 tests in 0.007s
OK

118
Tests/test_Compass.py Normal file
View File

@ -0,0 +1,118 @@
"""Tests for parsing Compass output.
"""
import os
import sys
import unittest
from Bio import Compass
def run_tests(argv):
test_suite = testing_suite()
runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
runner.run(test_suite)
def testing_suite():
"""Generate the suite of tests.
"""
test_suite = unittest.TestSuite()
test_loader = unittest.TestLoader()
test_loader.testMethodPrefix = 'test'
tests = [CompassTest]
for test in tests:
cur_suite = test_loader.loadTestsFromTestCase(test)
test_suite.addTest(cur_suite)
return test_suite
class CompassTest(unittest.TestCase):
def setUp(self):
file_dir = os.path.join("Compass")
self.test_files = [
os.path.join(file_dir, "comtest1"),
os.path.join(file_dir, "comtest2")]
def testCompassScanAndConsume(self):
cons = Compass._Consumer()
scan = Compass._Scanner()
scan.feed(open(self.test_files[0]), cons)
com_record = cons.data
self.assertEquals("60456.blo.gz.aln", com_record.query)
self.assertEquals("60456.blo.gz.aln", com_record.hit)
self.assertEquals(0.5, com_record.gap_threshold)
self.assertEquals(388, com_record.query_length)
self.assertEquals(386, com_record.query_filtered_length)
self.assertEquals(388, com_record.hit_length)
self.assertEquals(386, com_record.hit_filtered_length)
self.assertEquals(399, com_record.query_nseqs)
self.assertEquals(12.972, com_record.query_neffseqs)
self.assertEquals(399, com_record.hit_nseqs)
self.assertEquals(12.972, com_record.hit_neffseqs)
self.assertEquals(2759, com_record.sw_score)
self.assertEquals(float("0.00e+00"), com_record.evalue)
def testCompassParser(self):
parser = Compass.RecordParser()
com_record = parser.parse(open(self.test_files[0]))
self.assertEquals("60456.blo.gz.aln", com_record.query)
def testCompassIteratorEasy(self):
it = Compass.Iterator(open(self.test_files[0]))
com_record = it.next()
self.assertEquals("60456.blo.gz.aln", com_record.query)
com_record = it.next()
self.assertEquals(None, com_record)
pass
def testCompassIteratorHard(self):
it = Compass.Iterator(open(self.test_files[1]))
com_record = it.next()
self.assertEquals("allscop//14982.blo.gz.aln", com_record.hit)
self.assertEquals(float('1.01e+03'), com_record.evalue)
com_record = it.next()
self.assertEquals("allscop//14983.blo.gz.aln", com_record.hit)
self.assertEquals(float('1.01e+03'), com_record.evalue)
com_record = it.next()
self.assertEquals("allscop//14984.blo.gz.aln", com_record.hit)
self.assertEquals(float('5.75e+02'), com_record.evalue)
def testAlignmentParsingOne(self):
it = Compass.Iterator(open(self.test_files[1]))
com_record = it.next()
self.assertEquals(178, com_record.query_start)
self.assertEquals("KKDLEEIAD", com_record.query_aln)
self.assertEquals(9, com_record.hit_start)
self.assertEquals("QAAVQAVTA", com_record.hit_aln)
self.assertEquals("++ ++++++", com_record.positives)
com_record = it.next()
com_record = it.next()
self.assertEquals(371, com_record.query_start)
self.assertEquals("LEEAMDRMER~~~V", com_record.query_aln)
self.assertEquals(76, com_record.hit_start)
self.assertEquals("LQNFIDQLDNpddL", com_record.hit_aln)
self.assertEquals("+ ++++ + + +", com_record.positives)
def testAlignmentParsingTwo(self):
it = Compass.Iterator(open(self.test_files[0]))
com_record = it.next()
self.assertEquals(2, com_record.query_start)
self.assertEquals(2, com_record.hit_start)
self.assertEquals("LKERKL", com_record.hit_aln[-6:])
if __name__ == "__main__":
sys.exit(run_tests(sys.argv))

View File

@ -293,6 +293,7 @@ PACKAGES = [
'Bio.builders.Search',
'Bio.builders.SeqRecord',
'Bio.CDD',
'Bio.Compass',
'Bio.Clustalw',
'Bio.config',
'Bio.Crystal',