diff --git a/Bio/Compass/__init__.py b/Bio/Compass/__init__.py new file mode 100644 index 000000000..16b434914 --- /dev/null +++ b/Bio/Compass/__init__.py @@ -0,0 +1,260 @@ +# Copyright 2004 by James Casbon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +""" +Code to deal with COMPASS output, a program for profile/profile comparison. + +Compass is described in: + +Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein +alignments with assessment of statistical significance. J Mol Biol. 2003 Feb +7;326(1):317-36. + +Tested with COMPASS 1.24. + +Classes: +Record One result of a compass file +_Scanner Scan compass results +_Consumer Consume scanner events +RecordParser Parse one compass record +Iterator Iterate through a number of compass records +""" +from Bio import File +from Bio.ParserSupport import * +import re,string + +class Record: + """ + Hold information from one compass hit. + Ali1 one is the query, Ali2 the hit. + """ + + def __init__(self): + self.query='' + self.hit='' + self.gap_threshold=0 + self.query_length=0 + self.query_filtered_length=0 + self.query_nseqs=0 + self.query_neffseqs=0 + self.hit_length=0 + self.hit_filtered_length=0 + self.hit_nseqs=0 + self.hit_neffseqs=0 + self.sw_score=0 + self.evalue=-1 + self.query_start=-1 + self.hit_start=-1 + self.query_aln='' + self.hit_aln='' + self.positives='' + + + def query_coverage(self): + """Return the length of the query covered in alignment""" + s = string.replace(self.query_aln, "=", "") + return len(s) + + def hit_coverage(self): + """Return the length of the hit covered in the alignment""" + s = string.replace(self.hit_aln, "=", "") + return len(s) + +class _Scanner: + """Reads compass output and generate events""" + + def feed(self, handle, consumer): + """Feed in COMPASS ouput""" + + if isinstance(handle, File.UndoHandle): + pass + else: + handle = File.UndoHandle(handle) + + + assert isinstance(handle, File.UndoHandle), \ + "handle must be an UndoHandle" + if handle.peekline(): + self._scan_record(handle, consumer) + + + def _scan_record(self,handle,consumer): + self._scan_names(handle, consumer) + self._scan_threshold(handle, consumer) + self._scan_lengths(handle,consumer) + self._scan_profilewidth(handle, consumer) + self._scan_scores(handle,consumer) + self._scan_alignment(handle,consumer) + + def _scan_names(self,handle,consumer): + """ + Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln + """ + read_and_call(handle, consumer.names, contains="Ali1:") + + def _scan_threshold(self,handle, consumer): + """ + Threshold of effective gap content in columns: 0.5 + """ + read_and_call(handle, consumer.threshold, start="Threshold") + + def _scan_lengths(self,handle, consumer): + """ + length1=388 filtered_length1=386 length2=145 filtered_length2=137 + """ + read_and_call(handle, consumer.lengths, start="length1=") + + def _scan_profilewidth(self,handle,consumer): + """ + Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099 + """ + read_and_call(handle, consumer.profilewidth, contains="Nseqs1") + + def _scan_scores(self,handle, consumer): + """ + Smith-Waterman score = 37 Evalue = 5.75e+02 + """ + read_and_call(handle, consumer.scores, start="Smith-Waterman") + + def _scan_alignment(self,handle, consumer): + """ + QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN + + + QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV + + """ + while 1: + line = handle.readline() + if not line: + break + if is_blank_line(line): + continue + else: + consumer.query_alignment(line) + read_and_call(handle, consumer.positive_alignment) + read_and_call(handle, consumer.hit_alignment) + +class _Consumer: + # all regular expressions used -- compile only once + _re_names = re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+") + _re_threshold = \ + re.compile("Threshold of effective gap content in columns: (\S+)") + _re_lengths = \ + re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)" + + "\s+filtered_length2=(\S+)") + _re_profilewidth = \ + re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)") + _re_scores = re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)") + _re_start = re.compile("(\d+)") + _re_align = re.compile("^.{15}(\S+)") + _re_positive_alignment = re.compile("^.{15}(.+)") + + def __init__(self): + self.data = None + + def names(self, line): + """ + Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln + ------query----- -------hit------------- + """ + self.data = Record() + m = self.__class__._re_names.search(line) + self.data.query = m.group(1) + self.data.hit = m.group(2) + + def threshold(self,line): + m = self.__class__._re_threshold.search(line) + self.data.gap_threshold = float(m.group(1)) + + def lengths(self,line): + m = self.__class__._re_lengths.search(line) + self.data.query_length = int(m.group(1)) + self.data.query_filtered_length = float(m.group(2)) + self.data.hit_length = int(m.group(3)) + self.data.hit_filtered_length = float(m.group(4)) + + def profilewidth(self,line): + m = self.__class__._re_profilewidth.search(line) + self.data.query_nseqs = int(m.group(1)) + self.data.query_neffseqs = float(m.group(2)) + self.data.hit_nseqs = int(m.group(3)) + self.data.hit_neffseqs = float(m.group(4)) + + def scores(self, line): + m = self.__class__._re_scores.search(line) + if m: + self.data.sw_score = int(m.group(1)) + self.data.evalue = float(m.group(2)) + else: + self.data.sw_score = 0 + self.data.evalue = -1.0 + + def query_alignment(self, line): + m = self.__class__._re_start.search(line) + if m: + self.data.query_start = int(m.group(1)) + m = self.__class__._re_align.match(line) + assert m!=None, "invalid match" + self.data.query_aln = self.data.query_aln + m.group(1) + + def positive_alignment(self,line): + m = self.__class__._re_positive_alignment.match(line) + assert m!=None, "invalid match" + self.data.positives = self.data.positives + m.group(1) + + def hit_alignment(self,line): + m = self.__class__._re_start.search(line) + if m: + self.data.hit_start = int(m.group(1)) + m = self.__class__._re_align.match(line) + assert m!=None, "invalid match" + self.data.hit_aln = self.data.hit_aln + m.group(1) + +class RecordParser(AbstractParser): + """Parses compass results into a Record object. + """ + def __init__(self): + self._scanner = _Scanner() + self._consumer = _Consumer() + + + def parse(self, handle): + if isinstance(handle, File.UndoHandle): + uhandle = handle + else: + uhandle = File.UndoHandle(handle) + self._scanner.feed(uhandle, self._consumer) + return self._consumer.data + +class Iterator: + """Iterate through a file of compass results""" + def __init__(self, handle): + self._uhandle = File.UndoHandle(handle) + self._parser = RecordParser() + + def next(self): + lines = [] + while 1: + line = self._uhandle.readline() + if not line: + break + if line[0:4] == "Ali1" and lines: + self._uhandle.saveline(line) + break + + lines.append(line) + + + if not lines: + return None + + data = string.join(lines, '') + return self._parser.parse(File.StringHandle(data)) + diff --git a/CONTRIB b/CONTRIB index 43a05fa1c..32d206ccf 100644 --- a/CONTRIB +++ b/CONTRIB @@ -5,7 +5,6 @@ This is a list of people who have made contributions to Biopython. This is certainly not comprehensive, and if you've been overlooked (sorry!), please write Jeff at jchang@smi.stanford.edu. - Cecilia Alsmark Sebastian Bassi Yves Bastide diff --git a/Tests/Compass/comtest1 b/Tests/Compass/comtest1 new file mode 100644 index 000000000..551fcb976 --- /dev/null +++ b/Tests/Compass/comtest1 @@ -0,0 +1,40 @@ +....Ali1: 60456.blo.gz.aln Ali2: 60456.blo.gz.aln +Threshold of effective gap content in columns: 0.5 +length1=388 filtered_length1=386 length2=388 filtered_length2=386 +Nseqs1=399 Neff1=12.972 Nseqs2=399 Neff2=12.972 +Smith-Waterman score = 2759 Evalue = 0.00e+00 + +QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN + + +QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV + + +QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL + + +QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL + + +QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW + + +QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR + + +QUERY ISYATAYEKLEEAMDRMERVLKERKL + ++++++++++++++++++++++++++ +QUERY ISYATAYEKLEEAMDRMERVLKERKL + diff --git a/Tests/Compass/comtest2 b/Tests/Compass/comtest2 new file mode 100644 index 000000000..166534a09 --- /dev/null +++ b/Tests/Compass/comtest2 @@ -0,0 +1,41 @@ +Ali1: 60456.blo.gz.aln Ali2: allscop//14982.blo.gz.aln +Threshold of effective gap content in columns: 0.5 +length1=388 filtered_length1=386 length2=116 filtered_length2=115 +Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.313 +Smith-Waterman score = 35 Evalue = 1.01e+03 + +QUERY 178 KKDLEEIAD + ++ ++++++ +QUERY 9 QAAVQAVTA + +Ali1: 60456.blo.gz.aln Ali2: allscop//14983.blo.gz.aln +Threshold of effective gap content in columns: 0.5 +length1=388 filtered_length1=386 length2=121 filtered_length2=119 +Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.168 +Smith-Waterman score = 35 Evalue = 1.01e+03 + +QUERY 178 KKDLEEIAD + ++ ++++++ +QUERY 9 REAVEAAVD + +Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln +Threshold of effective gap content in columns: 0.5 +length1=388 filtered_length1=386 length2=145 filtered_length2=137 +Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=5.869 +Smith-Waterman score = 37 Evalue = 5.75e+02 + +QUERY 371 LEEAMDRMER~~~V + + ++++ + + + +QUERY 76 LQNFIDQLDNpddL + +Ali1: 60456.blo.gz.aln Ali2: allscop//15010.blo.gz.aln +Threshold of effective gap content in columns: 0.5 +length1=388 filtered_length1=386 length2=141 filtered_length2=141 +Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099 +Smith-Waterman score = 37 Evalue = 5.75e+02 + +QUERY 163 LIINSP + ++++++ +QUERY 32 LFDAHD + + diff --git a/Tests/output/test_Compass b/Tests/output/test_Compass new file mode 100644 index 000000000..ed35c8658 --- /dev/null +++ b/Tests/output/test_Compass @@ -0,0 +1,12 @@ +test_Compass +testAlignmentParsingOne (test_Compass.CompassTest) ... ok +testAlignmentParsingTwo (test_Compass.CompassTest) ... ok +testCompassIteratorEasy (test_Compass.CompassTest) ... ok +testCompassIteratorHard (test_Compass.CompassTest) ... ok +testCompassParser (test_Compass.CompassTest) ... ok +testCompassScanAndConsume (test_Compass.CompassTest) ... ok + +---------------------------------------------------------------------- +Ran 6 tests in 0.007s + +OK diff --git a/Tests/test_Compass.py b/Tests/test_Compass.py new file mode 100644 index 000000000..82a235a94 --- /dev/null +++ b/Tests/test_Compass.py @@ -0,0 +1,118 @@ +"""Tests for parsing Compass output. +""" +import os +import sys +import unittest + +from Bio import Compass + +def run_tests(argv): + test_suite = testing_suite() + runner = unittest.TextTestRunner(sys.stdout, verbosity = 2) + runner.run(test_suite) + +def testing_suite(): + """Generate the suite of tests. + """ + test_suite = unittest.TestSuite() + + test_loader = unittest.TestLoader() + test_loader.testMethodPrefix = 'test' + tests = [CompassTest] + + for test in tests: + cur_suite = test_loader.loadTestsFromTestCase(test) + test_suite.addTest(cur_suite) + + return test_suite + +class CompassTest(unittest.TestCase): + def setUp(self): + file_dir = os.path.join("Compass") + self.test_files = [ + os.path.join(file_dir, "comtest1"), + os.path.join(file_dir, "comtest2")] + + def testCompassScanAndConsume(self): + cons = Compass._Consumer() + scan = Compass._Scanner() + scan.feed(open(self.test_files[0]), cons) + + com_record = cons.data + + self.assertEquals("60456.blo.gz.aln", com_record.query) + self.assertEquals("60456.blo.gz.aln", com_record.hit) + self.assertEquals(0.5, com_record.gap_threshold) + + self.assertEquals(388, com_record.query_length) + self.assertEquals(386, com_record.query_filtered_length) + self.assertEquals(388, com_record.hit_length) + self.assertEquals(386, com_record.hit_filtered_length) + + self.assertEquals(399, com_record.query_nseqs) + self.assertEquals(12.972, com_record.query_neffseqs) + self.assertEquals(399, com_record.hit_nseqs) + self.assertEquals(12.972, com_record.hit_neffseqs) + + self.assertEquals(2759, com_record.sw_score) + self.assertEquals(float("0.00e+00"), com_record.evalue) + + def testCompassParser(self): + parser = Compass.RecordParser() + com_record = parser.parse(open(self.test_files[0])) + + self.assertEquals("60456.blo.gz.aln", com_record.query) + + def testCompassIteratorEasy(self): + it = Compass.Iterator(open(self.test_files[0])) + + com_record = it.next() + self.assertEquals("60456.blo.gz.aln", com_record.query) + + com_record = it.next() + self.assertEquals(None, com_record) + pass + + def testCompassIteratorHard(self): + it = Compass.Iterator(open(self.test_files[1])) + + com_record = it.next() + self.assertEquals("allscop//14982.blo.gz.aln", com_record.hit) + self.assertEquals(float('1.01e+03'), com_record.evalue) + + com_record = it.next() + self.assertEquals("allscop//14983.blo.gz.aln", com_record.hit) + self.assertEquals(float('1.01e+03'), com_record.evalue) + + com_record = it.next() + self.assertEquals("allscop//14984.blo.gz.aln", com_record.hit) + self.assertEquals(float('5.75e+02'), com_record.evalue) + + def testAlignmentParsingOne(self): + it = Compass.Iterator(open(self.test_files[1])) + + com_record = it.next() + self.assertEquals(178, com_record.query_start) + self.assertEquals("KKDLEEIAD", com_record.query_aln) + self.assertEquals(9, com_record.hit_start) + self.assertEquals("QAAVQAVTA", com_record.hit_aln) + self.assertEquals("++ ++++++", com_record.positives) + + com_record = it.next() + com_record = it.next() + self.assertEquals(371, com_record.query_start) + self.assertEquals("LEEAMDRMER~~~V", com_record.query_aln) + self.assertEquals(76, com_record.hit_start) + self.assertEquals("LQNFIDQLDNpddL", com_record.hit_aln) + self.assertEquals("+ ++++ + + +", com_record.positives) + + def testAlignmentParsingTwo(self): + it = Compass.Iterator(open(self.test_files[0])) + + com_record = it.next() + self.assertEquals(2, com_record.query_start) + self.assertEquals(2, com_record.hit_start) + self.assertEquals("LKERKL", com_record.hit_aln[-6:]) + +if __name__ == "__main__": + sys.exit(run_tests(sys.argv)) diff --git a/setup.py b/setup.py index 9342c116d..659c43478 100644 --- a/setup.py +++ b/setup.py @@ -293,6 +293,7 @@ PACKAGES = [ 'Bio.builders.Search', 'Bio.builders.SeqRecord', 'Bio.CDD', + 'Bio.Compass', 'Bio.Clustalw', 'Bio.config', 'Bio.Crystal',