mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Added Compass parsing code contributed by James Casbon.
This commit is contained in:
260
Bio/Compass/__init__.py
Normal file
260
Bio/Compass/__init__.py
Normal file
@ -0,0 +1,260 @@
|
||||
# Copyright 2004 by James Casbon. All rights reserved.
|
||||
# This code is part of the Biopython distribution and governed by its
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
|
||||
"""
|
||||
Code to deal with COMPASS output, a program for profile/profile comparison.
|
||||
|
||||
Compass is described in:
|
||||
|
||||
Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
|
||||
alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
|
||||
7;326(1):317-36.
|
||||
|
||||
Tested with COMPASS 1.24.
|
||||
|
||||
Classes:
|
||||
Record One result of a compass file
|
||||
_Scanner Scan compass results
|
||||
_Consumer Consume scanner events
|
||||
RecordParser Parse one compass record
|
||||
Iterator Iterate through a number of compass records
|
||||
"""
|
||||
from Bio import File
|
||||
from Bio.ParserSupport import *
|
||||
import re,string
|
||||
|
||||
class Record:
|
||||
"""
|
||||
Hold information from one compass hit.
|
||||
Ali1 one is the query, Ali2 the hit.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.query=''
|
||||
self.hit=''
|
||||
self.gap_threshold=0
|
||||
self.query_length=0
|
||||
self.query_filtered_length=0
|
||||
self.query_nseqs=0
|
||||
self.query_neffseqs=0
|
||||
self.hit_length=0
|
||||
self.hit_filtered_length=0
|
||||
self.hit_nseqs=0
|
||||
self.hit_neffseqs=0
|
||||
self.sw_score=0
|
||||
self.evalue=-1
|
||||
self.query_start=-1
|
||||
self.hit_start=-1
|
||||
self.query_aln=''
|
||||
self.hit_aln=''
|
||||
self.positives=''
|
||||
|
||||
|
||||
def query_coverage(self):
|
||||
"""Return the length of the query covered in alignment"""
|
||||
s = string.replace(self.query_aln, "=", "")
|
||||
return len(s)
|
||||
|
||||
def hit_coverage(self):
|
||||
"""Return the length of the hit covered in the alignment"""
|
||||
s = string.replace(self.hit_aln, "=", "")
|
||||
return len(s)
|
||||
|
||||
class _Scanner:
|
||||
"""Reads compass output and generate events"""
|
||||
|
||||
def feed(self, handle, consumer):
|
||||
"""Feed in COMPASS ouput"""
|
||||
|
||||
if isinstance(handle, File.UndoHandle):
|
||||
pass
|
||||
else:
|
||||
handle = File.UndoHandle(handle)
|
||||
|
||||
|
||||
assert isinstance(handle, File.UndoHandle), \
|
||||
"handle must be an UndoHandle"
|
||||
if handle.peekline():
|
||||
self._scan_record(handle, consumer)
|
||||
|
||||
|
||||
def _scan_record(self,handle,consumer):
|
||||
self._scan_names(handle, consumer)
|
||||
self._scan_threshold(handle, consumer)
|
||||
self._scan_lengths(handle,consumer)
|
||||
self._scan_profilewidth(handle, consumer)
|
||||
self._scan_scores(handle,consumer)
|
||||
self._scan_alignment(handle,consumer)
|
||||
|
||||
def _scan_names(self,handle,consumer):
|
||||
"""
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
|
||||
"""
|
||||
read_and_call(handle, consumer.names, contains="Ali1:")
|
||||
|
||||
def _scan_threshold(self,handle, consumer):
|
||||
"""
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
"""
|
||||
read_and_call(handle, consumer.threshold, start="Threshold")
|
||||
|
||||
def _scan_lengths(self,handle, consumer):
|
||||
"""
|
||||
length1=388 filtered_length1=386 length2=145 filtered_length2=137
|
||||
"""
|
||||
read_and_call(handle, consumer.lengths, start="length1=")
|
||||
|
||||
def _scan_profilewidth(self,handle,consumer):
|
||||
"""
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
|
||||
"""
|
||||
read_and_call(handle, consumer.profilewidth, contains="Nseqs1")
|
||||
|
||||
def _scan_scores(self,handle, consumer):
|
||||
"""
|
||||
Smith-Waterman score = 37 Evalue = 5.75e+02
|
||||
"""
|
||||
read_and_call(handle, consumer.scores, start="Smith-Waterman")
|
||||
|
||||
def _scan_alignment(self,handle, consumer):
|
||||
"""
|
||||
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
|
||||
|
||||
|
||||
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
|
||||
|
||||
"""
|
||||
while 1:
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
break
|
||||
if is_blank_line(line):
|
||||
continue
|
||||
else:
|
||||
consumer.query_alignment(line)
|
||||
read_and_call(handle, consumer.positive_alignment)
|
||||
read_and_call(handle, consumer.hit_alignment)
|
||||
|
||||
class _Consumer:
|
||||
# all regular expressions used -- compile only once
|
||||
_re_names = re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+")
|
||||
_re_threshold = \
|
||||
re.compile("Threshold of effective gap content in columns: (\S+)")
|
||||
_re_lengths = \
|
||||
re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)"
|
||||
+ "\s+filtered_length2=(\S+)")
|
||||
_re_profilewidth = \
|
||||
re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)")
|
||||
_re_scores = re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)")
|
||||
_re_start = re.compile("(\d+)")
|
||||
_re_align = re.compile("^.{15}(\S+)")
|
||||
_re_positive_alignment = re.compile("^.{15}(.+)")
|
||||
|
||||
def __init__(self):
|
||||
self.data = None
|
||||
|
||||
def names(self, line):
|
||||
"""
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
|
||||
------query----- -------hit-------------
|
||||
"""
|
||||
self.data = Record()
|
||||
m = self.__class__._re_names.search(line)
|
||||
self.data.query = m.group(1)
|
||||
self.data.hit = m.group(2)
|
||||
|
||||
def threshold(self,line):
|
||||
m = self.__class__._re_threshold.search(line)
|
||||
self.data.gap_threshold = float(m.group(1))
|
||||
|
||||
def lengths(self,line):
|
||||
m = self.__class__._re_lengths.search(line)
|
||||
self.data.query_length = int(m.group(1))
|
||||
self.data.query_filtered_length = float(m.group(2))
|
||||
self.data.hit_length = int(m.group(3))
|
||||
self.data.hit_filtered_length = float(m.group(4))
|
||||
|
||||
def profilewidth(self,line):
|
||||
m = self.__class__._re_profilewidth.search(line)
|
||||
self.data.query_nseqs = int(m.group(1))
|
||||
self.data.query_neffseqs = float(m.group(2))
|
||||
self.data.hit_nseqs = int(m.group(3))
|
||||
self.data.hit_neffseqs = float(m.group(4))
|
||||
|
||||
def scores(self, line):
|
||||
m = self.__class__._re_scores.search(line)
|
||||
if m:
|
||||
self.data.sw_score = int(m.group(1))
|
||||
self.data.evalue = float(m.group(2))
|
||||
else:
|
||||
self.data.sw_score = 0
|
||||
self.data.evalue = -1.0
|
||||
|
||||
def query_alignment(self, line):
|
||||
m = self.__class__._re_start.search(line)
|
||||
if m:
|
||||
self.data.query_start = int(m.group(1))
|
||||
m = self.__class__._re_align.match(line)
|
||||
assert m!=None, "invalid match"
|
||||
self.data.query_aln = self.data.query_aln + m.group(1)
|
||||
|
||||
def positive_alignment(self,line):
|
||||
m = self.__class__._re_positive_alignment.match(line)
|
||||
assert m!=None, "invalid match"
|
||||
self.data.positives = self.data.positives + m.group(1)
|
||||
|
||||
def hit_alignment(self,line):
|
||||
m = self.__class__._re_start.search(line)
|
||||
if m:
|
||||
self.data.hit_start = int(m.group(1))
|
||||
m = self.__class__._re_align.match(line)
|
||||
assert m!=None, "invalid match"
|
||||
self.data.hit_aln = self.data.hit_aln + m.group(1)
|
||||
|
||||
class RecordParser(AbstractParser):
|
||||
"""Parses compass results into a Record object.
|
||||
"""
|
||||
def __init__(self):
|
||||
self._scanner = _Scanner()
|
||||
self._consumer = _Consumer()
|
||||
|
||||
|
||||
def parse(self, handle):
|
||||
if isinstance(handle, File.UndoHandle):
|
||||
uhandle = handle
|
||||
else:
|
||||
uhandle = File.UndoHandle(handle)
|
||||
self._scanner.feed(uhandle, self._consumer)
|
||||
return self._consumer.data
|
||||
|
||||
class Iterator:
|
||||
"""Iterate through a file of compass results"""
|
||||
def __init__(self, handle):
|
||||
self._uhandle = File.UndoHandle(handle)
|
||||
self._parser = RecordParser()
|
||||
|
||||
def next(self):
|
||||
lines = []
|
||||
while 1:
|
||||
line = self._uhandle.readline()
|
||||
if not line:
|
||||
break
|
||||
if line[0:4] == "Ali1" and lines:
|
||||
self._uhandle.saveline(line)
|
||||
break
|
||||
|
||||
lines.append(line)
|
||||
|
||||
|
||||
if not lines:
|
||||
return None
|
||||
|
||||
data = string.join(lines, '')
|
||||
return self._parser.parse(File.StringHandle(data))
|
||||
|
1
CONTRIB
1
CONTRIB
@ -5,7 +5,6 @@ This is a list of people who have made contributions to Biopython.
|
||||
This is certainly not comprehensive, and if you've been overlooked
|
||||
(sorry!), please write Jeff at jchang@smi.stanford.edu.
|
||||
|
||||
|
||||
Cecilia Alsmark <Cecilia.Alsmark@ebc.uu.se>
|
||||
Sebastian Bassi <sbassi@asalup.org>
|
||||
Yves Bastide <ybastide@irisa.fr>
|
||||
|
40
Tests/Compass/comtest1
Normal file
40
Tests/Compass/comtest1
Normal file
@ -0,0 +1,40 @@
|
||||
....Ali1: 60456.blo.gz.aln Ali2: 60456.blo.gz.aln
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
length1=388 filtered_length1=386 length2=388 filtered_length2=386
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=399 Neff2=12.972
|
||||
Smith-Waterman score = 2759 Evalue = 0.00e+00
|
||||
|
||||
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
|
||||
|
||||
|
||||
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
|
||||
|
||||
|
||||
QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
|
||||
|
||||
|
||||
QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
|
||||
|
||||
|
||||
QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
|
||||
|
||||
|
||||
QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
|
||||
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
|
||||
|
||||
|
||||
QUERY ISYATAYEKLEEAMDRMERVLKERKL
|
||||
++++++++++++++++++++++++++
|
||||
QUERY ISYATAYEKLEEAMDRMERVLKERKL
|
||||
|
41
Tests/Compass/comtest2
Normal file
41
Tests/Compass/comtest2
Normal file
@ -0,0 +1,41 @@
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//14982.blo.gz.aln
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
length1=388 filtered_length1=386 length2=116 filtered_length2=115
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.313
|
||||
Smith-Waterman score = 35 Evalue = 1.01e+03
|
||||
|
||||
QUERY 178 KKDLEEIAD
|
||||
++ ++++++
|
||||
QUERY 9 QAAVQAVTA
|
||||
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//14983.blo.gz.aln
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
length1=388 filtered_length1=386 length2=121 filtered_length2=119
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.168
|
||||
Smith-Waterman score = 35 Evalue = 1.01e+03
|
||||
|
||||
QUERY 178 KKDLEEIAD
|
||||
++ ++++++
|
||||
QUERY 9 REAVEAAVD
|
||||
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
length1=388 filtered_length1=386 length2=145 filtered_length2=137
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=5.869
|
||||
Smith-Waterman score = 37 Evalue = 5.75e+02
|
||||
|
||||
QUERY 371 LEEAMDRMER~~~V
|
||||
+ ++++ + + +
|
||||
QUERY 76 LQNFIDQLDNpddL
|
||||
|
||||
Ali1: 60456.blo.gz.aln Ali2: allscop//15010.blo.gz.aln
|
||||
Threshold of effective gap content in columns: 0.5
|
||||
length1=388 filtered_length1=386 length2=141 filtered_length2=141
|
||||
Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
|
||||
Smith-Waterman score = 37 Evalue = 5.75e+02
|
||||
|
||||
QUERY 163 LIINSP
|
||||
++++++
|
||||
QUERY 32 LFDAHD
|
||||
|
||||
|
12
Tests/output/test_Compass
Normal file
12
Tests/output/test_Compass
Normal file
@ -0,0 +1,12 @@
|
||||
test_Compass
|
||||
testAlignmentParsingOne (test_Compass.CompassTest) ... ok
|
||||
testAlignmentParsingTwo (test_Compass.CompassTest) ... ok
|
||||
testCompassIteratorEasy (test_Compass.CompassTest) ... ok
|
||||
testCompassIteratorHard (test_Compass.CompassTest) ... ok
|
||||
testCompassParser (test_Compass.CompassTest) ... ok
|
||||
testCompassScanAndConsume (test_Compass.CompassTest) ... ok
|
||||
|
||||
----------------------------------------------------------------------
|
||||
Ran 6 tests in 0.007s
|
||||
|
||||
OK
|
118
Tests/test_Compass.py
Normal file
118
Tests/test_Compass.py
Normal file
@ -0,0 +1,118 @@
|
||||
"""Tests for parsing Compass output.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from Bio import Compass
|
||||
|
||||
def run_tests(argv):
|
||||
test_suite = testing_suite()
|
||||
runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
|
||||
runner.run(test_suite)
|
||||
|
||||
def testing_suite():
|
||||
"""Generate the suite of tests.
|
||||
"""
|
||||
test_suite = unittest.TestSuite()
|
||||
|
||||
test_loader = unittest.TestLoader()
|
||||
test_loader.testMethodPrefix = 'test'
|
||||
tests = [CompassTest]
|
||||
|
||||
for test in tests:
|
||||
cur_suite = test_loader.loadTestsFromTestCase(test)
|
||||
test_suite.addTest(cur_suite)
|
||||
|
||||
return test_suite
|
||||
|
||||
class CompassTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
file_dir = os.path.join("Compass")
|
||||
self.test_files = [
|
||||
os.path.join(file_dir, "comtest1"),
|
||||
os.path.join(file_dir, "comtest2")]
|
||||
|
||||
def testCompassScanAndConsume(self):
|
||||
cons = Compass._Consumer()
|
||||
scan = Compass._Scanner()
|
||||
scan.feed(open(self.test_files[0]), cons)
|
||||
|
||||
com_record = cons.data
|
||||
|
||||
self.assertEquals("60456.blo.gz.aln", com_record.query)
|
||||
self.assertEquals("60456.blo.gz.aln", com_record.hit)
|
||||
self.assertEquals(0.5, com_record.gap_threshold)
|
||||
|
||||
self.assertEquals(388, com_record.query_length)
|
||||
self.assertEquals(386, com_record.query_filtered_length)
|
||||
self.assertEquals(388, com_record.hit_length)
|
||||
self.assertEquals(386, com_record.hit_filtered_length)
|
||||
|
||||
self.assertEquals(399, com_record.query_nseqs)
|
||||
self.assertEquals(12.972, com_record.query_neffseqs)
|
||||
self.assertEquals(399, com_record.hit_nseqs)
|
||||
self.assertEquals(12.972, com_record.hit_neffseqs)
|
||||
|
||||
self.assertEquals(2759, com_record.sw_score)
|
||||
self.assertEquals(float("0.00e+00"), com_record.evalue)
|
||||
|
||||
def testCompassParser(self):
|
||||
parser = Compass.RecordParser()
|
||||
com_record = parser.parse(open(self.test_files[0]))
|
||||
|
||||
self.assertEquals("60456.blo.gz.aln", com_record.query)
|
||||
|
||||
def testCompassIteratorEasy(self):
|
||||
it = Compass.Iterator(open(self.test_files[0]))
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals("60456.blo.gz.aln", com_record.query)
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals(None, com_record)
|
||||
pass
|
||||
|
||||
def testCompassIteratorHard(self):
|
||||
it = Compass.Iterator(open(self.test_files[1]))
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals("allscop//14982.blo.gz.aln", com_record.hit)
|
||||
self.assertEquals(float('1.01e+03'), com_record.evalue)
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals("allscop//14983.blo.gz.aln", com_record.hit)
|
||||
self.assertEquals(float('1.01e+03'), com_record.evalue)
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals("allscop//14984.blo.gz.aln", com_record.hit)
|
||||
self.assertEquals(float('5.75e+02'), com_record.evalue)
|
||||
|
||||
def testAlignmentParsingOne(self):
|
||||
it = Compass.Iterator(open(self.test_files[1]))
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals(178, com_record.query_start)
|
||||
self.assertEquals("KKDLEEIAD", com_record.query_aln)
|
||||
self.assertEquals(9, com_record.hit_start)
|
||||
self.assertEquals("QAAVQAVTA", com_record.hit_aln)
|
||||
self.assertEquals("++ ++++++", com_record.positives)
|
||||
|
||||
com_record = it.next()
|
||||
com_record = it.next()
|
||||
self.assertEquals(371, com_record.query_start)
|
||||
self.assertEquals("LEEAMDRMER~~~V", com_record.query_aln)
|
||||
self.assertEquals(76, com_record.hit_start)
|
||||
self.assertEquals("LQNFIDQLDNpddL", com_record.hit_aln)
|
||||
self.assertEquals("+ ++++ + + +", com_record.positives)
|
||||
|
||||
def testAlignmentParsingTwo(self):
|
||||
it = Compass.Iterator(open(self.test_files[0]))
|
||||
|
||||
com_record = it.next()
|
||||
self.assertEquals(2, com_record.query_start)
|
||||
self.assertEquals(2, com_record.hit_start)
|
||||
self.assertEquals("LKERKL", com_record.hit_aln[-6:])
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(run_tests(sys.argv))
|
Reference in New Issue
Block a user