Fix NBRF/PIR parsing so that is accepts files produced by Clustalw -- thanks to Ashleigh Smythe for the bug report.

Fix bugs in NBRF parsing where not all of the comments were passed through.
Try to speed up sequence building using joins instead of sequential addition of sequence.
This commit is contained in:
chapmanb
2004-03-02 23:17:13 +00:00
parent f1693ec3ed
commit eb65c38f54
5 changed files with 7835 additions and 7676 deletions

View File

@ -91,7 +91,7 @@ class _Scanner:
"sequence_name", \
"comment", \
"sequence_line", \
"sequence_final_line" ]
"sequence_final_text" ]
# make a parser that returns only the tags we are interested in
expression = Martel.select_names( nbrf_format.nbrf_record, self.interest_tags)
@ -115,29 +115,28 @@ class _RecordConsumer:
"""
def __init__(self):
self.data = Record.Record()
self._sequences = []
def sequence_type(self, sequence_type ):
self.data.sequence_type = sequence_type[ 0 ]
self.data.sequence_type = "".join(sequence_type)
def sequence_name(self, sequence_name ):
self.data.sequence_name = sequence_name[ 0 ]
self.data.sequence_name = "".join(sequence_name)
def comment(self, comment ):
self.data.comment = comment[ 0 ]
self.data.comment = "".join(comment)
def sequence_line( self, sequences ):
for sequence in sequences:
sequence = sequence.strip()
sequence = sequence.replace( ' ', '' )
self.data.sequence.data = self.data.sequence.data + sequence[:]
new_seq = "".join(sequences)
parts = new_seq.split()
self._sequences.append("".join(parts))
def sequence_final_line( self, sequences ):
for sequence in sequences:
sequence = sequence.strip()
sequence = sequence.replace( ' ', '' )
sequence = sequence[ :-1 ]
self.data.sequence.data = self.data.sequence.data + sequence[:]
def sequence_final_text( self, sequences ):
new_seq = "".join(sequences)
parts = new_seq.split()
self._sequences.append("".join(parts))
self.data.sequence.data = "".join(self._sequences)
class RecordParser:

View File

@ -9,15 +9,6 @@ This is a huge regular regular expression for NBRF, built using
the 'regular expressiona on steroids' capabilities of Martel.
http://www-nbrf.georgetown.edu/pirwww/pirhome.shtml
Notes:
Just so I remember -- the new end of line syntax is:
New regexp syntax - \R
\R means "\n|\r\n?"
[\R] means "[\n\r]"
This helps us have endlines be consistent across platforms.
"""
# standard library
#http://www-nbrf.georgetown.edu/pirwww/pirhome.shtml
@ -27,59 +18,41 @@ import string
import Martel
from Martel import RecordReader
from Martel import Str
from Martel import AnyEol
from Martel import AnyEol, UntilEol
from Martel import ToEol
from Martel import Group
from Martel import Alt
from Martel import Alt, Opt
from Martel import Rep
from Martel import Rep1
from Martel import Any
from Martel import AnyBut
from Martel import UntilSep
from Bio.NBRF.ValSeq import valid_sequence_dict
# --- first set up some helper constants and functions
# Copyright 2001 by Katharine Lindner. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
sequence_types = map( Str, valid_sequence_dict.keys() )
sequence_type = Group( "sequence_type", Alt( *sequence_types ) )
sequence_name = Group( "sequence_name", Rep1( Martel.Expression.Dot() ) )
name_line = Martel.Group( "name_line", \
Str( ">" ) +
sequence_type +
Str( ";" ) +
sequence_name +
UntilEol("sequence_name") +
AnyEol() )
comment_line = Group( "comment_line",
Rep1(AnyBut(' ')) +
ToEol( "comment" ) )
excluded_chars = chr( 0x2a ) + chr( 10 ) + chr( 13 )
comment_line = UntilEol("comment") + AnyEol()
# 0x2a -- '*'
# 10 -- '\n', 13 -- '\r' newline endings
excluded_chars = chr(0x2a) + chr(10) + chr(13)
# sequence lines with only sequence
sequence_text = Group( "sequence_text", \
Martel.Rep1( AnyBut( excluded_chars ) ) )
sequence_final_text = Group( "sequence_final_text", \
Martel.Rep1( AnyBut( excluded_chars ) ) )
sequence_final_line = Group( "sequence_final_line",
sequence_final_text +
Str( chr( 0x2a ) ) +
AnyEol() )
sequence_line = Group( "sequence_line", sequence_text +
AnyEol() )
sequence_block = Group( "sequence_block", Rep( sequence_line ) )
AnyEol())
# the final line, has a '*' and potentially some sequence
sequence_final_line = Group( "sequence_final_line",
UntilSep("sequence_final_text", chr(0x2a)) + Str(chr(0x2a)) +
Rep1(AnyEol()))
sequence_block = Group("sequence_block", Rep( sequence_line ))
nbrf_record = name_line + comment_line + sequence_block + sequence_final_line

108
Tests/NBRF/clustalw.pir Normal file
View File

@ -0,0 +1,108 @@
>DL;804Angiostrongylus_cantonensis
--------------------------------------------------
----------------------------------ATT-AAGCCATG-CA-
T-GAG-GA--GTTC-A--GC------TT--TA-A----G-T-GA--AA-C
-TGCGAACGGCTCATTAG-AGCAGATG-T-GATT---TATT-CG--G--A
A-A--A-T--CC-T----ATT-GGA--TAACTGCG--GTAAT-TCTGGAG
CTAATACATGCGTAT-A-A-AC-CCTG-AC---T--T-T--C---GAAA-
-GGGTGCAAT-TA-TTAGAG---C---AA-A-TCAAT-CAT---------
----T-T---TC----------G-GA------TG----TAGTT-------
---T---GCT---G-A-C-TC-TGAATA-A---CG--CAG--CATA-TCG
G-CGGC-T-T-GT---TCGCCGATAAT-CCGAAAA----AG---TGT-C-
TGCCC-TATCA--AC---CT---GA-TGGTAGTCTATTAGTCTA-CCATG
GTTATTACGGGTAACGGAGAATAAGGGTT-CGACTCCGGAGAGGGAGCCT
TAGAAACGGCTACCACATCCAAGGAAGGCAGCAG-GCGCGAAACTTATCC
AA-T-CTTG-----A-ATAGATGA-GATAGTGACT---------------
--------AAAAATAAAAA--GACCA---TTCC-T-AT-G--GAACG-GT
TATTTCAATGAGT--TGATCATAAACCTTTTTT--C-G-AGTA--TCCAG
TGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTC--CACTA
GTGTA-AATCGTCATTGCTGCGGTTAAAAAGC-TCGTAGTTGGAT-C-TG
AGTTGC---AT--GCA-AT-G-ATTCG--C-CT----T--TG--G--CGT
----TAAT------C---AT-TG-TTGTG---ACTA---T------T-T-
--G--CTG--G-T-T--TTCT-AT--TG-A--AA-----TTTC-----G-
A-TT-----TCTTTA-GTG-GC-TA--GCGA-GTT-TA-CTTTGA-AT-A
AATTAAAGTGCT-CAGAACAAG---CGTT-----T--GC-TT-G--AAT-
G-GTCGAT-CATGGAATAA-----TAAAAGAGGAC--TTCG---GT-T--
----CTATT-T----ATTGGTTC-AG---G-AA------CTG------AA
GT-AATGATTAAGAGGGACA--ATTC-GGGGGCATTCGTATCCCTGCGCG
AGAGGTGAAATTCGTG-GACCG-CAGGGGGACGCCCTAAAGCGAAAG-CA
TTTGCC-AAGAAT--GTCTTCATTAATCA-AGAACGAAAGTCAGAGGTTC
GAAGGCGATTAGATA--CCGCCC-TAGTTCTGACCGTAAACTATGCCATC
TAGC-GA--TCC-GAT--GG-GG--TA--T--TG--T-T----GCCTT--
GTCGAGG-AGCTT-CCCGGAAACGA--AA-GTCTTTCGGT-TCCTGGGGT
AGTATGGTTGC-AAAGCT-G-AAACTTAAAGA-AATTGACGGAATGGCAC
CACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGA--AAACT
-CACCC-GGCCCGGACACCGTAA-GGATTGAC-----AGATTGA--A---
AGCTCTTTCTC-GATTTGGTGGTTGGTGGTGCATGGCCGTTCTTAGTTG-
GTGGAG-CGATTTGTCTGGTTTATTCC-GAT-AACGAGCGAGACTCT-AG
-C-C--TG-CTAAA-TA-G--TGA--CTA---------------GA----
TT-----------AT------T----GAGTC-------TA-G----T--C
-------TA-------------C-TT-----CTT-AG---AGGGATAAG-
CGG---TGTT-T-----A-G-C--CGCA--CG-AGATTGAGCGATAACAG
GTCTGTGATGCCCTTAGATGTCCGGGG-CTG-CACGCGCGCTACAATGGA
AG-AAT-CAGC--TGGC---CTA--T----CCAT-TGC-CG-A-AAGGT-
AT----T----GGTAAACCG-TTGAAACT--CTTCC-GTG-ACCGGGATA
GGGAATTGT--A-ATT---------ATT---TCCC-TTGAACG-AGGAAT
TCCTAGTAAGTGTG-AGTCATCAGCTCACGCTGATTACGTCCC-TGCCAT
TTGTACACACCGCCCGTCGCTGTC-CGGG-ACTG--AGC-TGTC--TCGA
GAGGACT-GCGG-A-CTA----CT--GTA----TTGA-GG---CCT----
---T---CGGG------TCG-----CGATA----TGGCG---GG-AAA-C
AG-TTC-AATC-G-CAATG-G--CTTGAACCGGGTAAAAGTCGT-AACAA
GGTATCTG------------------------------------------
---------------------------
*
>DL;815Parelaphostrongylus_odocoil
--------------------------------------------------
----------------------------------ATT-AAGCCATG-CA-
T-GTG-GA--GTTC-A--AC------TT--CA-A---AG-T-GA--AA-C
-TGCGAACGGCTCATTAG-AGCAGATG-T-CATT---TATT-CG--G--A
A-A--A-T--CC-T--T-AAT-GGA--TAACTGCG--GTAAT-TCTGGAG
CTAATACATATGCAT-A-A-AC-CCTG-AC---T--C-TG-T---GAAA-
-GGGTGCAAT-TA-TTAGAG---C---AA-A-TCAAT-CAT---------
----T-T---TC----------G-GA------TG----TAGTT-------
---T---GCT---G-A-C-TC-TGAATA-A---CG--CAG--CATA-TCG
G-CGGC-T-T-GT---TCGCCGATATT-CCGAAAA----AG---TGT-C-
TGCCC-TATCA--AC---CT---GA-TGGTAGTCTATTAGTCTA-CCATG
GTTATTACGGGTAACGGAGAATAAGGGTT-CGACTCCGGAGAGGGAGCCT
TAGAAACGGCTACCACATCCAAGGAAGGCAGCAG-GCGCGAAACTTATCC
AA-T-CTTG-----A-ATAGATGA-GATAGTGACT---------------
--------AAAAATAAAAA--GACCA---TTCC-T-AT-G--GAACG-GT
CATTTCAATGAGT--TGATCATAAACCTTTTTT--C-G-AGTA--TCAAG
TGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTC--CACTA
GTGTA-AATCGTCATTGCTGCGGTTAAAAAGC-TCGTAGTTGGAT-C-TG
AGTCGC---AT--GCA-AT-G-ATTCG--C-CT----T--TG--G--CGT
----TAAT------C---AT-TG-TTGTG---ACTA---T------T-T-
--G--CTG--G-T-T--TTCT-AT--TG-A--AA-----TTTC-----G-
A-TT-----TCTATA-GTG-GC-TA--GCGA-GTT-TA-CTTTGA-AT-A
AATTAAAGTGCT-CAGAACAAG---CGTT-----T--GC-TT-G--AAT-
G-GTCGAT-CATGGAATAA-----TAAAAGAGGAC--TTCG---GT-T--
----CTATT-T----ATTGGTTC-AG---G-AA------CTG------AA
AT-AATGGTTAAGAGGGACA--ATTC-GGGGGCATTCGTATCCCTGCGCG
AGAGGTGAAATTCGTG-GACCG-CAGGGGGACGCCCTAAAGCGAAAG-CA
TTTGCC-AAGAAT--GTCTTCATTAATCA-AGAACGAAAGTCAGAGGTTC
GAAGGCGATTAGATA--CCGCCC-TAGTTCTGACCGTAAACTATGCCATC
TAGC-GA--TCC-GAT--GG-GG--TA--T--TG--T-T----GCCTT--
GTCGAGG-AGCTT-CCCGGAAACGA--AA-GTCTTTCGGT-TCCTGGGGT
AGTATGGTTGC-AAAGCT-G-AAACTTAAAGA-AATTGACGGAATGGCAC
CACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGA--AAACT
-CACCC-GGCCCGGACACCGTAA-GGATTGAC-----AGATTGA--A---
AGCTCTTTCTC-GATTTGGTGGTTGGTGGTGCATGGCCGTTCTTAGTTG-
GTGGAG-CGATTTGTCTGGTTTATTCC-GAT-AACGAGCGAGACTCT-AG
-C-C--TG-CTAAA-TA-G--TGA--CTA---------------GA----
T------------ACG-----T----ATGTC-------TA-G----T--C
-------TA-------------C-TT-----CTT-AG---AGGGATAAG-
CGG---TGTT-T-----A-G-C--CGCA--CG-AGATTGAGCGATAACAG
GTCTGTGATGCCCTTAGATGTTCGGGG-CTG-CACGCGCGCTACAATGGA
AG-AAT-CAGC--TGGC---CTA--T----CCAT-TAC-CG-A-AAGGT-
AT----T----GGTAAACCG-TTGAAACT--CTTCC-GTG-ACCGGGATA
GGGAATTGT--A-ATT---------ATT---TCCC-TTGAACG-AGGAAT
TCCTAGTAAGTGTG-AGTCATCAGCTCACGCTGATTACGTCCC-TGCCAT
TTGTACACACCGCCCGTCGCTGTC-CGGG-ACTG--AGC-TGTC--TCGA
GAGGACT-GCGG-A-CTA----CT--GTA----TTGA-GG---CCT----
---T---CGGG------TCG-----CGATA----TGGCG---GG-AAA-C
AG-TTC-AATC-G-CAATG-G--CTTGAACCGGGTAAAAGTCGT-A----
--------------------------------------------------
---------------------------
*

File diff suppressed because it is too large Load Diff

View File

@ -7,8 +7,8 @@ import Bio.NBRF
from Bio.RecordFile import RecordFile
from Bio.File import SGMLHandle
testfiles = [ 'B_nuc.pir', 'Cw_prot.pir', 'DMA_nuc.pir', 'DMB_prot.pir'
]
testfiles = [ 'B_nuc.pir', 'Cw_prot.pir', 'DMA_nuc.pir', 'DMB_prot.pir',
'clustalw.pir']
for file in testfiles:
fh = open(os.path.join("NBRF", file))