mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
Fix NBRF/PIR parsing so that is accepts files produced by Clustalw -- thanks to Ashleigh Smythe for the bug report.
Fix bugs in NBRF parsing where not all of the comments were passed through. Try to speed up sequence building using joins instead of sequential addition of sequence.
This commit is contained in:
@ -91,7 +91,7 @@ class _Scanner:
|
||||
"sequence_name", \
|
||||
"comment", \
|
||||
"sequence_line", \
|
||||
"sequence_final_line" ]
|
||||
"sequence_final_text" ]
|
||||
|
||||
# make a parser that returns only the tags we are interested in
|
||||
expression = Martel.select_names( nbrf_format.nbrf_record, self.interest_tags)
|
||||
@ -115,29 +115,28 @@ class _RecordConsumer:
|
||||
"""
|
||||
def __init__(self):
|
||||
self.data = Record.Record()
|
||||
|
||||
self._sequences = []
|
||||
|
||||
def sequence_type(self, sequence_type ):
|
||||
self.data.sequence_type = sequence_type[ 0 ]
|
||||
self.data.sequence_type = "".join(sequence_type)
|
||||
|
||||
def sequence_name(self, sequence_name ):
|
||||
self.data.sequence_name = sequence_name[ 0 ]
|
||||
self.data.sequence_name = "".join(sequence_name)
|
||||
|
||||
def comment(self, comment ):
|
||||
self.data.comment = comment[ 0 ]
|
||||
self.data.comment = "".join(comment)
|
||||
|
||||
def sequence_line( self, sequences ):
|
||||
for sequence in sequences:
|
||||
sequence = sequence.strip()
|
||||
sequence = sequence.replace( ' ', '' )
|
||||
self.data.sequence.data = self.data.sequence.data + sequence[:]
|
||||
new_seq = "".join(sequences)
|
||||
parts = new_seq.split()
|
||||
self._sequences.append("".join(parts))
|
||||
|
||||
def sequence_final_line( self, sequences ):
|
||||
for sequence in sequences:
|
||||
sequence = sequence.strip()
|
||||
sequence = sequence.replace( ' ', '' )
|
||||
sequence = sequence[ :-1 ]
|
||||
self.data.sequence.data = self.data.sequence.data + sequence[:]
|
||||
def sequence_final_text( self, sequences ):
|
||||
new_seq = "".join(sequences)
|
||||
parts = new_seq.split()
|
||||
self._sequences.append("".join(parts))
|
||||
|
||||
self.data.sequence.data = "".join(self._sequences)
|
||||
|
||||
|
||||
class RecordParser:
|
||||
|
@ -9,15 +9,6 @@ This is a huge regular regular expression for NBRF, built using
|
||||
the 'regular expressiona on steroids' capabilities of Martel.
|
||||
|
||||
http://www-nbrf.georgetown.edu/pirwww/pirhome.shtml
|
||||
|
||||
Notes:
|
||||
Just so I remember -- the new end of line syntax is:
|
||||
New regexp syntax - \R
|
||||
\R means "\n|\r\n?"
|
||||
[\R] means "[\n\r]"
|
||||
|
||||
This helps us have endlines be consistent across platforms.
|
||||
|
||||
"""
|
||||
# standard library
|
||||
#http://www-nbrf.georgetown.edu/pirwww/pirhome.shtml
|
||||
@ -27,59 +18,41 @@ import string
|
||||
import Martel
|
||||
from Martel import RecordReader
|
||||
from Martel import Str
|
||||
from Martel import AnyEol
|
||||
from Martel import AnyEol, UntilEol
|
||||
from Martel import ToEol
|
||||
from Martel import Group
|
||||
from Martel import Alt
|
||||
from Martel import Alt, Opt
|
||||
from Martel import Rep
|
||||
from Martel import Rep1
|
||||
from Martel import Any
|
||||
from Martel import AnyBut
|
||||
|
||||
from Martel import UntilSep
|
||||
|
||||
from Bio.NBRF.ValSeq import valid_sequence_dict
|
||||
|
||||
|
||||
|
||||
# --- first set up some helper constants and functions
|
||||
# Copyright 2001 by Katharine Lindner. All rights reserved.
|
||||
# This code is part of the Biopython distribution and governed by its
|
||||
# license. Please see the LICENSE file that should have been included
|
||||
# as part of this package.
|
||||
|
||||
|
||||
sequence_types = map( Str, valid_sequence_dict.keys() )
|
||||
sequence_type = Group( "sequence_type", Alt( *sequence_types ) )
|
||||
sequence_name = Group( "sequence_name", Rep1( Martel.Expression.Dot() ) )
|
||||
name_line = Martel.Group( "name_line", \
|
||||
Str( ">" ) +
|
||||
sequence_type +
|
||||
Str( ";" ) +
|
||||
sequence_name +
|
||||
UntilEol("sequence_name") +
|
||||
AnyEol() )
|
||||
comment_line = Group( "comment_line",
|
||||
Rep1(AnyBut(' ')) +
|
||||
ToEol( "comment" ) )
|
||||
|
||||
excluded_chars = chr( 0x2a ) + chr( 10 ) + chr( 13 )
|
||||
comment_line = UntilEol("comment") + AnyEol()
|
||||
|
||||
# 0x2a -- '*'
|
||||
# 10 -- '\n', 13 -- '\r' newline endings
|
||||
excluded_chars = chr(0x2a) + chr(10) + chr(13)
|
||||
# sequence lines with only sequence
|
||||
sequence_text = Group( "sequence_text", \
|
||||
Martel.Rep1( AnyBut( excluded_chars ) ) )
|
||||
sequence_final_text = Group( "sequence_final_text", \
|
||||
Martel.Rep1( AnyBut( excluded_chars ) ) )
|
||||
sequence_final_line = Group( "sequence_final_line",
|
||||
sequence_final_text +
|
||||
Str( chr( 0x2a ) ) +
|
||||
AnyEol() )
|
||||
sequence_line = Group( "sequence_line", sequence_text +
|
||||
AnyEol() )
|
||||
sequence_block = Group( "sequence_block", Rep( sequence_line ) )
|
||||
AnyEol())
|
||||
# the final line, has a '*' and potentially some sequence
|
||||
sequence_final_line = Group( "sequence_final_line",
|
||||
UntilSep("sequence_final_text", chr(0x2a)) + Str(chr(0x2a)) +
|
||||
Rep1(AnyEol()))
|
||||
|
||||
sequence_block = Group("sequence_block", Rep( sequence_line ))
|
||||
nbrf_record = name_line + comment_line + sequence_block + sequence_final_line
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
108
Tests/NBRF/clustalw.pir
Normal file
108
Tests/NBRF/clustalw.pir
Normal file
@ -0,0 +1,108 @@
|
||||
>DL;804Angiostrongylus_cantonensis
|
||||
|
||||
--------------------------------------------------
|
||||
----------------------------------ATT-AAGCCATG-CA-
|
||||
T-GAG-GA--GTTC-A--GC------TT--TA-A----G-T-GA--AA-C
|
||||
-TGCGAACGGCTCATTAG-AGCAGATG-T-GATT---TATT-CG--G--A
|
||||
A-A--A-T--CC-T----ATT-GGA--TAACTGCG--GTAAT-TCTGGAG
|
||||
CTAATACATGCGTAT-A-A-AC-CCTG-AC---T--T-T--C---GAAA-
|
||||
-GGGTGCAAT-TA-TTAGAG---C---AA-A-TCAAT-CAT---------
|
||||
----T-T---TC----------G-GA------TG----TAGTT-------
|
||||
---T---GCT---G-A-C-TC-TGAATA-A---CG--CAG--CATA-TCG
|
||||
G-CGGC-T-T-GT---TCGCCGATAAT-CCGAAAA----AG---TGT-C-
|
||||
TGCCC-TATCA--AC---CT---GA-TGGTAGTCTATTAGTCTA-CCATG
|
||||
GTTATTACGGGTAACGGAGAATAAGGGTT-CGACTCCGGAGAGGGAGCCT
|
||||
TAGAAACGGCTACCACATCCAAGGAAGGCAGCAG-GCGCGAAACTTATCC
|
||||
AA-T-CTTG-----A-ATAGATGA-GATAGTGACT---------------
|
||||
--------AAAAATAAAAA--GACCA---TTCC-T-AT-G--GAACG-GT
|
||||
TATTTCAATGAGT--TGATCATAAACCTTTTTT--C-G-AGTA--TCCAG
|
||||
TGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTC--CACTA
|
||||
GTGTA-AATCGTCATTGCTGCGGTTAAAAAGC-TCGTAGTTGGAT-C-TG
|
||||
AGTTGC---AT--GCA-AT-G-ATTCG--C-CT----T--TG--G--CGT
|
||||
----TAAT------C---AT-TG-TTGTG---ACTA---T------T-T-
|
||||
--G--CTG--G-T-T--TTCT-AT--TG-A--AA-----TTTC-----G-
|
||||
A-TT-----TCTTTA-GTG-GC-TA--GCGA-GTT-TA-CTTTGA-AT-A
|
||||
AATTAAAGTGCT-CAGAACAAG---CGTT-----T--GC-TT-G--AAT-
|
||||
G-GTCGAT-CATGGAATAA-----TAAAAGAGGAC--TTCG---GT-T--
|
||||
----CTATT-T----ATTGGTTC-AG---G-AA------CTG------AA
|
||||
GT-AATGATTAAGAGGGACA--ATTC-GGGGGCATTCGTATCCCTGCGCG
|
||||
AGAGGTGAAATTCGTG-GACCG-CAGGGGGACGCCCTAAAGCGAAAG-CA
|
||||
TTTGCC-AAGAAT--GTCTTCATTAATCA-AGAACGAAAGTCAGAGGTTC
|
||||
GAAGGCGATTAGATA--CCGCCC-TAGTTCTGACCGTAAACTATGCCATC
|
||||
TAGC-GA--TCC-GAT--GG-GG--TA--T--TG--T-T----GCCTT--
|
||||
GTCGAGG-AGCTT-CCCGGAAACGA--AA-GTCTTTCGGT-TCCTGGGGT
|
||||
AGTATGGTTGC-AAAGCT-G-AAACTTAAAGA-AATTGACGGAATGGCAC
|
||||
CACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGA--AAACT
|
||||
-CACCC-GGCCCGGACACCGTAA-GGATTGAC-----AGATTGA--A---
|
||||
AGCTCTTTCTC-GATTTGGTGGTTGGTGGTGCATGGCCGTTCTTAGTTG-
|
||||
GTGGAG-CGATTTGTCTGGTTTATTCC-GAT-AACGAGCGAGACTCT-AG
|
||||
-C-C--TG-CTAAA-TA-G--TGA--CTA---------------GA----
|
||||
TT-----------AT------T----GAGTC-------TA-G----T--C
|
||||
-------TA-------------C-TT-----CTT-AG---AGGGATAAG-
|
||||
CGG---TGTT-T-----A-G-C--CGCA--CG-AGATTGAGCGATAACAG
|
||||
GTCTGTGATGCCCTTAGATGTCCGGGG-CTG-CACGCGCGCTACAATGGA
|
||||
AG-AAT-CAGC--TGGC---CTA--T----CCAT-TGC-CG-A-AAGGT-
|
||||
AT----T----GGTAAACCG-TTGAAACT--CTTCC-GTG-ACCGGGATA
|
||||
GGGAATTGT--A-ATT---------ATT---TCCC-TTGAACG-AGGAAT
|
||||
TCCTAGTAAGTGTG-AGTCATCAGCTCACGCTGATTACGTCCC-TGCCAT
|
||||
TTGTACACACCGCCCGTCGCTGTC-CGGG-ACTG--AGC-TGTC--TCGA
|
||||
GAGGACT-GCGG-A-CTA----CT--GTA----TTGA-GG---CCT----
|
||||
---T---CGGG------TCG-----CGATA----TGGCG---GG-AAA-C
|
||||
AG-TTC-AATC-G-CAATG-G--CTTGAACCGGGTAAAAGTCGT-AACAA
|
||||
GGTATCTG------------------------------------------
|
||||
---------------------------
|
||||
*
|
||||
>DL;815Parelaphostrongylus_odocoil
|
||||
|
||||
--------------------------------------------------
|
||||
----------------------------------ATT-AAGCCATG-CA-
|
||||
T-GTG-GA--GTTC-A--AC------TT--CA-A---AG-T-GA--AA-C
|
||||
-TGCGAACGGCTCATTAG-AGCAGATG-T-CATT---TATT-CG--G--A
|
||||
A-A--A-T--CC-T--T-AAT-GGA--TAACTGCG--GTAAT-TCTGGAG
|
||||
CTAATACATATGCAT-A-A-AC-CCTG-AC---T--C-TG-T---GAAA-
|
||||
-GGGTGCAAT-TA-TTAGAG---C---AA-A-TCAAT-CAT---------
|
||||
----T-T---TC----------G-GA------TG----TAGTT-------
|
||||
---T---GCT---G-A-C-TC-TGAATA-A---CG--CAG--CATA-TCG
|
||||
G-CGGC-T-T-GT---TCGCCGATATT-CCGAAAA----AG---TGT-C-
|
||||
TGCCC-TATCA--AC---CT---GA-TGGTAGTCTATTAGTCTA-CCATG
|
||||
GTTATTACGGGTAACGGAGAATAAGGGTT-CGACTCCGGAGAGGGAGCCT
|
||||
TAGAAACGGCTACCACATCCAAGGAAGGCAGCAG-GCGCGAAACTTATCC
|
||||
AA-T-CTTG-----A-ATAGATGA-GATAGTGACT---------------
|
||||
--------AAAAATAAAAA--GACCA---TTCC-T-AT-G--GAACG-GT
|
||||
CATTTCAATGAGT--TGATCATAAACCTTTTTT--C-G-AGTA--TCAAG
|
||||
TGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTC--CACTA
|
||||
GTGTA-AATCGTCATTGCTGCGGTTAAAAAGC-TCGTAGTTGGAT-C-TG
|
||||
AGTCGC---AT--GCA-AT-G-ATTCG--C-CT----T--TG--G--CGT
|
||||
----TAAT------C---AT-TG-TTGTG---ACTA---T------T-T-
|
||||
--G--CTG--G-T-T--TTCT-AT--TG-A--AA-----TTTC-----G-
|
||||
A-TT-----TCTATA-GTG-GC-TA--GCGA-GTT-TA-CTTTGA-AT-A
|
||||
AATTAAAGTGCT-CAGAACAAG---CGTT-----T--GC-TT-G--AAT-
|
||||
G-GTCGAT-CATGGAATAA-----TAAAAGAGGAC--TTCG---GT-T--
|
||||
----CTATT-T----ATTGGTTC-AG---G-AA------CTG------AA
|
||||
AT-AATGGTTAAGAGGGACA--ATTC-GGGGGCATTCGTATCCCTGCGCG
|
||||
AGAGGTGAAATTCGTG-GACCG-CAGGGGGACGCCCTAAAGCGAAAG-CA
|
||||
TTTGCC-AAGAAT--GTCTTCATTAATCA-AGAACGAAAGTCAGAGGTTC
|
||||
GAAGGCGATTAGATA--CCGCCC-TAGTTCTGACCGTAAACTATGCCATC
|
||||
TAGC-GA--TCC-GAT--GG-GG--TA--T--TG--T-T----GCCTT--
|
||||
GTCGAGG-AGCTT-CCCGGAAACGA--AA-GTCTTTCGGT-TCCTGGGGT
|
||||
AGTATGGTTGC-AAAGCT-G-AAACTTAAAGA-AATTGACGGAATGGCAC
|
||||
CACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGA--AAACT
|
||||
-CACCC-GGCCCGGACACCGTAA-GGATTGAC-----AGATTGA--A---
|
||||
AGCTCTTTCTC-GATTTGGTGGTTGGTGGTGCATGGCCGTTCTTAGTTG-
|
||||
GTGGAG-CGATTTGTCTGGTTTATTCC-GAT-AACGAGCGAGACTCT-AG
|
||||
-C-C--TG-CTAAA-TA-G--TGA--CTA---------------GA----
|
||||
T------------ACG-----T----ATGTC-------TA-G----T--C
|
||||
-------TA-------------C-TT-----CTT-AG---AGGGATAAG-
|
||||
CGG---TGTT-T-----A-G-C--CGCA--CG-AGATTGAGCGATAACAG
|
||||
GTCTGTGATGCCCTTAGATGTTCGGGG-CTG-CACGCGCGCTACAATGGA
|
||||
AG-AAT-CAGC--TGGC---CTA--T----CCAT-TAC-CG-A-AAGGT-
|
||||
AT----T----GGTAAACCG-TTGAAACT--CTTCC-GTG-ACCGGGATA
|
||||
GGGAATTGT--A-ATT---------ATT---TCCC-TTGAACG-AGGAAT
|
||||
TCCTAGTAAGTGTG-AGTCATCAGCTCACGCTGATTACGTCCC-TGCCAT
|
||||
TTGTACACACCGCCCGTCGCTGTC-CGGG-ACTG--AGC-TGTC--TCGA
|
||||
GAGGACT-GCGG-A-CTA----CT--GTA----TTGA-GG---CCT----
|
||||
---T---CGGG------TCG-----CGATA----TGGCG---GG-AAA-C
|
||||
AG-TTC-AATC-G-CAATG-G--CTTGAACCGGGTAAAAGTCGT-A----
|
||||
--------------------------------------------------
|
||||
---------------------------
|
||||
*
|
15309
Tests/output/test_nbrf
15309
Tests/output/test_nbrf
File diff suppressed because it is too large
Load Diff
@ -7,8 +7,8 @@ import Bio.NBRF
|
||||
from Bio.RecordFile import RecordFile
|
||||
from Bio.File import SGMLHandle
|
||||
|
||||
testfiles = [ 'B_nuc.pir', 'Cw_prot.pir', 'DMA_nuc.pir', 'DMB_prot.pir'
|
||||
]
|
||||
testfiles = [ 'B_nuc.pir', 'Cw_prot.pir', 'DMA_nuc.pir', 'DMB_prot.pir',
|
||||
'clustalw.pir']
|
||||
|
||||
for file in testfiles:
|
||||
fh = open(os.path.join("NBRF", file))
|
||||
|
Reference in New Issue
Block a user