Include EMBL example with max length line wrapping, and fix previous commit

This commit is contained in:
peterjc
2010-09-23 15:39:46 +01:00
parent a80edef885
commit 26dad4278d
5 changed files with 63 additions and 20 deletions

View File

@ -303,22 +303,22 @@ class _InsdcWriter(SequentialSequenceWriter):
"Returns a list of strings."""
#TODO - Do the line spliting while preserving white space?
text = text.strip()
if len(text) < max_len:
if len(text) <= max_len:
return [text]
words = text.split()
assert max([len(w) for w in words]) <= max_len, \
"Your description cannot be broken into nice lines!:\n%s" \
% repr(text)
if max([len(w) for w in words]) > max_len:
raise ValueError("Text cannot be broken into len %i lines!:\n%s"
% (max_len, repr(text)))
text = ""
while words and len(text) + 1 + len(words[0]) < max_len:
while words and len(text) + 1 + len(words[0]) <= max_len:
text += " " + words.pop(0)
text = text.strip()
assert len(text) <= max_len
answer = [text]
while words:
text = ""
while words and len(text) + 1 + len(words[0]) < max_len:
text = words.pop(0)
while words and len(text) + 1 + len(words[0]) <= max_len:
text += " " + words.pop(0)
text = text.strip()
assert len(text) <= max_len
@ -356,7 +356,7 @@ class GenBankWriter(_InsdcWriter):
def _write_single_line(self, tag, text):
"Used in the the 'header' of each GenBank record."""
assert len(tag) < self.HEADER_WIDTH
assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \
assert len(text) <= self.MAX_WIDTH - self.HEADER_WIDTH, \
"Annotation %s too long for %s line" % (repr(text), tag)
self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH),
text.replace("\n", " ")))
@ -366,9 +366,8 @@ class GenBankWriter(_InsdcWriter):
#TODO - Do the line spliting while preserving white space?
max_len = self.MAX_WIDTH - self.HEADER_WIDTH
lines = self._split_multi_line(text, max_len)
assert len(tag) < self.HEADER_WIDTH
self._write_single_line(tag, lines[0])
for line in lines[1:] :
for line in lines[1:]:
self._write_single_line("", line)
def _write_multi_entries(self, tag, text_list):

View File

@ -259,3 +259,45 @@ SQ Sequence 153 AA; 17492 MW; 2159E552 CRC32;
KNTTEKETFC RAATVLRQFY SHHEKDTRCL GATAQQFHRH KQLIRFLKRL DRNLWGLAGL 120
NSCPVKEANQ STLENFLERL KTIMREKYSK CSS 153
//
ID CQ797900; SV 1; linear; protein; PRT; PRO; 496 AA.
XX
AC CQ797900;
XX
DT 20-APR-2004 (Rel. 79, Created)
DT 31-MAY-2006 (Rel. 88, Last updated, Version 3)
XX
DE Sequence 2 from Patent WO2004029250.
XX
KW .
XX
OS Sinorhizobium meliloti
OC Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales; Rhizobiaceae;
OC Sinorhizobium/Ensifer group; Sinorhizobium.
XX
RN [1]
RA Hoshino T., Ichikawa K., Tazoe M.;
RT "Dna encoding
RT flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena
RT se , pdxr , and microbial production of vitamin b6";
RL Patent number WO2004029250-A1/2, 08-APR-2004.
RL DSM IP Assets B.V. (NL).
XX
FH Key Location/Qualifiers
FH
FT source 1..496
FT /organism="Sinorhizobium meliloti"
FT /mol_type="protein"
FT /db_xref="taxon:382"
XX
SQ Sequence 496 AA; 53189 MW; 54C0D9AD CRC32;
MAIGTLEATT LIRGRAMTTV LPSPELIASF VDIVGPGNAL TAPADTAPYL VESRGLYRGT 60
TPLVLRPGSV EEVSLVMRLA SQTRTAVVPQ GGNTGHVAGQ IPREGKADVV LSLERLNRIR 120
DIDPVGNVIV ADAGCILADI QKAADDVDRL FPLSLGSEGS ARIGGNLSTN AGGTAVLAYG 180
NMRQLCLGLE VVLPTGEIWD GLRRLRKDNT GYDLRDLFIG AEGTLGVITG AVLKLFPKPR 240
GHQVAFAGLR SVEDALTLFD RATSVCGPAL TGFELMPRLG IEFTTRHIAG VRDPMETTHP 300
WYALIDISTS DTAESAERMV QDLLEAVIAD GLVENAVIAQ NEAQRRALWH MRESMSPAQK 360
PEGGSIKHDV SVPVSSIPAF MTEADALVSK AIPGARICAF GHMGDGNIHY NISQPVGADK 420
QSFLDRWREI NAIVHAVVLK HDGSISAEHG IGQLKRDELA AIRSPIEIEL MRRIKHAFDP 480
AGIMNPDKVL REDRGE 496
//

View File

@ -1789,7 +1789,7 @@ Testing reading genbank format file GFF/NC_001422.gbk
Failed: Missing SFF flow information
Checking can write/read as 'tab' format
Checking can write/read as 'nexus' format
Testing reading embl format file EMBL/epo_prt_start.embl
Testing reading embl format file EMBL/epo_prt_selection.embl
ID = 'A00022.1', Name='A00022',
Seq='CLARIIRYFYNAKA', length=14
ID = 'A00028.1', Name='A00028',
@ -1797,8 +1797,8 @@ Testing reading embl format file EMBL/epo_prt_start.embl
ID = 'A00031.1', Name='A00031',
Seq='RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRA...ERTCGGA', length=58
...
ID = 'A00078.1', Name='A00078',
Seq='MGLTSQLLPPLFFLLACAGNFVHGHKCDITLQEIIKTLNS...KYSKCSS', length=153
ID = 'CQ797900.1', Name='CQ797900',
Seq='MAIGTLEATTLIRGRAMTTVLPSPELIASFVDIVGPGNAL...LREDRGE', length=496
Checking can write/read as 'fasta' format
Checking can write/read as 'clustal' format
Failed: Sequences must all be the same length
@ -1808,17 +1808,19 @@ Testing reading embl format file EMBL/epo_prt_start.embl
Failed: Sequences must all be the same length
Checking can write/read as 'embl' format
Checking can write/read as 'fastq' format
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
Checking can write/read as 'fastq-illumina' format
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
Checking can write/read as 'fastq-solexa' format
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
Checking can write/read as 'genbank' format
Failed: Text cannot be broken into len 68 lines!:
'Dna encoding flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena se , pdxr , and microbial production of vitamin b6'
Checking can write/read as 'imgt' format
Checking can write/read as 'phd' format
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
Checking can write/read as 'qual' format
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
Checking can write/read as 'sff' format
Failed: Missing SFF flow information
Checking can write/read as 'tab' format

View File

@ -139,7 +139,7 @@ test_files = [ \
#Following files are also used in test_GFF.py
("genbank",False, 'GFF/NC_001422.gbk', 1),
#Following files are currently only used here or in test_SeqIO_index.py:
("embl", False, 'EMBL/epo_prt_start.embl', 8), #proteins
("embl", False, 'EMBL/epo_prt_selection.embl', 9), #proteins
("embl", False, 'EMBL/TRBG361.embl', 1),
("embl", False, 'EMBL/DD231055_edited.embl', 1),
("embl", False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line

View File

@ -128,7 +128,7 @@ tests = [
("Quality/sanger_faked.fastq", "fastq-sanger", generic_dna),
("Quality/solexa_faked.fastq", "fastq-solexa", generic_dna),
("Quality/illumina_faked.fastq", "fastq-illumina", generic_dna),
("EMBL/epo_prt_start.embl", "embl", None),
("EMBL/epo_prt_selection.embl", "embl", None),
("EMBL/U87107.embl", "embl", None),
("EMBL/TRBG361.embl", "embl", None),
("EMBL/A04195.imgt", "embl", None), #Not a proper EMBL file, an IMGT file