mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
Include EMBL example with max length line wrapping, and fix previous commit
This commit is contained in:
@ -303,22 +303,22 @@ class _InsdcWriter(SequentialSequenceWriter):
|
||||
"Returns a list of strings."""
|
||||
#TODO - Do the line spliting while preserving white space?
|
||||
text = text.strip()
|
||||
if len(text) < max_len:
|
||||
if len(text) <= max_len:
|
||||
return [text]
|
||||
|
||||
words = text.split()
|
||||
assert max([len(w) for w in words]) <= max_len, \
|
||||
"Your description cannot be broken into nice lines!:\n%s" \
|
||||
% repr(text)
|
||||
if max([len(w) for w in words]) > max_len:
|
||||
raise ValueError("Text cannot be broken into len %i lines!:\n%s"
|
||||
% (max_len, repr(text)))
|
||||
text = ""
|
||||
while words and len(text) + 1 + len(words[0]) < max_len:
|
||||
while words and len(text) + 1 + len(words[0]) <= max_len:
|
||||
text += " " + words.pop(0)
|
||||
text = text.strip()
|
||||
assert len(text) <= max_len
|
||||
answer = [text]
|
||||
while words:
|
||||
text = ""
|
||||
while words and len(text) + 1 + len(words[0]) < max_len:
|
||||
text = words.pop(0)
|
||||
while words and len(text) + 1 + len(words[0]) <= max_len:
|
||||
text += " " + words.pop(0)
|
||||
text = text.strip()
|
||||
assert len(text) <= max_len
|
||||
@ -356,7 +356,7 @@ class GenBankWriter(_InsdcWriter):
|
||||
def _write_single_line(self, tag, text):
|
||||
"Used in the the 'header' of each GenBank record."""
|
||||
assert len(tag) < self.HEADER_WIDTH
|
||||
assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \
|
||||
assert len(text) <= self.MAX_WIDTH - self.HEADER_WIDTH, \
|
||||
"Annotation %s too long for %s line" % (repr(text), tag)
|
||||
self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH),
|
||||
text.replace("\n", " ")))
|
||||
@ -366,9 +366,8 @@ class GenBankWriter(_InsdcWriter):
|
||||
#TODO - Do the line spliting while preserving white space?
|
||||
max_len = self.MAX_WIDTH - self.HEADER_WIDTH
|
||||
lines = self._split_multi_line(text, max_len)
|
||||
assert len(tag) < self.HEADER_WIDTH
|
||||
self._write_single_line(tag, lines[0])
|
||||
for line in lines[1:] :
|
||||
for line in lines[1:]:
|
||||
self._write_single_line("", line)
|
||||
|
||||
def _write_multi_entries(self, tag, text_list):
|
||||
|
@ -259,3 +259,45 @@ SQ Sequence 153 AA; 17492 MW; 2159E552 CRC32;
|
||||
KNTTEKETFC RAATVLRQFY SHHEKDTRCL GATAQQFHRH KQLIRFLKRL DRNLWGLAGL 120
|
||||
NSCPVKEANQ STLENFLERL KTIMREKYSK CSS 153
|
||||
//
|
||||
ID CQ797900; SV 1; linear; protein; PRT; PRO; 496 AA.
|
||||
XX
|
||||
AC CQ797900;
|
||||
XX
|
||||
DT 20-APR-2004 (Rel. 79, Created)
|
||||
DT 31-MAY-2006 (Rel. 88, Last updated, Version 3)
|
||||
XX
|
||||
DE Sequence 2 from Patent WO2004029250.
|
||||
XX
|
||||
KW .
|
||||
XX
|
||||
OS Sinorhizobium meliloti
|
||||
OC Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales; Rhizobiaceae;
|
||||
OC Sinorhizobium/Ensifer group; Sinorhizobium.
|
||||
XX
|
||||
RN [1]
|
||||
RA Hoshino T., Ichikawa K., Tazoe M.;
|
||||
RT "Dna encoding
|
||||
RT flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena
|
||||
RT se , pdxr , and microbial production of vitamin b6";
|
||||
RL Patent number WO2004029250-A1/2, 08-APR-2004.
|
||||
RL DSM IP Assets B.V. (NL).
|
||||
XX
|
||||
FH Key Location/Qualifiers
|
||||
FH
|
||||
FT source 1..496
|
||||
FT /organism="Sinorhizobium meliloti"
|
||||
FT /mol_type="protein"
|
||||
FT /db_xref="taxon:382"
|
||||
XX
|
||||
SQ Sequence 496 AA; 53189 MW; 54C0D9AD CRC32;
|
||||
MAIGTLEATT LIRGRAMTTV LPSPELIASF VDIVGPGNAL TAPADTAPYL VESRGLYRGT 60
|
||||
TPLVLRPGSV EEVSLVMRLA SQTRTAVVPQ GGNTGHVAGQ IPREGKADVV LSLERLNRIR 120
|
||||
DIDPVGNVIV ADAGCILADI QKAADDVDRL FPLSLGSEGS ARIGGNLSTN AGGTAVLAYG 180
|
||||
NMRQLCLGLE VVLPTGEIWD GLRRLRKDNT GYDLRDLFIG AEGTLGVITG AVLKLFPKPR 240
|
||||
GHQVAFAGLR SVEDALTLFD RATSVCGPAL TGFELMPRLG IEFTTRHIAG VRDPMETTHP 300
|
||||
WYALIDISTS DTAESAERMV QDLLEAVIAD GLVENAVIAQ NEAQRRALWH MRESMSPAQK 360
|
||||
PEGGSIKHDV SVPVSSIPAF MTEADALVSK AIPGARICAF GHMGDGNIHY NISQPVGADK 420
|
||||
QSFLDRWREI NAIVHAVVLK HDGSISAEHG IGQLKRDELA AIRSPIEIEL MRRIKHAFDP 480
|
||||
AGIMNPDKVL REDRGE 496
|
||||
//
|
||||
|
@ -1789,7 +1789,7 @@ Testing reading genbank format file GFF/NC_001422.gbk
|
||||
Failed: Missing SFF flow information
|
||||
Checking can write/read as 'tab' format
|
||||
Checking can write/read as 'nexus' format
|
||||
Testing reading embl format file EMBL/epo_prt_start.embl
|
||||
Testing reading embl format file EMBL/epo_prt_selection.embl
|
||||
ID = 'A00022.1', Name='A00022',
|
||||
Seq='CLARIIRYFYNAKA', length=14
|
||||
ID = 'A00028.1', Name='A00028',
|
||||
@ -1797,8 +1797,8 @@ Testing reading embl format file EMBL/epo_prt_start.embl
|
||||
ID = 'A00031.1', Name='A00031',
|
||||
Seq='RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRA...ERTCGGA', length=58
|
||||
...
|
||||
ID = 'A00078.1', Name='A00078',
|
||||
Seq='MGLTSQLLPPLFFLLACAGNFVHGHKCDITLQEIIKTLNS...KYSKCSS', length=153
|
||||
ID = 'CQ797900.1', Name='CQ797900',
|
||||
Seq='MAIGTLEATTLIRGRAMTTVLPSPELIASFVDIVGPGNAL...LREDRGE', length=496
|
||||
Checking can write/read as 'fasta' format
|
||||
Checking can write/read as 'clustal' format
|
||||
Failed: Sequences must all be the same length
|
||||
@ -1808,17 +1808,19 @@ Testing reading embl format file EMBL/epo_prt_start.embl
|
||||
Failed: Sequences must all be the same length
|
||||
Checking can write/read as 'embl' format
|
||||
Checking can write/read as 'fastq' format
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
|
||||
Checking can write/read as 'fastq-illumina' format
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
|
||||
Checking can write/read as 'fastq-solexa' format
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
|
||||
Checking can write/read as 'genbank' format
|
||||
Failed: Text cannot be broken into len 68 lines!:
|
||||
'Dna encoding flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena se , pdxr , and microbial production of vitamin b6'
|
||||
Checking can write/read as 'imgt' format
|
||||
Checking can write/read as 'phd' format
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
|
||||
Checking can write/read as 'qual' format
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
|
||||
Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
|
||||
Checking can write/read as 'sff' format
|
||||
Failed: Missing SFF flow information
|
||||
Checking can write/read as 'tab' format
|
||||
|
@ -139,7 +139,7 @@ test_files = [ \
|
||||
#Following files are also used in test_GFF.py
|
||||
("genbank",False, 'GFF/NC_001422.gbk', 1),
|
||||
#Following files are currently only used here or in test_SeqIO_index.py:
|
||||
("embl", False, 'EMBL/epo_prt_start.embl', 8), #proteins
|
||||
("embl", False, 'EMBL/epo_prt_selection.embl', 9), #proteins
|
||||
("embl", False, 'EMBL/TRBG361.embl', 1),
|
||||
("embl", False, 'EMBL/DD231055_edited.embl', 1),
|
||||
("embl", False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
|
||||
|
@ -128,7 +128,7 @@ tests = [
|
||||
("Quality/sanger_faked.fastq", "fastq-sanger", generic_dna),
|
||||
("Quality/solexa_faked.fastq", "fastq-solexa", generic_dna),
|
||||
("Quality/illumina_faked.fastq", "fastq-illumina", generic_dna),
|
||||
("EMBL/epo_prt_start.embl", "embl", None),
|
||||
("EMBL/epo_prt_selection.embl", "embl", None),
|
||||
("EMBL/U87107.embl", "embl", None),
|
||||
("EMBL/TRBG361.embl", "embl", None),
|
||||
("EMBL/A04195.imgt", "embl", None), #Not a proper EMBL file, an IMGT file
|
||||
|
Reference in New Issue
Block a user