Include EMBL example with max length line wrapping, and fix previous commit

2025-10-20 13:43:47 +08:00 · 2010-09-23 15:39:46 +01:00
parent a80edef885
commit 26dad4278d
5 changed files with 63 additions and 20 deletions
--- a/Bio/SeqIO/InsdcIO.py
+++ b/Bio/SeqIO/InsdcIO.py
@ -303,22 +303,22 @@ class _InsdcWriter(SequentialSequenceWriter):
        "Returns a list of strings."""
        #TODO - Do the line spliting while preserving white space?
        text = text.strip()
-        if len(text) < max_len:
+        if len(text) <= max_len:
            return [text]

        words = text.split()
-        assert max([len(w) for w in words]) <= max_len, \
-               "Your description cannot be broken into nice lines!:\n%s" \
-               % repr(text)
+        if max([len(w) for w in words]) > max_len:
+            raise ValueError("Text cannot be broken into len %i lines!:\n%s"
+                             % (max_len, repr(text)))
        text = ""
-        while words and len(text) + 1 + len(words[0]) < max_len:
+        while words and len(text) + 1 + len(words[0]) <= max_len:
            text += " " + words.pop(0)
            text = text.strip()
        assert len(text) <= max_len
        answer = [text]
        while words:
-            text = ""
-            while words and len(text) + 1 + len(words[0]) < max_len:
+            text = words.pop(0)
+            while words and len(text) + 1 + len(words[0]) <= max_len:
                text += " " + words.pop(0)
                text = text.strip()
            assert len(text) <= max_len
@ -356,7 +356,7 @@ class GenBankWriter(_InsdcWriter):
    def _write_single_line(self, tag, text):
        "Used in the the 'header' of each GenBank record."""
        assert len(tag) < self.HEADER_WIDTH
-        assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \
+        assert len(text) <= self.MAX_WIDTH - self.HEADER_WIDTH, \
               "Annotation %s too long for %s line" % (repr(text), tag)
        self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH),
                                      text.replace("\n", " ")))
@ -366,9 +366,8 @@ class GenBankWriter(_InsdcWriter):
        #TODO - Do the line spliting while preserving white space?
        max_len = self.MAX_WIDTH - self.HEADER_WIDTH
        lines = self._split_multi_line(text, max_len)
-        assert len(tag) < self.HEADER_WIDTH
        self._write_single_line(tag, lines[0])
-        for line in lines[1:] :
+        for line in lines[1:]:
            self._write_single_line("", line)

    def _write_multi_entries(self, tag, text_list):
--- a/Tests/EMBL/epo_prt_selection.embl
+++ b/Tests/EMBL/epo_prt_selection.embl
@ -259,3 +259,45 @@ SQ   Sequence 153 AA; 17492 MW; 2159E552 CRC32;
     KNTTEKETFC RAATVLRQFY SHHEKDTRCL GATAQQFHRH KQLIRFLKRL DRNLWGLAGL       120
     NSCPVKEANQ STLENFLERL KTIMREKYSK CSS                                    153
 //
+ID   CQ797900; SV 1; linear; protein; PRT; PRO; 496 AA.
+XX
+AC   CQ797900;
+XX
+DT   20-APR-2004 (Rel. 79, Created)
+DT   31-MAY-2006 (Rel. 88, Last updated, Version 3)
+XX
+DE   Sequence 2 from Patent WO2004029250.
+XX
+KW   .
+XX
+OS   Sinorhizobium meliloti
+OC   Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales; Rhizobiaceae;
+OC   Sinorhizobium/Ensifer group; Sinorhizobium.
+XX
+RN   [1]
+RA   Hoshino T., Ichikawa K., Tazoe M.;
+RT   "Dna encoding
+RT   flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena
+RT   se , pdxr , and microbial production of vitamin b6";
+RL   Patent number WO2004029250-A1/2, 08-APR-2004.
+RL   DSM IP Assets B.V. (NL).
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   source          1..496
+FT                   /organism="Sinorhizobium meliloti"
+FT                   /mol_type="protein"
+FT                   /db_xref="taxon:382"
+XX
+SQ   Sequence 496 AA; 53189 MW; 54C0D9AD CRC32;
+     MAIGTLEATT LIRGRAMTTV LPSPELIASF VDIVGPGNAL TAPADTAPYL VESRGLYRGT        60
+     TPLVLRPGSV EEVSLVMRLA SQTRTAVVPQ GGNTGHVAGQ IPREGKADVV LSLERLNRIR       120
+     DIDPVGNVIV ADAGCILADI QKAADDVDRL FPLSLGSEGS ARIGGNLSTN AGGTAVLAYG       180
+     NMRQLCLGLE VVLPTGEIWD GLRRLRKDNT GYDLRDLFIG AEGTLGVITG AVLKLFPKPR       240
+     GHQVAFAGLR SVEDALTLFD RATSVCGPAL TGFELMPRLG IEFTTRHIAG VRDPMETTHP       300
+     WYALIDISTS DTAESAERMV QDLLEAVIAD GLVENAVIAQ NEAQRRALWH MRESMSPAQK       360
+     PEGGSIKHDV SVPVSSIPAF MTEADALVSK AIPGARICAF GHMGDGNIHY NISQPVGADK       420
+     QSFLDRWREI NAIVHAVVLK HDGSISAEHG IGQLKRDELA AIRSPIEIEL MRRIKHAFDP       480
+     AGIMNPDKVL REDRGE                                                       496
+//
+
--- a/Tests/output/test_SeqIO
+++ b/Tests/output/test_SeqIO
@ -1789,7 +1789,7 @@ Testing reading genbank format file GFF/NC_001422.gbk
 Failed: Missing SFF flow information
 Checking can write/read as 'tab' format
 Checking can write/read as 'nexus' format
-Testing reading embl format file EMBL/epo_prt_start.embl
+Testing reading embl format file EMBL/epo_prt_selection.embl
 ID = 'A00022.1', Name='A00022',
 Seq='CLARIIRYFYNAKA', length=14
 ID = 'A00028.1', Name='A00028',
@ -1797,8 +1797,8 @@ Testing reading embl format file EMBL/epo_prt_start.embl
 ID = 'A00031.1', Name='A00031',
 Seq='RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRA...ERTCGGA', length=58
 ...
- ID = 'A00078.1', Name='A00078',
- Seq='MGLTSQLLPPLFFLLACAGNFVHGHKCDITLQEIIKTLNS...KYSKCSS', length=153
+ ID = 'CQ797900.1', Name='CQ797900',
+ Seq='MAIGTLEATTLIRGRAMTTVLPSPELIASFVDIVGPGNAL...LREDRGE', length=496
 Checking can write/read as 'fasta' format
 Checking can write/read as 'clustal' format
 Failed: Sequences must all be the same length
@ -1808,17 +1808,19 @@ Testing reading embl format file EMBL/epo_prt_start.embl
 Failed: Sequences must all be the same length
 Checking can write/read as 'embl' format
 Checking can write/read as 'fastq' format
- Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
+ Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
 Checking can write/read as 'fastq-illumina' format
- Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
+ Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
 Checking can write/read as 'fastq-solexa' format
- Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
+ Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
 Checking can write/read as 'genbank' format
+ Failed: Text cannot be broken into len 68 lines!:
+'Dna encoding flavin-adenine-dinucleotide-dependent-d-erythronate-4-phosphate-dehydrogena se , pdxr , and microbial production of vitamin b6'
 Checking can write/read as 'imgt' format
 Checking can write/read as 'phd' format
- Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
+ Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
 Checking can write/read as 'qual' format
- Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=A00078.1).
+ Failed: No suitable quality scores found in letter_annotations of SeqRecord (id=CQ797900.1).
 Checking can write/read as 'sff' format
 Failed: Missing SFF flow information
 Checking can write/read as 'tab' format
--- a/Tests/test_SeqIO.py
+++ b/Tests/test_SeqIO.py
@ -139,7 +139,7 @@ test_files = [ \
 #Following files are also used in test_GFF.py
    ("genbank",False, 'GFF/NC_001422.gbk', 1),
 #Following files are currently only used here or in test_SeqIO_index.py:
-    ("embl",   False, 'EMBL/epo_prt_start.embl', 8), #proteins
+    ("embl",   False, 'EMBL/epo_prt_selection.embl', 9), #proteins
    ("embl",   False, 'EMBL/TRBG361.embl', 1),
    ("embl",   False, 'EMBL/DD231055_edited.embl', 1),
    ("embl",   False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
--- a/Tests/test_SeqIO_index.py
+++ b/Tests/test_SeqIO_index.py
@ -128,7 +128,7 @@ tests = [
    ("Quality/sanger_faked.fastq", "fastq-sanger", generic_dna),
    ("Quality/solexa_faked.fastq", "fastq-solexa", generic_dna),
    ("Quality/illumina_faked.fastq", "fastq-illumina", generic_dna),
-    ("EMBL/epo_prt_start.embl", "embl", None),
+    ("EMBL/epo_prt_selection.embl", "embl", None),
    ("EMBL/U87107.embl", "embl", None),
    ("EMBL/TRBG361.embl", "embl", None),
    ("EMBL/A04195.imgt", "embl", None), #Not a proper EMBL file, an IMGT file