diff --git a/Bio/GenBank/Scanner.py b/Bio/GenBank/Scanner.py index 0ea4af00b..201818111 100644 --- a/Bio/GenBank/Scanner.py +++ b/Bio/GenBank/Scanner.py @@ -1306,17 +1306,17 @@ class GenBankScanner(InsdcScanner): ) # if line[55:62] != ' ': # raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line) + parse_date = False if line[62:73].strip(): + parse_date = True if line[64:65] != "-": - raise ValueError( - "LOCUS line does not contain - at " - "position 65 in date:\n" + line - ) + parse_date = False + warnings.warn("LOCUS line does not contain - at position 65 in date:\n" + line, + BiopythonParserWarning) if line[68:69] != "-": - raise ValueError( - "LOCUS line does not contain - at " - "position 69 in date:\n" + line - ) + parse_date = False + warnings.warn("LOCUS line does not contain - at position 69 in date:\n" + line, + BiopythonParserWarning) name_and_length_str = line[self.GENBANK_INDENT : 29] while " " in name_and_length_str: @@ -1353,7 +1353,7 @@ class GenBankScanner(InsdcScanner): consumer.molecule_type(line[33:41].strip()) consumer.topology(line[42:51].strip()) consumer.data_file_division(line[52:55]) - if line[62:73].strip(): + if parse_date: consumer.date(line[62:73]) elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [ "", @@ -1429,17 +1429,17 @@ class GenBankScanner(InsdcScanner): raise ValueError( "LOCUS line does not contain space at position 68:\n" + line ) + parse_date = False if line[68:79].strip(): + parse_date = True if line[70:71] != "-": - raise ValueError( - "LOCUS line does not contain - at " - "position 71 in date:\n" + line - ) + parse_date = False + warnings.warn("LOCUS line does not contain - at position 71 in date:\n" + line, + BiopythonParserWarning) if line[74:75] != "-": - raise ValueError( - "LOCUS line does not contain - at " - "position 75 in date:\n" + line - ) + parse_date = False + warnings.warn("LOCUS line does not contain - at position 75 in date:\n" + line, + BiopythonParserWarning) name_and_length_str = line[self.GENBANK_INDENT : 40] while " " in name_and_length_str: @@ -1467,7 +1467,7 @@ class GenBankScanner(InsdcScanner): consumer.topology(line[55:63].strip()) if line[64:76].strip(): consumer.data_file_division(line[64:67]) - if line[68:79].strip(): + if parse_date: consumer.date(line[68:79]) elif line[self.GENBANK_INDENT :].strip().count(" ") == 0: # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 diff --git a/Tests/GenBank/noref_date_warning.gb b/Tests/GenBank/noref_date_warning.gb new file mode 100644 index 000000000..b75f67d5c --- /dev/null +++ b/Tests/GenBank/noref_date_warning.gb @@ -0,0 +1,72 @@ +LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd +DEFINITION Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2 + (DNCLI2), mRNA. +ACCESSION NM_006141 +VERSION NM_006141.1 GI:5453633 +KEYWORDS . +SOURCE human. + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo. +COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final + NCBI review. The reference sequence was derived from AF035812.1. +FEATURES Location/Qualifiers + source 1..1622 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /map="16" + gene 1..1622 + /gene="DNCLI2" + /note="LIC2" + /db_xref="LocusID:1783" + CDS 7..1485 + /gene="DNCLI2" + /note="similar to R. norvegicus and G. gallus dynein light + intermediate chain 2, Swiss-Prot Accession Numbers Q62698 + and Q90828, respectively" + /codon_start=1 + /db_xref="LocusID:1783" + /product="dynein, cytoplasmic, light intermediate + polypeptide 2" + /protein_id="NP_006132.1" + /db_xref="GI:5453634" + /translation="MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVS + TRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHT + RCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHI + DKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNV + LTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKE + EKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPED + AYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSP + RTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAG + GVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA" +BASE COUNT 474 a 356 c 428 g 364 t +ORIGIN + 1 ggcaagatgg cgccggtggg ggtggagaag aagctgctgc taggtcccaa cgggcccgcg + 61 gtggcggccg ccggcgacct gaccagtgag gaggaggaag gccagagcct atggtcctcc + 121 attctgagcg aagtgtccac ccgcgccagg tccaagctgc cgtccggcaa gaacatcctg + 181 gtcttcggtg aagatggttc tggtaaaaca accctcatga ctaaactaca aggagctgag + 241 catggcaaaa aaggaagagg cctagaatat ctctacctca gtgtccatga tgaggaccga + 301 gatgatcaca cgcgctgcaa cgtgtggatt ctggatggag acttgtacca caaaggcctg + 361 ctgaaatttg cagtttctgc tgaatccttg ccagagaccc tcgtcatttt tgttgcagac + 421 atgtctagac cttggactgt gatggaatct ctgcagaaat gggctagtgt tttacgtgag + 481 cacattgata aaatgaaaat tccaccagaa aaaatgaggg agctggaacg gaagtttgtg + 541 aaagattttc aagactatat ggaacctgaa gaaggttgtc aaggttcccc acagagaaga + 601 ggccctctga cctcaggctc cgatgaagaa aatgttgccc tgcctctggg tgacaatgtg + 661 ctgactcata acctggggat cccggtgttg gtggtgtgca caaagtgtga tgcggtgagt + 721 gtcctggaga aggagcacga ttacagggat gagcatttgg actttatcca gtcacacctg + 781 cggaggttct gccttcagta tggagctgcc ttgatttaca catcagtgaa agaagagaaa + 841 aacctcgact tgttgtataa gtatattgtt cataaaacat acggtttcca cttcaccaca + 901 cctgccttag ttgtggaaaa ggatgccgtt tttatacctg caggctggga caatgaaaag + 961 aaaatagcta ttttacatga aaattttaca accgtgaagc cggaagatgc atatgaagac + 1021 tttattgtga aacctcccgt gagaaagctg gtccacgaca aagagttggc agcagaagat + 1081 gagcaggtgt tcctaatgaa gcaacagtca ctccttgcca agcaaccagc cactcccacg + 1141 agagcttctg aatctcctgc aagaggaccc tctggctctc caaggaccca gggtcgggga + 1201 gggccagcca gtgtgcctag ctcctcccca ggcacgtcag taaaaaagcc ggacccaaac + 1261 atcaaaaata atgcagcaag tgaaggggtg ttggccagct tcttcaacag tctgttgagt + 1321 aaaaagacag gctctcctgg aagtcctggt gctggtgggg tgcagagcac agccaagaag + 1381 tcaggacaaa agactgtgtt gtcaaatgtt caggaagaac tggatagaat gactcgaaag + 1441 ccagactcta tggtaacaaa ctcttcaaca gaaaatgaag cctgaacctc cttaaaaagt + 1501 gcatatgtcg aatgaccaaa taactatgta tattgatctg ctaagaccag gatttttctg + 1561 atatggcaca tgctatcagt tttttggggc aggggagatg aactttaaaa aaaaaaaaaa + 1621 aa +// diff --git a/Tests/test_GenBank.py b/Tests/test_GenBank.py index 5830acd0a..3f39e496d 100644 --- a/Tests/test_GenBank.py +++ b/Tests/test_GenBank.py @@ -3444,6 +3444,114 @@ qualifiers: """\ type: CDS location: [6:1485](+) +qualifiers: + Key: codon_start, Value: ['1'] + Key: db_xref, Value: ['LocusID:1783', 'GI:5453634'] + Key: gene, Value: ['DNCLI2'] + Key: note, Value: ['similar to R. norvegicus and G. gallus dynein light intermediate chain 2, Swiss-Prot Accession Numbers Q62698 and Q90828, respectively'] + Key: product, Value: ['dynein, cytoplasmic, light intermediate polypeptide 2'] + Key: protein_id, Value: ['NP_006132.1'] + Key: translation, Value: ['MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVSTRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHTRCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHIDKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNVLTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKEEKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPEDAYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSPRTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAGGVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA'] +""", + 1, + ), + ) + dbxrefs = [] + self.perform_feature_parser_test( + record, + seq, + id, + name, + description, + annotations, + references, + features, + dbxrefs, + ) + + def test_feature_parser_date_warning(self): + path = "GenBank/noref_date_warning.gb" + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + with open(path) as handle: + records = GenBank.Iterator(handle, self.feat_parser) + record = next(records) + self.assertEqual(len(caught), 2) + self.assertEqual(caught[0].category, BiopythonParserWarning) + self.assertEqual(caught[1].category, BiopythonParserWarning) + self.assertEqual( + str(caught[0].message), + "LOCUS line does not contain - at position 65 in date:\n" + "LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n", + ) + self.assertEqual( + str(caught[1].message), + "LOCUS line does not contain - at position 69 in date:\n" + "LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n", + ) + seq = "GGCAAGATGGCGCCGGTGGGGGTGGAGAAGAAGCTGCTGCTAGGTCCCAACGGG...AAA" + id = "NM_006141.1" + name = "NM_006141" + description = ( + "Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2" + " (DNCLI2), mRNA" + ) + annotations = { + "accessions": ["NM_006141"], + "comment": """\ +PROVISIONAL REFSEQ: This record has not yet been subject to final +NCBI review. The reference sequence was derived from AF035812.1.""", + "data_file_division": "PRI", + # "date": "01-NOV-2000", NB: no date + "gi": "5453633", + "keywords": [""], + "molecule_type": "mRNA", + "organism": "Homo sapiens", + "sequence_version": 1, + "source": "human", + "taxonomy": [ + "Eukaryota", + "Metazoa", + "Chordata", + "Craniata", + "Vertebrata", + "Euteleostomi", + "Mammalia", + "Eutheria", + "Primates", + "Catarrhini", + "Hominidae", + "Homo", + ], + } + references = [] + features = ( + ( + """\ +type: source +location: [0:1622](+) +qualifiers: + Key: db_xref, Value: ['taxon:9606'] + Key: map, Value: ['16'] + Key: organism, Value: ['Homo sapiens'] +""", + 1, + ), + ( + """\ +type: gene +location: [0:1622](+) +qualifiers: + Key: db_xref, Value: ['LocusID:1783'] + Key: gene, Value: ['DNCLI2'] + Key: note, Value: ['LIC2'] +""", + 1, + ), + ( + """\ +type: CDS +location: [6:1485](+) qualifiers: Key: codon_start, Value: ['1'] Key: db_xref, Value: ['LocusID:1783', 'GI:5453634'] @@ -8124,6 +8232,14 @@ class LineOneTests(unittest.TestCase): "BCT", None, ), + ( + "LOCUS AB070938 6497 bp DNA linear BCT" + " 1-Oct-2001\n", + "linear", + "DNA", + "BCT", + [BiopythonParserWarning, BiopythonParserWarning], + ), ( "LOCUS NC_005816 9609 bp DNA circular BCT" " 21-JUL-2008", @@ -8509,5 +8625,6 @@ class GenBankScannerTests(unittest.TestCase): if __name__ == "__main__": + os.chdir(os.path.dirname(__file__)) runner = unittest.TextTestRunner(verbosity=2) unittest.main(testRunner=runner)