mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
4775: raise warnings instead of errors if dashes are not in the expected places for dates
This commit is contained in:
committed by
Peter Cock
parent
e0c081cd4d
commit
b8b90e1152
@ -1306,17 +1306,17 @@ class GenBankScanner(InsdcScanner):
|
||||
)
|
||||
# if line[55:62] != ' ':
|
||||
# raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line)
|
||||
parse_date = False
|
||||
if line[62:73].strip():
|
||||
parse_date = True
|
||||
if line[64:65] != "-":
|
||||
raise ValueError(
|
||||
"LOCUS line does not contain - at "
|
||||
"position 65 in date:\n" + line
|
||||
)
|
||||
parse_date = False
|
||||
warnings.warn("LOCUS line does not contain - at position 65 in date:\n" + line,
|
||||
BiopythonParserWarning)
|
||||
if line[68:69] != "-":
|
||||
raise ValueError(
|
||||
"LOCUS line does not contain - at "
|
||||
"position 69 in date:\n" + line
|
||||
)
|
||||
parse_date = False
|
||||
warnings.warn("LOCUS line does not contain - at position 69 in date:\n" + line,
|
||||
BiopythonParserWarning)
|
||||
|
||||
name_and_length_str = line[self.GENBANK_INDENT : 29]
|
||||
while " " in name_and_length_str:
|
||||
@ -1353,7 +1353,7 @@ class GenBankScanner(InsdcScanner):
|
||||
consumer.molecule_type(line[33:41].strip())
|
||||
consumer.topology(line[42:51].strip())
|
||||
consumer.data_file_division(line[52:55])
|
||||
if line[62:73].strip():
|
||||
if parse_date:
|
||||
consumer.date(line[62:73])
|
||||
elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [
|
||||
"",
|
||||
@ -1429,17 +1429,17 @@ class GenBankScanner(InsdcScanner):
|
||||
raise ValueError(
|
||||
"LOCUS line does not contain space at position 68:\n" + line
|
||||
)
|
||||
parse_date = False
|
||||
if line[68:79].strip():
|
||||
parse_date = True
|
||||
if line[70:71] != "-":
|
||||
raise ValueError(
|
||||
"LOCUS line does not contain - at "
|
||||
"position 71 in date:\n" + line
|
||||
)
|
||||
parse_date = False
|
||||
warnings.warn("LOCUS line does not contain - at position 71 in date:\n" + line,
|
||||
BiopythonParserWarning)
|
||||
if line[74:75] != "-":
|
||||
raise ValueError(
|
||||
"LOCUS line does not contain - at "
|
||||
"position 75 in date:\n" + line
|
||||
)
|
||||
parse_date = False
|
||||
warnings.warn("LOCUS line does not contain - at position 75 in date:\n" + line,
|
||||
BiopythonParserWarning)
|
||||
|
||||
name_and_length_str = line[self.GENBANK_INDENT : 40]
|
||||
while " " in name_and_length_str:
|
||||
@ -1467,7 +1467,7 @@ class GenBankScanner(InsdcScanner):
|
||||
consumer.topology(line[55:63].strip())
|
||||
if line[64:76].strip():
|
||||
consumer.data_file_division(line[64:67])
|
||||
if line[68:79].strip():
|
||||
if parse_date:
|
||||
consumer.date(line[68:79])
|
||||
elif line[self.GENBANK_INDENT :].strip().count(" ") == 0:
|
||||
# Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762
|
||||
|
72
Tests/GenBank/noref_date_warning.gb
Normal file
72
Tests/GenBank/noref_date_warning.gb
Normal file
@ -0,0 +1,72 @@
|
||||
LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd
|
||||
DEFINITION Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2
|
||||
(DNCLI2), mRNA.
|
||||
ACCESSION NM_006141
|
||||
VERSION NM_006141.1 GI:5453633
|
||||
KEYWORDS .
|
||||
SOURCE human.
|
||||
ORGANISM Homo sapiens
|
||||
Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
|
||||
Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo.
|
||||
COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final
|
||||
NCBI review. The reference sequence was derived from AF035812.1.
|
||||
FEATURES Location/Qualifiers
|
||||
source 1..1622
|
||||
/organism="Homo sapiens"
|
||||
/db_xref="taxon:9606"
|
||||
/map="16"
|
||||
gene 1..1622
|
||||
/gene="DNCLI2"
|
||||
/note="LIC2"
|
||||
/db_xref="LocusID:1783"
|
||||
CDS 7..1485
|
||||
/gene="DNCLI2"
|
||||
/note="similar to R. norvegicus and G. gallus dynein light
|
||||
intermediate chain 2, Swiss-Prot Accession Numbers Q62698
|
||||
and Q90828, respectively"
|
||||
/codon_start=1
|
||||
/db_xref="LocusID:1783"
|
||||
/product="dynein, cytoplasmic, light intermediate
|
||||
polypeptide 2"
|
||||
/protein_id="NP_006132.1"
|
||||
/db_xref="GI:5453634"
|
||||
/translation="MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVS
|
||||
TRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHT
|
||||
RCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHI
|
||||
DKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNV
|
||||
LTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKE
|
||||
EKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPED
|
||||
AYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSP
|
||||
RTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAG
|
||||
GVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA"
|
||||
BASE COUNT 474 a 356 c 428 g 364 t
|
||||
ORIGIN
|
||||
1 ggcaagatgg cgccggtggg ggtggagaag aagctgctgc taggtcccaa cgggcccgcg
|
||||
61 gtggcggccg ccggcgacct gaccagtgag gaggaggaag gccagagcct atggtcctcc
|
||||
121 attctgagcg aagtgtccac ccgcgccagg tccaagctgc cgtccggcaa gaacatcctg
|
||||
181 gtcttcggtg aagatggttc tggtaaaaca accctcatga ctaaactaca aggagctgag
|
||||
241 catggcaaaa aaggaagagg cctagaatat ctctacctca gtgtccatga tgaggaccga
|
||||
301 gatgatcaca cgcgctgcaa cgtgtggatt ctggatggag acttgtacca caaaggcctg
|
||||
361 ctgaaatttg cagtttctgc tgaatccttg ccagagaccc tcgtcatttt tgttgcagac
|
||||
421 atgtctagac cttggactgt gatggaatct ctgcagaaat gggctagtgt tttacgtgag
|
||||
481 cacattgata aaatgaaaat tccaccagaa aaaatgaggg agctggaacg gaagtttgtg
|
||||
541 aaagattttc aagactatat ggaacctgaa gaaggttgtc aaggttcccc acagagaaga
|
||||
601 ggccctctga cctcaggctc cgatgaagaa aatgttgccc tgcctctggg tgacaatgtg
|
||||
661 ctgactcata acctggggat cccggtgttg gtggtgtgca caaagtgtga tgcggtgagt
|
||||
721 gtcctggaga aggagcacga ttacagggat gagcatttgg actttatcca gtcacacctg
|
||||
781 cggaggttct gccttcagta tggagctgcc ttgatttaca catcagtgaa agaagagaaa
|
||||
841 aacctcgact tgttgtataa gtatattgtt cataaaacat acggtttcca cttcaccaca
|
||||
901 cctgccttag ttgtggaaaa ggatgccgtt tttatacctg caggctggga caatgaaaag
|
||||
961 aaaatagcta ttttacatga aaattttaca accgtgaagc cggaagatgc atatgaagac
|
||||
1021 tttattgtga aacctcccgt gagaaagctg gtccacgaca aagagttggc agcagaagat
|
||||
1081 gagcaggtgt tcctaatgaa gcaacagtca ctccttgcca agcaaccagc cactcccacg
|
||||
1141 agagcttctg aatctcctgc aagaggaccc tctggctctc caaggaccca gggtcgggga
|
||||
1201 gggccagcca gtgtgcctag ctcctcccca ggcacgtcag taaaaaagcc ggacccaaac
|
||||
1261 atcaaaaata atgcagcaag tgaaggggtg ttggccagct tcttcaacag tctgttgagt
|
||||
1321 aaaaagacag gctctcctgg aagtcctggt gctggtgggg tgcagagcac agccaagaag
|
||||
1381 tcaggacaaa agactgtgtt gtcaaatgtt caggaagaac tggatagaat gactcgaaag
|
||||
1441 ccagactcta tggtaacaaa ctcttcaaca gaaaatgaag cctgaacctc cttaaaaagt
|
||||
1501 gcatatgtcg aatgaccaaa taactatgta tattgatctg ctaagaccag gatttttctg
|
||||
1561 atatggcaca tgctatcagt tttttggggc aggggagatg aactttaaaa aaaaaaaaaa
|
||||
1621 aa
|
||||
//
|
@ -3444,6 +3444,114 @@ qualifiers:
|
||||
"""\
|
||||
type: CDS
|
||||
location: [6:1485](+)
|
||||
qualifiers:
|
||||
Key: codon_start, Value: ['1']
|
||||
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
|
||||
Key: gene, Value: ['DNCLI2']
|
||||
Key: note, Value: ['similar to R. norvegicus and G. gallus dynein light intermediate chain 2, Swiss-Prot Accession Numbers Q62698 and Q90828, respectively']
|
||||
Key: product, Value: ['dynein, cytoplasmic, light intermediate polypeptide 2']
|
||||
Key: protein_id, Value: ['NP_006132.1']
|
||||
Key: translation, Value: ['MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVSTRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHTRCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHIDKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNVLTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKEEKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPEDAYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSPRTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAGGVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA']
|
||||
""",
|
||||
1,
|
||||
),
|
||||
)
|
||||
dbxrefs = []
|
||||
self.perform_feature_parser_test(
|
||||
record,
|
||||
seq,
|
||||
id,
|
||||
name,
|
||||
description,
|
||||
annotations,
|
||||
references,
|
||||
features,
|
||||
dbxrefs,
|
||||
)
|
||||
|
||||
def test_feature_parser_date_warning(self):
|
||||
path = "GenBank/noref_date_warning.gb"
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
with open(path) as handle:
|
||||
records = GenBank.Iterator(handle, self.feat_parser)
|
||||
record = next(records)
|
||||
self.assertEqual(len(caught), 2)
|
||||
self.assertEqual(caught[0].category, BiopythonParserWarning)
|
||||
self.assertEqual(caught[1].category, BiopythonParserWarning)
|
||||
self.assertEqual(
|
||||
str(caught[0].message),
|
||||
"LOCUS line does not contain - at position 65 in date:\n"
|
||||
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
|
||||
)
|
||||
self.assertEqual(
|
||||
str(caught[1].message),
|
||||
"LOCUS line does not contain - at position 69 in date:\n"
|
||||
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
|
||||
)
|
||||
seq = "GGCAAGATGGCGCCGGTGGGGGTGGAGAAGAAGCTGCTGCTAGGTCCCAACGGG...AAA"
|
||||
id = "NM_006141.1"
|
||||
name = "NM_006141"
|
||||
description = (
|
||||
"Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2"
|
||||
" (DNCLI2), mRNA"
|
||||
)
|
||||
annotations = {
|
||||
"accessions": ["NM_006141"],
|
||||
"comment": """\
|
||||
PROVISIONAL REFSEQ: This record has not yet been subject to final
|
||||
NCBI review. The reference sequence was derived from AF035812.1.""",
|
||||
"data_file_division": "PRI",
|
||||
# "date": "01-NOV-2000", NB: no date
|
||||
"gi": "5453633",
|
||||
"keywords": [""],
|
||||
"molecule_type": "mRNA",
|
||||
"organism": "Homo sapiens",
|
||||
"sequence_version": 1,
|
||||
"source": "human",
|
||||
"taxonomy": [
|
||||
"Eukaryota",
|
||||
"Metazoa",
|
||||
"Chordata",
|
||||
"Craniata",
|
||||
"Vertebrata",
|
||||
"Euteleostomi",
|
||||
"Mammalia",
|
||||
"Eutheria",
|
||||
"Primates",
|
||||
"Catarrhini",
|
||||
"Hominidae",
|
||||
"Homo",
|
||||
],
|
||||
}
|
||||
references = []
|
||||
features = (
|
||||
(
|
||||
"""\
|
||||
type: source
|
||||
location: [0:1622](+)
|
||||
qualifiers:
|
||||
Key: db_xref, Value: ['taxon:9606']
|
||||
Key: map, Value: ['16']
|
||||
Key: organism, Value: ['Homo sapiens']
|
||||
""",
|
||||
1,
|
||||
),
|
||||
(
|
||||
"""\
|
||||
type: gene
|
||||
location: [0:1622](+)
|
||||
qualifiers:
|
||||
Key: db_xref, Value: ['LocusID:1783']
|
||||
Key: gene, Value: ['DNCLI2']
|
||||
Key: note, Value: ['LIC2']
|
||||
""",
|
||||
1,
|
||||
),
|
||||
(
|
||||
"""\
|
||||
type: CDS
|
||||
location: [6:1485](+)
|
||||
qualifiers:
|
||||
Key: codon_start, Value: ['1']
|
||||
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
|
||||
@ -8124,6 +8232,14 @@ class LineOneTests(unittest.TestCase):
|
||||
"BCT",
|
||||
None,
|
||||
),
|
||||
(
|
||||
"LOCUS AB070938 6497 bp DNA linear BCT"
|
||||
" 1-Oct-2001\n",
|
||||
"linear",
|
||||
"DNA",
|
||||
"BCT",
|
||||
[BiopythonParserWarning, BiopythonParserWarning],
|
||||
),
|
||||
(
|
||||
"LOCUS NC_005816 9609 bp DNA circular BCT"
|
||||
" 21-JUL-2008",
|
||||
@ -8509,5 +8625,6 @@ class GenBankScannerTests(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
runner = unittest.TextTestRunner(verbosity=2)
|
||||
unittest.main(testRunner=runner)
|
||||
|
Reference in New Issue
Block a user