mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
4775: raise warnings instead of errors if dashes are not in the expected places for dates
This commit is contained in:
committed by
Peter Cock
parent
e0c081cd4d
commit
b8b90e1152
@ -1306,17 +1306,17 @@ class GenBankScanner(InsdcScanner):
|
|||||||
)
|
)
|
||||||
# if line[55:62] != ' ':
|
# if line[55:62] != ' ':
|
||||||
# raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line)
|
# raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line)
|
||||||
|
parse_date = False
|
||||||
if line[62:73].strip():
|
if line[62:73].strip():
|
||||||
|
parse_date = True
|
||||||
if line[64:65] != "-":
|
if line[64:65] != "-":
|
||||||
raise ValueError(
|
parse_date = False
|
||||||
"LOCUS line does not contain - at "
|
warnings.warn("LOCUS line does not contain - at position 65 in date:\n" + line,
|
||||||
"position 65 in date:\n" + line
|
BiopythonParserWarning)
|
||||||
)
|
|
||||||
if line[68:69] != "-":
|
if line[68:69] != "-":
|
||||||
raise ValueError(
|
parse_date = False
|
||||||
"LOCUS line does not contain - at "
|
warnings.warn("LOCUS line does not contain - at position 69 in date:\n" + line,
|
||||||
"position 69 in date:\n" + line
|
BiopythonParserWarning)
|
||||||
)
|
|
||||||
|
|
||||||
name_and_length_str = line[self.GENBANK_INDENT : 29]
|
name_and_length_str = line[self.GENBANK_INDENT : 29]
|
||||||
while " " in name_and_length_str:
|
while " " in name_and_length_str:
|
||||||
@ -1353,7 +1353,7 @@ class GenBankScanner(InsdcScanner):
|
|||||||
consumer.molecule_type(line[33:41].strip())
|
consumer.molecule_type(line[33:41].strip())
|
||||||
consumer.topology(line[42:51].strip())
|
consumer.topology(line[42:51].strip())
|
||||||
consumer.data_file_division(line[52:55])
|
consumer.data_file_division(line[52:55])
|
||||||
if line[62:73].strip():
|
if parse_date:
|
||||||
consumer.date(line[62:73])
|
consumer.date(line[62:73])
|
||||||
elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [
|
elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [
|
||||||
"",
|
"",
|
||||||
@ -1429,17 +1429,17 @@ class GenBankScanner(InsdcScanner):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"LOCUS line does not contain space at position 68:\n" + line
|
"LOCUS line does not contain space at position 68:\n" + line
|
||||||
)
|
)
|
||||||
|
parse_date = False
|
||||||
if line[68:79].strip():
|
if line[68:79].strip():
|
||||||
|
parse_date = True
|
||||||
if line[70:71] != "-":
|
if line[70:71] != "-":
|
||||||
raise ValueError(
|
parse_date = False
|
||||||
"LOCUS line does not contain - at "
|
warnings.warn("LOCUS line does not contain - at position 71 in date:\n" + line,
|
||||||
"position 71 in date:\n" + line
|
BiopythonParserWarning)
|
||||||
)
|
|
||||||
if line[74:75] != "-":
|
if line[74:75] != "-":
|
||||||
raise ValueError(
|
parse_date = False
|
||||||
"LOCUS line does not contain - at "
|
warnings.warn("LOCUS line does not contain - at position 75 in date:\n" + line,
|
||||||
"position 75 in date:\n" + line
|
BiopythonParserWarning)
|
||||||
)
|
|
||||||
|
|
||||||
name_and_length_str = line[self.GENBANK_INDENT : 40]
|
name_and_length_str = line[self.GENBANK_INDENT : 40]
|
||||||
while " " in name_and_length_str:
|
while " " in name_and_length_str:
|
||||||
@ -1467,7 +1467,7 @@ class GenBankScanner(InsdcScanner):
|
|||||||
consumer.topology(line[55:63].strip())
|
consumer.topology(line[55:63].strip())
|
||||||
if line[64:76].strip():
|
if line[64:76].strip():
|
||||||
consumer.data_file_division(line[64:67])
|
consumer.data_file_division(line[64:67])
|
||||||
if line[68:79].strip():
|
if parse_date:
|
||||||
consumer.date(line[68:79])
|
consumer.date(line[68:79])
|
||||||
elif line[self.GENBANK_INDENT :].strip().count(" ") == 0:
|
elif line[self.GENBANK_INDENT :].strip().count(" ") == 0:
|
||||||
# Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762
|
# Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762
|
||||||
|
72
Tests/GenBank/noref_date_warning.gb
Normal file
72
Tests/GenBank/noref_date_warning.gb
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd
|
||||||
|
DEFINITION Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2
|
||||||
|
(DNCLI2), mRNA.
|
||||||
|
ACCESSION NM_006141
|
||||||
|
VERSION NM_006141.1 GI:5453633
|
||||||
|
KEYWORDS .
|
||||||
|
SOURCE human.
|
||||||
|
ORGANISM Homo sapiens
|
||||||
|
Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
|
||||||
|
Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo.
|
||||||
|
COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final
|
||||||
|
NCBI review. The reference sequence was derived from AF035812.1.
|
||||||
|
FEATURES Location/Qualifiers
|
||||||
|
source 1..1622
|
||||||
|
/organism="Homo sapiens"
|
||||||
|
/db_xref="taxon:9606"
|
||||||
|
/map="16"
|
||||||
|
gene 1..1622
|
||||||
|
/gene="DNCLI2"
|
||||||
|
/note="LIC2"
|
||||||
|
/db_xref="LocusID:1783"
|
||||||
|
CDS 7..1485
|
||||||
|
/gene="DNCLI2"
|
||||||
|
/note="similar to R. norvegicus and G. gallus dynein light
|
||||||
|
intermediate chain 2, Swiss-Prot Accession Numbers Q62698
|
||||||
|
and Q90828, respectively"
|
||||||
|
/codon_start=1
|
||||||
|
/db_xref="LocusID:1783"
|
||||||
|
/product="dynein, cytoplasmic, light intermediate
|
||||||
|
polypeptide 2"
|
||||||
|
/protein_id="NP_006132.1"
|
||||||
|
/db_xref="GI:5453634"
|
||||||
|
/translation="MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVS
|
||||||
|
TRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHT
|
||||||
|
RCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHI
|
||||||
|
DKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNV
|
||||||
|
LTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKE
|
||||||
|
EKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPED
|
||||||
|
AYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSP
|
||||||
|
RTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAG
|
||||||
|
GVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA"
|
||||||
|
BASE COUNT 474 a 356 c 428 g 364 t
|
||||||
|
ORIGIN
|
||||||
|
1 ggcaagatgg cgccggtggg ggtggagaag aagctgctgc taggtcccaa cgggcccgcg
|
||||||
|
61 gtggcggccg ccggcgacct gaccagtgag gaggaggaag gccagagcct atggtcctcc
|
||||||
|
121 attctgagcg aagtgtccac ccgcgccagg tccaagctgc cgtccggcaa gaacatcctg
|
||||||
|
181 gtcttcggtg aagatggttc tggtaaaaca accctcatga ctaaactaca aggagctgag
|
||||||
|
241 catggcaaaa aaggaagagg cctagaatat ctctacctca gtgtccatga tgaggaccga
|
||||||
|
301 gatgatcaca cgcgctgcaa cgtgtggatt ctggatggag acttgtacca caaaggcctg
|
||||||
|
361 ctgaaatttg cagtttctgc tgaatccttg ccagagaccc tcgtcatttt tgttgcagac
|
||||||
|
421 atgtctagac cttggactgt gatggaatct ctgcagaaat gggctagtgt tttacgtgag
|
||||||
|
481 cacattgata aaatgaaaat tccaccagaa aaaatgaggg agctggaacg gaagtttgtg
|
||||||
|
541 aaagattttc aagactatat ggaacctgaa gaaggttgtc aaggttcccc acagagaaga
|
||||||
|
601 ggccctctga cctcaggctc cgatgaagaa aatgttgccc tgcctctggg tgacaatgtg
|
||||||
|
661 ctgactcata acctggggat cccggtgttg gtggtgtgca caaagtgtga tgcggtgagt
|
||||||
|
721 gtcctggaga aggagcacga ttacagggat gagcatttgg actttatcca gtcacacctg
|
||||||
|
781 cggaggttct gccttcagta tggagctgcc ttgatttaca catcagtgaa agaagagaaa
|
||||||
|
841 aacctcgact tgttgtataa gtatattgtt cataaaacat acggtttcca cttcaccaca
|
||||||
|
901 cctgccttag ttgtggaaaa ggatgccgtt tttatacctg caggctggga caatgaaaag
|
||||||
|
961 aaaatagcta ttttacatga aaattttaca accgtgaagc cggaagatgc atatgaagac
|
||||||
|
1021 tttattgtga aacctcccgt gagaaagctg gtccacgaca aagagttggc agcagaagat
|
||||||
|
1081 gagcaggtgt tcctaatgaa gcaacagtca ctccttgcca agcaaccagc cactcccacg
|
||||||
|
1141 agagcttctg aatctcctgc aagaggaccc tctggctctc caaggaccca gggtcgggga
|
||||||
|
1201 gggccagcca gtgtgcctag ctcctcccca ggcacgtcag taaaaaagcc ggacccaaac
|
||||||
|
1261 atcaaaaata atgcagcaag tgaaggggtg ttggccagct tcttcaacag tctgttgagt
|
||||||
|
1321 aaaaagacag gctctcctgg aagtcctggt gctggtgggg tgcagagcac agccaagaag
|
||||||
|
1381 tcaggacaaa agactgtgtt gtcaaatgtt caggaagaac tggatagaat gactcgaaag
|
||||||
|
1441 ccagactcta tggtaacaaa ctcttcaaca gaaaatgaag cctgaacctc cttaaaaagt
|
||||||
|
1501 gcatatgtcg aatgaccaaa taactatgta tattgatctg ctaagaccag gatttttctg
|
||||||
|
1561 atatggcaca tgctatcagt tttttggggc aggggagatg aactttaaaa aaaaaaaaaa
|
||||||
|
1621 aa
|
||||||
|
//
|
@ -3444,6 +3444,114 @@ qualifiers:
|
|||||||
"""\
|
"""\
|
||||||
type: CDS
|
type: CDS
|
||||||
location: [6:1485](+)
|
location: [6:1485](+)
|
||||||
|
qualifiers:
|
||||||
|
Key: codon_start, Value: ['1']
|
||||||
|
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
|
||||||
|
Key: gene, Value: ['DNCLI2']
|
||||||
|
Key: note, Value: ['similar to R. norvegicus and G. gallus dynein light intermediate chain 2, Swiss-Prot Accession Numbers Q62698 and Q90828, respectively']
|
||||||
|
Key: product, Value: ['dynein, cytoplasmic, light intermediate polypeptide 2']
|
||||||
|
Key: protein_id, Value: ['NP_006132.1']
|
||||||
|
Key: translation, Value: ['MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVSTRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHTRCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHIDKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNVLTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKEEKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPEDAYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSPRTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAGGVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA']
|
||||||
|
""",
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
dbxrefs = []
|
||||||
|
self.perform_feature_parser_test(
|
||||||
|
record,
|
||||||
|
seq,
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
description,
|
||||||
|
annotations,
|
||||||
|
references,
|
||||||
|
features,
|
||||||
|
dbxrefs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_feature_parser_date_warning(self):
|
||||||
|
path = "GenBank/noref_date_warning.gb"
|
||||||
|
with warnings.catch_warnings(record=True) as caught:
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
with open(path) as handle:
|
||||||
|
records = GenBank.Iterator(handle, self.feat_parser)
|
||||||
|
record = next(records)
|
||||||
|
self.assertEqual(len(caught), 2)
|
||||||
|
self.assertEqual(caught[0].category, BiopythonParserWarning)
|
||||||
|
self.assertEqual(caught[1].category, BiopythonParserWarning)
|
||||||
|
self.assertEqual(
|
||||||
|
str(caught[0].message),
|
||||||
|
"LOCUS line does not contain - at position 65 in date:\n"
|
||||||
|
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
str(caught[1].message),
|
||||||
|
"LOCUS line does not contain - at position 69 in date:\n"
|
||||||
|
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
|
||||||
|
)
|
||||||
|
seq = "GGCAAGATGGCGCCGGTGGGGGTGGAGAAGAAGCTGCTGCTAGGTCCCAACGGG...AAA"
|
||||||
|
id = "NM_006141.1"
|
||||||
|
name = "NM_006141"
|
||||||
|
description = (
|
||||||
|
"Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2"
|
||||||
|
" (DNCLI2), mRNA"
|
||||||
|
)
|
||||||
|
annotations = {
|
||||||
|
"accessions": ["NM_006141"],
|
||||||
|
"comment": """\
|
||||||
|
PROVISIONAL REFSEQ: This record has not yet been subject to final
|
||||||
|
NCBI review. The reference sequence was derived from AF035812.1.""",
|
||||||
|
"data_file_division": "PRI",
|
||||||
|
# "date": "01-NOV-2000", NB: no date
|
||||||
|
"gi": "5453633",
|
||||||
|
"keywords": [""],
|
||||||
|
"molecule_type": "mRNA",
|
||||||
|
"organism": "Homo sapiens",
|
||||||
|
"sequence_version": 1,
|
||||||
|
"source": "human",
|
||||||
|
"taxonomy": [
|
||||||
|
"Eukaryota",
|
||||||
|
"Metazoa",
|
||||||
|
"Chordata",
|
||||||
|
"Craniata",
|
||||||
|
"Vertebrata",
|
||||||
|
"Euteleostomi",
|
||||||
|
"Mammalia",
|
||||||
|
"Eutheria",
|
||||||
|
"Primates",
|
||||||
|
"Catarrhini",
|
||||||
|
"Hominidae",
|
||||||
|
"Homo",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
references = []
|
||||||
|
features = (
|
||||||
|
(
|
||||||
|
"""\
|
||||||
|
type: source
|
||||||
|
location: [0:1622](+)
|
||||||
|
qualifiers:
|
||||||
|
Key: db_xref, Value: ['taxon:9606']
|
||||||
|
Key: map, Value: ['16']
|
||||||
|
Key: organism, Value: ['Homo sapiens']
|
||||||
|
""",
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"""\
|
||||||
|
type: gene
|
||||||
|
location: [0:1622](+)
|
||||||
|
qualifiers:
|
||||||
|
Key: db_xref, Value: ['LocusID:1783']
|
||||||
|
Key: gene, Value: ['DNCLI2']
|
||||||
|
Key: note, Value: ['LIC2']
|
||||||
|
""",
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"""\
|
||||||
|
type: CDS
|
||||||
|
location: [6:1485](+)
|
||||||
qualifiers:
|
qualifiers:
|
||||||
Key: codon_start, Value: ['1']
|
Key: codon_start, Value: ['1']
|
||||||
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
|
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
|
||||||
@ -8124,6 +8232,14 @@ class LineOneTests(unittest.TestCase):
|
|||||||
"BCT",
|
"BCT",
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"LOCUS AB070938 6497 bp DNA linear BCT"
|
||||||
|
" 1-Oct-2001\n",
|
||||||
|
"linear",
|
||||||
|
"DNA",
|
||||||
|
"BCT",
|
||||||
|
[BiopythonParserWarning, BiopythonParserWarning],
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"LOCUS NC_005816 9609 bp DNA circular BCT"
|
"LOCUS NC_005816 9609 bp DNA circular BCT"
|
||||||
" 21-JUL-2008",
|
" 21-JUL-2008",
|
||||||
@ -8509,5 +8625,6 @@ class GenBankScannerTests(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
os.chdir(os.path.dirname(__file__))
|
||||||
runner = unittest.TextTestRunner(verbosity=2)
|
runner = unittest.TextTestRunner(verbosity=2)
|
||||||
unittest.main(testRunner=runner)
|
unittest.main(testRunner=runner)
|
||||||
|
Reference in New Issue
Block a user