4775: raise warnings instead of errors if dashes are not in the expected places for dates

This commit is contained in:
Evgeny Tankhilevich
2024-07-27 02:21:46 +01:00
committed by Peter Cock
parent e0c081cd4d
commit b8b90e1152
3 changed files with 207 additions and 18 deletions

View File

@ -1306,17 +1306,17 @@ class GenBankScanner(InsdcScanner):
)
# if line[55:62] != ' ':
# raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line)
parse_date = False
if line[62:73].strip():
parse_date = True
if line[64:65] != "-":
raise ValueError(
"LOCUS line does not contain - at "
"position 65 in date:\n" + line
)
parse_date = False
warnings.warn("LOCUS line does not contain - at position 65 in date:\n" + line,
BiopythonParserWarning)
if line[68:69] != "-":
raise ValueError(
"LOCUS line does not contain - at "
"position 69 in date:\n" + line
)
parse_date = False
warnings.warn("LOCUS line does not contain - at position 69 in date:\n" + line,
BiopythonParserWarning)
name_and_length_str = line[self.GENBANK_INDENT : 29]
while " " in name_and_length_str:
@ -1353,7 +1353,7 @@ class GenBankScanner(InsdcScanner):
consumer.molecule_type(line[33:41].strip())
consumer.topology(line[42:51].strip())
consumer.data_file_division(line[52:55])
if line[62:73].strip():
if parse_date:
consumer.date(line[62:73])
elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [
"",
@ -1429,17 +1429,17 @@ class GenBankScanner(InsdcScanner):
raise ValueError(
"LOCUS line does not contain space at position 68:\n" + line
)
parse_date = False
if line[68:79].strip():
parse_date = True
if line[70:71] != "-":
raise ValueError(
"LOCUS line does not contain - at "
"position 71 in date:\n" + line
)
parse_date = False
warnings.warn("LOCUS line does not contain - at position 71 in date:\n" + line,
BiopythonParserWarning)
if line[74:75] != "-":
raise ValueError(
"LOCUS line does not contain - at "
"position 75 in date:\n" + line
)
parse_date = False
warnings.warn("LOCUS line does not contain - at position 75 in date:\n" + line,
BiopythonParserWarning)
name_and_length_str = line[self.GENBANK_INDENT : 40]
while " " in name_and_length_str:
@ -1467,7 +1467,7 @@ class GenBankScanner(InsdcScanner):
consumer.topology(line[55:63].strip())
if line[64:76].strip():
consumer.data_file_division(line[64:67])
if line[68:79].strip():
if parse_date:
consumer.date(line[68:79])
elif line[self.GENBANK_INDENT :].strip().count(" ") == 0:
# Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762

View File

@ -0,0 +1,72 @@
LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd
DEFINITION Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2
(DNCLI2), mRNA.
ACCESSION NM_006141
VERSION NM_006141.1 GI:5453633
KEYWORDS .
SOURCE human.
ORGANISM Homo sapiens
Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo.
COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence was derived from AF035812.1.
FEATURES Location/Qualifiers
source 1..1622
/organism="Homo sapiens"
/db_xref="taxon:9606"
/map="16"
gene 1..1622
/gene="DNCLI2"
/note="LIC2"
/db_xref="LocusID:1783"
CDS 7..1485
/gene="DNCLI2"
/note="similar to R. norvegicus and G. gallus dynein light
intermediate chain 2, Swiss-Prot Accession Numbers Q62698
and Q90828, respectively"
/codon_start=1
/db_xref="LocusID:1783"
/product="dynein, cytoplasmic, light intermediate
polypeptide 2"
/protein_id="NP_006132.1"
/db_xref="GI:5453634"
/translation="MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVS
TRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHT
RCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHI
DKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNV
LTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKE
EKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPED
AYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSP
RTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAG
GVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA"
BASE COUNT 474 a 356 c 428 g 364 t
ORIGIN
1 ggcaagatgg cgccggtggg ggtggagaag aagctgctgc taggtcccaa cgggcccgcg
61 gtggcggccg ccggcgacct gaccagtgag gaggaggaag gccagagcct atggtcctcc
121 attctgagcg aagtgtccac ccgcgccagg tccaagctgc cgtccggcaa gaacatcctg
181 gtcttcggtg aagatggttc tggtaaaaca accctcatga ctaaactaca aggagctgag
241 catggcaaaa aaggaagagg cctagaatat ctctacctca gtgtccatga tgaggaccga
301 gatgatcaca cgcgctgcaa cgtgtggatt ctggatggag acttgtacca caaaggcctg
361 ctgaaatttg cagtttctgc tgaatccttg ccagagaccc tcgtcatttt tgttgcagac
421 atgtctagac cttggactgt gatggaatct ctgcagaaat gggctagtgt tttacgtgag
481 cacattgata aaatgaaaat tccaccagaa aaaatgaggg agctggaacg gaagtttgtg
541 aaagattttc aagactatat ggaacctgaa gaaggttgtc aaggttcccc acagagaaga
601 ggccctctga cctcaggctc cgatgaagaa aatgttgccc tgcctctggg tgacaatgtg
661 ctgactcata acctggggat cccggtgttg gtggtgtgca caaagtgtga tgcggtgagt
721 gtcctggaga aggagcacga ttacagggat gagcatttgg actttatcca gtcacacctg
781 cggaggttct gccttcagta tggagctgcc ttgatttaca catcagtgaa agaagagaaa
841 aacctcgact tgttgtataa gtatattgtt cataaaacat acggtttcca cttcaccaca
901 cctgccttag ttgtggaaaa ggatgccgtt tttatacctg caggctggga caatgaaaag
961 aaaatagcta ttttacatga aaattttaca accgtgaagc cggaagatgc atatgaagac
1021 tttattgtga aacctcccgt gagaaagctg gtccacgaca aagagttggc agcagaagat
1081 gagcaggtgt tcctaatgaa gcaacagtca ctccttgcca agcaaccagc cactcccacg
1141 agagcttctg aatctcctgc aagaggaccc tctggctctc caaggaccca gggtcgggga
1201 gggccagcca gtgtgcctag ctcctcccca ggcacgtcag taaaaaagcc ggacccaaac
1261 atcaaaaata atgcagcaag tgaaggggtg ttggccagct tcttcaacag tctgttgagt
1321 aaaaagacag gctctcctgg aagtcctggt gctggtgggg tgcagagcac agccaagaag
1381 tcaggacaaa agactgtgtt gtcaaatgtt caggaagaac tggatagaat gactcgaaag
1441 ccagactcta tggtaacaaa ctcttcaaca gaaaatgaag cctgaacctc cttaaaaagt
1501 gcatatgtcg aatgaccaaa taactatgta tattgatctg ctaagaccag gatttttctg
1561 atatggcaca tgctatcagt tttttggggc aggggagatg aactttaaaa aaaaaaaaaa
1621 aa
//

View File

@ -3444,6 +3444,114 @@ qualifiers:
"""\
type: CDS
location: [6:1485](+)
qualifiers:
Key: codon_start, Value: ['1']
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
Key: gene, Value: ['DNCLI2']
Key: note, Value: ['similar to R. norvegicus and G. gallus dynein light intermediate chain 2, Swiss-Prot Accession Numbers Q62698 and Q90828, respectively']
Key: product, Value: ['dynein, cytoplasmic, light intermediate polypeptide 2']
Key: protein_id, Value: ['NP_006132.1']
Key: translation, Value: ['MAPVGVEKKLLLGPNGPAVAAAGDLTSEEEEGQSLWSSILSEVSTRARSKLPSGKNILVFGEDGSGKTTLMTKLQGAEHGKKGRGLEYLYLSVHDEDRDDHTRCNVWILDGDLYHKGLLKFAVSAESLPETLVIFVADMSRPWTVMESLQKWASVLREHIDKMKIPPEKMRELERKFVKDFQDYMEPEEGCQGSPQRRGPLTSGSDEENVALPLGDNVLTHNLGIPVLVVCTKCDAVSVLEKEHDYRDEHLDFIQSHLRRFCLQYGAALIYTSVKEEKNLDLLYKYIVHKTYGFHFTTPALVVEKDAVFIPAGWDNEKKIAILHENFTTVKPEDAYEDFIVKPPVRKLVHDKELAAEDEQVFLMKQQSLLAKQPATPTRASESPARGPSGSPRTQGRGGPASVPSSSPGTSVKKPDPNIKNNAASEGVLASFFNSLLSKKTGSPGSPGAGGVQSTAKKSGQKTVLSNVQEELDRMTRKPDSMVTNSSTENEA']
""",
1,
),
)
dbxrefs = []
self.perform_feature_parser_test(
record,
seq,
id,
name,
description,
annotations,
references,
features,
dbxrefs,
)
def test_feature_parser_date_warning(self):
path = "GenBank/noref_date_warning.gb"
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
with open(path) as handle:
records = GenBank.Iterator(handle, self.feat_parser)
record = next(records)
self.assertEqual(len(caught), 2)
self.assertEqual(caught[0].category, BiopythonParserWarning)
self.assertEqual(caught[1].category, BiopythonParserWarning)
self.assertEqual(
str(caught[0].message),
"LOCUS line does not contain - at position 65 in date:\n"
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
)
self.assertEqual(
str(caught[1].message),
"LOCUS line does not contain - at position 69 in date:\n"
"LOCUS NM_006141 1622 bp mRNA PRI yyyy/mon/dd\n",
)
seq = "GGCAAGATGGCGCCGGTGGGGGTGGAGAAGAAGCTGCTGCTAGGTCCCAACGGG...AAA"
id = "NM_006141.1"
name = "NM_006141"
description = (
"Homo sapiens dynein, cytoplasmic, light intermediate polypeptide 2"
" (DNCLI2), mRNA"
)
annotations = {
"accessions": ["NM_006141"],
"comment": """\
PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence was derived from AF035812.1.""",
"data_file_division": "PRI",
# "date": "01-NOV-2000", NB: no date
"gi": "5453633",
"keywords": [""],
"molecule_type": "mRNA",
"organism": "Homo sapiens",
"sequence_version": 1,
"source": "human",
"taxonomy": [
"Eukaryota",
"Metazoa",
"Chordata",
"Craniata",
"Vertebrata",
"Euteleostomi",
"Mammalia",
"Eutheria",
"Primates",
"Catarrhini",
"Hominidae",
"Homo",
],
}
references = []
features = (
(
"""\
type: source
location: [0:1622](+)
qualifiers:
Key: db_xref, Value: ['taxon:9606']
Key: map, Value: ['16']
Key: organism, Value: ['Homo sapiens']
""",
1,
),
(
"""\
type: gene
location: [0:1622](+)
qualifiers:
Key: db_xref, Value: ['LocusID:1783']
Key: gene, Value: ['DNCLI2']
Key: note, Value: ['LIC2']
""",
1,
),
(
"""\
type: CDS
location: [6:1485](+)
qualifiers:
Key: codon_start, Value: ['1']
Key: db_xref, Value: ['LocusID:1783', 'GI:5453634']
@ -8124,6 +8232,14 @@ class LineOneTests(unittest.TestCase):
"BCT",
None,
),
(
"LOCUS AB070938 6497 bp DNA linear BCT"
" 1-Oct-2001\n",
"linear",
"DNA",
"BCT",
[BiopythonParserWarning, BiopythonParserWarning],
),
(
"LOCUS NC_005816 9609 bp DNA circular BCT"
" 21-JUL-2008",
@ -8509,5 +8625,6 @@ class GenBankScannerTests(unittest.TestCase):
if __name__ == "__main__":
os.chdir(os.path.dirname(__file__))
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)