Files
biopython/Tests/test_SeqIO_UniprotIO.py

641 lines
26 KiB
Python

# Copyright 2010 by Andrea Pierleoni
# Revisions copyright 2010-2015 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test for the Uniprot parser on Uniprot XML files."""
import os
import unittest
from seq_tests_common import SeqRecordTestBaseClass
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import BiopythonDeprecationWarning
class ParserTests(SeqRecordTestBaseClass):
"""Tests Uniprot XML parser."""
def check_uni001(self, mode):
"""Parsing Uniprot file uni001."""
filename = "uni001"
# test the record parser
datafile = os.path.join("SwissProt", filename)
with open(datafile, mode) as handle:
seq_record = SeqIO.read(handle, "uniprot-xml")
self.assertIsInstance(seq_record, SeqRecord)
# test a couple of things on the record -- this is not exhaustive
self.assertEqual(seq_record.id, "Q91G55")
self.assertEqual(seq_record.name, "043L_IIV6")
self.assertEqual(seq_record.description, "Uncharacterized protein 043L")
self.assertEqual(
repr(seq_record.seq),
"Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI')",
)
# self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
# self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'Homo'])
# self.assertEqual(record.seqinfo, (348, 39676, '75818910'))
self.assertEqual(len(seq_record.features), 1)
self.assertEqual(
repr(seq_record.features[0]),
"SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969', qualifiers=...)",
)
self.assertEqual(
str(seq_record.features[0]),
"\n".join(
[
"type: chain",
"location: [0:116]",
"id: PRO_0000377969",
"qualifiers:",
" Key: description, Value: Uncharacterized protein 043L",
" Key: id, Value: PRO_0000377969",
" Key: type, Value: chain",
"",
]
),
)
self.assertEqual(len(seq_record.annotations["references"]), 2)
self.assertEqual(
seq_record.annotations["references"][0].authors,
"Jakob N.J., Mueller K., Bahr U., Darai G.",
)
self.assertEqual(
seq_record.annotations["references"][0].title,
"Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.",
)
self.assertEqual(
seq_record.annotations["references"][0].journal,
"Virology 286:182-196(2001)",
)
self.assertEqual(
seq_record.annotations["references"][0].comment,
"journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ",
)
self.assertEqual(len(seq_record.dbxrefs), 11)
self.assertEqual(seq_record.dbxrefs[0], "DOI:10.1006/viro.2001.0963")
self.assertEqual(seq_record.annotations["sequence_length"], 116)
self.assertEqual(
seq_record.annotations["sequence_checksum"], "4A29B35FB716523C"
)
self.assertEqual(seq_record.annotations["modified"], "2009-07-07")
self.assertEqual(seq_record.annotations["accessions"], ["Q91G55"])
self.assertEqual(
seq_record.annotations["taxonomy"],
["Viruses", "dsDNA viruses, no RNA stage", "Iridoviridae", "Iridovirus"],
)
self.assertEqual(seq_record.annotations["sequence_mass"], 13673)
self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot")
self.assertEqual(seq_record.annotations["gene_name_ORF"], ["IIV6-043L"])
self.assertEqual(seq_record.annotations["version"], 21)
self.assertEqual(seq_record.annotations["sequence_modified"], "2001-12-01")
self.assertEqual(
seq_record.annotations["keywords"],
["Complete proteome", "Virus reference strain"],
)
self.assertEqual(
seq_record.annotations["organism_host"],
[
"Acheta domesticus",
"House cricket",
"Chilo suppressalis",
"striped riceborer",
"Gryllus bimaculatus",
"Two-spotted cricket",
"Gryllus campestris",
"Spodoptera frugiperda",
"Fall armyworm",
],
)
self.assertEqual(seq_record.annotations["created"], "2009-06-16")
self.assertEqual(
seq_record.annotations["organism_name"], ["Chilo iridescent virus"]
)
self.assertEqual(
seq_record.annotations["organism"],
"Invertebrate iridescent virus 6 (IIV-6)",
)
self.assertEqual(
seq_record.annotations["recommendedName_fullName"],
["Uncharacterized protein 043L"],
)
self.assertEqual(seq_record.annotations["sequence_version"], 1)
self.assertEqual(seq_record.annotations["proteinExistence"], ["Predicted"])
def test_uni001(self):
"""Parsing Uniprot file uni001 in text mode and in binary mode."""
self.check_uni001("rb")
with self.assertWarns(BiopythonDeprecationWarning):
self.check_uni001("rt")
def test_uni003(self):
"""Parsing Uniprot file uni003."""
filename = "uni003"
# test the record parser
datafile = os.path.join("SwissProt", filename)
with open(datafile, "rb") as handle:
seq_record = SeqIO.read(handle, "uniprot-xml")
self.assertIsInstance(seq_record, SeqRecord)
# test general record entries
self.assertEqual(seq_record.id, "O44185")
self.assertEqual(seq_record.name, "FLP13_CAEEL")
self.assertEqual(seq_record.description, "FMRFamide-like neuropeptides 13")
self.assertEqual(
repr(seq_record.seq),
"Seq('MMTSLLTISMFVVAIQAFDSSEIRMLDEQYDTKNPFFQFLENSKRSDRPTRAMD...GRK')",
)
self.assertEqual(len(seq_record.annotations["references"]), 7)
self.assertEqual(
seq_record.annotations["references"][5].authors, "Kim K., Li C."
)
self.assertEqual(
seq_record.annotations["references"][5].title,
"Expression and regulation of an FMRFamide-related "
"neuropeptide gene family in Caenorhabditis elegans.",
)
self.assertEqual(
seq_record.annotations["references"][5].journal,
"J. Comp. Neurol. 475:540-550(2004)",
)
self.assertEqual(
seq_record.annotations["references"][5].comment,
"journal article | 2004 | Scope: TISSUE SPECIFICITY, "
"DEVELOPMENTAL STAGE | ",
)
self.assertEqual(seq_record.annotations["accessions"], ["O44185"])
self.assertEqual(seq_record.annotations["created"], "2004-05-10")
self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot")
self.assertEqual(seq_record.annotations["gene_name_ORF"], ["F33D4.3"])
self.assertEqual(seq_record.annotations["gene_name_primary"], "flp-13")
self.assertEqual(
seq_record.annotations["keywords"],
[
"Amidation",
"Cleavage on pair of basic residues",
"Complete proteome",
"Direct protein sequencing",
"Neuropeptide",
"Reference proteome",
"Repeat",
"Secreted",
"Signal",
],
)
self.assertEqual(seq_record.annotations["modified"], "2012-11-28")
self.assertEqual(seq_record.annotations["organism"], "Caenorhabditis elegans")
self.assertEqual(
seq_record.annotations["proteinExistence"], ["evidence at protein level"]
)
self.assertEqual(
seq_record.annotations["recommendedName_fullName"],
["FMRFamide-like neuropeptides 13"],
)
self.assertEqual(seq_record.annotations["sequence_length"], 160)
self.assertEqual(
seq_record.annotations["sequence_checksum"], "BE4C24E9B85FCD11"
)
self.assertEqual(seq_record.annotations["sequence_mass"], 17736)
self.assertEqual(seq_record.annotations["sequence_modified"], "1998-06-01")
self.assertEqual(seq_record.annotations["sequence_precursor"], "true")
self.assertEqual(seq_record.annotations["sequence_version"], 1)
self.assertEqual(
seq_record.annotations["taxonomy"],
[
"Eukaryota",
"Metazoa",
"Ecdysozoa",
"Nematoda",
"Chromadorea",
"Rhabditida",
"Rhabditoidea",
"Rhabditidae",
"Peloderinae",
"Caenorhabditis",
],
)
self.assertEqual(seq_record.annotations["type"], ["ECO:0000006", "ECO:0000001"])
self.assertEqual(seq_record.annotations["version"], 74)
# test comment entries
self.assertEqual(
seq_record.annotations["comment_allergen"],
["Causes an allergic reaction in human."],
)
self.assertEqual(
seq_record.annotations["comment_alternativeproducts_isoform"],
["Q8W1X2-1", "Q8W1X2-2"],
)
self.assertEqual(
seq_record.annotations["comment_biotechnology"],
[
"Green fluorescent protein has been engineered to "
"produce a vast number of variously colored "
"mutants, fusion proteins, and biosensors. "
"Fluorescent proteins and its mutated allelic "
"forms, blue, cyan and yellow have become a useful "
"and ubiquitous tool for making chimeric proteins, "
"where they function as a fluorescent protein tag. "
"Typically they tolerate N- and C-terminal fusion "
"to a broad variety of proteins. They have been "
"expressed in most known cell types and are used "
"as a noninvasive fluorescent marker in living "
"cells and organisms. They enable a wide range of "
"applications where they have functioned as a cell "
"lineage tracer, reporter of gene expression, or as "
"a measure of protein-protein interactions.",
"Can also be used as a molecular thermometer, "
"allowing accurate temperature measurements in "
"fluids. The measurement process relies on the "
"detection of the blinking of GFP using "
"fluorescence correlation spectroscopy.",
],
)
self.assertEqual(
seq_record.annotations["comment_catalyticactivity"],
[
"ATP + acetyl-CoA + HCO(3)(-) = ADP + phosphate + malonyl-CoA.",
"ATP + biotin-[carboxyl-carrier-protein] "
"+ CO(2) = ADP + phosphate + "
"carboxy-biotin-[carboxyl-carrier-protein].",
],
)
self.assertEqual(
seq_record.annotations["comment_caution"],
[
"Could be the product of a pseudogene. The "
"existence of a transcript at this locus is "
"supported by only one sequence submission "
"(PubMed:2174397)."
],
)
self.assertEqual(
seq_record.annotations["comment_cofactor"],
[
"Biotin (By similarity).",
"Binds 2 manganese ions per subunit (By similarity).",
],
)
self.assertEqual(
seq_record.annotations["comment_developmentalstage"],
[
"Expressed from the comma stage of embryogenesis, "
"during all larval stages, and in low levels in "
"adults."
],
)
self.assertEqual(
seq_record.annotations["comment_disease"],
[
"Defects in MC2R are the cause of glucocorticoid "
"deficiency type 1 (GCCD1) [MIM:202200]; also known "
"as familial glucocorticoid deficiency type 1 "
"(FGD1). GCCD1 is an autosomal recessive disorder "
"due to congenital insensitivity or resistance to "
"adrenocorticotropin (ACTH). It is characterized by "
"progressive primary adrenal insufficiency, without "
"mineralocorticoid deficiency."
],
)
self.assertEqual(
seq_record.annotations["comment_disruptionphenotype"],
[
"Mice display impaired B-cell development which "
"does not progress pass the progenitor stage."
],
)
self.assertEqual(
seq_record.annotations["comment_domain"],
[
"Two regions, an N-terminal (aa 96-107) and a "
"C-terminal (aa 274-311) are required for binding "
"FGF2."
],
)
self.assertEqual(
seq_record.annotations["comment_enzymeregulation"],
[
"By phosphorylation. The catalytic activity is "
"inhibited by soraphen A, a polyketide isolated "
"from the myxobacterium Sorangium cellulosum and "
"a potent inhibitor of fungal growth."
],
)
self.assertEqual(
seq_record.annotations["comment_function"],
[
"FMRFamides and FMRFamide-like peptides are "
"neuropeptides. AADGAPLIRF-amide and "
"APEASPFIRF-amide inhibit muscle tension in somatic "
"muscle. APEASPFIRF-amide is a potent inhibitor of "
"the activity of dissected pharyngeal myogenic "
"muscle system."
],
)
self.assertEqual(
seq_record.annotations["comment_induction"],
[
"Repressed in presence of fatty acids. Repressed "
"3-fold by lipid precursors, inositol and "
"choline, and also controlled by regulatory "
"factors INO2, INO4 and OPI1."
],
)
self.assertEqual(
seq_record.annotations["comment_interaction_intactId"],
["EBI-356720", "EBI-746969", "EBI-720116"],
)
self.assertEqual(
seq_record.annotations["comment_massspectrometry"],
["88..98:1032|MALDI", "100..110:1133.7|MALDI"],
)
self.assertEqual(
seq_record.annotations["comment_miscellaneous"],
["Present with 20200 molecules/cell in log phase SD medium."],
)
self.assertEqual(
seq_record.annotations["comment_onlineinformation"],
["NIEHS-SNPs@http://egp.gs.washington.edu/data/api5/"],
)
self.assertEqual(
seq_record.annotations["comment_pathway"],
[
"Lipid metabolism; malonyl-CoA biosynthesis; "
"malonyl-CoA from acetyl-CoA: step 1/1."
],
)
self.assertEqual(
seq_record.annotations["comment_RNAediting"],
[
"Partially edited. RNA editing generates receptor "
"isoforms that differ in their ability to interact "
"with the phospholipase C signaling cascade in a "
"transfected cell line, suggesting that this RNA "
"processing event may contribute to the modulation "
"of serotonergic neurotransmission in the central "
"nervous system."
],
)
self.assertEqual(
seq_record.annotations["comment_PTM"],
["Acetylation at Lys-251 impairs antiapoptotic function."],
)
self.assertEqual(
seq_record.annotations["comment_pharmaceutical"],
[
"Could be used as a possible therapeutic agent for "
"treating rheumatoid arthritis."
],
)
self.assertEqual(
seq_record.annotations["comment_polymorphism"],
[
"Position 23 is polymorphic; the frequencies in "
"unrelated Caucasians are 0.87 for Cys and 0.13 "
"for Ser."
],
)
self.assertEqual(
seq_record.annotations["comment_similarity"],
["Belongs to the FARP (FMRFamide related peptide) family."],
)
self.assertEqual(
seq_record.annotations["comment_subcellularlocation_location"], ["Secreted"]
)
self.assertEqual(seq_record.annotations["comment_subunit"], ["Homodimer."])
self.assertEqual(
seq_record.annotations["comment_tissuespecificity"],
[
"Each flp gene is expressed in a distinct set of "
"neurons. Flp-13 is expressed in the ASE sensory "
"neurons, the DD motor neurons, the 15, M3 and M5 "
"cholinergic pharyngeal motoneurons, and the ASG, "
"ASK and BAG neurons."
],
)
self.assertEqual(
seq_record.annotations["comment_toxicdose"],
[
"LD(50) is 50 ug/kg in mouse by "
"intracerebroventricular injection "
"and 600 ng/g in Blatella germanica."
],
)
self.assertEqual(
repr(seq_record.features[1]),
"SeqFeature(SimpleLocation(ExactPosition(17), ExactPosition(43)), type='propeptide', id='PRO_0000009556', qualifiers=...)",
)
self.assertEqual(
str(seq_record.features[1]),
"\n".join(
[
"type: propeptide",
"location: [17:43]",
"id: PRO_0000009556",
"qualifiers:",
" Key: evidence, Value: 7",
" Key: id, Value: PRO_0000009556",
" Key: status, Value: potential",
" Key: type, Value: propeptide",
"",
]
),
)
def test_sp016(self):
"""Parsing SwissProt file sp016."""
filename = "sp016"
# test the record parser
datafile = os.path.join("SwissProt", filename)
with open(datafile) as handle:
seq_record = SeqIO.read(handle, "swiss")
self.assertIsInstance(seq_record, SeqRecord)
# test ProteinExistence (the numerical value describing the evidence for the existence of the protein)
self.assertEqual(seq_record.annotations["protein_existence"], 1)
# test Sequence version
self.assertEqual(seq_record.annotations["sequence_version"], 1)
# test Entry version
self.assertEqual(seq_record.annotations["entry_version"], 93)
def test_P60904(self):
"""Parsing SwissProt file P60904.txt."""
filename = "P60904.txt"
# test the record parser
datafile = os.path.join("SwissProt", filename)
with open(datafile) as handle:
seq_record = SeqIO.read(handle, "swiss")
self.assertIsInstance(seq_record, SeqRecord)
# test Sequence version
self.assertEqual(seq_record.annotations["sequence_version"], 1)
# test Entry version
self.assertEqual(seq_record.annotations["entry_version"], 158)
def compare_txt_xml(self, old, new):
"""Compare text and XML based parser output."""
self.assertEqual(old.id, new.id)
self.assertEqual(old.name, new.name)
self.assertEqual(len(old), len(new))
self.assertEqual(old.seq, new.seq)
for key in set(old.annotations).intersection(new.annotations):
if key in ["date"]:
# TODO - Why is this a list vs str?
continue
self.assertIsInstance(
old.annotations[key], type(new.annotations[key]), msg=f"key={key}"
)
if key == "references":
self.assertEqual(len(old.annotations[key]), len(new.annotations[key]))
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
# Tweak for line breaks in plain text SwissProt
r1.title = r1.title.replace("- ", "-")
r2.title = r2.title.replace("- ", "-")
r1.journal = r1.journal.rstrip(".") # Should parser do this?
r1.medline_id = "" # Missing in UniPort XML? TODO - check
# Lots of extra comments in UniProt XML
r1.comment = ""
r2.comment = ""
if not r2.journal:
r1.journal = ""
self.compare_reference(r1, r2)
elif key in ["organism"]:
self.assertTrue(
old.annotations[key] == new.annotations[key]
or old.annotations[key].startswith(new.annotations[key] + " ")
)
elif isinstance(old.annotations[key], list):
self.assertEqual(
sorted(old.annotations[key]), sorted(new.annotations[key])
)
else:
self.assertEqual(
old.annotations[key], new.annotations[key], msg=f"key={key}"
)
self.assertEqual(
len(old.features),
len(new.features),
"Features in %s, %i vs %i" % (old.id, len(old.features), len(new.features)),
)
for f1, f2 in zip(old.features, new.features):
self.assertEqual(
repr(f1.location),
repr(f2.location),
f"{f1.location} {f1.type} vs {f2.location} {f2.type}",
)
def test_Q13639(self):
"""Compare SwissProt text and uniprot XML versions of Q13639."""
old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
self.compare_txt_xml(old, new)
def test_H2CNN8(self):
"""Compare SwissProt text and uniprot XML versions of H2CNN8."""
old = SeqIO.read("SwissProt/H2CNN8.txt", "swiss")
new = SeqIO.read("SwissProt/H2CNN8.xml", "uniprot-xml")
self.compare_txt_xml(old, new)
def test_F2CXE6(self):
"""Compare SwissProt text and uniprot XML versions of F2CXE6."""
# This evil record has a semi-colon in the gene name,
# GN Name=HvPIP2;8 {ECO:0000313|EMBL:BAN04711.1};
# <gene><name type="primary" evidence="3">HvPIP2;8</name></gene>
old = SeqIO.read("SwissProt/F2CXE6.txt", "swiss")
new = SeqIO.read("SwissProt/F2CXE6.xml", "uniprot-xml")
self.compare_txt_xml(old, new)
# TODO - Why the mismatch gene_name vs gene_name_primary?
# TODO - Handle evidence codes on GN line (see GitHub isse #416)
self.assertEqual(
old.annotations["gene_name"],
[{"Name": "HvPIP2;8 {ECO:0000313|EMBL:BAN04711.1}"}],
)
self.assertEqual(new.annotations["gene_name_primary"], "HvPIP2;8")
self.assertEqual(old.name, "F2CXE6_HORVD")
self.assertEqual(new.name, "F2CXE6_HORVD")
def test_P84001(self):
"""Parse mass spec structured comment with unknown loc."""
xml = list(SeqIO.parse("SwissProt/P84001.xml", "uniprot-xml"))[0]
self.assertEqual(xml.id, "P84001")
self.assertEqual(len(xml.annotations["comment_massspectrometry"]), 1)
self.assertEqual(
xml.annotations["comment_massspectrometry"][0],
"undefined:9571|Electrospray",
)
def test_multi_ex(self):
"""Compare SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
fas_list = list(SeqIO.parse("SwissProt/multi_ex.fasta", "fasta"))
with open("SwissProt/multi_ex.list") as handle:
ids = [x.strip() for x in handle]
self.assertEqual(len(txt_list), len(ids))
self.assertEqual(len(txt_list), len(fas_list))
self.assertEqual(len(txt_list), len(xml_list))
for txt, xml, fas, id in zip(txt_list, xml_list, fas_list, ids):
self.assertEqual(txt.id, id)
self.assertIn(txt.id, fas.id.split("|"))
self.assertEqual(txt.seq, fas.seq)
self.compare_txt_xml(txt, xml)
def test_multi_ex_index(self):
"""Index SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
with open("SwissProt/multi_ex.list") as handle:
ids = [x.strip() for x in handle]
txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss")
xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml")
self.assertEqual(sorted(txt_index), sorted(ids))
self.assertEqual(sorted(xml_index), sorted(ids))
# Check SeqIO.parse() versus SeqIO.index() for plain text "swiss"
for old in txt_list:
new = txt_index[old.id]
self.compare_record(old, new)
# Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml"
for old in xml_list:
new = xml_index[old.id]
self.compare_record(old, new)
txt_index.close()
xml_index.close()
def test_submittedName_allowed(self):
"""Checks if parser supports new XML Element (submittedName)."""
with open("SwissProt/R5HY77.xml", "rb") as handle:
for entry in SeqIO.parse(handle, "uniprot-xml"):
self.assertEqual(entry.id, "R5HY77")
self.assertEqual(entry.description, "Elongation factor Ts")
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)