Support all versions of SeqXML

This commit is contained in:
Michael M
2024-01-19 22:52:22 -06:00
committed by Peter Cock
parent f7fbecb6bd
commit ec9026ce4d
7 changed files with 116 additions and 10 deletions

View File

@ -79,12 +79,24 @@ class ContentHandler(handler.ContentHandler):
)
if self.seqXMLversion is None:
raise ValueError("Failed to find seqXMLversion")
elif self.seqXMLversion not in ["0.1", "0.2", "0.3", "0.4"]:
raise ValueError("Unsupported seqXMLversion")
url = f"http://www.seqxml.org/{self.seqXMLversion}/seqxml.xsd"
if schema != url:
if schema is not None and schema != url:
raise ValueError(
"XML Schema '%s' found not consistent with reported seqXML version %s"
% (schema, self.seqXMLversion)
)
# speciesName and ncbiTaxID attributes on the root are only supported
# in 0.4
if self.speciesName and float(self.seqXMLversion) < 0.4:
raise ValueError(
"Attribute 'speciesName' on root is only supported in version 0.4"
)
if self.ncbiTaxID and float(self.seqXMLversion) < 0.4:
raise ValueError(
"Attribute 'speciesName' on root is only supported in version 0.4"
)
self.endElementNS = self.endSeqXMLElement
self.startElementNS = self.startEntryElement
@ -117,7 +129,7 @@ class ContentHandler(handler.ContentHandler):
if namespace is None:
if localname == "id":
record.id = value
elif localname == "source":
elif localname == "source" and float(self.seqXMLversion) >= 0.3:
record.annotations["source"] = value
else:
raise ValueError(
@ -139,6 +151,8 @@ class ContentHandler(handler.ContentHandler):
raise ValueError("Expected to find the end of an entry element")
if qname is not None:
raise RuntimeError("Unexpected qname for entry element")
if self.records[-1].seq is None:
raise ValueError("Failed to find a sequence for entry element")
self.startElementNS = self.startEntryElement
self.endElementNS = self.endSeqXMLElement
@ -155,9 +169,16 @@ class ContentHandler(handler.ContentHandler):
return self.startSpeciesElement(attrs)
if localname == "description":
return self.startDescriptionElement(attrs)
if localname in ("DNAseq", "RNAseq", "AAseq"):
if (
localname in ("DNAseq", "RNAseq", "AAseq")
and float(self.seqXMLversion) >= 0.2
) or (
localname in ["dnaSeq", "rnaSeq", "aaSeq"] and self.seqXMLversion == "0.1"
):
return self.startSequenceElement(attrs)
if localname == "DBRef":
if (localname == "DBRef" and float(self.seqXMLversion) >= 0.2) or (
localname == "alternativeID" and self.seqXMLversion == "0.1"
):
return self.startDBRefElement(attrs)
if localname == "property":
return self.startPropertyElement(attrs)
@ -251,11 +272,17 @@ class ContentHandler(handler.ContentHandler):
if qname is not None:
raise RuntimeError(f"Unexpected qname '{qname}' for sequence end")
record = self.records[-1]
if localname == "DNAseq":
if (localname == "DNAseq" and float(self.seqXMLversion) >= 0.2) or (
localname == "dnaSeq" and self.seqXMLversion == "0.1"
):
record.annotations["molecule_type"] = "DNA"
elif localname == "RNAseq":
elif (localname == "RNAseq" and float(self.seqXMLversion) >= 0.2) or (
localname == "rnaSeq" and self.seqXMLversion == "0.1"
):
record.annotations["molecule_type"] = "RNA"
elif localname == "AAseq":
elif (localname == "AAseq" and float(self.seqXMLversion) >= 0.2) or (
localname == "aaSeq" and self.seqXMLversion == "0.1"
):
record.annotations["molecule_type"] = "protein"
else:
raise RuntimeError(
@ -267,12 +294,15 @@ class ContentHandler(handler.ContentHandler):
def startDBRefElement(self, attrs):
"""Parse a database cross reference."""
TYPE = None
source = None
ID = None
for key, value in attrs.items():
namespace, localname = key
if namespace is None:
if localname == "source":
if localname == "type":
TYPE = value
elif localname == "source":
source = value
elif localname == "id":
ID = value
@ -284,11 +314,14 @@ class ContentHandler(handler.ContentHandler):
raise ValueError(
f"Unexpected namespace '{namespace}' for DBRef attribute"
)
# The attributes "source" and "id" are required:
# The attributes "source" and "id" are required, and "type" in versions
# 0.2-0.3:
if source is None:
raise ValueError("Failed to find source for DBRef element")
if ID is None:
raise ValueError("Failed to find id for DBRef element")
if TYPE is None and 0.2 <= float(self.seqXMLversion) <= 0.3:
raise ValueError("Failed to find type for DBRef element")
if self.data is not None:
raise RuntimeError(f"Unexpected data found: '{self.data}'")
self.data = ""
@ -305,7 +338,9 @@ class ContentHandler(handler.ContentHandler):
raise RuntimeError(f"Unexpected namespace '{namespace}' for DBRef element")
if qname is not None:
raise RuntimeError(f"Unexpected qname '{qname}' for DBRef element")
if localname != "DBRef":
if (localname != "DBRef" and float(self.seqXMLversion) >= 0.2) or (
localname != "alternativeID" and self.seqXMLversion == "0.1"
):
raise RuntimeError(f"Unexpected localname '{localname}' for DBRef element")
if self.data:
raise RuntimeError(

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML seqXMLversion="0.4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.3/seqxml.xsd" speciesName="fake">
<entry id="fake">
<description>entry has no sequence data</description>
</entry>
</seqXML>

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML seqXMLversion="0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.1/seqxml.xsd">
<entry id="fakeDNA">
<dnaSeq>AAAAAACGTAAA</dnaSeq>
<alternativeID source="db" id="1"/>
</entry>
<entry id="fakeRNA">
<rnaSeq>AAAUUUUCTGAA</rnaSeq>
</entry>
<entry id="fakeAA">
<aaSeq>GAKKVFIEDVSKEFVEEFIWPAVQSSALYE</aaSeq>
</entry>
</seqXML>

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML seqXMLversion="0.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.2/seqxml.xsd">
<entry id="fakeDNA">
<DNAseq>AAAAAACGTAAA</DNAseq>
<!-- Should fail because v0.2 DBRef tags must have a type attribute -->
<DBRef source="test" id="test"/>
</entry>
<entry id="fakeRNA">
<RNAseq>AAAUUUUCTGAA</RNAseq>
</entry>
<entry id="fakeAA">
<AAseq>GAKKVFIEDVSKEFVEEFIWPAVQSSALYE</AAseq>
</entry>
</seqXML>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Should fail because speciesName in root is unsupported in v0.3 -->
<seqXML seqXMLversion="0.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.3/seqxml.xsd" speciesName="fake">
<entry id="fakeDNA">
<DNAseq>AAAAAACGTAAA</DNAseq>
<DBRef source="test" id="test"/>
</entry>
</seqXML>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML seqXMLversion="0.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.2/seqxml.xsd" speciesName="fake">
<!-- Should fail because "source" attribute on entries is only version >=0.3 -->
<entry id="fakeDNA" source="Fake">
<DNAseq>AAAAAACGTAAA</DNAseq>
</entry>
</seqXML>

View File

@ -226,6 +226,29 @@ class TestReadCorruptFiles(unittest.TestCase):
self.assertRaises(ValueError, f, "SeqXML/corrupt_example1.xml")
self.assertRaises(ValueError, f, "SeqXML/corrupt_example2.xml")
self.assertRaises(ValueError, f, "SeqXML/corrupt_example3.xml")
class TestOldVersions(unittest.TestCase):
def test_version01(self):
"""Test for version 0.1 specific features."""
records = list(SeqIO.parse("SeqXML/version_01_example.xml", "seqxml"))
self.assertEqual(records[0].seq, "AAAAAACGTAAA")
self.assertEqual(records[0].dbxrefs[0], "db:1")
self.assertEqual(records[1].seq, "AAAUUUUCTGAA")
self.assertEqual(records[2].seq, "GAKKVFIEDVSKEFVEEFIWPAVQSSALYE")
def test_wrong_version(self):
"""Handling of wrong versions."""
def f(path):
records = SeqIO.parse(path, "seqxml")
for record in records:
pass
self.assertRaises(ValueError, f, "SeqXML/wrong_version1.xml")
self.assertRaises(ValueError, f, "SeqXML/wrong_version2.xml")
self.assertRaises(ValueError, f, "SeqXML/wrong_version3.xml")
if __name__ == "__main__":