diff --git a/Bio/SeqIO/SeqXmlIO.py b/Bio/SeqIO/SeqXmlIO.py
index a5c1b29c8..0cd54a607 100644
--- a/Bio/SeqIO/SeqXmlIO.py
+++ b/Bio/SeqIO/SeqXmlIO.py
@@ -79,12 +79,24 @@ class ContentHandler(handler.ContentHandler):
)
if self.seqXMLversion is None:
raise ValueError("Failed to find seqXMLversion")
+ elif self.seqXMLversion not in ["0.1", "0.2", "0.3", "0.4"]:
+ raise ValueError("Unsupported seqXMLversion")
url = f"http://www.seqxml.org/{self.seqXMLversion}/seqxml.xsd"
- if schema != url:
+ if schema is not None and schema != url:
raise ValueError(
"XML Schema '%s' found not consistent with reported seqXML version %s"
% (schema, self.seqXMLversion)
)
+ # speciesName and ncbiTaxID attributes on the root are only supported
+ # in 0.4
+ if self.speciesName and float(self.seqXMLversion) < 0.4:
+ raise ValueError(
+ "Attribute 'speciesName' on root is only supported in version 0.4"
+ )
+ if self.ncbiTaxID and float(self.seqXMLversion) < 0.4:
+ raise ValueError(
+ "Attribute 'speciesName' on root is only supported in version 0.4"
+ )
self.endElementNS = self.endSeqXMLElement
self.startElementNS = self.startEntryElement
@@ -117,7 +129,7 @@ class ContentHandler(handler.ContentHandler):
if namespace is None:
if localname == "id":
record.id = value
- elif localname == "source":
+ elif localname == "source" and float(self.seqXMLversion) >= 0.3:
record.annotations["source"] = value
else:
raise ValueError(
@@ -139,6 +151,8 @@ class ContentHandler(handler.ContentHandler):
raise ValueError("Expected to find the end of an entry element")
if qname is not None:
raise RuntimeError("Unexpected qname for entry element")
+ if self.records[-1].seq is None:
+ raise ValueError("Failed to find a sequence for entry element")
self.startElementNS = self.startEntryElement
self.endElementNS = self.endSeqXMLElement
@@ -155,9 +169,16 @@ class ContentHandler(handler.ContentHandler):
return self.startSpeciesElement(attrs)
if localname == "description":
return self.startDescriptionElement(attrs)
- if localname in ("DNAseq", "RNAseq", "AAseq"):
+ if (
+ localname in ("DNAseq", "RNAseq", "AAseq")
+ and float(self.seqXMLversion) >= 0.2
+ ) or (
+ localname in ["dnaSeq", "rnaSeq", "aaSeq"] and self.seqXMLversion == "0.1"
+ ):
return self.startSequenceElement(attrs)
- if localname == "DBRef":
+ if (localname == "DBRef" and float(self.seqXMLversion) >= 0.2) or (
+ localname == "alternativeID" and self.seqXMLversion == "0.1"
+ ):
return self.startDBRefElement(attrs)
if localname == "property":
return self.startPropertyElement(attrs)
@@ -251,11 +272,17 @@ class ContentHandler(handler.ContentHandler):
if qname is not None:
raise RuntimeError(f"Unexpected qname '{qname}' for sequence end")
record = self.records[-1]
- if localname == "DNAseq":
+ if (localname == "DNAseq" and float(self.seqXMLversion) >= 0.2) or (
+ localname == "dnaSeq" and self.seqXMLversion == "0.1"
+ ):
record.annotations["molecule_type"] = "DNA"
- elif localname == "RNAseq":
+ elif (localname == "RNAseq" and float(self.seqXMLversion) >= 0.2) or (
+ localname == "rnaSeq" and self.seqXMLversion == "0.1"
+ ):
record.annotations["molecule_type"] = "RNA"
- elif localname == "AAseq":
+ elif (localname == "AAseq" and float(self.seqXMLversion) >= 0.2) or (
+ localname == "aaSeq" and self.seqXMLversion == "0.1"
+ ):
record.annotations["molecule_type"] = "protein"
else:
raise RuntimeError(
@@ -267,12 +294,15 @@ class ContentHandler(handler.ContentHandler):
def startDBRefElement(self, attrs):
"""Parse a database cross reference."""
+ TYPE = None
source = None
ID = None
for key, value in attrs.items():
namespace, localname = key
if namespace is None:
- if localname == "source":
+ if localname == "type":
+ TYPE = value
+ elif localname == "source":
source = value
elif localname == "id":
ID = value
@@ -284,11 +314,14 @@ class ContentHandler(handler.ContentHandler):
raise ValueError(
f"Unexpected namespace '{namespace}' for DBRef attribute"
)
- # The attributes "source" and "id" are required:
+ # The attributes "source" and "id" are required, and "type" in versions
+ # 0.2-0.3:
if source is None:
raise ValueError("Failed to find source for DBRef element")
if ID is None:
raise ValueError("Failed to find id for DBRef element")
+ if TYPE is None and 0.2 <= float(self.seqXMLversion) <= 0.3:
+ raise ValueError("Failed to find type for DBRef element")
if self.data is not None:
raise RuntimeError(f"Unexpected data found: '{self.data}'")
self.data = ""
@@ -305,7 +338,9 @@ class ContentHandler(handler.ContentHandler):
raise RuntimeError(f"Unexpected namespace '{namespace}' for DBRef element")
if qname is not None:
raise RuntimeError(f"Unexpected qname '{qname}' for DBRef element")
- if localname != "DBRef":
+ if (localname != "DBRef" and float(self.seqXMLversion) >= 0.2) or (
+ localname != "alternativeID" and self.seqXMLversion == "0.1"
+ ):
raise RuntimeError(f"Unexpected localname '{localname}' for DBRef element")
if self.data:
raise RuntimeError(
diff --git a/Tests/SeqXML/corrupt_example3.xml b/Tests/SeqXML/corrupt_example3.xml
new file mode 100644
index 000000000..a946079b2
--- /dev/null
+++ b/Tests/SeqXML/corrupt_example3.xml
@@ -0,0 +1,6 @@
+
+
+
+ entry has no sequence data
+
+
diff --git a/Tests/SeqXML/version_01_example.xml b/Tests/SeqXML/version_01_example.xml
new file mode 100644
index 000000000..c81dce41f
--- /dev/null
+++ b/Tests/SeqXML/version_01_example.xml
@@ -0,0 +1,13 @@
+
+
+
+ AAAAAACGTAAA
+
+
+
+ AAAUUUUCTGAA
+
+
+ GAKKVFIEDVSKEFVEEFIWPAVQSSALYE
+
+
diff --git a/Tests/SeqXML/wrong_version1.xml b/Tests/SeqXML/wrong_version1.xml
new file mode 100644
index 000000000..8486a25f1
--- /dev/null
+++ b/Tests/SeqXML/wrong_version1.xml
@@ -0,0 +1,14 @@
+
+
+
+ AAAAAACGTAAA
+
+
+
+
+ AAAUUUUCTGAA
+
+
+ GAKKVFIEDVSKEFVEEFIWPAVQSSALYE
+
+
diff --git a/Tests/SeqXML/wrong_version2.xml b/Tests/SeqXML/wrong_version2.xml
new file mode 100644
index 000000000..2f66a1239
--- /dev/null
+++ b/Tests/SeqXML/wrong_version2.xml
@@ -0,0 +1,8 @@
+
+
+
+
+ AAAAAACGTAAA
+
+
+
diff --git a/Tests/SeqXML/wrong_version3.xml b/Tests/SeqXML/wrong_version3.xml
new file mode 100644
index 000000000..9ef5f7c8c
--- /dev/null
+++ b/Tests/SeqXML/wrong_version3.xml
@@ -0,0 +1,7 @@
+
+
+
+
+ AAAAAACGTAAA
+
+
diff --git a/Tests/test_SeqIO_SeqXML.py b/Tests/test_SeqIO_SeqXML.py
index 5bceb8c70..545de790f 100644
--- a/Tests/test_SeqIO_SeqXML.py
+++ b/Tests/test_SeqIO_SeqXML.py
@@ -226,6 +226,29 @@ class TestReadCorruptFiles(unittest.TestCase):
self.assertRaises(ValueError, f, "SeqXML/corrupt_example1.xml")
self.assertRaises(ValueError, f, "SeqXML/corrupt_example2.xml")
+ self.assertRaises(ValueError, f, "SeqXML/corrupt_example3.xml")
+
+
+class TestOldVersions(unittest.TestCase):
+ def test_version01(self):
+ """Test for version 0.1 specific features."""
+ records = list(SeqIO.parse("SeqXML/version_01_example.xml", "seqxml"))
+ self.assertEqual(records[0].seq, "AAAAAACGTAAA")
+ self.assertEqual(records[0].dbxrefs[0], "db:1")
+ self.assertEqual(records[1].seq, "AAAUUUUCTGAA")
+ self.assertEqual(records[2].seq, "GAKKVFIEDVSKEFVEEFIWPAVQSSALYE")
+
+ def test_wrong_version(self):
+ """Handling of wrong versions."""
+
+ def f(path):
+ records = SeqIO.parse(path, "seqxml")
+ for record in records:
+ pass
+
+ self.assertRaises(ValueError, f, "SeqXML/wrong_version1.xml")
+ self.assertRaises(ValueError, f, "SeqXML/wrong_version2.xml")
+ self.assertRaises(ValueError, f, "SeqXML/wrong_version3.xml")
if __name__ == "__main__":