diff --git a/Bio/SeqIO/SeqXmlIO.py b/Bio/SeqIO/SeqXmlIO.py index a5c1b29c8..0cd54a607 100644 --- a/Bio/SeqIO/SeqXmlIO.py +++ b/Bio/SeqIO/SeqXmlIO.py @@ -79,12 +79,24 @@ class ContentHandler(handler.ContentHandler): ) if self.seqXMLversion is None: raise ValueError("Failed to find seqXMLversion") + elif self.seqXMLversion not in ["0.1", "0.2", "0.3", "0.4"]: + raise ValueError("Unsupported seqXMLversion") url = f"http://www.seqxml.org/{self.seqXMLversion}/seqxml.xsd" - if schema != url: + if schema is not None and schema != url: raise ValueError( "XML Schema '%s' found not consistent with reported seqXML version %s" % (schema, self.seqXMLversion) ) + # speciesName and ncbiTaxID attributes on the root are only supported + # in 0.4 + if self.speciesName and float(self.seqXMLversion) < 0.4: + raise ValueError( + "Attribute 'speciesName' on root is only supported in version 0.4" + ) + if self.ncbiTaxID and float(self.seqXMLversion) < 0.4: + raise ValueError( + "Attribute 'speciesName' on root is only supported in version 0.4" + ) self.endElementNS = self.endSeqXMLElement self.startElementNS = self.startEntryElement @@ -117,7 +129,7 @@ class ContentHandler(handler.ContentHandler): if namespace is None: if localname == "id": record.id = value - elif localname == "source": + elif localname == "source" and float(self.seqXMLversion) >= 0.3: record.annotations["source"] = value else: raise ValueError( @@ -139,6 +151,8 @@ class ContentHandler(handler.ContentHandler): raise ValueError("Expected to find the end of an entry element") if qname is not None: raise RuntimeError("Unexpected qname for entry element") + if self.records[-1].seq is None: + raise ValueError("Failed to find a sequence for entry element") self.startElementNS = self.startEntryElement self.endElementNS = self.endSeqXMLElement @@ -155,9 +169,16 @@ class ContentHandler(handler.ContentHandler): return self.startSpeciesElement(attrs) if localname == "description": return self.startDescriptionElement(attrs) - if localname in ("DNAseq", "RNAseq", "AAseq"): + if ( + localname in ("DNAseq", "RNAseq", "AAseq") + and float(self.seqXMLversion) >= 0.2 + ) or ( + localname in ["dnaSeq", "rnaSeq", "aaSeq"] and self.seqXMLversion == "0.1" + ): return self.startSequenceElement(attrs) - if localname == "DBRef": + if (localname == "DBRef" and float(self.seqXMLversion) >= 0.2) or ( + localname == "alternativeID" and self.seqXMLversion == "0.1" + ): return self.startDBRefElement(attrs) if localname == "property": return self.startPropertyElement(attrs) @@ -251,11 +272,17 @@ class ContentHandler(handler.ContentHandler): if qname is not None: raise RuntimeError(f"Unexpected qname '{qname}' for sequence end") record = self.records[-1] - if localname == "DNAseq": + if (localname == "DNAseq" and float(self.seqXMLversion) >= 0.2) or ( + localname == "dnaSeq" and self.seqXMLversion == "0.1" + ): record.annotations["molecule_type"] = "DNA" - elif localname == "RNAseq": + elif (localname == "RNAseq" and float(self.seqXMLversion) >= 0.2) or ( + localname == "rnaSeq" and self.seqXMLversion == "0.1" + ): record.annotations["molecule_type"] = "RNA" - elif localname == "AAseq": + elif (localname == "AAseq" and float(self.seqXMLversion) >= 0.2) or ( + localname == "aaSeq" and self.seqXMLversion == "0.1" + ): record.annotations["molecule_type"] = "protein" else: raise RuntimeError( @@ -267,12 +294,15 @@ class ContentHandler(handler.ContentHandler): def startDBRefElement(self, attrs): """Parse a database cross reference.""" + TYPE = None source = None ID = None for key, value in attrs.items(): namespace, localname = key if namespace is None: - if localname == "source": + if localname == "type": + TYPE = value + elif localname == "source": source = value elif localname == "id": ID = value @@ -284,11 +314,14 @@ class ContentHandler(handler.ContentHandler): raise ValueError( f"Unexpected namespace '{namespace}' for DBRef attribute" ) - # The attributes "source" and "id" are required: + # The attributes "source" and "id" are required, and "type" in versions + # 0.2-0.3: if source is None: raise ValueError("Failed to find source for DBRef element") if ID is None: raise ValueError("Failed to find id for DBRef element") + if TYPE is None and 0.2 <= float(self.seqXMLversion) <= 0.3: + raise ValueError("Failed to find type for DBRef element") if self.data is not None: raise RuntimeError(f"Unexpected data found: '{self.data}'") self.data = "" @@ -305,7 +338,9 @@ class ContentHandler(handler.ContentHandler): raise RuntimeError(f"Unexpected namespace '{namespace}' for DBRef element") if qname is not None: raise RuntimeError(f"Unexpected qname '{qname}' for DBRef element") - if localname != "DBRef": + if (localname != "DBRef" and float(self.seqXMLversion) >= 0.2) or ( + localname != "alternativeID" and self.seqXMLversion == "0.1" + ): raise RuntimeError(f"Unexpected localname '{localname}' for DBRef element") if self.data: raise RuntimeError( diff --git a/Tests/SeqXML/corrupt_example3.xml b/Tests/SeqXML/corrupt_example3.xml new file mode 100644 index 000000000..a946079b2 --- /dev/null +++ b/Tests/SeqXML/corrupt_example3.xml @@ -0,0 +1,6 @@ + + + + entry has no sequence data + + diff --git a/Tests/SeqXML/version_01_example.xml b/Tests/SeqXML/version_01_example.xml new file mode 100644 index 000000000..c81dce41f --- /dev/null +++ b/Tests/SeqXML/version_01_example.xml @@ -0,0 +1,13 @@ + + + + AAAAAACGTAAA + + + + AAAUUUUCTGAA + + + GAKKVFIEDVSKEFVEEFIWPAVQSSALYE + + diff --git a/Tests/SeqXML/wrong_version1.xml b/Tests/SeqXML/wrong_version1.xml new file mode 100644 index 000000000..8486a25f1 --- /dev/null +++ b/Tests/SeqXML/wrong_version1.xml @@ -0,0 +1,14 @@ + + + + AAAAAACGTAAA + + + + + AAAUUUUCTGAA + + + GAKKVFIEDVSKEFVEEFIWPAVQSSALYE + + diff --git a/Tests/SeqXML/wrong_version2.xml b/Tests/SeqXML/wrong_version2.xml new file mode 100644 index 000000000..2f66a1239 --- /dev/null +++ b/Tests/SeqXML/wrong_version2.xml @@ -0,0 +1,8 @@ + + + + + AAAAAACGTAAA + + + diff --git a/Tests/SeqXML/wrong_version3.xml b/Tests/SeqXML/wrong_version3.xml new file mode 100644 index 000000000..9ef5f7c8c --- /dev/null +++ b/Tests/SeqXML/wrong_version3.xml @@ -0,0 +1,7 @@ + + + + + AAAAAACGTAAA + + diff --git a/Tests/test_SeqIO_SeqXML.py b/Tests/test_SeqIO_SeqXML.py index 5bceb8c70..545de790f 100644 --- a/Tests/test_SeqIO_SeqXML.py +++ b/Tests/test_SeqIO_SeqXML.py @@ -226,6 +226,29 @@ class TestReadCorruptFiles(unittest.TestCase): self.assertRaises(ValueError, f, "SeqXML/corrupt_example1.xml") self.assertRaises(ValueError, f, "SeqXML/corrupt_example2.xml") + self.assertRaises(ValueError, f, "SeqXML/corrupt_example3.xml") + + +class TestOldVersions(unittest.TestCase): + def test_version01(self): + """Test for version 0.1 specific features.""" + records = list(SeqIO.parse("SeqXML/version_01_example.xml", "seqxml")) + self.assertEqual(records[0].seq, "AAAAAACGTAAA") + self.assertEqual(records[0].dbxrefs[0], "db:1") + self.assertEqual(records[1].seq, "AAAUUUUCTGAA") + self.assertEqual(records[2].seq, "GAKKVFIEDVSKEFVEEFIWPAVQSSALYE") + + def test_wrong_version(self): + """Handling of wrong versions.""" + + def f(path): + records = SeqIO.parse(path, "seqxml") + for record in records: + pass + + self.assertRaises(ValueError, f, "SeqXML/wrong_version1.xml") + self.assertRaises(ValueError, f, "SeqXML/wrong_version2.xml") + self.assertRaises(ValueError, f, "SeqXML/wrong_version3.xml") if __name__ == "__main__":