Add a Blast XML2 writer (#4669)

* added BLAST XML2 writer

* reorder

* abstract

* finished
This commit is contained in:
mdehoon
2024-03-17 14:17:06 +09:00
committed by GitHub
parent 78ad0112c0
commit bfc7b2ca5d
4 changed files with 1221 additions and 286 deletions

View File

@ -952,12 +952,17 @@ def write(records, destination, fmt="XML"):
written.
- fmt - string describing the file format to write
(case-insensitive).
Currently, only "XML" is accepted.
Currently, only "XML" and "XML2" are accepted.
Returns the number of records written (as an integer).
"""
if fmt.upper() == "XML":
fmt = fmt.upper()
if fmt == "XML":
Writer = _writers.XMLWriter
elif fmt == "XML2":
Writer = _writers.XML2Writer
else:
raise ValueError(f"Unknown format {fmt}; expected 'XML' or 'XML2'")
try:
stream = open(destination, "wb")
except TypeError: # not a path, assume we received a stream

File diff suppressed because it is too large Load Diff

View File

@ -435,9 +435,11 @@ These parsers have now been removed from Biopython, as the BLAST output in
these formats kept changing, each time breaking the Biopython parsers.
Nowadays, Biopython can parse BLAST output in the XML format, the XML2 format,
and tabular format. This chapter describes the parser for BLAST output in the
XML format; parsing XML2 output is done in exactly the same way as parsing XML.
BLAST output in tabular format can be parsed as alignments (see the section
:ref:`subsec:align_tabular`).
XML and XML2 formats using the ``Bio.Blast.parse`` function. This function
automatically detects if the XML file is in the XML format or in the XML2
format.
BLAST output in tabular format can be parsed as alignments using the
``Bio.Align.parse`` function (see the section :ref:`subsec:align_tabular`).
You can get BLAST output in XML format in various ways. For the parser,
it doesnt matter how the output was generated, as long as it is in the
@ -1389,7 +1391,9 @@ Writing BLAST records
---------------------
Use the ``write`` function in ``Bio.Blast`` to save BLAST records as an XML
file:
file. By default, the (DTD-based) XML format is used; you can also save the
BLAST records in the (schema-based) XML2 format by using the ``fmt="XML2"``
argument to the ``write`` function.
.. code:: pycon
@ -1398,6 +1402,13 @@ file:
>>> records = Blast.parse(stream)
>>> Blast.write(records, "my_qblast_output.xml")
or
.. code:: pycon
>>> Blast.write(records, "my_qblast_output.xml", fmt="XML2")
In this example, we could have saved the data returned by ``Blast.qblast``
directly to an XML file (see section :ref:`subsec:saving-blast-results`).
However, by parsing the data returned by qblast into records, we can sort or

View File

@ -3317,6 +3317,18 @@ G26684.1 228
written_records = Blast.parse(stream)
self.check_xml_2900_blastn_001_records(written_records)
def test_xml_2900_blastn_001_v2_writer(self):
"""Writing BLASTN 2.9.0+ XML2 (xml_2900_blastn_001_v2.xml)."""
filename = "xml_2900_blastn_001_v2.xml"
path = os.path.join("Blast", filename)
with Blast.parse(path) as records:
stream = io.BytesIO()
n = Blast.write(records, stream, fmt="XML2")
self.assertEqual(n, 1)
stream.seek(0)
written_records = Blast.parse(stream)
self.check_xml_2900_blastn_001_records(written_records, xml2=True)
def test_megablast_legacy(self):
"""Parsing megablast 2.2.26 [Sep-21-2011] (megablast_legacy.xml)."""
filename = "megablast_legacy.xml"
@ -9480,6 +9492,18 @@ AI021773. 60 SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTE 108
written_records = Blast.parse(stream)
self.check_xml_2900_blastx_001_records(written_records, xml2=False)
def test_xml_2900_blastx_001_v2_writer(self):
"""Writing BLASTX 2.9.0+ XML2 (xml_2900_blastx_001_v2.xml)."""
filename = "xml_2900_blastx_001_v2.xml"
path = os.path.join("Blast", filename)
with Blast.parse(path) as records:
stream = io.BytesIO()
n = Blast.write(records, stream, fmt="XML2")
self.assertEqual(n, 1)
stream.seek(0)
written_records = Blast.parse(stream)
self.check_xml_2900_blastx_001_records(written_records, xml2=True)
class TestTBlastn(unittest.TestCase):
"""Test the Blast XML parser for tblastn output."""
@ -10524,6 +10548,18 @@ CAJ99216. 180 FLKQHLNQKMPLLYGGSVNTQNAKEILGIDSVDGLLIGSTSLELENFKTIISFL 234
written_records = Blast.parse(stream)
self.check_xml_2900_tblastn_001_records(written_records)
def test_xml_2900_tblastn_001_v2_writer(self):
"""Writing TBLASTN 2.9.0+ XML2 (xml_2900_tblastn_001_v2.xml)."""
filename = "xml_2900_tblastn_001_v2.xml"
path = os.path.join("Blast", filename)
with Blast.parse(path) as records:
stream = io.BytesIO()
n = Blast.write(records, stream, fmt="XML2")
self.assertEqual(n, 1)
stream.seek(0)
written_records = Blast.parse(stream)
self.check_xml_2900_tblastn_001_records(written_records, xml2=True)
class TestTBlastx(unittest.TestCase):
"""Test the Blast XML parser for tblastx output."""
@ -11266,9 +11302,9 @@ class TestRPSBlast(unittest.TestCase):
path = os.path.join("Blast", filename)
with open(path, "rb") as stream:
records = Blast.parse(stream)
self.check_xml_2900_rpsblast_001(records)
self.check_xml_2900_rpsblast_001_records(records)
with Blast.parse(path) as records:
self.check_xml_2900_rpsblast_001(records)
self.check_xml_2900_rpsblast_001_records(records)
with open(path, "rb") as stream:
record = Blast.read(stream)
self.check_xml_2900_rpsblast_001_record(record)
@ -11281,9 +11317,9 @@ class TestRPSBlast(unittest.TestCase):
path = os.path.join("Blast", filename)
with open(path, "rb") as stream:
records = Blast.parse(stream)
self.check_xml_2900_rpsblast_001(records, xml2=True)
self.check_xml_2900_rpsblast_001_records(records, xml2=True)
with Blast.parse(path) as records:
self.check_xml_2900_rpsblast_001(records, xml2=True)
self.check_xml_2900_rpsblast_001_records(records, xml2=True)
with open(path, "rb") as stream:
record = Blast.read(stream)
self.check_xml_2900_rpsblast_001_record(record, xml2=True)
@ -11300,9 +11336,21 @@ class TestRPSBlast(unittest.TestCase):
self.assertEqual(n, 1)
stream.seek(0)
written_records = Blast.parse(stream)
self.check_xml_2900_rpsblast_001(written_records)
self.check_xml_2900_rpsblast_001_records(written_records)
def check_xml_2900_rpsblast_001(self, records, xml2=False):
def test_xml_2900_rpsblast_001_v2_writer(self):
"""Writing rpsblast 2.9.0+ XML2 (xml_2900_rpsblast_001_v2.xml)."""
filename = "xml_2900_rpsblast_001_v2.xml"
path = os.path.join("Blast", filename)
with Blast.parse(path) as records:
stream = io.BytesIO()
n = Blast.write(records, stream, fmt="XML2")
self.assertEqual(n, 1)
stream.seek(0)
written_records = Blast.parse(stream)
self.check_xml_2900_rpsblast_001_records(written_records, xml2=True)
def check_xml_2900_rpsblast_001_records(self, records, xml2=False):
self.assertEqual(records.program, "rpsblast")
self.assertEqual(records.version, "RPSBLAST 2.9.0+")
self.assertEqual(