mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
196 lines
7.2 KiB
Python
196 lines
7.2 KiB
Python
# Copyright 2006-2016 by Peter Cock. All rights reserved.
|
|
# Revisions copyright 2021 by Michiel de Hoon.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Bio.Align support for the "nexus" file format.
|
|
|
|
You are expected to use this module via the Bio.Align functions.
|
|
|
|
See also the Bio.Nexus module (which this code calls internally),
|
|
as this offers more than just accessing the alignment or its
|
|
sequences as SeqRecord objects.
|
|
"""
|
|
|
|
from io import StringIO
|
|
|
|
import numpy as np
|
|
|
|
from Bio.Align import Alignment
|
|
from Bio.Align import interfaces
|
|
from Bio.Nexus import Nexus
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
|
|
class AlignmentWriter(interfaces.AlignmentWriter):
|
|
"""Nexus alignment writer.
|
|
|
|
Note that Nexus files are only expected to hold ONE alignment
|
|
matrix.
|
|
|
|
You are expected to call this class via Bio.Align.write().
|
|
"""
|
|
|
|
fmt = "Nexus"
|
|
|
|
def __init__(self, target, interleave=None):
|
|
"""Create an AlignmentWriter object.
|
|
|
|
Arguments:
|
|
- target - output stream or file name
|
|
- interleave - if None (default): interleave if columns > 1000
|
|
if True: use interleaved format
|
|
if False: do not use interleaved format
|
|
|
|
"""
|
|
super().__init__(target)
|
|
self.interleave = interleave
|
|
|
|
def write_file(self, stream, alignments):
|
|
"""Write a file with the alignments, and return the number of alignments.
|
|
|
|
alignments - A list or iterator returning Alignment objects
|
|
"""
|
|
count = super().write_file(stream, alignments)
|
|
if count != 1:
|
|
raise ValueError("Expected to write 1 alignment; wrote %d" % count)
|
|
return count
|
|
|
|
def format_alignment(self, alignment, interleave=None):
|
|
"""Return a string with a single alignment in the Nexus format.
|
|
|
|
Creates an empty Nexus object, adds the sequences
|
|
and then gets Nexus to prepare the output.
|
|
|
|
- alignment - An Alignment object
|
|
- interleave - if None (default): interleave if columns > 1000
|
|
if True: use interleaved format
|
|
if False: do not use interleaved format
|
|
"""
|
|
stream = StringIO()
|
|
self.write_alignment(alignment, stream, interleave)
|
|
stream.seek(0)
|
|
return stream.read()
|
|
|
|
def write_alignment(self, alignment, stream, interleave=None):
|
|
"""Write a single alignment to the output file.
|
|
|
|
- alignment - An Alignment object
|
|
- stream - output stream
|
|
- interleave - if None (default): interleave if columns > 1000
|
|
if True: use interleaved format
|
|
if False: do not use interleaved format
|
|
"""
|
|
nseqs, length = alignment.shape
|
|
if nseqs == 0:
|
|
raise ValueError("Must have at least one sequence")
|
|
if length == 0:
|
|
raise ValueError("Non-empty sequences are required")
|
|
|
|
rows, columns = alignment.shape
|
|
if rows == 0:
|
|
raise ValueError("Must have at least one sequence")
|
|
if columns == 0:
|
|
raise ValueError("Non-empty sequences are required")
|
|
datatype = self._classify_mol_type_for_nexus(alignment)
|
|
minimal_record = (
|
|
"begin data; dimensions ntax=0 nchar=0; format datatype=%s; end;" % datatype
|
|
)
|
|
n = Nexus.Nexus(minimal_record)
|
|
for record, aligned_sequence in zip(alignment.sequences, alignment):
|
|
# Sanity test sequences (should this be even stricter?)
|
|
if datatype == "dna" and "U" in record.seq:
|
|
raise ValueError(f"{record.id} contains U, but DNA alignment")
|
|
elif datatype == "rna" and "T" in record.seq:
|
|
raise ValueError(f"{record.id} contains T, but RNA alignment")
|
|
n.add_sequence(record.id, aligned_sequence)
|
|
|
|
# Note: MrBayes may choke on large alignments if not interleaved
|
|
if interleave is None:
|
|
interleave = columns > 1000
|
|
n.write_nexus_data(stream, interleave=interleave)
|
|
|
|
def write_alignments(self, stream, alignments):
|
|
"""Write alignments to the output file, and return the number of alignments.
|
|
|
|
alignments - A list or iterator returning Alignment objects
|
|
"""
|
|
count = 0
|
|
interleave = self.interleave
|
|
for alignment in alignments:
|
|
self.write_alignment(alignment, stream, interleave=interleave)
|
|
count += 1
|
|
return count
|
|
|
|
def _classify_mol_type_for_nexus(self, alignment):
|
|
"""Return 'protein', 'dna', or 'rna' based on records' molecule type (PRIVATE).
|
|
|
|
All the records must have a molecule_type annotation, and they must
|
|
agree.
|
|
|
|
Raises an exception if this is not possible.
|
|
"""
|
|
values = {
|
|
sequence.annotations.get("molecule_type", None)
|
|
for sequence in alignment.sequences
|
|
}
|
|
if all(_ and "DNA" in _ for _ in values):
|
|
return "dna" # could have been a mix of "DNA" and "gDNA"
|
|
elif all(_ and "RNA" in _ for _ in values):
|
|
return "rna" # could have been a mix of "RNA" and "mRNA"
|
|
elif all(_ and "protein" in _ for _ in values):
|
|
return "protein"
|
|
else:
|
|
raise ValueError("Need the molecule type to be defined")
|
|
|
|
|
|
class AlignmentIterator(interfaces.AlignmentIterator):
|
|
"""Nexus alignment iterator."""
|
|
|
|
fmt = "Nexus"
|
|
|
|
def _read_header(self, stream):
|
|
try:
|
|
line = next(stream)
|
|
except StopIteration:
|
|
raise ValueError("Empty file.") from None
|
|
|
|
if line.strip() != "#NEXUS":
|
|
raise ValueError("File does not start with NEXUS header.")
|
|
|
|
def _read_next_alignment(self, stream):
|
|
# NOTE - We only expect ONE alignment matrix per Nexus file.
|
|
n = Nexus.Nexus(stream)
|
|
if not n.matrix:
|
|
# No alignment found
|
|
return
|
|
|
|
# Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
|
|
# The original names and the modified names are kept in these two lists:
|
|
assert len(n.unaltered_taxlabels) == len(n.taxlabels)
|
|
|
|
# TODO - Can we extract any annotation too?
|
|
if n.datatype in ("dna", "nucleotide"):
|
|
annotations = {"molecule_type": "DNA"}
|
|
elif n.datatype == "rna":
|
|
annotations = {"molecule_type": "RNA"}
|
|
elif n.datatype == "protein":
|
|
annotations = {"molecule_type": "protein"}
|
|
else:
|
|
annotations = None
|
|
aligned_seqs = [bytes(n.matrix[name]) for name in n.taxlabels]
|
|
seqs, coordinates = Alignment.parse_printed_alignment(aligned_seqs)
|
|
records = [
|
|
SeqRecord(Seq(seq), id=name, annotations=annotations)
|
|
for seq, name in zip(seqs, n.unaltered_taxlabels)
|
|
]
|
|
# Remove columns consisting of gaps only
|
|
steps = np.diff(coordinates, 1).max(0)
|
|
indices = np.nonzero(steps == 0)[0] + 1
|
|
coordinates = np.delete(coordinates, indices, 1)
|
|
alignment = Alignment(records, coordinates)
|
|
return alignment
|