Add Mauve section to Bio.Align documentation (#4394)

* update

* update
This commit is contained in:
mdehoon
2023-08-01 11:39:17 +09:00
committed by GitHub
parent 3ab7e6be54
commit 2d78a78964
7 changed files with 415 additions and 191 deletions

View File

@ -3350,8 +3350,8 @@ class AlignmentsAbstractBaseClass(ABC):
class Alignments(AlignmentsAbstractBaseClass, list): # noqa: D101
def __init__(self): # noqa: D107
super().__init__()
def __init__(self, alignments=()): # noqa: D107
super().__init__(alignments)
self._index = -1
def __next__(self):

View File

@ -161,6 +161,9 @@ Douglas R. Cavener: ``Comparison of the consensus sequence flanking translationa
Timothy L. Bailey and Charles Elkan: ``Fitting a mixture model by expectation maximization to discover motifs in biopolymers'', \textit{Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology} 28--36. AAAI Press, Menlo Park, California (1994).
\bibitem{chapman2000}
Brad Chapman and Jeff Chang: ``Biopython: Python tools for computational biology''. \textit{ACM SIGBIO Newsletter} {\bf 20} (2): 15--19 (August 2000).
\bibitem{darling2004}
Aaron E. Darling, Bob Mau, Frederick R. Blattner, Nicole T. Perna: ``Mauve: Multiple alignment of conserved genomic sequence with rearrangements.'' \textit{Genome Research} {\bf 14} (7): 1394--1403 (2004).
\url{https://doi.org/10.1101/gr.2289704}
\bibitem{dayhoff1978}
M.O. Dayhoff, R.M. Schwartz, and B.C. Orcutt: ``A Model of Evolutionary Change in Proteins.'' \textit{Atlas of Protein Sequence and Structure}, Volume 5, Supplement 3, 1978: 345--352. The National Biomedical Research Foundation, 1979.
\bibitem{dehoon2004}

View File

@ -770,24 +770,6 @@ The consensus sequence and secondary structure are associated with the sequence
'consensus sequence': 'KVKFKYKGEEKEVDISKIKKVWRVGKMVSFTYDD.NGKTGRGAVSEKDAPKELLsMLuK'}
\end{minted}
\subsection{EMBOSS}
\label{subsec:align_emboss}
\subsection{GSG Multiple Sequence Format (MSF)}
\label{subsec:align_msf}
\subsection{Exonerate}
\label{subsec:align_exonerate}
\subsection{Nexus}
\label{subsec:align_nexus}
\subsection{Mauve eXtended Multi-FastA (xmfa) format}
\label{subsec:align_mauve}
\subsection{Tabular output from BLAST or FASTA}
\label{subsec:align_tabular}
\subsection{PHYLIP output files}
\label{subsec:align_phylip}
@ -923,12 +905,320 @@ True
['CYS1_DICDI', 'ALEU_HORVU', 'CATH_HUMAN']
\end{minted}
\subsection{EMBOSS}
\label{subsec:align_emboss}
\subsection{GSG Multiple Sequence Format (MSF)}
\label{subsec:align_msf}
\subsection{Exonerate}
\label{subsec:align_exonerate}
\subsection{Nexus}
\label{subsec:align_nexus}
\subsection{Tabular output from BLAST or FASTA}
\label{subsec:align_tabular}
\subsection{HH-suite output files}
\label{subsec:align_hhr}
\subsection{A2M}
\label{subsec:align_a2m}
\subsection{Mauve eXtended Multi-FastA (xmfa) format}
\label{subsec:align_mauve}
Mauve~\cite{darling2004} is a software package for constructing multiple genome alignments. These alignments are stored in the eXtended Multi-FastA (xmfa) format.
Depending on how exactly \verb|progressiveMauve| (the aligner program in Mauve) was called, the xmfa format is slightly different.
If \verb|progressiveMauve| is called with a single sequence input file, as in
\begin{minted}{text}
progressiveMauve combined.fasta --output=combined.xmfa ...
\end{minted}
where \verb|combined.fasta| contains the genome sequences:
\begin{minted}{text}
>equCab1
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
>mm9
GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT
>canFam2
CAAGCCCTGCGCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTTC
\end{minted}
then the output file \verb|combined.xmfa| is as follows:
\begin{minted}{text}
#FormatVersion Mauve1
#Sequence1File combined.fa
#Sequence1Entry 1
#Sequence1Format FastA
#Sequence2File combined.fa
#Sequence2Entry 2
#Sequence2Format FastA
#Sequence3File combined.fa
#Sequence3Entry 3
#Sequence3Format FastA
#BackboneFile combined.xmfa.bbcols
> 1:2-49 - combined.fa
AAGCCCTCCTAGCACACACCCGGAGTGG-CCGGGCCGTACTTTCCTTTT
> 2:0-0 + combined.fa
-------------------------------------------------
> 3:2-48 + combined.fa
AAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTT
=
> 1:1-1 + combined.fa
G
=
> 1:50-50 + combined.fa
A
=
> 2:1-41 + combined.fa
GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT
=
> 3:1-1 + combined.fa
C
=
> 3:49-49 + combined.fa
C
=
\end{minted}
with numbers (1, 2, 3) referring to the input genome sequences for horse (\verb+equCab1+), mouse (\verb+mm9+), and dog (\verb+canFam2+), respectively.
This xmfa file consists of six alignment blocks, separated by \verb|=| characters. Use \verb|Align.parse| to extract these alignments:
%doctest ../Tests/Mauve lib:numpy
\begin{minted}{pycon}
>>> from Bio import Align
>>> alignments = Align.parse("combined.xmfa", "mauve")
\end{minted}
The file header data are stored in the \verb|metadata| attribute:
%cont-doctest
\begin{minted}{pycon}
>>> alignments.metadata # doctest: +NORMALIZE_WHITESPACE
{'FormatVersion': 'Mauve1',
'BackboneFile': 'combined.xmfa.bbcols',
'File': 'combined.fa'}
\end{minted}
The \verb|identifiers| attribute stores the sequence identifiers for the three sequences, which in this case is the three numbers:
%cont-doctest
\begin{minted}{pycon}
>>> alignments.identifiers
['0', '1', '2']
\end{minted}
These identifiers are used in the individual alignments:
%cont-doctest
\begin{minted}{pycon}
>>> for alignment in alignments:
... print([record.id for record in alignment.sequences])
... print(alignment)
... print("******")
...
['0', '1', '2']
0 49 AAGCCCTCCTAGCACACACCCGGAGTGG-CCGGGCCGTACTTTCCTTTT 1
1 0 ------------------------------------------------- 0
2 1 AAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTT 48
<BLANKLINE>
******
['0']
0 0 G 1
<BLANKLINE>
******
['0']
0 49 A 50
<BLANKLINE>
******
['1']
1 0 GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT 41
<BLANKLINE>
******
['2']
2 0 C 1
<BLANKLINE>
******
['2']
2 48 C 49
<BLANKLINE>
******
\end{minted}
Note that only the first block is a real alignment; the other blocks contain only a single sequence. By including these blocks, the xmfa file contains the full sequence that was provided in the \verb|combined.fa| input file.
If \verb|progressiveMauve| is called with a separate input file for each genome, as in
\begin{minted}{text}
progressiveMauve equCab1.fa canFam2.fa mm9.fa --output=separate.xmfa ...
\end{minted}
where each Fasta file contains the genome sequence for one species only, then the output file \verb|separate.xmfa| is as follows:
\begin{minted}{text}
#FormatVersion Mauve1
#Sequence1File equCab1.fa
#Sequence1Format FastA
#Sequence2File canFam2.fa
#Sequence2Format FastA
#Sequence3File mm9.fa
#Sequence3Format FastA
#BackboneFile separate.xmfa.bbcols
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=
\end{minted}
The identifiers \verb+equCab1+ for horse, \verb+mm9+ for mouse, and \verb+canFam2+ for dog are now shown explicitly in the output file.
This xmfa file consists of two alignment blocks, separated by \verb|=| characters. Use \verb|Align.parse| to extract these alignments:
%doctest ../Tests/Mauve lib:numpy
\begin{minted}{pycon}
>>> from Bio import Align
>>> alignments = Align.parse("separate.xmfa", "mauve")
\end{minted}
The file header data now does not include the input file name:
%cont-doctest
\begin{minted}{pycon}
>>> alignments.metadata # doctest: +NORMALIZE_WHITESPACE
{'FormatVersion': 'Mauve1',
'BackboneFile': 'separate.xmfa.bbcols'}
\end{minted}
The \verb|identifiers| attribute stores the sequence identifiers for the three sequences:
%cont-doctest
\begin{minted}{pycon}
>>> alignments.identifiers
['equCab1.fa', 'canFam2.fa', 'mm9.fa']
\end{minted}
These identifiers are used in the individual alignments:
%cont-doctest
\begin{minted}{pycon}
>>> for alignment in alignments:
... print([record.id for record in alignment.sequences])
... print(alignment)
... print("******")
...
['equCab1.fa', 'canFam2.fa', 'mm9.fa']
equCab1.f 50 TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC 0
canFam2.f 0 CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC 49
mm9.fa 19 ---------------------------------GGATCTACTTTTCCTCTTC 0
<BLANKLINE>
******
['mm9.fa']
mm9.fa 19 CTGGCGTCCGGAGCTGGGACGT 41
<BLANKLINE>
******
\end{minted}
To print the alignments in Mauve format, use \verb|Align.write|:
%cont-doctest
\begin{minted}{pycon}
>>> from io import StringIO
>>> stream = StringIO()
>>> alignments = Align.parse("separate.xmfa", "mauve")
>>> Align.write(alignments, stream, "mauve")
2
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
#FormatVersion Mauve1
#Sequence1File equCab1.fa
#Sequence1Format FastA
#Sequence2File canFam2.fa
#Sequence2Format FastA
#Sequence3File mm9.fa
#Sequence3Format FastA
#BackboneFile separate.xmfa.bbcols
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=
<BLANKLINE>
\end{minted}
Here, the writer makes use of the information stored in \verb+alignments.metadata+ and \verb+alignments.identifiers+ to create this format.
If your \verb|alignments| object does not have these attributes, you can provide them as keyword arguments to \verb+Align.write+:
%cont-doctest
\begin{minted}{pycon}
>>> stream = StringIO()
>>> alignments = Align.parse("separate.xmfa", "mauve")
>>> metadata = alignments.metadata
>>> identifiers = alignments.identifiers
>>> alignments = list(alignments) # this drops the attributes
>>> alignments.metadata # doctest: +ELLIPSIS
Traceback (most recent call last):
...
AttributeError: 'list' object has no attribute 'metadata'
>>> alignments.identifiers # doctest: +ELLIPSIS
Traceback (most recent call last):
...
AttributeError: 'list' object has no attribute 'identifiers'
>>> Align.write(alignments, stream, "mauve", metadata=metadata, identifiers=identifiers)
2
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
#FormatVersion Mauve1
#Sequence1File equCab1.fa
#Sequence1Format FastA
#Sequence2File canFam2.fa
#Sequence2Format FastA
#Sequence3File mm9.fa
#Sequence3Format FastA
#BackboneFile separate.xmfa.bbcols
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=
<BLANKLINE>
\end{minted}
Python does not allow you to add these attributes to the \verb+alignments+ object directly, as in this example it was converted to a plain list.
However, you can construct an \verb|Alignments| object (which inherits from \verb+list+) and add the attributes to it:
%cont-doctest
\begin{minted}{pycon}
>>> alignments = Align.Alignments(alignments)
>>> alignments.metadata = metadata
>>> alignments.identifiers = identifiers
>>> stream = StringIO()
>>> Align.write(alignments, stream, "mauve", metadata=metadata, identifiers=identifiers)
2
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
#FormatVersion Mauve1
#Sequence1File equCab1.fa
#Sequence1Format FastA
#Sequence2File canFam2.fa
#Sequence2Format FastA
#Sequence3File mm9.fa
#Sequence3Format FastA
#BackboneFile separate.xmfa.bbcols
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=
<BLANKLINE>
\end{minted}
When printing a single alignment in \verb+Mauve+ format, use keyword arguments to provide the metadata and identifiers:
%cont-doctest
\begin{minted}{pycon}
>>> alignment = alignments[0]
>>> print(alignment.format("mauve", metadata=metadata, identifiers=identifiers))
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
<BLANKLINE>
\end{minted}
\subsection{Sequence Alignment/Map (SAM)}
\label{subseq:align_sam}

View File

@ -6,19 +6,13 @@
#Sequence3File mm9.fa
#Sequence3Format FastA
#BackboneFile separate.xmfa.bbcols
> 1:0-0 + equCab1.fa
------------------------
> 2:26-49 + canFam2.fa
GTCCCGGGCCCTGCTTTCCTTTTC
> 3:1-24 - mm9.fa
GCCAGGGATCTACTTTTCCTCTTC
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
> 1:1-50 + equCab1.fa
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
=
> 2:1-25 + canFam2.fa
CAAGCCCTGCGCGCTCAGCCGGAGT
=
> 3:25-41 + mm9.fa
GTCCGGAGCTGGGACGT
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=

View File

@ -1,2 +1,3 @@
seq0_leftend seq0_rightend seq1_leftend seq1_rightend seq2_leftend seq2_rightend
0 0 26 49 -1 -24
-19 -50 1 31 0 0
-1 -18 32 49 -1 -19

View File

@ -1 +1,2 @@
0 1 24 1 2
0 34 19 0 1 2
0 1 33 0 1

View File

@ -25,13 +25,16 @@ except ImportError:
class TestCombinedFile(unittest.TestCase):
def setUp(self):
filename = "combined.fa"
path = os.path.join("Mauve", filename)
records = SeqIO.parse(path, "fasta")
self.sequences = {
str(index): record.seq for index, record in enumerate(records)
}
# Generate the output file combined.xmfa by running
# progressiveMauve combined.fa --output=combined.xmfa
filename = "combined.fa"
path = os.path.join("Mauve", filename)
records = SeqIO.parse(path, "fasta")
sequences = {str(index): record.seq for index, record in enumerate(records)}
del filename
del path
del records
def test_parse(self):
path = os.path.join("Mauve", "combined.xmfa")
@ -432,14 +435,19 @@ numpy.array([['C']], dtype='U')
self.assertEqual(output.read(), data)
class TestDSeparateFiles(unittest.TestCase):
def setUp(self):
self.sequences = {}
for species in ("equCab1", "canFam2", "mm9"):
filename = f"{species}.fa"
path = os.path.join("Mauve", filename)
record = SeqIO.read(path, "fasta")
self.sequences[filename] = record.seq
class TestSeparateFiles(unittest.TestCase):
# Generate the output file separate.xmfa by running
# progressiveMauve --solid-seeds equCab1.fa canFam2.fa mm9.fa --output=separate.xmfa
sequences = {}
for species in ("equCab1", "canFam2", "mm9"):
filename = f"{species}.fa"
path = os.path.join("Mauve", filename)
record = SeqIO.read(path, "fasta")
sequences[filename] = record.seq
del filename
del path
del record
def test_parse(self):
path = os.path.join("Mauve", "separate.xmfa")
@ -456,20 +464,23 @@ class TestDSeparateFiles(unittest.TestCase):
self.assertEqual(len(alignment), 3)
self.assertEqual(len(alignment.sequences), 3)
self.assertEqual(alignment.sequences[0].id, "equCab1.fa")
self.assertEqual(alignment.sequences[0].seq, "")
self.assertEqual(
alignment.sequences[0].seq,
Seq("GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"),
)
start = alignment.coordinates[0, 0]
end = alignment.coordinates[0, -1]
self.assertEqual(start, 0)
self.assertEqual(start, 50)
self.assertEqual(end, 0)
self.assertEqual(alignment.sequences[1].id, "canFam2.fa")
self.assertEqual(
repr(alignment.sequences[1].seq),
"Seq({25: 'GTCCCGGGCCCTGCTTTCCTTTTC'}, length=49)",
alignment.sequences[1].seq,
Seq("CAAGCCCTGCGCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTTC"),
)
start = alignment.coordinates[1, 0]
end = alignment.coordinates[1, -1]
sequence = self.sequences[alignment.sequences[1].id]
self.assertEqual(start, 25)
self.assertEqual(start, 0)
self.assertEqual(end, 49)
self.assertEqual(alignment.sequences[1].seq[start:end], sequence[start:end])
self.assertEqual(alignment.sequences[2].id, "mm9.fa")
@ -478,83 +489,46 @@ class TestDSeparateFiles(unittest.TestCase):
start = len(sequence) - alignment.coordinates[2, 0]
end = len(sequence) - alignment.coordinates[2, -1]
self.assertEqual(start, 0)
self.assertEqual(end, 24)
self.assertEqual(end, 19)
sequence = self.sequences[alignment.sequences[2].id][start:end]
self.assertEqual(alignment.sequences[2].seq[start:end], sequence)
self.assertEqual(alignment[0], "------------------------")
self.assertEqual(alignment[1], "GTCCCGGGCCCTGCTTTCCTTTTC")
self.assertEqual(alignment[2], "GCCAGGGATCTACTTTTCCTCTTC")
self.assertEqual(
alignment[0], "TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC"
)
self.assertEqual(
alignment[1], "CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC"
)
self.assertEqual(
alignment[2], "---------------------------------GGATCTACTTTTCCTCTTC"
)
self.assertEqual(
str(alignment),
"""\
equCab1.f 0 ------------------------ 0
canFam2.f 25 GTCCCGGGCCCTGCTTTCCTTTTC 49
mm9.fa 24 GCCAGGGATCTACTTTTCCTCTTC 0
equCab1.f 50 TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC 0
canFam2.f 0 CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC 49
mm9.fa 19 ---------------------------------GGATCTACTTTTCCTCTTC 0
""",
)
self.assertTrue(
numpy.array_equal(
alignment.coordinates,
numpy.array([[0, 0], [25, 49], [24, 0]]),
)
)
self.assertEqual(
alignment.format("mauve", metadata, identifiers),
"""\
> 1:0-0 + equCab1.fa
------------------------
> 2:26-49 + canFam2.fa
GTCCCGGGCCCTGCTTTCCTTTTC
> 3:1-24 - mm9.fa
GCCAGGGATCTACTTTTCCTCTTC
=
""",
)
self.assertTrue(
numpy.array_equal(
numpy.array(alignment, "U"),
# fmt: off
# flake8: noqa
numpy.array([['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
['G', 'T', 'C', 'C', 'C', 'G', 'G', 'G', 'C', 'C', 'C', 'T', 'G',
'C', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
['G', 'C', 'C', 'A', 'G', 'G', 'G', 'A', 'T', 'C', 'T', 'A', 'C',
'T', 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'T', 'T', 'C']],
dtype='U')
numpy.array([[50, 40, 38, 19, 19, 18, 10, 10, 0],
[ 0, 10, 10, 29, 30, 31, 39, 39, 49],
[19, 19, 19, 19, 19, 19, 11, 10, 0]]),
# fmt: on
)
)
alignment = next(alignments)
saved_alignments.append(alignment)
self.assertEqual(len(alignment), 1)
self.assertEqual(len(alignment.sequences), 1)
self.assertEqual(alignment.sequences[0].id, "equCab1.fa")
self.assertEqual(
alignment.sequences[0].seq,
"GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA",
)
sequence = self.sequences[alignment.sequences[0].id]
start = alignment.coordinates[0, 0]
end = alignment.coordinates[0, -1]
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
self.assertEqual(
alignment[0], "GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"
)
self.assertTrue(
numpy.array_equal(alignment.coordinates, numpy.array([[0, 50]]))
)
self.assertEqual(
str(alignment),
"""\
equCab1.f 0 GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA 50
""",
)
self.assertEqual(
alignment.format("mauve", metadata, identifiers),
"""\
> 1:1-50 + equCab1.fa
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
> 1:1-50 - equCab1.fa
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
> 2:1-49 + canFam2.fa
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
> 3:1-19 - mm9.fa
---------------------------------GGATCTACTTTTCCTCTTC
=
""",
)
@ -563,49 +537,18 @@ GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
numpy.array(alignment, "U"),
# fmt: off
# flake8: noqa
numpy.array([['G', 'A', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'A', 'G', 'T', 'A',
'C', 'G', 'G', 'C', 'C', 'C', 'G', 'G', 'C', 'C', 'A', 'C', 'T',
'C', 'C', 'G', 'G', 'G', 'T', 'G', 'T', 'G', 'T', 'G', 'C', 'T',
'A', 'G', 'G', 'A', 'G', 'G', 'G', 'C', 'T', 'T', 'A']],
dtype='U')
# fmt: on
)
)
alignment = next(alignments)
saved_alignments.append(alignment)
self.assertEqual(len(alignment), 1)
self.assertEqual(len(alignment.sequences), 1)
self.assertEqual(alignment.sequences[0].id, "canFam2.fa")
self.assertEqual(alignment.sequences[0].seq, "CAAGCCCTGCGCGCTCAGCCGGAGT")
sequence = self.sequences[alignment.sequences[0].id]
start = alignment.coordinates[0, 0]
end = alignment.coordinates[0, -1]
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
self.assertEqual(alignment[0], "CAAGCCCTGCGCGCTCAGCCGGAGT")
self.assertTrue(
numpy.array_equal(alignment.coordinates, numpy.array([[0, 25]]))
)
self.assertEqual(
str(alignment),
"""\
canFam2.f 0 CAAGCCCTGCGCGCTCAGCCGGAGT 25
""",
)
self.assertEqual(
alignment.format("mauve", metadata, identifiers),
"""\
> 2:1-25 + canFam2.fa
CAAGCCCTGCGCGCTCAGCCGGAGT
=
""",
)
self.assertTrue(
numpy.array_equal(
numpy.array(alignment, "U"),
# fmt: off
# flake8: noqa
numpy.array([['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', 'G', 'C', 'G',
'C', 'T', 'C', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'G', 'T']],
numpy.array([['T', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'C', 'C', 'T', 'A', 'G',
'C', 'A', 'C', 'A', 'C', 'A', 'C', 'C', 'C', 'G', 'G', 'A', 'G',
'T', 'G', 'G', 'C', 'C', '-', 'G', 'G', 'G', 'C', 'C', 'G', 'T',
'A', 'C', '-', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', '-', '-', 'G',
'C', 'G', 'C', 'T', 'C', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'G',
'T', 'G', 'T', 'C', 'C', 'C', 'G', 'G', 'G', 'C', 'C', 'C', 'T',
'G', 'C', '-', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
'-', '-', '-', '-', '-', '-', '-', 'G', 'G', 'A', 'T', 'C', 'T',
'A', 'C', 'T', 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'T', 'T', 'C']],
dtype='U')
# fmt: on
)
@ -615,31 +558,29 @@ numpy.array([['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', 'G', 'C', 'G',
self.assertEqual(len(alignment), 1)
self.assertEqual(len(alignment.sequences), 1)
self.assertEqual(alignment.sequences[0].id, "mm9.fa")
sequence = self.sequences[alignment.sequences[0].id]
start = alignment.coordinates[0, 0]
end = alignment.coordinates[0, -1]
self.assertEqual(start, 24)
self.assertEqual(end, 41)
self.assertEqual(alignment.sequences[0].seq[start:end], "GTCCGGAGCTGGGACGT")
self.assertEqual(
repr(alignment.sequences[0].seq),
"Seq({19: 'CTGGCGTCCGGAGCTGGGACGT'}, length=41)",
)
sequence = self.sequences[alignment.sequences[0].id]
start = alignment.coordinates[0, 0]
end = alignment.coordinates[0, -1]
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
self.assertEqual(alignment[0], "GTCCGGAGCTGGGACGT")
self.assertEqual(alignment[0], "CTGGCGTCCGGAGCTGGGACGT")
self.assertTrue(
numpy.array_equal(alignment.coordinates, numpy.array([[19, 41]]))
)
self.assertEqual(
str(alignment),
"""\
mm9.fa 24 GTCCGGAGCTGGGACGT 41
mm9.fa 19 CTGGCGTCCGGAGCTGGGACGT 41
""",
)
self.assertTrue(
numpy.array_equal(alignment.coordinates, numpy.array([[24, 41]]))
)
self.assertEqual(
alignment.format("mauve", metadata, identifiers),
"""\
> 3:25-41 + mm9.fa
GTCCGGAGCTGGGACGT
> 3:20-41 + mm9.fa
CTGGCGTCCGGAGCTGGGACGT
=
""",
)
@ -648,15 +589,15 @@ GTCCGGAGCTGGGACGT
numpy.array(alignment, "U"),
# fmt: off
# flake8: noqa
numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
'A', 'C', 'G', 'T']], dtype='U')
numpy.array([['C', 'T', 'G', 'G', 'C', 'G', 'T', 'C', 'C', 'G', 'G',
'A', 'G', 'C', 'T', 'G', 'G', 'G', 'A', 'C', 'G', 'T']], dtype='U')
# fmt: on
)
)
self.assertRaises(StopIteration, next, alignments)
# As each nucleotide in each sequence is stored exactly once in an XMFA
# file, we can reconstitute the full sequences:
self.assertEqual(len(saved_alignments), 4)
self.assertEqual(len(saved_alignments), 2)
filenames = []
for alignment in saved_alignments:
for record in alignment.sequences:
@ -697,26 +638,20 @@ numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
for record in alignment.sequences:
filename = record.id
record.seq = sequences[filename]
self.assertEqual(alignment[0], "------------------------")
self.assertEqual(alignment[1], "GTCCCGGGCCCTGCTTTCCTTTTC")
self.assertEqual(alignment[2], "GCCAGGGATCTACTTTTCCTCTTC")
self.assertEqual(
alignment[0], "TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC"
)
self.assertEqual(
alignment[1], "CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC"
)
self.assertEqual(
alignment[2], "---------------------------------GGATCTACTTTTCCTCTTC"
)
alignment = saved_alignments[1]
for record in alignment.sequences:
filename = record.id
record.seq = sequences[filename]
self.assertEqual(
alignment[0], "GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"
)
alignment = saved_alignments[2]
for record in alignment.sequences:
filename = record.id
record.seq = sequences[filename]
self.assertEqual(alignment[0], "CAAGCCCTGCGCGCTCAGCCGGAGT")
alignment = saved_alignments[3]
for record in alignment.sequences:
filename = record.id
record.seq = sequences[filename]
self.assertEqual(alignment[0], "GTCCGGAGCTGGGACGT")
self.assertEqual(alignment[0], "CTGGCGTCCGGAGCTGGGACGT")
def test_write_read(self):
path = os.path.join("Mauve", "separate.xmfa")
@ -729,7 +664,7 @@ numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
alignments = Align.parse(stream, "mauve")
output = StringIO()
n = Align.write(alignments, output, "mauve")
self.assertEqual(n, 4)
self.assertEqual(n, 2)
output.seek(0)
self.assertEqual(output.read(), data)