mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 13:43:47 +08:00
@ -3350,8 +3350,8 @@ class AlignmentsAbstractBaseClass(ABC):
|
||||
|
||||
|
||||
class Alignments(AlignmentsAbstractBaseClass, list): # noqa: D101
|
||||
def __init__(self): # noqa: D107
|
||||
super().__init__()
|
||||
def __init__(self, alignments=()): # noqa: D107
|
||||
super().__init__(alignments)
|
||||
self._index = -1
|
||||
|
||||
def __next__(self):
|
||||
|
@ -161,6 +161,9 @@ Douglas R. Cavener: ``Comparison of the consensus sequence flanking translationa
|
||||
Timothy L. Bailey and Charles Elkan: ``Fitting a mixture model by expectation maximization to discover motifs in biopolymers'', \textit{Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology} 28--36. AAAI Press, Menlo Park, California (1994).
|
||||
\bibitem{chapman2000}
|
||||
Brad Chapman and Jeff Chang: ``Biopython: Python tools for computational biology''. \textit{ACM SIGBIO Newsletter} {\bf 20} (2): 15--19 (August 2000).
|
||||
\bibitem{darling2004}
|
||||
Aaron E. Darling, Bob Mau, Frederick R. Blattner, Nicole T. Perna: ``Mauve: Multiple alignment of conserved genomic sequence with rearrangements.'' \textit{Genome Research} {\bf 14} (7): 1394--1403 (2004).
|
||||
\url{https://doi.org/10.1101/gr.2289704}
|
||||
\bibitem{dayhoff1978}
|
||||
M.O. Dayhoff, R.M. Schwartz, and B.C. Orcutt: ``A Model of Evolutionary Change in Proteins.'' \textit{Atlas of Protein Sequence and Structure}, Volume 5, Supplement 3, 1978: 345--352. The National Biomedical Research Foundation, 1979.
|
||||
\bibitem{dehoon2004}
|
||||
|
@ -770,24 +770,6 @@ The consensus sequence and secondary structure are associated with the sequence
|
||||
'consensus sequence': 'KVKFKYKGEEKEVDISKIKKVWRVGKMVSFTYDD.NGKTGRGAVSEKDAPKELLsMLuK'}
|
||||
\end{minted}
|
||||
|
||||
\subsection{EMBOSS}
|
||||
\label{subsec:align_emboss}
|
||||
|
||||
\subsection{GSG Multiple Sequence Format (MSF)}
|
||||
\label{subsec:align_msf}
|
||||
|
||||
\subsection{Exonerate}
|
||||
\label{subsec:align_exonerate}
|
||||
|
||||
\subsection{Nexus}
|
||||
\label{subsec:align_nexus}
|
||||
|
||||
\subsection{Mauve eXtended Multi-FastA (xmfa) format}
|
||||
\label{subsec:align_mauve}
|
||||
|
||||
\subsection{Tabular output from BLAST or FASTA}
|
||||
\label{subsec:align_tabular}
|
||||
|
||||
\subsection{PHYLIP output files}
|
||||
\label{subsec:align_phylip}
|
||||
|
||||
@ -923,12 +905,320 @@ True
|
||||
['CYS1_DICDI', 'ALEU_HORVU', 'CATH_HUMAN']
|
||||
\end{minted}
|
||||
|
||||
\subsection{EMBOSS}
|
||||
\label{subsec:align_emboss}
|
||||
|
||||
\subsection{GSG Multiple Sequence Format (MSF)}
|
||||
\label{subsec:align_msf}
|
||||
|
||||
\subsection{Exonerate}
|
||||
\label{subsec:align_exonerate}
|
||||
|
||||
\subsection{Nexus}
|
||||
\label{subsec:align_nexus}
|
||||
|
||||
\subsection{Tabular output from BLAST or FASTA}
|
||||
\label{subsec:align_tabular}
|
||||
|
||||
\subsection{HH-suite output files}
|
||||
\label{subsec:align_hhr}
|
||||
|
||||
\subsection{A2M}
|
||||
\label{subsec:align_a2m}
|
||||
|
||||
\subsection{Mauve eXtended Multi-FastA (xmfa) format}
|
||||
\label{subsec:align_mauve}
|
||||
|
||||
Mauve~\cite{darling2004} is a software package for constructing multiple genome alignments. These alignments are stored in the eXtended Multi-FastA (xmfa) format.
|
||||
Depending on how exactly \verb|progressiveMauve| (the aligner program in Mauve) was called, the xmfa format is slightly different.
|
||||
|
||||
If \verb|progressiveMauve| is called with a single sequence input file, as in
|
||||
\begin{minted}{text}
|
||||
progressiveMauve combined.fasta --output=combined.xmfa ...
|
||||
\end{minted}
|
||||
where \verb|combined.fasta| contains the genome sequences:
|
||||
\begin{minted}{text}
|
||||
>equCab1
|
||||
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
|
||||
>mm9
|
||||
GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT
|
||||
>canFam2
|
||||
CAAGCCCTGCGCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTTC
|
||||
\end{minted}
|
||||
then the output file \verb|combined.xmfa| is as follows:
|
||||
\begin{minted}{text}
|
||||
#FormatVersion Mauve1
|
||||
#Sequence1File combined.fa
|
||||
#Sequence1Entry 1
|
||||
#Sequence1Format FastA
|
||||
#Sequence2File combined.fa
|
||||
#Sequence2Entry 2
|
||||
#Sequence2Format FastA
|
||||
#Sequence3File combined.fa
|
||||
#Sequence3Entry 3
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile combined.xmfa.bbcols
|
||||
> 1:2-49 - combined.fa
|
||||
AAGCCCTCCTAGCACACACCCGGAGTGG-CCGGGCCGTACTTTCCTTTT
|
||||
> 2:0-0 + combined.fa
|
||||
-------------------------------------------------
|
||||
> 3:2-48 + combined.fa
|
||||
AAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTT
|
||||
=
|
||||
> 1:1-1 + combined.fa
|
||||
G
|
||||
=
|
||||
> 1:50-50 + combined.fa
|
||||
A
|
||||
=
|
||||
> 2:1-41 + combined.fa
|
||||
GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
> 3:1-1 + combined.fa
|
||||
C
|
||||
=
|
||||
> 3:49-49 + combined.fa
|
||||
C
|
||||
=
|
||||
\end{minted}
|
||||
with numbers (1, 2, 3) referring to the input genome sequences for horse (\verb+equCab1+), mouse (\verb+mm9+), and dog (\verb+canFam2+), respectively.
|
||||
This xmfa file consists of six alignment blocks, separated by \verb|=| characters. Use \verb|Align.parse| to extract these alignments:
|
||||
%doctest ../Tests/Mauve lib:numpy
|
||||
\begin{minted}{pycon}
|
||||
>>> from Bio import Align
|
||||
>>> alignments = Align.parse("combined.xmfa", "mauve")
|
||||
\end{minted}
|
||||
The file header data are stored in the \verb|metadata| attribute:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignments.metadata # doctest: +NORMALIZE_WHITESPACE
|
||||
{'FormatVersion': 'Mauve1',
|
||||
'BackboneFile': 'combined.xmfa.bbcols',
|
||||
'File': 'combined.fa'}
|
||||
\end{minted}
|
||||
The \verb|identifiers| attribute stores the sequence identifiers for the three sequences, which in this case is the three numbers:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignments.identifiers
|
||||
['0', '1', '2']
|
||||
\end{minted}
|
||||
These identifiers are used in the individual alignments:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> for alignment in alignments:
|
||||
... print([record.id for record in alignment.sequences])
|
||||
... print(alignment)
|
||||
... print("******")
|
||||
...
|
||||
['0', '1', '2']
|
||||
0 49 AAGCCCTCCTAGCACACACCCGGAGTGG-CCGGGCCGTACTTTCCTTTT 1
|
||||
1 0 ------------------------------------------------- 0
|
||||
2 1 AAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTT 48
|
||||
<BLANKLINE>
|
||||
******
|
||||
['0']
|
||||
0 0 G 1
|
||||
<BLANKLINE>
|
||||
******
|
||||
['0']
|
||||
0 49 A 50
|
||||
<BLANKLINE>
|
||||
******
|
||||
['1']
|
||||
1 0 GAAGAGGAAAAGTAGATCCCTGGCGTCCGGAGCTGGGACGT 41
|
||||
<BLANKLINE>
|
||||
******
|
||||
['2']
|
||||
2 0 C 1
|
||||
<BLANKLINE>
|
||||
******
|
||||
['2']
|
||||
2 48 C 49
|
||||
<BLANKLINE>
|
||||
******
|
||||
\end{minted}
|
||||
Note that only the first block is a real alignment; the other blocks contain only a single sequence. By including these blocks, the xmfa file contains the full sequence that was provided in the \verb|combined.fa| input file.
|
||||
|
||||
If \verb|progressiveMauve| is called with a separate input file for each genome, as in
|
||||
\begin{minted}{text}
|
||||
progressiveMauve equCab1.fa canFam2.fa mm9.fa --output=separate.xmfa ...
|
||||
\end{minted}
|
||||
where each Fasta file contains the genome sequence for one species only, then the output file \verb|separate.xmfa| is as follows:
|
||||
\begin{minted}{text}
|
||||
#FormatVersion Mauve1
|
||||
#Sequence1File equCab1.fa
|
||||
#Sequence1Format FastA
|
||||
#Sequence2File canFam2.fa
|
||||
#Sequence2Format FastA
|
||||
#Sequence3File mm9.fa
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile separate.xmfa.bbcols
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
\end{minted}
|
||||
The identifiers \verb+equCab1+ for horse, \verb+mm9+ for mouse, and \verb+canFam2+ for dog are now shown explicitly in the output file.
|
||||
This xmfa file consists of two alignment blocks, separated by \verb|=| characters. Use \verb|Align.parse| to extract these alignments:
|
||||
%doctest ../Tests/Mauve lib:numpy
|
||||
\begin{minted}{pycon}
|
||||
>>> from Bio import Align
|
||||
>>> alignments = Align.parse("separate.xmfa", "mauve")
|
||||
\end{minted}
|
||||
The file header data now does not include the input file name:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignments.metadata # doctest: +NORMALIZE_WHITESPACE
|
||||
{'FormatVersion': 'Mauve1',
|
||||
'BackboneFile': 'separate.xmfa.bbcols'}
|
||||
\end{minted}
|
||||
The \verb|identifiers| attribute stores the sequence identifiers for the three sequences:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignments.identifiers
|
||||
['equCab1.fa', 'canFam2.fa', 'mm9.fa']
|
||||
\end{minted}
|
||||
These identifiers are used in the individual alignments:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> for alignment in alignments:
|
||||
... print([record.id for record in alignment.sequences])
|
||||
... print(alignment)
|
||||
... print("******")
|
||||
...
|
||||
['equCab1.fa', 'canFam2.fa', 'mm9.fa']
|
||||
equCab1.f 50 TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC 0
|
||||
canFam2.f 0 CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC 49
|
||||
mm9.fa 19 ---------------------------------GGATCTACTTTTCCTCTTC 0
|
||||
<BLANKLINE>
|
||||
******
|
||||
['mm9.fa']
|
||||
mm9.fa 19 CTGGCGTCCGGAGCTGGGACGT 41
|
||||
<BLANKLINE>
|
||||
******
|
||||
\end{minted}
|
||||
|
||||
To print the alignments in Mauve format, use \verb|Align.write|:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> from io import StringIO
|
||||
>>> stream = StringIO()
|
||||
>>> alignments = Align.parse("separate.xmfa", "mauve")
|
||||
>>> Align.write(alignments, stream, "mauve")
|
||||
2
|
||||
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
|
||||
#FormatVersion Mauve1
|
||||
#Sequence1File equCab1.fa
|
||||
#Sequence1Format FastA
|
||||
#Sequence2File canFam2.fa
|
||||
#Sequence2Format FastA
|
||||
#Sequence3File mm9.fa
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile separate.xmfa.bbcols
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
<BLANKLINE>
|
||||
\end{minted}
|
||||
Here, the writer makes use of the information stored in \verb+alignments.metadata+ and \verb+alignments.identifiers+ to create this format.
|
||||
If your \verb|alignments| object does not have these attributes, you can provide them as keyword arguments to \verb+Align.write+:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> stream = StringIO()
|
||||
>>> alignments = Align.parse("separate.xmfa", "mauve")
|
||||
>>> metadata = alignments.metadata
|
||||
>>> identifiers = alignments.identifiers
|
||||
>>> alignments = list(alignments) # this drops the attributes
|
||||
>>> alignments.metadata # doctest: +ELLIPSIS
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: 'list' object has no attribute 'metadata'
|
||||
>>> alignments.identifiers # doctest: +ELLIPSIS
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: 'list' object has no attribute 'identifiers'
|
||||
>>> Align.write(alignments, stream, "mauve", metadata=metadata, identifiers=identifiers)
|
||||
2
|
||||
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
|
||||
#FormatVersion Mauve1
|
||||
#Sequence1File equCab1.fa
|
||||
#Sequence1Format FastA
|
||||
#Sequence2File canFam2.fa
|
||||
#Sequence2Format FastA
|
||||
#Sequence3File mm9.fa
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile separate.xmfa.bbcols
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
<BLANKLINE>
|
||||
\end{minted}
|
||||
Python does not allow you to add these attributes to the \verb+alignments+ object directly, as in this example it was converted to a plain list.
|
||||
However, you can construct an \verb|Alignments| object (which inherits from \verb+list+) and add the attributes to it:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignments = Align.Alignments(alignments)
|
||||
>>> alignments.metadata = metadata
|
||||
>>> alignments.identifiers = identifiers
|
||||
>>> stream = StringIO()
|
||||
>>> Align.write(alignments, stream, "mauve", metadata=metadata, identifiers=identifiers)
|
||||
2
|
||||
>>> print(stream.getvalue()) # doctest: +NORMALIZE_WHITESPACE
|
||||
#FormatVersion Mauve1
|
||||
#Sequence1File equCab1.fa
|
||||
#Sequence1Format FastA
|
||||
#Sequence2File canFam2.fa
|
||||
#Sequence2Format FastA
|
||||
#Sequence3File mm9.fa
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile separate.xmfa.bbcols
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
<BLANKLINE>
|
||||
\end{minted}
|
||||
When printing a single alignment in \verb+Mauve+ format, use keyword arguments to provide the metadata and identifiers:
|
||||
%cont-doctest
|
||||
\begin{minted}{pycon}
|
||||
>>> alignment = alignments[0]
|
||||
>>> print(alignment.format("mauve", metadata=metadata, identifiers=identifiers))
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
<BLANKLINE>
|
||||
\end{minted}
|
||||
|
||||
\subsection{Sequence Alignment/Map (SAM)}
|
||||
\label{subseq:align_sam}
|
||||
|
||||
|
@ -6,19 +6,13 @@
|
||||
#Sequence3File mm9.fa
|
||||
#Sequence3Format FastA
|
||||
#BackboneFile separate.xmfa.bbcols
|
||||
> 1:0-0 + equCab1.fa
|
||||
------------------------
|
||||
> 2:26-49 + canFam2.fa
|
||||
GTCCCGGGCCCTGCTTTCCTTTTC
|
||||
> 3:1-24 - mm9.fa
|
||||
GCCAGGGATCTACTTTTCCTCTTC
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
> 1:1-50 + equCab1.fa
|
||||
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
|
||||
=
|
||||
> 2:1-25 + canFam2.fa
|
||||
CAAGCCCTGCGCGCTCAGCCGGAGT
|
||||
=
|
||||
> 3:25-41 + mm9.fa
|
||||
GTCCGGAGCTGGGACGT
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
|
@ -1,2 +1,3 @@
|
||||
seq0_leftend seq0_rightend seq1_leftend seq1_rightend seq2_leftend seq2_rightend
|
||||
0 0 26 49 -1 -24
|
||||
-19 -50 1 31 0 0
|
||||
-1 -18 32 49 -1 -19
|
||||
|
@ -1 +1,2 @@
|
||||
0 1 24 1 2
|
||||
0 34 19 0 1 2
|
||||
0 1 33 0 1
|
||||
|
@ -25,13 +25,16 @@ except ImportError:
|
||||
|
||||
|
||||
class TestCombinedFile(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# Generate the output file combined.xmfa by running
|
||||
# progressiveMauve combined.fa --output=combined.xmfa
|
||||
|
||||
filename = "combined.fa"
|
||||
path = os.path.join("Mauve", filename)
|
||||
records = SeqIO.parse(path, "fasta")
|
||||
self.sequences = {
|
||||
str(index): record.seq for index, record in enumerate(records)
|
||||
}
|
||||
sequences = {str(index): record.seq for index, record in enumerate(records)}
|
||||
del filename
|
||||
del path
|
||||
del records
|
||||
|
||||
def test_parse(self):
|
||||
path = os.path.join("Mauve", "combined.xmfa")
|
||||
@ -432,14 +435,19 @@ numpy.array([['C']], dtype='U')
|
||||
self.assertEqual(output.read(), data)
|
||||
|
||||
|
||||
class TestDSeparateFiles(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.sequences = {}
|
||||
class TestSeparateFiles(unittest.TestCase):
|
||||
# Generate the output file separate.xmfa by running
|
||||
# progressiveMauve --solid-seeds equCab1.fa canFam2.fa mm9.fa --output=separate.xmfa
|
||||
|
||||
sequences = {}
|
||||
for species in ("equCab1", "canFam2", "mm9"):
|
||||
filename = f"{species}.fa"
|
||||
path = os.path.join("Mauve", filename)
|
||||
record = SeqIO.read(path, "fasta")
|
||||
self.sequences[filename] = record.seq
|
||||
sequences[filename] = record.seq
|
||||
del filename
|
||||
del path
|
||||
del record
|
||||
|
||||
def test_parse(self):
|
||||
path = os.path.join("Mauve", "separate.xmfa")
|
||||
@ -456,20 +464,23 @@ class TestDSeparateFiles(unittest.TestCase):
|
||||
self.assertEqual(len(alignment), 3)
|
||||
self.assertEqual(len(alignment.sequences), 3)
|
||||
self.assertEqual(alignment.sequences[0].id, "equCab1.fa")
|
||||
self.assertEqual(alignment.sequences[0].seq, "")
|
||||
self.assertEqual(
|
||||
alignment.sequences[0].seq,
|
||||
Seq("GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"),
|
||||
)
|
||||
start = alignment.coordinates[0, 0]
|
||||
end = alignment.coordinates[0, -1]
|
||||
self.assertEqual(start, 0)
|
||||
self.assertEqual(start, 50)
|
||||
self.assertEqual(end, 0)
|
||||
self.assertEqual(alignment.sequences[1].id, "canFam2.fa")
|
||||
self.assertEqual(
|
||||
repr(alignment.sequences[1].seq),
|
||||
"Seq({25: 'GTCCCGGGCCCTGCTTTCCTTTTC'}, length=49)",
|
||||
alignment.sequences[1].seq,
|
||||
Seq("CAAGCCCTGCGCGCTCAGCCGGAGTGTCCCGGGCCCTGCTTTCCTTTTC"),
|
||||
)
|
||||
start = alignment.coordinates[1, 0]
|
||||
end = alignment.coordinates[1, -1]
|
||||
sequence = self.sequences[alignment.sequences[1].id]
|
||||
self.assertEqual(start, 25)
|
||||
self.assertEqual(start, 0)
|
||||
self.assertEqual(end, 49)
|
||||
self.assertEqual(alignment.sequences[1].seq[start:end], sequence[start:end])
|
||||
self.assertEqual(alignment.sequences[2].id, "mm9.fa")
|
||||
@ -478,83 +489,46 @@ class TestDSeparateFiles(unittest.TestCase):
|
||||
start = len(sequence) - alignment.coordinates[2, 0]
|
||||
end = len(sequence) - alignment.coordinates[2, -1]
|
||||
self.assertEqual(start, 0)
|
||||
self.assertEqual(end, 24)
|
||||
self.assertEqual(end, 19)
|
||||
sequence = self.sequences[alignment.sequences[2].id][start:end]
|
||||
self.assertEqual(alignment.sequences[2].seq[start:end], sequence)
|
||||
self.assertEqual(alignment[0], "------------------------")
|
||||
self.assertEqual(alignment[1], "GTCCCGGGCCCTGCTTTCCTTTTC")
|
||||
self.assertEqual(alignment[2], "GCCAGGGATCTACTTTTCCTCTTC")
|
||||
self.assertEqual(
|
||||
alignment[0], "TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC"
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment[1], "CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC"
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment[2], "---------------------------------GGATCTACTTTTCCTCTTC"
|
||||
)
|
||||
self.assertEqual(
|
||||
str(alignment),
|
||||
"""\
|
||||
equCab1.f 0 ------------------------ 0
|
||||
canFam2.f 25 GTCCCGGGCCCTGCTTTCCTTTTC 49
|
||||
mm9.fa 24 GCCAGGGATCTACTTTTCCTCTTC 0
|
||||
equCab1.f 50 TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC 0
|
||||
canFam2.f 0 CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC 49
|
||||
mm9.fa 19 ---------------------------------GGATCTACTTTTCCTCTTC 0
|
||||
""",
|
||||
)
|
||||
self.assertTrue(
|
||||
numpy.array_equal(
|
||||
alignment.coordinates,
|
||||
numpy.array([[0, 0], [25, 49], [24, 0]]),
|
||||
)
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment.format("mauve", metadata, identifiers),
|
||||
"""\
|
||||
> 1:0-0 + equCab1.fa
|
||||
------------------------
|
||||
> 2:26-49 + canFam2.fa
|
||||
GTCCCGGGCCCTGCTTTCCTTTTC
|
||||
> 3:1-24 - mm9.fa
|
||||
GCCAGGGATCTACTTTTCCTCTTC
|
||||
=
|
||||
""",
|
||||
)
|
||||
self.assertTrue(
|
||||
numpy.array_equal(
|
||||
numpy.array(alignment, "U"),
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
numpy.array([['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
|
||||
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
|
||||
['G', 'T', 'C', 'C', 'C', 'G', 'G', 'G', 'C', 'C', 'C', 'T', 'G',
|
||||
'C', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
|
||||
['G', 'C', 'C', 'A', 'G', 'G', 'G', 'A', 'T', 'C', 'T', 'A', 'C',
|
||||
'T', 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'T', 'T', 'C']],
|
||||
dtype='U')
|
||||
numpy.array([[50, 40, 38, 19, 19, 18, 10, 10, 0],
|
||||
[ 0, 10, 10, 29, 30, 31, 39, 39, 49],
|
||||
[19, 19, 19, 19, 19, 19, 11, 10, 0]]),
|
||||
# fmt: on
|
||||
)
|
||||
)
|
||||
alignment = next(alignments)
|
||||
saved_alignments.append(alignment)
|
||||
self.assertEqual(len(alignment), 1)
|
||||
self.assertEqual(len(alignment.sequences), 1)
|
||||
self.assertEqual(alignment.sequences[0].id, "equCab1.fa")
|
||||
self.assertEqual(
|
||||
alignment.sequences[0].seq,
|
||||
"GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA",
|
||||
)
|
||||
sequence = self.sequences[alignment.sequences[0].id]
|
||||
start = alignment.coordinates[0, 0]
|
||||
end = alignment.coordinates[0, -1]
|
||||
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
|
||||
self.assertEqual(
|
||||
alignment[0], "GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"
|
||||
)
|
||||
self.assertTrue(
|
||||
numpy.array_equal(alignment.coordinates, numpy.array([[0, 50]]))
|
||||
)
|
||||
self.assertEqual(
|
||||
str(alignment),
|
||||
"""\
|
||||
equCab1.f 0 GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA 50
|
||||
""",
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment.format("mauve", metadata, identifiers),
|
||||
"""\
|
||||
> 1:1-50 + equCab1.fa
|
||||
GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
|
||||
> 1:1-50 - equCab1.fa
|
||||
TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC
|
||||
> 2:1-49 + canFam2.fa
|
||||
CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC
|
||||
> 3:1-19 - mm9.fa
|
||||
---------------------------------GGATCTACTTTTCCTCTTC
|
||||
=
|
||||
""",
|
||||
)
|
||||
@ -563,49 +537,18 @@ GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA
|
||||
numpy.array(alignment, "U"),
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
numpy.array([['G', 'A', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'A', 'G', 'T', 'A',
|
||||
'C', 'G', 'G', 'C', 'C', 'C', 'G', 'G', 'C', 'C', 'A', 'C', 'T',
|
||||
'C', 'C', 'G', 'G', 'G', 'T', 'G', 'T', 'G', 'T', 'G', 'C', 'T',
|
||||
'A', 'G', 'G', 'A', 'G', 'G', 'G', 'C', 'T', 'T', 'A']],
|
||||
dtype='U')
|
||||
# fmt: on
|
||||
)
|
||||
)
|
||||
alignment = next(alignments)
|
||||
saved_alignments.append(alignment)
|
||||
self.assertEqual(len(alignment), 1)
|
||||
self.assertEqual(len(alignment.sequences), 1)
|
||||
self.assertEqual(alignment.sequences[0].id, "canFam2.fa")
|
||||
self.assertEqual(alignment.sequences[0].seq, "CAAGCCCTGCGCGCTCAGCCGGAGT")
|
||||
sequence = self.sequences[alignment.sequences[0].id]
|
||||
start = alignment.coordinates[0, 0]
|
||||
end = alignment.coordinates[0, -1]
|
||||
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
|
||||
self.assertEqual(alignment[0], "CAAGCCCTGCGCGCTCAGCCGGAGT")
|
||||
self.assertTrue(
|
||||
numpy.array_equal(alignment.coordinates, numpy.array([[0, 25]]))
|
||||
)
|
||||
self.assertEqual(
|
||||
str(alignment),
|
||||
"""\
|
||||
canFam2.f 0 CAAGCCCTGCGCGCTCAGCCGGAGT 25
|
||||
""",
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment.format("mauve", metadata, identifiers),
|
||||
"""\
|
||||
> 2:1-25 + canFam2.fa
|
||||
CAAGCCCTGCGCGCTCAGCCGGAGT
|
||||
=
|
||||
""",
|
||||
)
|
||||
self.assertTrue(
|
||||
numpy.array_equal(
|
||||
numpy.array(alignment, "U"),
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
numpy.array([['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', 'G', 'C', 'G',
|
||||
'C', 'T', 'C', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'G', 'T']],
|
||||
numpy.array([['T', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'C', 'C', 'T', 'A', 'G',
|
||||
'C', 'A', 'C', 'A', 'C', 'A', 'C', 'C', 'C', 'G', 'G', 'A', 'G',
|
||||
'T', 'G', 'G', 'C', 'C', '-', 'G', 'G', 'G', 'C', 'C', 'G', 'T',
|
||||
'A', 'C', '-', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
|
||||
['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', '-', '-', 'G',
|
||||
'C', 'G', 'C', 'T', 'C', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'G',
|
||||
'T', 'G', 'T', 'C', 'C', 'C', 'G', 'G', 'G', 'C', 'C', 'C', 'T',
|
||||
'G', 'C', '-', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'C'],
|
||||
['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
|
||||
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
|
||||
'-', '-', '-', '-', '-', '-', '-', 'G', 'G', 'A', 'T', 'C', 'T',
|
||||
'A', 'C', 'T', 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'T', 'T', 'C']],
|
||||
dtype='U')
|
||||
# fmt: on
|
||||
)
|
||||
@ -615,31 +558,29 @@ numpy.array([['C', 'A', 'A', 'G', 'C', 'C', 'C', 'T', 'G', 'C', 'G', 'C', 'G',
|
||||
self.assertEqual(len(alignment), 1)
|
||||
self.assertEqual(len(alignment.sequences), 1)
|
||||
self.assertEqual(alignment.sequences[0].id, "mm9.fa")
|
||||
sequence = self.sequences[alignment.sequences[0].id]
|
||||
start = alignment.coordinates[0, 0]
|
||||
end = alignment.coordinates[0, -1]
|
||||
self.assertEqual(start, 24)
|
||||
self.assertEqual(end, 41)
|
||||
self.assertEqual(alignment.sequences[0].seq[start:end], "GTCCGGAGCTGGGACGT")
|
||||
self.assertEqual(
|
||||
repr(alignment.sequences[0].seq),
|
||||
"Seq({19: 'CTGGCGTCCGGAGCTGGGACGT'}, length=41)",
|
||||
)
|
||||
sequence = self.sequences[alignment.sequences[0].id]
|
||||
start = alignment.coordinates[0, 0]
|
||||
end = alignment.coordinates[0, -1]
|
||||
self.assertEqual(alignment.sequences[0].seq[start:end], sequence[start:end])
|
||||
self.assertEqual(alignment[0], "GTCCGGAGCTGGGACGT")
|
||||
self.assertEqual(alignment[0], "CTGGCGTCCGGAGCTGGGACGT")
|
||||
self.assertTrue(
|
||||
numpy.array_equal(alignment.coordinates, numpy.array([[19, 41]]))
|
||||
)
|
||||
self.assertEqual(
|
||||
str(alignment),
|
||||
"""\
|
||||
mm9.fa 24 GTCCGGAGCTGGGACGT 41
|
||||
mm9.fa 19 CTGGCGTCCGGAGCTGGGACGT 41
|
||||
""",
|
||||
)
|
||||
self.assertTrue(
|
||||
numpy.array_equal(alignment.coordinates, numpy.array([[24, 41]]))
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment.format("mauve", metadata, identifiers),
|
||||
"""\
|
||||
> 3:25-41 + mm9.fa
|
||||
GTCCGGAGCTGGGACGT
|
||||
> 3:20-41 + mm9.fa
|
||||
CTGGCGTCCGGAGCTGGGACGT
|
||||
=
|
||||
""",
|
||||
)
|
||||
@ -648,15 +589,15 @@ GTCCGGAGCTGGGACGT
|
||||
numpy.array(alignment, "U"),
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
|
||||
'A', 'C', 'G', 'T']], dtype='U')
|
||||
numpy.array([['C', 'T', 'G', 'G', 'C', 'G', 'T', 'C', 'C', 'G', 'G',
|
||||
'A', 'G', 'C', 'T', 'G', 'G', 'G', 'A', 'C', 'G', 'T']], dtype='U')
|
||||
# fmt: on
|
||||
)
|
||||
)
|
||||
self.assertRaises(StopIteration, next, alignments)
|
||||
# As each nucleotide in each sequence is stored exactly once in an XMFA
|
||||
# file, we can reconstitute the full sequences:
|
||||
self.assertEqual(len(saved_alignments), 4)
|
||||
self.assertEqual(len(saved_alignments), 2)
|
||||
filenames = []
|
||||
for alignment in saved_alignments:
|
||||
for record in alignment.sequences:
|
||||
@ -697,26 +638,20 @@ numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
|
||||
for record in alignment.sequences:
|
||||
filename = record.id
|
||||
record.seq = sequences[filename]
|
||||
self.assertEqual(alignment[0], "------------------------")
|
||||
self.assertEqual(alignment[1], "GTCCCGGGCCCTGCTTTCCTTTTC")
|
||||
self.assertEqual(alignment[2], "GCCAGGGATCTACTTTTCCTCTTC")
|
||||
self.assertEqual(
|
||||
alignment[0], "TAAGCCCTCCTAGCACACACCCGGAGTGGCC-GGGCCGTAC-TTTCCTTTTC"
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment[1], "CAAGCCCTGC--GCGCTCAGCCGGAGTGTCCCGGGCCCTGC-TTTCCTTTTC"
|
||||
)
|
||||
self.assertEqual(
|
||||
alignment[2], "---------------------------------GGATCTACTTTTCCTCTTC"
|
||||
)
|
||||
alignment = saved_alignments[1]
|
||||
for record in alignment.sequences:
|
||||
filename = record.id
|
||||
record.seq = sequences[filename]
|
||||
self.assertEqual(
|
||||
alignment[0], "GAAAAGGAAAGTACGGCCCGGCCACTCCGGGTGTGTGCTAGGAGGGCTTA"
|
||||
)
|
||||
alignment = saved_alignments[2]
|
||||
for record in alignment.sequences:
|
||||
filename = record.id
|
||||
record.seq = sequences[filename]
|
||||
self.assertEqual(alignment[0], "CAAGCCCTGCGCGCTCAGCCGGAGT")
|
||||
alignment = saved_alignments[3]
|
||||
for record in alignment.sequences:
|
||||
filename = record.id
|
||||
record.seq = sequences[filename]
|
||||
self.assertEqual(alignment[0], "GTCCGGAGCTGGGACGT")
|
||||
self.assertEqual(alignment[0], "CTGGCGTCCGGAGCTGGGACGT")
|
||||
|
||||
def test_write_read(self):
|
||||
path = os.path.join("Mauve", "separate.xmfa")
|
||||
@ -729,7 +664,7 @@ numpy.array([['G', 'T', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'G',
|
||||
alignments = Align.parse(stream, "mauve")
|
||||
output = StringIO()
|
||||
n = Align.write(alignments, output, "mauve")
|
||||
self.assertEqual(n, 4)
|
||||
self.assertEqual(n, 2)
|
||||
output.seek(0)
|
||||
self.assertEqual(output.read(), data)
|
||||
|
||||
|
Reference in New Issue
Block a user