mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
438 lines
20 KiB
Python
438 lines
20 KiB
Python
# Copyright 2009 by Cymon J. Cox. All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Command line wrapper for the multiple alignment programme MAFFT."""
|
|
|
|
from Bio.Application import _Argument
|
|
from Bio.Application import _Option
|
|
from Bio.Application import _Switch
|
|
from Bio.Application import AbstractCommandline
|
|
|
|
|
|
class MafftCommandline(AbstractCommandline):
|
|
"""Command line wrapper for the multiple alignment program MAFFT.
|
|
|
|
http://align.bmr.kyushu-u.ac.jp/mafft/software/
|
|
|
|
Notes
|
|
-----
|
|
Last checked against version: MAFFT v6.717b (2009/12/03)
|
|
|
|
References
|
|
----------
|
|
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
|
|
multiple ncRNA alignment by incorporating structural information into
|
|
a MAFFT-based framework (describes RNA structural alignment methods)
|
|
|
|
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
|
|
developments in the MAFFT multiple sequence alignment program
|
|
(outlines version 6)
|
|
|
|
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
|
|
algorithm to build an approximate tree from a large number of
|
|
unaligned sequences (describes the PartTree algorithm)
|
|
|
|
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
|
|
version 5: improvement in accuracy of multiple sequence alignment
|
|
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
|
|
strategies)
|
|
|
|
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
|
|
|
|
Examples
|
|
--------
|
|
>>> from Bio.Align.Applications import MafftCommandline
|
|
>>> mafft_exe = "/opt/local/mafft"
|
|
>>> in_file = "../Doc/examples/opuntia.fasta"
|
|
>>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
|
|
>>> print(mafft_cline)
|
|
/opt/local/mafft ../Doc/examples/opuntia.fasta
|
|
|
|
If the mafft binary is on the path (typically the case on a Unix style
|
|
operating system) then you don't need to supply the executable location:
|
|
|
|
>>> from Bio.Align.Applications import MafftCommandline
|
|
>>> in_file = "../Doc/examples/opuntia.fasta"
|
|
>>> mafft_cline = MafftCommandline(input=in_file)
|
|
>>> print(mafft_cline)
|
|
mafft ../Doc/examples/opuntia.fasta
|
|
|
|
You would typically run the command line with mafft_cline() or via
|
|
the Python subprocess module, as described in the Biopython tutorial.
|
|
|
|
Note that MAFFT will write the alignment to stdout, which you may
|
|
want to save to a file and then parse, e.g.::
|
|
|
|
stdout, stderr = mafft_cline()
|
|
with open("aligned.fasta", "w") as handle:
|
|
handle.write(stdout)
|
|
from Bio import AlignIO
|
|
align = AlignIO.read("aligned.fasta", "fasta")
|
|
|
|
Alternatively, to parse the output with AlignIO directly you can
|
|
use StringIO to turn the string into a handle::
|
|
|
|
stdout, stderr = mafft_cline()
|
|
from io import StringIO
|
|
from Bio import AlignIO
|
|
align = AlignIO.read(StringIO(stdout), "fasta")
|
|
|
|
"""
|
|
|
|
def __init__(self, cmd="mafft", **kwargs):
|
|
"""Initialize the class."""
|
|
BLOSUM_MATRICES = ["30", "45", "62", "80"]
|
|
self.parameters = [
|
|
# **** Algorithm ****
|
|
# Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
|
|
# i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
|
|
_Switch(["--auto", "auto"], "Automatically select strategy. Default off."),
|
|
# Distance is calculated based on the number of shared 6mers. Default: on
|
|
_Switch(
|
|
["--6merpair", "6merpair", "sixmerpair"],
|
|
"Distance is calculated based on the number of shared "
|
|
"6mers. Default: on",
|
|
),
|
|
# All pairwise alignments are computed with the Needleman-Wunsch
|
|
# algorithm. More accurate but slower than --6merpair. Suitable for a
|
|
# set of globally alignable sequences. Applicable to up to ~200
|
|
# sequences. A combination with --maxiterate 1000 is recommended (G-
|
|
# INS-i). Default: off (6mer distance is used)
|
|
_Switch(
|
|
["--globalpair", "globalpair"],
|
|
"All pairwise alignments are computed with the "
|
|
"Needleman-Wunsch algorithm. Default: off",
|
|
),
|
|
# All pairwise alignments are computed with the Smith-Waterman
|
|
# algorithm. More accurate but slower than --6merpair. Suitable for a
|
|
# set of locally alignable sequences. Applicable to up to ~200
|
|
# sequences. A combination with --maxiterate 1000 is recommended (L-
|
|
# INS-i). Default: off (6mer distance is used)
|
|
_Switch(
|
|
["--localpair", "localpair"],
|
|
"All pairwise alignments are computed with the "
|
|
"Smith-Waterman algorithm. Default: off",
|
|
),
|
|
# All pairwise alignments are computed with a local algorithm with
|
|
# the generalized affine gap cost (Altschul 1998). More accurate but
|
|
# slower than --6merpair. Suitable when large internal gaps are
|
|
# expected. Applicable to up to ~200 sequences. A combination with --
|
|
# maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
|
|
# distance is used)
|
|
_Switch(
|
|
["--genafpair", "genafpair"],
|
|
"All pairwise alignments are computed with a local "
|
|
"algorithm with the generalized affine gap cost "
|
|
"(Altschul 1998). Default: off",
|
|
),
|
|
# All pairwise alignments are computed with FASTA (Pearson and Lipman
|
|
# 1988). FASTA is required. Default: off (6mer distance is used)
|
|
_Switch(
|
|
["--fastapair", "fastapair"],
|
|
"All pairwise alignments are computed with FASTA "
|
|
"(Pearson and Lipman 1988). Default: off",
|
|
),
|
|
# Weighting factor for the consistency term calculated from pairwise
|
|
# alignments. Valid when either of --blobalpair, --localpair, --
|
|
# genafpair, --fastapair or --blastpair is selected. Default: 2.7
|
|
_Option(
|
|
["--weighti", "weighti"],
|
|
"Weighting factor for the consistency term calculated "
|
|
"from pairwise alignments. Default: 2.7",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Guide tree is built number times in the progressive stage. Valid
|
|
# with 6mer distance. Default: 2
|
|
_Option(
|
|
["--retree", "retree"],
|
|
"Guide tree is built number times in the progressive "
|
|
"stage. Valid with 6mer distance. Default: 2",
|
|
checker_function=lambda x: isinstance(x, int),
|
|
equate=False,
|
|
),
|
|
# Number cycles of iterative refinement are performed. Default: 0
|
|
_Option(
|
|
["--maxiterate", "maxiterate"],
|
|
"Number cycles of iterative refinement are performed. Default: 0",
|
|
checker_function=lambda x: isinstance(x, int),
|
|
equate=False,
|
|
),
|
|
# Number of threads to use. Default: 1
|
|
_Option(
|
|
["--thread", "thread"],
|
|
"Number of threads to use. Default: 1",
|
|
checker_function=lambda x: isinstance(x, int),
|
|
equate=False,
|
|
),
|
|
# Use FFT approximation in group-to-group alignment. Default: on
|
|
_Switch(
|
|
["--fft", "fft"],
|
|
"Use FFT approximation in group-to-group alignment. Default: on",
|
|
),
|
|
# Do not use FFT approximation in group-to-group alignment. Default:
|
|
# off
|
|
_Switch(
|
|
["--nofft", "nofft"],
|
|
"Do not use FFT approximation in group-to-group "
|
|
"alignment. Default: off",
|
|
),
|
|
# Alignment score is not checked in the iterative refinement stage.
|
|
# Default: off (score is checked)
|
|
_Switch(
|
|
["--noscore", "noscore"],
|
|
"Alignment score is not checked in the iterative "
|
|
"refinement stage. Default: off (score is checked)",
|
|
),
|
|
# Use the Myers-Miller (1988) algorithm. Default: automatically
|
|
# turned on when the alignment length exceeds 10,000 (aa/nt).
|
|
_Switch(
|
|
["--memsave", "memsave"],
|
|
"Use the Myers-Miller (1988) algorithm. Default: "
|
|
"automatically turned on when the alignment length "
|
|
"exceeds 10,000 (aa/nt).",
|
|
),
|
|
# Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
|
|
# the 6mer distance. Recommended for a large number (> ~10,000) of
|
|
# sequences are input. Default: off
|
|
_Switch(
|
|
["--parttree", "parttree"],
|
|
"Use a fast tree-building method with the 6mer "
|
|
"distance. Default: off",
|
|
),
|
|
# The PartTree algorithm is used with distances based on DP. Slightly
|
|
# more accurate and slower than --parttree. Recommended for a large
|
|
# number (> ~10,000) of sequences are input. Default: off
|
|
_Switch(
|
|
["--dpparttree", "dpparttree"],
|
|
"The PartTree algorithm is used with distances "
|
|
"based on DP. Default: off",
|
|
),
|
|
# The PartTree algorithm is used with distances based on FASTA.
|
|
# Slightly more accurate and slower than --parttree. Recommended for
|
|
# a large number (> ~10,000) of sequences are input. FASTA is
|
|
# required. Default: off
|
|
_Switch(
|
|
["--fastaparttree", "fastaparttree"],
|
|
"The PartTree algorithm is used with distances based "
|
|
"on FASTA. Default: off",
|
|
),
|
|
# The number of partitions in the PartTree algorithm. Default: 50
|
|
_Option(
|
|
["--partsize", "partsize"],
|
|
"The number of partitions in the PartTree algorithm. Default: 50",
|
|
checker_function=lambda x: isinstance(x, int),
|
|
equate=False,
|
|
),
|
|
# Do not make alignment larger than number sequences. Valid only with
|
|
# the --*parttree options. Default: the number of input sequences
|
|
_Switch(
|
|
["--groupsize", "groupsize"],
|
|
"Do not make alignment larger than number sequences. "
|
|
"Default: the number of input sequences",
|
|
),
|
|
# Adjust direction according to the first sequence
|
|
# Mafft V6 beta function
|
|
_Switch(
|
|
["--adjustdirection", "adjustdirection"],
|
|
"Adjust direction according to the first sequence. Default off.",
|
|
),
|
|
# Adjust direction according to the first sequence
|
|
# for highly diverged data; very slow
|
|
# Mafft V6 beta function
|
|
_Switch(
|
|
["--adjustdirectionaccurately", "adjustdirectionaccurately"],
|
|
"Adjust direction according to the first sequence,"
|
|
"for highly diverged data; very slow"
|
|
"Default off.",
|
|
),
|
|
# **** Parameter ****
|
|
# Gap opening penalty at group-to-group alignment. Default: 1.53
|
|
_Option(
|
|
["--op", "op"],
|
|
"Gap opening penalty at group-to-group alignment. Default: 1.53",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Offset value, which works like gap extension penalty, for group-to-
|
|
# group alignment. Default: 0.123
|
|
_Option(
|
|
["--ep", "ep"],
|
|
"Offset value, which works like gap extension penalty, "
|
|
"for group-to- group alignment. Default: 0.123",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Gap opening penalty at local pairwise alignment. Valid when the --
|
|
# localpair or --genafpair option is selected. Default: -2.00
|
|
_Option(
|
|
["--lop", "lop"],
|
|
"Gap opening penalty at local pairwise alignment. Default: 0.123",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Offset value at local pairwise alignment. Valid when the --
|
|
# localpair or --genafpair option is selected. Default: 0.1
|
|
_Option(
|
|
["--lep", "lep"],
|
|
"Offset value at local pairwise alignment. Default: 0.1",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Gap extension penalty at local pairwise alignment. Valid when the -
|
|
# -localpair or --genafpair option is selected. Default: -0.1
|
|
_Option(
|
|
["--lexp", "lexp"],
|
|
"Gap extension penalty at local pairwise alignment. Default: -0.1",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Gap opening penalty to skip the alignment. Valid when the --
|
|
# genafpair option is selected. Default: -6.00
|
|
_Option(
|
|
["--LOP", "LOP"],
|
|
"Gap opening penalty to skip the alignment. Default: -6.00",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# Gap extension penalty to skip the alignment. Valid when the --
|
|
# genafpair option is selected. Default: 0.00
|
|
_Option(
|
|
["--LEXP", "LEXP"],
|
|
"Gap extension penalty to skip the alignment. Default: 0.00",
|
|
checker_function=lambda x: isinstance(x, float),
|
|
equate=False,
|
|
),
|
|
# BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
|
|
# number=30, 45, 62 or 80. Default: 62
|
|
_Option(
|
|
["--bl", "bl"],
|
|
"BLOSUM number matrix is used. Default: 62",
|
|
checker_function=lambda x: x in BLOSUM_MATRICES,
|
|
equate=False,
|
|
),
|
|
# JTT PAM number (Jones et al. 1992) matrix is used. number>0.
|
|
# Default: BLOSUM62
|
|
_Option(
|
|
["--jtt", "jtt"],
|
|
"JTT PAM number (Jones et al. 1992) matrix is used. "
|
|
"number>0. Default: BLOSUM62",
|
|
equate=False,
|
|
),
|
|
# Transmembrane PAM number (Jones et al. 1994) matrix is used.
|
|
# number>0. Default: BLOSUM62
|
|
_Option(
|
|
["--tm", "tm"],
|
|
"Transmembrane PAM number (Jones et al. 1994) "
|
|
"matrix is used. number>0. Default: BLOSUM62",
|
|
filename=True, # to ensure spaced inputs are quoted
|
|
equate=False,
|
|
),
|
|
# Use a user-defined AA scoring matrix. The format of matrixfile is
|
|
# the same to that of BLAST. Ignored when nucleotide sequences are
|
|
# input. Default: BLOSUM62
|
|
_Option(
|
|
["--aamatrix", "aamatrix"],
|
|
"Use a user-defined AA scoring matrix. Default: BLOSUM62",
|
|
filename=True, # to ensure spaced inputs are quoted
|
|
equate=False,
|
|
),
|
|
# Incorporate the AA/nuc composition information into the scoring
|
|
# matrix. Default: off
|
|
_Switch(
|
|
["--fmodel", "fmodel"],
|
|
"Incorporate the AA/nuc composition information into "
|
|
"the scoring matrix (True) or not (False, default)",
|
|
),
|
|
# **** Output ****
|
|
# Name length for CLUSTAL and PHYLIP format output
|
|
_Option(
|
|
["--namelength", "namelength"],
|
|
"""Name length in CLUSTAL and PHYLIP output.
|
|
|
|
MAFFT v6.847 (2011) added --namelength for use with
|
|
the --clustalout option for CLUSTAL output.
|
|
|
|
MAFFT v7.024 (2013) added support for this with the
|
|
--phylipout option for PHYLIP output (default 10).
|
|
""",
|
|
checker_function=lambda x: isinstance(x, int),
|
|
equate=False,
|
|
),
|
|
# Output format: clustal format. Default: off (fasta format)
|
|
_Switch(
|
|
["--clustalout", "clustalout"],
|
|
"Output format: clustal (True) or fasta (False, default)",
|
|
),
|
|
# Output format: phylip format.
|
|
# Added in beta with v6.847, fixed in v6.850 (2011)
|
|
_Switch(
|
|
["--phylipout", "phylipout"],
|
|
"Output format: phylip (True), or fasta (False, default)",
|
|
),
|
|
# Output order: same as input. Default: on
|
|
_Switch(
|
|
["--inputorder", "inputorder"],
|
|
"Output order: same as input (True, default) or alignment "
|
|
"based (False)",
|
|
),
|
|
# Output order: aligned. Default: off (inputorder)
|
|
_Switch(
|
|
["--reorder", "reorder"],
|
|
"Output order: aligned (True) or in input order (False, default)",
|
|
),
|
|
# Guide tree is output to the input.tree file. Default: off
|
|
_Switch(
|
|
["--treeout", "treeout"],
|
|
"Guide tree is output to the input.tree file (True) or "
|
|
"not (False, default)",
|
|
),
|
|
# Do not report progress. Default: off
|
|
_Switch(
|
|
["--quiet", "quiet"],
|
|
"Do not report progress (True) or not (False, default).",
|
|
),
|
|
# **** Input ****
|
|
# Assume the sequences are nucleotide. Default: auto
|
|
_Switch(
|
|
["--nuc", "nuc"],
|
|
"Assume the sequences are nucleotide (True/False). Default: auto",
|
|
),
|
|
# Assume the sequences are amino acid. Default: auto
|
|
_Switch(
|
|
["--amino", "amino"],
|
|
"Assume the sequences are amino acid (True/False). Default: auto",
|
|
),
|
|
# MAFFT has multiple --seed commands where the unaligned input is
|
|
# aligned to the seed alignment. There can be multiple seeds in the
|
|
# form: "mafft --seed align1 --seed align2 [etc] input"
|
|
# Effectively for n number of seed alignments.
|
|
# TODO - Can we use class _ArgumentList here?
|
|
_Option(
|
|
["--seed", "seed"],
|
|
"Seed alignments given in alignment_n (fasta format) "
|
|
"are aligned with sequences in input.",
|
|
filename=True,
|
|
equate=False,
|
|
),
|
|
# The input (must be FASTA format)
|
|
_Argument(["input"], "Input file name", filename=True, is_required=True),
|
|
# mafft-profile takes a second alignment input as an argument:
|
|
# mafft-profile align1 align2
|
|
_Argument(
|
|
["input1"],
|
|
"Second input file name for the mafft-profile command",
|
|
filename=True,
|
|
),
|
|
]
|
|
AbstractCommandline.__init__(self, cmd, **kwargs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from Bio._utils import run_doctest
|
|
|
|
run_doctest()
|