biopython/Bio/Align/Applications/_Clustalw.py

# Copyright 2009 by Cymon J. Cox.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program Clustal W."""

import os

from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


class ClustalwCommandline(AbstractCommandline):
    """Command line wrapper for clustalw (version one or two).

    http://www.clustal.org/

    Notes
    -----
    Last checked against versions: 1.83 and 2.1

    References
    ----------
    Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
    McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
    Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
    Bioinformatics, 23, 2947-2948.

    Examples
    --------
    >>> from Bio.Align.Applications import ClustalwCommandline
    >>> in_file = "unaligned.fasta"
    >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
    >>> print(clustalw_cline)
    clustalw2 -infile=unaligned.fasta

    You would typically run the command line with clustalw_cline() or via
    the Python subprocess module, as described in the Biopython tutorial.

    """

    # TODO - Should we default to cmd="clustalw2" now?
    def __init__(self, cmd="clustalw", **kwargs):
        """Initialize the class."""
        self.parameters = [
            _Option(
                ["-infile", "-INFILE", "INFILE", "infile"],
                "Input sequences.",
                filename=True,
            ),
            _Option(
                ["-profile1", "-PROFILE1", "PROFILE1", "profile1"],
                "Profiles (old alignment).",
                filename=True,
            ),
            _Option(
                ["-profile2", "-PROFILE2", "PROFILE2", "profile2"],
                "Profiles (old alignment).",
                filename=True,
            ),
            # ################# VERBS (do things) #############################
            _Switch(
                ["-options", "-OPTIONS", "OPTIONS", "options"],
                "List the command line parameters",
            ),
            _Switch(
                ["-help", "-HELP", "HELP", "help"], "Outline the command line params."
            ),
            _Switch(
                ["-check", "-CHECK", "CHECK", "check"],
                "Outline the command line params.",
            ),
            _Switch(
                ["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"],
                "Output full help content.",
            ),
            _Switch(
                ["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment."
            ),
            _Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."),
            _Switch(
                ["-pim", "-PIM", "PIM", "pim"],
                "Output percent identity matrix (while calculating the tree).",
            ),
            _Option(
                ["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"],
                "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Switch(
                ["-convert", "-CONVERT", "CONVERT", "convert"],
                "Output the input sequences in a different file format.",
            ),
            # #################### PARAMETERS (set things) #########################
            # ***General settings:****
            # Makes no sense in biopython
            # _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"],
            #        [],
            #        lambda x: 0, # Does not take value
            #        False,
            #        "read command line, then enter normal interactive menus",
            #        False),
            _Switch(
                ["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"],
                "Use FAST algorithm for the alignment guide tree",
            ),
            _Option(
                ["-type", "-TYPE", "TYPE", "type"],
                "PROTEIN or DNA sequences",
                checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"],
            ),
            _Switch(
                ["-negative", "-NEGATIVE", "NEGATIVE", "negative"],
                "Protein alignment with negative values in matrix",
            ),
            _Option(
                ["-outfile", "-OUTFILE", "OUTFILE", "outfile"],
                "Output sequence alignment file name",
                filename=True,
            ),
            _Option(
                ["-output", "-OUTPUT", "OUTPUT", "output"],
                "Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA",
                checker_function=lambda x: x
                in [
                    "CLUSTAL",
                    "GCG",
                    "GDE",
                    "PHYLIP",
                    "PIR",
                    "NEXUS",
                    "FASTA",
                    "clustal",
                    "gcg",
                    "gde",
                    "phylip",
                    "pir",
                    "nexus",
                    "fasta",
                ],
            ),
            _Option(
                ["-outorder", "-OUTORDER", "OUTORDER", "outorder"],
                "Output taxon order: INPUT or ALIGNED",
                checker_function=lambda x: x
                in ["INPUT", "input", "ALIGNED", "aligned"],
            ),
            _Option(
                ["-case", "-CASE", "CASE", "case"],
                "LOWER or UPPER (for GDE output only)",
                checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"],
            ),
            _Option(
                ["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"],
                "OFF or ON (for Clustal output only)",
                checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
            ),
            _Option(
                ["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"],
                "OFF or ON (NEW- for all output formats)",
                checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
            ),
            _Option(
                ["-range", "-RANGE", "RANGE", "range"],
                "Sequence range to write starting m to m+n. "
                "Input as string eg. '24,200'",
            ),
            _Option(
                ["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"],
                "Maximum allowed input sequence length",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Switch(
                ["-quiet", "-QUIET", "QUIET", "quiet"],
                "Reduce console output to minimum",
            ),
            _Option(
                ["-stats", "-STATS", "STATS", "stats"],
                "Log some alignment statistics to file",
                filename=True,
            ),
            # ***Fast Pairwise Alignments:***
            _Option(
                ["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"],
                "Word size",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"],
                "Number of best diags.",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-window", "-WINDOW", "WINDOW", "window"],
                "Window around best diags.",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"],
                "Gap penalty",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-score", "-SCORE", "SCORE", "score"],
                "Either: PERCENT or ABSOLUTE",
                checker_function=lambda x: x
                in ["percent", "PERCENT", "absolute", "ABSOLUTE"],
            ),
            # ***Slow Pairwise Alignments:***
            _Option(
                ["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"],
                "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
                checker_function=lambda x: (
                    x
                    in [
                        "BLOSUM",
                        "PAM",
                        "GONNET",
                        "ID",
                        "blosum",
                        "pam",
                        "gonnet",
                        "id",
                    ]
                    or os.path.exists(x)
                ),
                filename=True,
            ),
            _Option(
                ["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"],
                "DNA weight matrix=IUB, CLUSTALW or filename",
                checker_function=lambda x: (
                    x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
                ),
                filename=True,
            ),
            _Option(
                ["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"],
                "Gap opening penalty",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"],
                "Gap extension penalty",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            # ***Multiple Alignments:***
            _Option(
                ["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
                "Output file name for newly created guide tree",
                filename=True,
            ),
            _Option(
                ["-usetree", "-USETREE", "USETREE", "usetree"],
                "File name of guide tree",
                checker_function=lambda x: os.path.exists,
                filename=True,
            ),
            _Option(
                ["-matrix", "-MATRIX", "MATRIX", "matrix"],
                "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
                checker_function=lambda x: (
                    x
                    in [
                        "BLOSUM",
                        "PAM",
                        "GONNET",
                        "ID",
                        "blosum",
                        "pam",
                        "gonnet",
                        "id",
                    ]
                    or os.path.exists(x)
                ),
                filename=True,
            ),
            _Option(
                ["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"],
                "DNA weight matrix=IUB, CLUSTALW or filename",
                checker_function=lambda x: (
                    x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
                ),
                filename=True,
            ),
            _Option(
                ["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"],
                "Gap opening penalty",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-gapext", "-GAPEXT", "GAPEXT", "gapext"],
                "Gap extension penalty",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Switch(
                ["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"],
                "No end gap separation pen.",
            ),
            _Option(
                ["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"],
                "Gap separation pen. range",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Switch(
                ["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off"
            ),
            _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"),
            _Switch(
                ["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"],
                "List hydrophilic res.",
            ),
            _Option(
                ["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"],
                "% ident. for delay",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            # Already handled in General Settings section, but appears a second
            # time under Multiple Alignments in the help
            # _Option(["-type", "-TYPE", "TYPE", "type"],
            #        "PROTEIN or DNA",
            #        checker_function=lambda x: x in ["PROTEIN", "DNA",
            #                                         "protein", "dna"]),
            _Option(
                ["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"],
                "Transitions weighting",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-iteration", "-ITERATION", "ITERATION", "iteration"],
                "NONE or TREE or ALIGNMENT",
                checker_function=lambda x: x
                in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"],
            ),
            _Option(
                ["-numiter", "-NUMITER", "NUMITER", "numiter"],
                "maximum number of iterations to perform",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Switch(
                ["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"],
                "Disable sequence weighting",
            ),
            # ***Profile Alignments:***
            _Switch(
                ["-profile", "-PROFILE", "PROFILE", "profile"],
                "Merge two alignments by profile alignment",
            ),
            _Option(
                ["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"],
                "Output file name for new guide tree of profile1",
                filename=True,
            ),
            _Option(
                ["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"],
                "Output file for new guide tree of profile2",
                filename=True,
            ),
            _Option(
                ["-usetree1", "-USETREE1", "USETREE1", "usetree1"],
                "File name of guide tree for profile1",
                checker_function=lambda x: os.path.exists,
                filename=True,
            ),
            _Option(
                ["-usetree2", "-USETREE2", "USETREE2", "usetree2"],
                "File name of guide tree for profile2",
                checker_function=lambda x: os.path.exists,
                filename=True,
            ),
            # ***Sequence to Profile Alignments:***
            _Switch(
                ["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"],
                "Sequentially add profile2 sequences to profile1 alignment",
            ),
            # These are already handled in the Multiple Alignments section,
            # but appear a second time here in the help.
            # _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
            #        "File for new guide tree",
            #        filename=True),
            # _Option(["-usetree", "-USETREE", "USETREE", "usetree"],
            #        "File for old guide tree",
            #        checker_function=lambda x: os.path.exists,
            #        filename=True),
            # ***Structure Alignments:***
            _Switch(
                ["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"],
                "Do not use secondary structure-gap penalty mask for profile 1",
            ),
            _Switch(
                ["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"],
                "Do not use secondary structure-gap penalty mask for profile 2",
            ),
            _Option(
                ["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"],
                "STRUCTURE or MASK or BOTH or NONE output in alignment file",
                checker_function=lambda x: x
                in [
                    "STRUCTURE",
                    "MASK",
                    "BOTH",
                    "NONE",
                    "structure",
                    "mask",
                    "both",
                    "none",
                ],
            ),
            _Option(
                ["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"],
                "Gap penalty for helix core residues",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"],
                "gap penalty for strand core residues",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"],
                "Gap penalty for loop regions",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"],
                "Gap penalty for structure termini",
                checker_function=lambda x: (isinstance(x, (float, int))),
            ),
            _Option(
                ["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"],
                "Number of residues inside helix to be treated as terminal",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Option(
                ["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"],
                "Number of residues outside helix to be treated as terminal",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Option(
                ["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"],
                "Number of residues inside strand to be treated as terminal",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Option(
                ["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"],
                "Number of residues outside strand to be treated as terminal",
                checker_function=lambda x: isinstance(x, int),
            ),
            # ***Trees:***
            _Option(
                ["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"],
                "nj OR phylip OR dist OR nexus",
                checker_function=lambda x: x
                in ["NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus"],
            ),
            _Option(
                ["-seed", "-SEED", "SEED", "seed"],
                "Seed number for bootstraps.",
                checker_function=lambda x: isinstance(x, int),
            ),
            _Switch(
                ["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction."
            ),
            _Switch(
                ["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"],
                "Ignore positions with gaps.",
            ),
            _Option(
                ["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"],
                "Node OR branch position of bootstrap values in tree display",
                checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"],
            ),
            _Option(
                ["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"],
                "NJ or UPGMA",
                checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"],
            ),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()