diff --git a/Bio/Align/Applications/_ClustalOmega.py b/Bio/Align/Applications/_ClustalOmega.py deleted file mode 100644 index f85b3e7c0..000000000 --- a/Bio/Align/Applications/_ClustalOmega.py +++ /dev/null @@ -1,270 +0,0 @@ -# Copyright 2011 by Andreas Wilm. All rights reserved. -# Based on ClustalW wrapper copyright 2009 by Cymon J. Cox. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program Clustal Omega.""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class ClustalOmegaCommandline(AbstractCommandline): - """Command line wrapper for clustal omega. - - http://www.clustal.org/omega - - Notes - ----- - Last checked against version: 1.2.0 - - References - ---------- - Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R, - McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011). - Fast, scalable generation of high-quality protein multiple - sequence alignments using Clustal Omega. - Molecular Systems Biology 7:539 https://doi.org/10.1038/msb.2011.75 - - Examples - -------- - >>> from Bio.Align.Applications import ClustalOmegaCommandline - >>> in_file = "unaligned.fasta" - >>> out_file = "aligned.fasta" - >>> clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True) - >>> print(clustalomega_cline) - clustalo -i unaligned.fasta -o aligned.fasta --auto -v - - You would typically run the command line with clustalomega_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="clustalo", **kwargs): - """Initialize the class.""" - # order parameters in the same order as clustalo --help - self.parameters = [ - # Sequence Input - _Option( - ["-i", "--in", "--infile", "infile"], - "Multiple sequence input file", - filename=True, - equate=False, - ), - _Option( - ["--hmm-in", "HMM input", "hmm_input"], - "HMM input files", - filename=True, - equate=False, - ), - _Switch(["--dealign", "dealign"], "Dealign input sequences"), - _Option( - ["--profile1", "--p1", "profile1"], - "Pre-aligned multiple sequence file (aligned columns will be kept fix).", - filename=True, - equate=False, - ), - _Option( - ["--profile2", "--p2", "profile2"], - "Pre-aligned multiple sequence file (aligned columns will be kept fix).", - filename=True, - equate=False, - ), - _Option( - ["-t", "--seqtype", "seqtype"], - "{Protein, RNA, DNA} Force a sequence type (default: auto).", - equate=False, - checker_function=lambda x: x - in ["protein", "rna", "dna", "Protein", "RNA", "DNA", "PROTEIN"], - ), - _Switch( - ["--is-profile", "isprofile"], - "disable check if profile, force profile (default no)", - ), - _Option( - ["--infmt", "infmt"], - """Forced sequence input file format (default: auto) - - Allowed values: a2m, fa[sta], clu[stal], msf, phy[lip], selex, st[ockholm], vie[nna] - """, - equate=False, - checker_function=lambda x: x - in [ - "a2m", - "fa", - "fasta", - "clu", - "clustal", - "msf", - "phy", - "phylip", - "selex", - "st", - "stockholm", - "vie", - "vienna", - ], - ), - # Clustering - _Option( - ["--distmat-in", "distmat_in"], - "Pairwise distance matrix input file (skips distance computation).", - filename=True, - equate=False, - ), - _Option( - ["--distmat-out", "distmat_out"], - "Pairwise distance matrix output file.", - filename=True, - equate=False, - ), - _Option( - ["--guidetree-in", "guidetree_in"], - "Guide tree input file (skips distance computation and guide-tree clustering step).", - filename=True, - equate=False, - ), - _Option( - ["--guidetree-out", "guidetree_out"], - "Guide tree output file.", - filename=True, - equate=False, - ), - _Switch( - ["--full", "distmat_full"], - "Use full distance matrix for guide-tree calculation (slow; mBed is default)", - ), - _Switch( - ["--full-iter", "distmat_full_iter"], - "Use full distance matrix for guide-tree calculation during iteration (mBed is default)", - ), - _Option( - ["--cluster-size", "clustersize"], - "soft maximum of sequences in sub-clusters", - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["--clustering-out", "clusteringout"], - "Clustering output file", - filename=True, - ), - _Switch( - ["--use-kimura", "usekimura"], - "use Kimura distance correction for aligned sequences (default no)", - ), - _Switch( - ["--percent-id", "percentid"], - "convert distances into percent identities (default no)", - ), - # Alignment Output - _Option( - ["-o", "--out", "--outfile", "outfile"], - "Multiple sequence alignment output file (default: stdout).", - filename=True, - equate=False, - ), - _Option( - ["--outfmt", "outfmt"], - "MSA output file format:" - " a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]" - " (default: fasta).", - equate=False, - checker_function=lambda x: x - in [ - "a2m", - "fa", - "fasta", - "clu", - "clustal", - "msf", - "phy", - "phylip", - "selex", - "st", - "stockholm", - "vie", - "vienna", - ], - ), - _Switch( - ["--residuenumber", "--resno", "residuenumber"], - "in Clustal format print residue numbers (default no)", - ), - _Option( - ["--wrap", "wrap"], - "number of residues before line-wrap in output", - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["--output-order", "outputorder"], - "MSA output order like in input/guide-tree", - checker_function=lambda x: x in ["input-order", "tree-order"], - ), - # Iteration - _Option( - ["--iterations", "--iter", "iterations"], - "Number of (combined guide-tree/HMM) iterations", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["--max-guidetree-iterations", "max_guidetree_iterations"], - "Maximum number of guidetree iterations", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["--max-hmm-iterations", "max_hmm_iterations"], - "Maximum number of HMM iterations", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - # Limits (will exit early, if exceeded): - _Option( - ["--maxnumseq", "maxnumseq"], - "Maximum allowed number of sequences", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["--maxseqlen", "maxseqlen"], - "Maximum allowed sequence length", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - # Miscellaneous: - _Switch( - ["--auto", "auto"], - "Set options automatically (might overwrite some of your options)", - ), - _Option( - ["--threads", "threads"], - "Number of processors to use", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-l", "--log", "log"], - "Log all non-essential output to this file.", - filename=True, - equate=False, - ), - _Switch(["-h", "--help", "help"], "Print help and exit."), - _Switch(["-v", "--verbose", "verbose"], "Verbose output"), - _Switch(["--version", "version"], "Print version information and exit"), - _Switch( - ["--long-version", "long_version"], - "Print long version information and exit", - ), - _Switch(["--force", "force"], "Force file overwriting."), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Clustalw.py b/Bio/Align/Applications/_Clustalw.py deleted file mode 100644 index 4349212f0..000000000 --- a/Bio/Align/Applications/_Clustalw.py +++ /dev/null @@ -1,488 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program Clustal W.""" - -import os - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class ClustalwCommandline(AbstractCommandline): - """Command line wrapper for clustalw (version one or two). - - http://www.clustal.org/ - - Notes - ----- - Last checked against versions: 1.83 and 2.1 - - References - ---------- - Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA, - McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD, - Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0. - Bioinformatics, 23, 2947-2948. - - Examples - -------- - >>> from Bio.Align.Applications import ClustalwCommandline - >>> in_file = "unaligned.fasta" - >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file) - >>> print(clustalw_cline) - clustalw2 -infile=unaligned.fasta - - You would typically run the command line with clustalw_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - # TODO - Should we default to cmd="clustalw2" now? - def __init__(self, cmd="clustalw", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-infile", "-INFILE", "INFILE", "infile"], - "Input sequences.", - filename=True, - ), - _Option( - ["-profile1", "-PROFILE1", "PROFILE1", "profile1"], - "Profiles (old alignment).", - filename=True, - ), - _Option( - ["-profile2", "-PROFILE2", "PROFILE2", "profile2"], - "Profiles (old alignment).", - filename=True, - ), - # ################# VERBS (do things) ############################# - _Switch( - ["-options", "-OPTIONS", "OPTIONS", "options"], - "List the command line parameters", - ), - _Switch( - ["-help", "-HELP", "HELP", "help"], "Outline the command line params." - ), - _Switch( - ["-check", "-CHECK", "CHECK", "check"], - "Outline the command line params.", - ), - _Switch( - ["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"], - "Output full help content.", - ), - _Switch( - ["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment." - ), - _Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."), - _Switch( - ["-pim", "-PIM", "PIM", "pim"], - "Output percent identity matrix (while calculating the tree).", - ), - _Option( - ["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"], - "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).", - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-convert", "-CONVERT", "CONVERT", "convert"], - "Output the input sequences in a different file format.", - ), - # #################### PARAMETERS (set things) ######################### - # ***General settings:**** - # Makes no sense in biopython - # _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"], - # [], - # lambda x: 0, # Does not take value - # False, - # "read command line, then enter normal interactive menus", - # False), - _Switch( - ["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"], - "Use FAST algorithm for the alignment guide tree", - ), - _Option( - ["-type", "-TYPE", "TYPE", "type"], - "PROTEIN or DNA sequences", - checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"], - ), - _Switch( - ["-negative", "-NEGATIVE", "NEGATIVE", "negative"], - "Protein alignment with negative values in matrix", - ), - _Option( - ["-outfile", "-OUTFILE", "OUTFILE", "outfile"], - "Output sequence alignment file name", - filename=True, - ), - _Option( - ["-output", "-OUTPUT", "OUTPUT", "output"], - "Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA", - checker_function=lambda x: x - in [ - "CLUSTAL", - "GCG", - "GDE", - "PHYLIP", - "PIR", - "NEXUS", - "FASTA", - "clustal", - "gcg", - "gde", - "phylip", - "pir", - "nexus", - "fasta", - ], - ), - _Option( - ["-outorder", "-OUTORDER", "OUTORDER", "outorder"], - "Output taxon order: INPUT or ALIGNED", - checker_function=lambda x: x - in ["INPUT", "input", "ALIGNED", "aligned"], - ), - _Option( - ["-case", "-CASE", "CASE", "case"], - "LOWER or UPPER (for GDE output only)", - checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"], - ), - _Option( - ["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"], - "OFF or ON (for Clustal output only)", - checker_function=lambda x: x in ["ON", "on", "OFF", "off"], - ), - _Option( - ["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"], - "OFF or ON (NEW- for all output formats)", - checker_function=lambda x: x in ["ON", "on", "OFF", "off"], - ), - _Option( - ["-range", "-RANGE", "RANGE", "range"], - "Sequence range to write starting m to m+n. " - "Input as string eg. '24,200'", - ), - _Option( - ["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"], - "Maximum allowed input sequence length", - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-quiet", "-QUIET", "QUIET", "quiet"], - "Reduce console output to minimum", - ), - _Option( - ["-stats", "-STATS", "STATS", "stats"], - "Log some alignment statistics to file", - filename=True, - ), - # ***Fast Pairwise Alignments:*** - _Option( - ["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"], - "Word size", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"], - "Number of best diags.", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-window", "-WINDOW", "WINDOW", "window"], - "Window around best diags.", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"], - "Gap penalty", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-score", "-SCORE", "SCORE", "score"], - "Either: PERCENT or ABSOLUTE", - checker_function=lambda x: x - in ["percent", "PERCENT", "absolute", "ABSOLUTE"], - ), - # ***Slow Pairwise Alignments:*** - _Option( - ["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"], - "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", - checker_function=lambda x: ( - x - in [ - "BLOSUM", - "PAM", - "GONNET", - "ID", - "blosum", - "pam", - "gonnet", - "id", - ] - or os.path.exists(x) - ), - filename=True, - ), - _Option( - ["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"], - "DNA weight matrix=IUB, CLUSTALW or filename", - checker_function=lambda x: ( - x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x) - ), - filename=True, - ), - _Option( - ["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"], - "Gap opening penalty", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"], - "Gap extension penalty", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - # ***Multiple Alignments:*** - _Option( - ["-newtree", "-NEWTREE", "NEWTREE", "newtree"], - "Output file name for newly created guide tree", - filename=True, - ), - _Option( - ["-usetree", "-USETREE", "USETREE", "usetree"], - "File name of guide tree", - checker_function=lambda x: os.path.exists, - filename=True, - ), - _Option( - ["-matrix", "-MATRIX", "MATRIX", "matrix"], - "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", - checker_function=lambda x: ( - x - in [ - "BLOSUM", - "PAM", - "GONNET", - "ID", - "blosum", - "pam", - "gonnet", - "id", - ] - or os.path.exists(x) - ), - filename=True, - ), - _Option( - ["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"], - "DNA weight matrix=IUB, CLUSTALW or filename", - checker_function=lambda x: ( - x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x) - ), - filename=True, - ), - _Option( - ["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"], - "Gap opening penalty", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-gapext", "-GAPEXT", "GAPEXT", "gapext"], - "Gap extension penalty", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Switch( - ["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"], - "No end gap separation pen.", - ), - _Option( - ["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"], - "Gap separation pen. range", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Switch( - ["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off" - ), - _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"), - _Switch( - ["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"], - "List hydrophilic res.", - ), - _Option( - ["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"], - "% ident. for delay", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - # Already handled in General Settings section, but appears a second - # time under Multiple Alignments in the help - # _Option(["-type", "-TYPE", "TYPE", "type"], - # "PROTEIN or DNA", - # checker_function=lambda x: x in ["PROTEIN", "DNA", - # "protein", "dna"]), - _Option( - ["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"], - "Transitions weighting", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-iteration", "-ITERATION", "ITERATION", "iteration"], - "NONE or TREE or ALIGNMENT", - checker_function=lambda x: x - in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"], - ), - _Option( - ["-numiter", "-NUMITER", "NUMITER", "numiter"], - "maximum number of iterations to perform", - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"], - "Disable sequence weighting", - ), - # ***Profile Alignments:*** - _Switch( - ["-profile", "-PROFILE", "PROFILE", "profile"], - "Merge two alignments by profile alignment", - ), - _Option( - ["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"], - "Output file name for new guide tree of profile1", - filename=True, - ), - _Option( - ["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"], - "Output file for new guide tree of profile2", - filename=True, - ), - _Option( - ["-usetree1", "-USETREE1", "USETREE1", "usetree1"], - "File name of guide tree for profile1", - checker_function=lambda x: os.path.exists, - filename=True, - ), - _Option( - ["-usetree2", "-USETREE2", "USETREE2", "usetree2"], - "File name of guide tree for profile2", - checker_function=lambda x: os.path.exists, - filename=True, - ), - # ***Sequence to Profile Alignments:*** - _Switch( - ["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"], - "Sequentially add profile2 sequences to profile1 alignment", - ), - # These are already handled in the Multiple Alignments section, - # but appear a second time here in the help. - # _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"], - # "File for new guide tree", - # filename=True), - # _Option(["-usetree", "-USETREE", "USETREE", "usetree"], - # "File for old guide tree", - # checker_function=lambda x: os.path.exists, - # filename=True), - # ***Structure Alignments:*** - _Switch( - ["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"], - "Do not use secondary structure-gap penalty mask for profile 1", - ), - _Switch( - ["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"], - "Do not use secondary structure-gap penalty mask for profile 2", - ), - _Option( - ["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"], - "STRUCTURE or MASK or BOTH or NONE output in alignment file", - checker_function=lambda x: x - in [ - "STRUCTURE", - "MASK", - "BOTH", - "NONE", - "structure", - "mask", - "both", - "none", - ], - ), - _Option( - ["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"], - "Gap penalty for helix core residues", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"], - "gap penalty for strand core residues", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"], - "Gap penalty for loop regions", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"], - "Gap penalty for structure termini", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Option( - ["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"], - "Number of residues inside helix to be treated as terminal", - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"], - "Number of residues outside helix to be treated as terminal", - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"], - "Number of residues inside strand to be treated as terminal", - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"], - "Number of residues outside strand to be treated as terminal", - checker_function=lambda x: isinstance(x, int), - ), - # ***Trees:*** - _Option( - ["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"], - "nj OR phylip OR dist OR nexus", - checker_function=lambda x: x - in ["NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus"], - ), - _Option( - ["-seed", "-SEED", "SEED", "seed"], - "Seed number for bootstraps.", - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction." - ), - _Switch( - ["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"], - "Ignore positions with gaps.", - ), - _Option( - ["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"], - "Node OR branch position of bootstrap values in tree display", - checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"], - ), - _Option( - ["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"], - "NJ or UPGMA", - checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"], - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Dialign.py b/Bio/Align/Applications/_Dialign.py deleted file mode 100644 index afa1de8d7..000000000 --- a/Bio/Align/Applications/_Dialign.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program DIALIGN2-2.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class DialignCommandline(AbstractCommandline): - """Command line wrapper for the multiple alignment program DIALIGN2-2. - - http://bibiserv.techfak.uni-bielefeld.de/dialign/welcome.html - - Notes - ----- - Last checked against version: 2.2 - - References - ---------- - B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence - Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36. - - Examples - -------- - To align a FASTA file (unaligned.fasta) with the output files names - aligned.* including a FASTA output file (aligned.fa), use: - - >>> from Bio.Align.Applications import DialignCommandline - >>> dialign_cline = DialignCommandline(input="unaligned.fasta", - ... fn="aligned", fa=True) - >>> print(dialign_cline) - dialign2-2 -fa -fn aligned unaligned.fasta - - You would typically run the command line with dialign_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="dialign2-2", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _Switch( - ["-afc", "afc"], - r"Creates additional output file '\*.afc' " - "containing data of all fragments considered " - "for alignment WARNING: this file can be HUGE !", - ), - _Switch( - ["-afc_v", "afc_v"], - "Like '-afc' but verbose: fragments are explicitly " - "printed. WARNING: this file can be EVEN BIGGER !", - ), - _Switch( - ["-anc", "anc"], - "Anchored alignment. Requires a file .anc " - "containing anchor points.", - ), - _Switch( - ["-cs", "cs"], - "If segments are translated, not only the 'Watson " - "strand' but also the 'Crick strand' is looked at.", - ), - _Switch(["-cw", "cw"], "Additional output file in CLUSTAL W format."), - _Switch( - ["-ds", "ds"], - "'dna alignment speed up' - non-translated nucleic acid " - "fragments are taken into account only if they start " - "with at least two matches. Speeds up DNA alignment at " - "the expense of sensitivity.", - ), - _Switch(["-fa", "fa"], "Additional output file in FASTA format."), - _Switch( - ["-ff", "ff"], - r"Creates file \*.frg containing information about all " - "fragments that are part of the respective optimal " - "pairwise alignmnets plus information about " - "consistency in the multiple alignment", - ), - _Option( - ["-fn", "fn"], - "Output files are named ..", - equate=False, - ), - _Switch( - ["-fop", "fop"], - r"Creates file \*.fop containing coordinates of all " - "fragments that are part of the respective pairwise alignments.", - ), - _Switch( - ["-fsm", "fsm"], - r"Creates file \*.fsm containing coordinates of all " - "fragments that are part of the final alignment", - ), - _Switch( - ["-iw", "iw"], - "Overlap weights switched off (by default, overlap " - "weights are used if up to 35 sequences are aligned). " - "This option speeds up the alignment but may lead " - "to reduced alignment quality.", - ), - _Switch( - ["-lgs", "lgs"], - "'long genomic sequences' - combines the following " - "options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, " - "-fop, -ff, -cs, -ds, -pst ", - ), - _Switch( - ["-lgs_t", "lgs_t"], - "Like '-lgs' but with all segment pairs assessed " - "at the peptide level (rather than 'mixed alignments' " - "as with the '-lgs' option). Therefore faster than " - "-lgs but not very sensitive for non-coding regions.", - ), - _Option( - ["-lmax", "lmax"], - "Maximum fragment length = x (default: x = 40 or " - "x = 120 for 'translated' fragments). Shorter x " - "speeds up the program but may affect alignment quality.", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Switch( - ["-lo", "lo"], - r"(Long Output) Additional file \*.log with information " - "about fragments selected for pairwise alignment and " - "about consistency in multi-alignment procedure.", - ), - _Switch( - ["-ma", "ma"], - "'mixed alignments' consisting of P-fragments and " - "N-fragments if nucleic acid sequences are aligned.", - ), - _Switch( - ["-mask", "mask"], - "Residues not belonging to selected fragments are " - r"replaced by '\*' characters in output alignment " - "(rather than being printed in lower-case characters)", - ), - _Switch( - ["-mat", "mat"], - r"Creates file \*mat with substitution counts derived " - "from the fragments that have been selected for alignment.", - ), - _Switch( - ["-mat_thr", "mat_thr"], - "Like '-mat' but only fragments with weight score " - "> t are considered", - ), - _Switch( - ["-max_link", "max_link"], - "'maximum linkage' clustering used to construct " - "sequence tree (instead of UPGMA).", - ), - _Switch(["-min_link", "min_link"], "'minimum linkage' clustering used."), - _Option(["-mot", "mot"], "'motif' option.", equate=False), - _Switch(["-msf", "msf"], "Separate output file in MSF format."), - _Switch( - ["-n", "n"], - "Input sequences are nucleic acid sequences. " - "No translation of fragments.", - ), - _Switch( - ["-nt", "nt"], - "Input sequences are nucleic acid sequences and " - "'nucleic acid segments' are translated to 'peptide " - "segments'.", - ), - _Switch( - ["-nta", "nta"], - "'no textual alignment' - textual alignment suppressed. " - "This option makes sense if other output files are of " - "interest -- e.g. the fragment files created with -ff, " - "-fop, -fsm or -lo.", - ), - _Switch( - ["-o", "o"], - "Fast version, resulting alignments may be slightly different.", - ), - _Switch( - ["-ow", "ow"], - "Overlap weights enforced (By default, overlap weights " - "are used only if up to 35 sequences are aligned since " - "calculating overlap weights is time consuming).", - ), - _Switch( - ["-pst", "pst"], - r"'print status'. Creates and updates a file \*.sta with " - "information about the current status of the program " - "run. This option is recommended if large data sets " - "are aligned since it allows the user to estimate the " - "remaining running time.", - ), - _Switch( - ["-smin", "smin"], - "Minimum similarity value for first residue pair " - "(or codon pair) in fragments. Speeds up protein " - "alignment or alignment of translated DNA fragments " - "at the expense of sensitivity.", - ), - _Option( - ["-stars", "stars"], - r"Maximum number of '\*' characters indicating degree " - "of local similarity among sequences. By default, no " - "stars are used but numbers between 0 and 9, instead.", - checker_function=lambda x: x in range(10), - equate=False, - ), - _Switch(["-stdo", "stdo"], "Results written to standard output."), - _Switch( - ["-ta", "ta"], - "Standard textual alignment printed (overrides " - "suppression of textual alignments in special " - "options, e.g. -lgs)", - ), - _Option( - ["-thr", "thr"], - "Threshold T = x.", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Switch( - ["-xfr", "xfr"], - "'exclude fragments' - list of fragments can be " - "specified that are NOT considered for pairwise alignment", - ), - _Argument( - ["input"], - "Input file name. Must be FASTA format", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_MSAProbs.py b/Bio/Align/Applications/_MSAProbs.py deleted file mode 100644 index 259d714ae..000000000 --- a/Bio/Align/Applications/_MSAProbs.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2013 by Christian Brueffer. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple sequence alignment program MSAProbs.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class MSAProbsCommandline(AbstractCommandline): - """Command line wrapper for MSAProbs. - - http://msaprobs.sourceforge.net - - Notes - ----- - Last checked against version: 0.9.7 - - References - ---------- - Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple - sequence alignment based on pair hidden Markov models and partition - function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964 - - Examples - -------- - >>> from Bio.Align.Applications import MSAProbsCommandline - >>> in_file = "unaligned.fasta" - >>> out_file = "aligned.cla" - >>> cline = MSAProbsCommandline(infile=in_file, outfile=out_file, clustalw=True) - >>> print(cline) - msaprobs -o aligned.cla -clustalw unaligned.fasta - - You would typically run the command line with cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="msaprobs", **kwargs): - """Initialize the class.""" - # order of parameters is the same as in msaprobs -help - self.parameters = [ - _Option( - ["-o", "--outfile", "outfile"], - "specify the output file name (STDOUT by default)", - filename=True, - equate=False, - ), - _Option( - ["-num_threads", "numthreads"], - "specify the number of threads used, and otherwise detect automatically", - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-clustalw", "clustalw"], - "use CLUSTALW output format instead of FASTA format", - ), - _Option( - ["-c", "consistency"], - "use 0 <= REPS <= 5 (default: 2) passes of consistency transformation", - checker_function=lambda x: isinstance(x, int) and 0 <= x <= 5, - ), - _Option( - ["-ir", "--iterative-refinement", "iterative_refinement"], - "use 0 <= REPS <= 1000 (default: 10) passes of iterative-refinement", - checker_function=lambda x: isinstance(x, int) and 0 <= x <= 1000, - ), - _Switch(["-v", "verbose"], "report progress while aligning (default: off)"), - _Option( - ["-annot", "annot"], - "write annotation for multiple alignment to FILENAME", - filename=True, - ), - _Switch( - ["-a", "--alignment-order", "alignment_order"], - "print sequences in alignment order rather than input order (default: off)", - ), - _Option(["-version", "version"], "print out version of MSAPROBS"), - _Argument(["infile"], "Multiple sequence input file", filename=True), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Mafft.py b/Bio/Align/Applications/_Mafft.py deleted file mode 100644 index 3ad1022a3..000000000 --- a/Bio/Align/Applications/_Mafft.py +++ /dev/null @@ -1,437 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment programme MAFFT.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class MafftCommandline(AbstractCommandline): - """Command line wrapper for the multiple alignment program MAFFT. - - http://align.bmr.kyushu-u.ac.jp/mafft/software/ - - Notes - ----- - Last checked against version: MAFFT v6.717b (2009/12/03) - - References - ---------- - Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of - multiple ncRNA alignment by incorporating structural information into - a MAFFT-based framework (describes RNA structural alignment methods) - - Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent - developments in the MAFFT multiple sequence alignment program - (outlines version 6) - - Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an - algorithm to build an approximate tree from a large number of - unaligned sequences (describes the PartTree algorithm) - - Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT - version 5: improvement in accuracy of multiple sequence alignment - (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i - strategies) - - Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) - - Examples - -------- - >>> from Bio.Align.Applications import MafftCommandline - >>> mafft_exe = "/opt/local/mafft" - >>> in_file = "../Doc/examples/opuntia.fasta" - >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) - >>> print(mafft_cline) - /opt/local/mafft ../Doc/examples/opuntia.fasta - - If the mafft binary is on the path (typically the case on a Unix style - operating system) then you don't need to supply the executable location: - - >>> from Bio.Align.Applications import MafftCommandline - >>> in_file = "../Doc/examples/opuntia.fasta" - >>> mafft_cline = MafftCommandline(input=in_file) - >>> print(mafft_cline) - mafft ../Doc/examples/opuntia.fasta - - You would typically run the command line with mafft_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - Note that MAFFT will write the alignment to stdout, which you may - want to save to a file and then parse, e.g.:: - - stdout, stderr = mafft_cline() - with open("aligned.fasta", "w") as handle: - handle.write(stdout) - from Bio import AlignIO - align = AlignIO.read("aligned.fasta", "fasta") - - Alternatively, to parse the output with AlignIO directly you can - use StringIO to turn the string into a handle:: - - stdout, stderr = mafft_cline() - from io import StringIO - from Bio import AlignIO - align = AlignIO.read(StringIO(stdout), "fasta") - - """ - - def __init__(self, cmd="mafft", **kwargs): - """Initialize the class.""" - BLOSUM_MATRICES = ["30", "45", "62", "80"] - self.parameters = [ - # **** Algorithm **** - # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- - # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) - _Switch(["--auto", "auto"], "Automatically select strategy. Default off."), - # Distance is calculated based on the number of shared 6mers. Default: on - _Switch( - ["--6merpair", "6merpair", "sixmerpair"], - "Distance is calculated based on the number of shared " - "6mers. Default: on", - ), - # All pairwise alignments are computed with the Needleman-Wunsch - # algorithm. More accurate but slower than --6merpair. Suitable for a - # set of globally alignable sequences. Applicable to up to ~200 - # sequences. A combination with --maxiterate 1000 is recommended (G- - # INS-i). Default: off (6mer distance is used) - _Switch( - ["--globalpair", "globalpair"], - "All pairwise alignments are computed with the " - "Needleman-Wunsch algorithm. Default: off", - ), - # All pairwise alignments are computed with the Smith-Waterman - # algorithm. More accurate but slower than --6merpair. Suitable for a - # set of locally alignable sequences. Applicable to up to ~200 - # sequences. A combination with --maxiterate 1000 is recommended (L- - # INS-i). Default: off (6mer distance is used) - _Switch( - ["--localpair", "localpair"], - "All pairwise alignments are computed with the " - "Smith-Waterman algorithm. Default: off", - ), - # All pairwise alignments are computed with a local algorithm with - # the generalized affine gap cost (Altschul 1998). More accurate but - # slower than --6merpair. Suitable when large internal gaps are - # expected. Applicable to up to ~200 sequences. A combination with -- - # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer - # distance is used) - _Switch( - ["--genafpair", "genafpair"], - "All pairwise alignments are computed with a local " - "algorithm with the generalized affine gap cost " - "(Altschul 1998). Default: off", - ), - # All pairwise alignments are computed with FASTA (Pearson and Lipman - # 1988). FASTA is required. Default: off (6mer distance is used) - _Switch( - ["--fastapair", "fastapair"], - "All pairwise alignments are computed with FASTA " - "(Pearson and Lipman 1988). Default: off", - ), - # Weighting factor for the consistency term calculated from pairwise - # alignments. Valid when either of --blobalpair, --localpair, -- - # genafpair, --fastapair or --blastpair is selected. Default: 2.7 - _Option( - ["--weighti", "weighti"], - "Weighting factor for the consistency term calculated " - "from pairwise alignments. Default: 2.7", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Guide tree is built number times in the progressive stage. Valid - # with 6mer distance. Default: 2 - _Option( - ["--retree", "retree"], - "Guide tree is built number times in the progressive " - "stage. Valid with 6mer distance. Default: 2", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Number cycles of iterative refinement are performed. Default: 0 - _Option( - ["--maxiterate", "maxiterate"], - "Number cycles of iterative refinement are performed. Default: 0", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Number of threads to use. Default: 1 - _Option( - ["--thread", "thread"], - "Number of threads to use. Default: 1", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Use FFT approximation in group-to-group alignment. Default: on - _Switch( - ["--fft", "fft"], - "Use FFT approximation in group-to-group alignment. Default: on", - ), - # Do not use FFT approximation in group-to-group alignment. Default: - # off - _Switch( - ["--nofft", "nofft"], - "Do not use FFT approximation in group-to-group " - "alignment. Default: off", - ), - # Alignment score is not checked in the iterative refinement stage. - # Default: off (score is checked) - _Switch( - ["--noscore", "noscore"], - "Alignment score is not checked in the iterative " - "refinement stage. Default: off (score is checked)", - ), - # Use the Myers-Miller (1988) algorithm. Default: automatically - # turned on when the alignment length exceeds 10,000 (aa/nt). - _Switch( - ["--memsave", "memsave"], - "Use the Myers-Miller (1988) algorithm. Default: " - "automatically turned on when the alignment length " - "exceeds 10,000 (aa/nt).", - ), - # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with - # the 6mer distance. Recommended for a large number (> ~10,000) of - # sequences are input. Default: off - _Switch( - ["--parttree", "parttree"], - "Use a fast tree-building method with the 6mer " - "distance. Default: off", - ), - # The PartTree algorithm is used with distances based on DP. Slightly - # more accurate and slower than --parttree. Recommended for a large - # number (> ~10,000) of sequences are input. Default: off - _Switch( - ["--dpparttree", "dpparttree"], - "The PartTree algorithm is used with distances " - "based on DP. Default: off", - ), - # The PartTree algorithm is used with distances based on FASTA. - # Slightly more accurate and slower than --parttree. Recommended for - # a large number (> ~10,000) of sequences are input. FASTA is - # required. Default: off - _Switch( - ["--fastaparttree", "fastaparttree"], - "The PartTree algorithm is used with distances based " - "on FASTA. Default: off", - ), - # The number of partitions in the PartTree algorithm. Default: 50 - _Option( - ["--partsize", "partsize"], - "The number of partitions in the PartTree algorithm. Default: 50", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Do not make alignment larger than number sequences. Valid only with - # the --*parttree options. Default: the number of input sequences - _Switch( - ["--groupsize", "groupsize"], - "Do not make alignment larger than number sequences. " - "Default: the number of input sequences", - ), - # Adjust direction according to the first sequence - # Mafft V6 beta function - _Switch( - ["--adjustdirection", "adjustdirection"], - "Adjust direction according to the first sequence. Default off.", - ), - # Adjust direction according to the first sequence - # for highly diverged data; very slow - # Mafft V6 beta function - _Switch( - ["--adjustdirectionaccurately", "adjustdirectionaccurately"], - "Adjust direction according to the first sequence," - "for highly diverged data; very slow" - "Default off.", - ), - # **** Parameter **** - # Gap opening penalty at group-to-group alignment. Default: 1.53 - _Option( - ["--op", "op"], - "Gap opening penalty at group-to-group alignment. Default: 1.53", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Offset value, which works like gap extension penalty, for group-to- - # group alignment. Default: 0.123 - _Option( - ["--ep", "ep"], - "Offset value, which works like gap extension penalty, " - "for group-to- group alignment. Default: 0.123", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Gap opening penalty at local pairwise alignment. Valid when the -- - # localpair or --genafpair option is selected. Default: -2.00 - _Option( - ["--lop", "lop"], - "Gap opening penalty at local pairwise alignment. Default: 0.123", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Offset value at local pairwise alignment. Valid when the -- - # localpair or --genafpair option is selected. Default: 0.1 - _Option( - ["--lep", "lep"], - "Offset value at local pairwise alignment. Default: 0.1", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Gap extension penalty at local pairwise alignment. Valid when the - - # -localpair or --genafpair option is selected. Default: -0.1 - _Option( - ["--lexp", "lexp"], - "Gap extension penalty at local pairwise alignment. Default: -0.1", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Gap opening penalty to skip the alignment. Valid when the -- - # genafpair option is selected. Default: -6.00 - _Option( - ["--LOP", "LOP"], - "Gap opening penalty to skip the alignment. Default: -6.00", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # Gap extension penalty to skip the alignment. Valid when the -- - # genafpair option is selected. Default: 0.00 - _Option( - ["--LEXP", "LEXP"], - "Gap extension penalty to skip the alignment. Default: 0.00", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. - # number=30, 45, 62 or 80. Default: 62 - _Option( - ["--bl", "bl"], - "BLOSUM number matrix is used. Default: 62", - checker_function=lambda x: x in BLOSUM_MATRICES, - equate=False, - ), - # JTT PAM number (Jones et al. 1992) matrix is used. number>0. - # Default: BLOSUM62 - _Option( - ["--jtt", "jtt"], - "JTT PAM number (Jones et al. 1992) matrix is used. " - "number>0. Default: BLOSUM62", - equate=False, - ), - # Transmembrane PAM number (Jones et al. 1994) matrix is used. - # number>0. Default: BLOSUM62 - _Option( - ["--tm", "tm"], - "Transmembrane PAM number (Jones et al. 1994) " - "matrix is used. number>0. Default: BLOSUM62", - filename=True, # to ensure spaced inputs are quoted - equate=False, - ), - # Use a user-defined AA scoring matrix. The format of matrixfile is - # the same to that of BLAST. Ignored when nucleotide sequences are - # input. Default: BLOSUM62 - _Option( - ["--aamatrix", "aamatrix"], - "Use a user-defined AA scoring matrix. Default: BLOSUM62", - filename=True, # to ensure spaced inputs are quoted - equate=False, - ), - # Incorporate the AA/nuc composition information into the scoring - # matrix. Default: off - _Switch( - ["--fmodel", "fmodel"], - "Incorporate the AA/nuc composition information into " - "the scoring matrix (True) or not (False, default)", - ), - # **** Output **** - # Name length for CLUSTAL and PHYLIP format output - _Option( - ["--namelength", "namelength"], - """Name length in CLUSTAL and PHYLIP output. - - MAFFT v6.847 (2011) added --namelength for use with - the --clustalout option for CLUSTAL output. - - MAFFT v7.024 (2013) added support for this with the - --phylipout option for PHYLIP output (default 10). - """, - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Output format: clustal format. Default: off (fasta format) - _Switch( - ["--clustalout", "clustalout"], - "Output format: clustal (True) or fasta (False, default)", - ), - # Output format: phylip format. - # Added in beta with v6.847, fixed in v6.850 (2011) - _Switch( - ["--phylipout", "phylipout"], - "Output format: phylip (True), or fasta (False, default)", - ), - # Output order: same as input. Default: on - _Switch( - ["--inputorder", "inputorder"], - "Output order: same as input (True, default) or alignment " - "based (False)", - ), - # Output order: aligned. Default: off (inputorder) - _Switch( - ["--reorder", "reorder"], - "Output order: aligned (True) or in input order (False, default)", - ), - # Guide tree is output to the input.tree file. Default: off - _Switch( - ["--treeout", "treeout"], - "Guide tree is output to the input.tree file (True) or " - "not (False, default)", - ), - # Do not report progress. Default: off - _Switch( - ["--quiet", "quiet"], - "Do not report progress (True) or not (False, default).", - ), - # **** Input **** - # Assume the sequences are nucleotide. Default: auto - _Switch( - ["--nuc", "nuc"], - "Assume the sequences are nucleotide (True/False). Default: auto", - ), - # Assume the sequences are amino acid. Default: auto - _Switch( - ["--amino", "amino"], - "Assume the sequences are amino acid (True/False). Default: auto", - ), - # MAFFT has multiple --seed commands where the unaligned input is - # aligned to the seed alignment. There can be multiple seeds in the - # form: "mafft --seed align1 --seed align2 [etc] input" - # Effectively for n number of seed alignments. - # TODO - Can we use class _ArgumentList here? - _Option( - ["--seed", "seed"], - "Seed alignments given in alignment_n (fasta format) " - "are aligned with sequences in input.", - filename=True, - equate=False, - ), - # The input (must be FASTA format) - _Argument(["input"], "Input file name", filename=True, is_required=True), - # mafft-profile takes a second alignment input as an argument: - # mafft-profile align1 align2 - _Argument( - ["input1"], - "Second input file name for the mafft-profile command", - filename=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Muscle.py b/Bio/Align/Applications/_Muscle.py deleted file mode 100644 index f4f72e12e..000000000 --- a/Bio/Align/Applications/_Muscle.py +++ /dev/null @@ -1,686 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program MUSCLE.""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class MuscleCommandline(AbstractCommandline): - r"""Command line wrapper for the multiple alignment program MUSCLE. - - http://www.drive5.com/muscle/ - - Notes - ----- - Last checked against version: 3.7, briefly against 3.8 - - References - ---------- - Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high - accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. - - Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with - reduced time and space complexity. BMC Bioinformatics 5(1): 113. - - Examples - -------- - >>> from Bio.Align.Applications import MuscleCommandline - >>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" - >>> in_file = r"C:\My Documents\unaligned.fasta" - >>> out_file = r"C:\My Documents\aligned.fasta" - >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) - >>> print(muscle_cline) - "C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" - - You would typically run the command line with muscle_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="muscle", **kwargs): - """Initialize the class.""" - CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] - DISTANCE_MEASURES_ITER1 = [ - "kmer6_6", - "kmer20_3", - "kmer20_4", - "kbit20_3", - "kmer4_6", - ] - DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [ - "pctid_kimura", - "pctid_log", - ] - OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] - TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] - - # The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype) - # were updated at somepoint in MUSCLE version 3.8. Prior to the update - # 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for - # backwards compatibility with older MUSCLE versions. - SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"] - WEIGHTING_SCHEMES = [ - "none", - "clustalw", - "henikoff", - "henikoffpb", - "gsc", - "threeway", - ] - self.parameters = [ - # Can't use "in" as the final alias as this - # is a reserved word in python: - _Option( - ["-in", "in", "input"], "Input filename", filename=True, equate=False - ), - _Option(["-out", "out"], "Output filename", filename=True, equate=False), - _Switch( - ["-diags", "diags"], "Find diagonals (faster for similar sequences)" - ), - _Switch(["-profile", "profile"], "Perform a profile alignment"), - _Option( - ["-in1", "in1"], - "First input filename for profile alignment", - filename=True, - equate=False, - ), - _Option( - ["-in2", "in2"], - "Second input filename for a profile alignment", - filename=True, - equate=False, - ), - # anchorspacing Integer 32 Minimum spacing - # between anchor cols - _Option( - ["-anchorspacing", "anchorspacing"], - "Minimum spacing between anchor columns", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # center Floating point [1] Center parameter. - # Should be negative. - _Option( - ["-center", "center"], - "Center parameter - should be negative", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # cluster1 upgma upgmb Clustering method. - _Option( - ["-cluster1", "cluster1"], - "Clustering method used in iteration 1", - checker_function=lambda x: x in CLUSTERING_ALGORITHMS, - equate=False, - ), - # cluster2 upgmb cluster1 is used - # neighborjoining in iteration 1 and - # 2, cluster2 in - # later iterations. - _Option( - ["-cluster2", "cluster2"], - "Clustering method used in iteration 2", - checker_function=lambda x: x in CLUSTERING_ALGORITHMS, - equate=False, - ), - # diaglength Integer 24 Minimum length of - # diagonal. - _Option( - ["-diaglength", "diaglength"], - "Minimum length of diagonal", - checker_function=lambda x: isinstance(x, int), - equate=True, - ), - # diagmargin Integer 5 Discard this many - # positions at ends - # of diagonal. - _Option( - ["-diagmargin", "diagmargin"], - "Discard this many positions at ends of diagonal", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # distance1 kmer6_6 Kmer6_6(amino) or Distance measure - # kmer20_3 Kmer4_6(nucleo) for iteration 1 - # kmer20_4 - # kbit20_3 - # kmer4_6 - _Option( - ["-distance1", "distance1"], - "Distance measure for iteration 1", - checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, - equate=False, - ), - # distance2 kmer6_6 pctid_kimura Distance measure - # kmer20_3 for iterations - # kmer20_4 2, 3 ... - # kbit20_3 - # pctid_kimura - # pctid_log - _Option( - ["-distance2", "distance2"], - "Distance measure for iteration 2", - checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, - equate=False, - ), - # gapextend Floating point [1] The gap extend score - _Option( - ["-gapextend", "gapextend"], - "Gap extension penalty", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # gapopen Floating point [1] The gap open score - # Must be negative. - _Option( - ["-gapopen", "gapopen"], - "Gap open score - negative number", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # hydro Integer 5 Window size for - # determining whether - # a region is - # hydrophobic. - _Option( - ["-hydro", "hydro"], - "Window size for hydrophobic region", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # hydrofactor Floating point 1.2 Multiplier for gap - # open/close - # penalties in - # hydrophobic regions - _Option( - ["-hydrofactor", "hydrofactor"], - "Multiplier for gap penalties in hydrophobic regions", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # log File name None. Log file name - # (delete existing - # file). - _Option(["-log", "log"], "Log file name", filename=True, equate=False), - # loga File name None. Log file name - # (append to existing - # file). - _Option( - ["-loga", "loga"], - "Log file name (append to existing file)", - filename=True, - equate=False, - ), - # matrix File name None. File name for - # substitution matrix - # in NCBI or WU-BLAST - # format. If you - # specify your own - # matrix, you should - # also specify: - # -gapopen - # -gapextend - # -center 0.0 - _Option( - ["-matrix", "matrix"], - "path to NCBI or WU-BLAST format protein substitution " - "matrix - also set -gapopen, -gapextend and -center", - filename=True, - equate=False, - ), - # diagbreak Integer 1 Maximum distance - # between two - # diagonals that - # allows them to - # merge into one - # diagonal. - _Option( - ["-diagbreak", "diagbreak"], - "Maximum distance between two diagonals that allows " - "them to merge into one diagonal", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8 - "Deprecated in v3.8, use -diagbreak instead.", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # maxhours Floating point None. Maximum time to - # run in hours. The - # actual time may - # exceed requested - # limit by a few - # minutes. Decimals - # are allowed, so 1.5 - # means one hour and - # 30 minutes. - _Option( - ["-maxhours", "maxhours"], - "Maximum time to run in hours", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # maxiters Integer 1, 2 ... 16 Maximum number of - # iterations. - _Option( - ["-maxiters", "maxiters"], - "Maximum number of iterations", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # maxtrees Integer 1 Maximum number of - # new trees to build - # in iteration 2. - _Option( - ["-maxtrees", "maxtrees"], - "Maximum number of trees to build in iteration 2", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # minbestcolscore Floating point [1] Minimum score a - # column must have to - # be an anchor. - _Option( - ["-minbestcolscore", "minbestcolscore"], - "Minimum score a column must have to be an anchor", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # minsmoothscore Floating point [1] Minimum smoothed - # score a column must - # have to be an - # anchor. - _Option( - ["-minsmoothscore", "minsmoothscore"], - "Minimum smoothed score a column must have to be an anchor", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # objscore sp spm Objective score - # ps used by tree - # dp dependent - # xp refinement. - # spf sp=sum-of-pairs - # spm score. (dimer - # approximation) - # spm=sp for < 100 - # seqs, otherwise spf - # dp=dynamic - # programming score. - # ps=average profile- - # sequence score. - # xp=cross profile - # score. - _Option( - ["-objscore", "objscore"], - "Objective score used by tree dependent refinement", - checker_function=lambda x: x in OBJECTIVE_SCORES, - equate=False, - ), - # refinewindow Integer 200 Length of window - # for -refinew. - _Option( - ["-refinewindow", "refinewindow"], - "Length of window for -refinew", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # root1 pseudo pseudo Method used to root - _Option( - ["-root1", "root1"], - "Method used to root tree in iteration 1", - checker_function=lambda x: x in TREE_ROOT_METHODS, - equate=False, - ), - # root2 midlongestspan tree; root1 is - # minavgleafdist used in iteration 1 - # and 2, root2 in - # later iterations. - _Option( - ["-root2", "root2"], - "Method used to root tree in iteration 2", - checker_function=lambda x: x in TREE_ROOT_METHODS, - equate=False, - ), - # scorefile File name None File name where to - # write a score file. - # This contains one - # line for each column - # in the alignment. - # The line contains - # the letters in the - # column followed by - # the average BLOSUM62 - # score over pairs of - # letters in the - # column. - _Option( - ["-scorefile", "scorefile"], - "Score file name, contains one line for each column" - " in the alignment with average BLOSUM62 score", - filename=True, - equate=False, - ), - # seqtype protein auto Sequence type. - # dna (MUSCLE version > 3.8) - # rna (MUSCLE version > 3.8) - # auto - # nucleo (only valid for MUSCLE versions < 3.8) - _Option( - ["-seqtype", "seqtype"], - "Sequence type", - checker_function=lambda x: x in SEQUENCE_TYPES, - equate=False, - ), - # smoothscoreceil Floating point [1] Maximum value of - # column score for - # smoothing purposes. - _Option( - ["-smoothscoreceil", "smoothscoreceil"], - "Maximum value of column score for smoothing", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # smoothwindow Integer 7 Window used for - # anchor column - # smoothing. - _Option( - ["-smoothwindow", "smoothwindow"], - "Window used for anchor column smoothing", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # spscore File name Compute SP - # objective score of - # multiple alignment. - _Option( - ["-spscore", "spscore"], - "Compute SP objective score of multiple alignment", - filename=True, - equate=False, - ), - # SUEFF Floating point value 0.1 Constant used in - # between 0 and 1. UPGMB clustering. - # Determines the - # relative fraction - # of average linkage - # (SUEFF) vs. nearest - # neighbor linkage - # (1 SUEFF). - _Option( - ["-sueff", "sueff"], - "Constant used in UPGMB clustering", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - # tree1 File name None Save tree - _Option( - ["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False - ), - # tree2 first or second - # iteration to given - # file in Newick - # (Phylip-compatible) - # format. - _Option( - ["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False - ), - # usetree File name None Use given tree as - # guide tree. Must by - # in Newick - # (Phyip-compatible) - # format. - _Option( - ["-usetree", "usetree"], - "Use given Newick tree as guide tree", - filename=True, - equate=False, - ), - # weight1 none clustalw Sequence weighting - _Option( - ["-weight1", "weight1"], - "Weighting scheme used in iteration 1", - checker_function=lambda x: x in WEIGHTING_SCHEMES, - equate=False, - ), - # weight2 henikoff scheme. - # henikoffpb weight1 is used in - # gsc iterations 1 and 2. - # clustalw weight2 is used for - # threeway tree-dependent - # refinement. - # none=all sequences - # have equal weight. - # henikoff=Henikoff & - # Henikoff weighting - # scheme. - # henikoffpb=Modified - # Henikoff scheme as - # used in PSI-BLAST. - # clustalw=CLUSTALW - # method. - # threeway=Gotoh - # three-way method. - _Option( - ["-weight2", "weight2"], - "Weighting scheme used in iteration 2", - checker_function=lambda x: x in WEIGHTING_SCHEMES, - equate=False, - ), - # ################### FORMATS #################################### - # Multiple formats can be specified on the command line - # If -msf appears it will be used regardless of other formats - # specified. If -clw appears (and not -msf), clustalw format will - # be used regardless of other formats specified. If both -clw and - # -clwstrict are specified -clwstrict will be used regardless of - # other formats specified. If -fasta is specified and not -msf, - # -clw, or clwstrict, fasta will be used. If -fasta and -html are - # specified -fasta will be used. Only if -html is specified alone - # will html be used. I kid ye not. - # clw no Write output in CLUSTALW format - # (default is FASTA). - _Switch( - ["-clw", "clw"], - "Write output in CLUSTALW format (with a MUSCLE header)", - ), - # clwstrict no Write output in CLUSTALW format with - # the "CLUSTAL W (1.81)" header rather - # than the MUSCLE version. This is - # useful when a post-processing step is - # picky about the file header. - _Switch( - ["-clwstrict", "clwstrict"], - "Write output in CLUSTALW format with version 1.81 header", - ), - # fasta yes Write output in FASTA format. - # Alternatives include clw, - # clwstrict, msf and html. - _Switch(["-fasta", "fasta"], "Write output in FASTA format"), - # html no Write output in HTML format (default - # is FASTA). - _Switch(["-html", "html"], "Write output in HTML format"), - # msf no Write output in MSF format (default - # is FASTA). - _Switch(["-msf", "msf"], "Write output in MSF format"), - # Phylip interleaved - undocumented as of 3.7 - _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"), - # Phylip sequential - undocumented as of 3.7 - _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"), - # ################# Additional specified output files ######### - _Option( - ["-phyiout", "phyiout"], - "Write PHYLIP interleaved output to specified filename", - filename=True, - equate=False, - ), - _Option( - ["-physout", "physout"], - "Write PHYLIP sequential format to specified filename", - filename=True, - equate=False, - ), - _Option( - ["-htmlout", "htmlout"], - "Write HTML output to specified filename", - filename=True, - equate=False, - ), - _Option( - ["-clwout", "clwout"], - "Write CLUSTALW output (with MUSCLE header) to specified filename", - filename=True, - equate=False, - ), - _Option( - ["-clwstrictout", "clwstrictout"], - "Write CLUSTALW output (with version 1.81 header) to " - "specified filename", - filename=True, - equate=False, - ), - _Option( - ["-msfout", "msfout"], - "Write MSF format output to specified filename", - filename=True, - equate=False, - ), - _Option( - ["-fastaout", "fastaout"], - "Write FASTA format output to specified filename", - filename=True, - equate=False, - ), - # ############# END FORMATS ################################### - # anchors yes Use anchor optimization in tree - # dependent refinement iterations. - _Switch( - ["-anchors", "anchors"], - "Use anchor optimisation in tree dependent refinement iterations", - ), - # noanchors no Disable anchor optimization. Default - # is anchors. - _Switch( - ["-noanchors", "noanchors"], - "Do not use anchor optimisation in tree dependent " - "refinement iterations", - ), - # brenner no Use Steven Brenner's method for - # computing the root alignment. - _Switch( - ["-brenner", "brenner"], "Use Steve Brenner's root alignment method" - ), - # cluster no Perform fast clustering of input - # sequences. Use the tree1 option to - # save the tree. - _Switch( - ["-cluster", "cluster"], - "Perform fast clustering of input sequences, " - "use -tree1 to save tree", - ), - # dimer no Use dimer approximation for the - # SP score (faster, less accurate). - _Switch( - ["-dimer", "dimer"], - "Use faster (slightly less accurate) dimer approximation" - "for the SP score", - ), - # group yes Group similar sequences together - # in the output. This is the default. - # See also stable. - _Switch(["-group", "group"], "Group similar sequences in output"), - # ############# log-expectation profile score #################### - # One of either -le, -sp, or -sv - # - # According to the doc, spn is default and the only option for - # nucleotides: this doesn't appear to be true. -le, -sp, and -sv - # can be used and produce numerically different logs - # (what is going on?) - # - # spn fails on proteins - # le maybe Use log-expectation profile score - # (VTML240). Alternatives are to use sp - # or sv. This is the default for amino - # acid sequences. - _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"), - # sv no Use sum-of-pairs profile score - # (VTML240). Default is le. - _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"), - # sp no Use sum-of-pairs protein profile - # score (PAM200). Default is le. - _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"), - # spn maybe Use sum-of-pairs nucleotide profile - # score (BLASTZ parameters). This is - # the only option for nucleotides, - # and is therefore the default. - _Switch( - ["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score" - ), - # ########## END log-expectation profile score ################### - # quiet no Do not display progress messages. - _Switch(["-quiet", "quiet"], "Do not display progress messages"), - # refine no Input file is already aligned, skip - # first two iterations and begin tree - # dependent refinement. - _Switch(["-refine", "refine"], "Only do tree dependent refinement"), - # refinew no Refine an alignment by dividing it - # into non-overlapping windows and - # re-aligning each window. Typically - # used for whole-genome nucleotide - # alignments. - _Switch( - ["-refinew", "refinew"], - "Only do tree dependent refinement using sliding window approach", - ), - # core yes in muscle, Do not catch exceptions. - # no in muscled. - _Switch(["-core", "core"], "Do not catch exceptions"), - # nocore no in muscle, Catch exceptions and give an - # yes in muscled. error message if possible. - _Switch(["-nocore", "nocore"], "Catch exceptions"), - # stable no Preserve input order of sequences - # in output file. Default is to group - # sequences by similarity (group). - _Switch( - ["-stable", "stable"], - "Do not group similar sequences in output (not supported in v3.8)", - ), - # termgaps4 yes Use 4-way test for treatment of - # terminal gaps. - # (Cannot be disabled in this version). - # - # termgapsfull no Terminal gaps penalized with - # full penalty. [1] Not fully - # supported in this version - # - # termgapshalf yes Terminal gaps penalized with - # half penalty. [1] Not fully - # supported in this version - # - # termgapshalflonger no Terminal gaps penalized with - # half penalty if gap relative - # to longer sequence, otherwise with - # full penalty. [1] Not fully - # supported in this version - # - # verbose no Write parameter settings and - # progress messages to log file. - _Switch(["-verbose", "verbose"], "Write parameter settings and progress"), - # version no Write version string to - # stdout and exit - _Switch(["-version", "version"], "Write version string to stdout and exit"), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Prank.py b/Bio/Align/Applications/_Prank.py deleted file mode 100644 index 3f4da6220..000000000 --- a/Bio/Align/Applications/_Prank.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program PRANK.""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class PrankCommandline(AbstractCommandline): - """Command line wrapper for the multiple alignment program PRANK. - - http://www.ebi.ac.uk/goldman-srv/prank/prank/ - - Notes - ----- - Last checked against version: 081202 - - References - ---------- - Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive - multiple alignment of sequences with insertions. Proceedings of - the National Academy of Sciences, 102: 10557--10562. - - Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement - prevents errors in sequence alignment and evolutionary analysis. - Science, 320: 1632. - - Examples - -------- - To align a FASTA file (unaligned.fasta) with the output in aligned - FASTA format with the output filename starting with "aligned" (you - can't pick the filename explicitly), no tree output and no XML output, - use: - - >>> from Bio.Align.Applications import PrankCommandline - >>> prank_cline = PrankCommandline(d="unaligned.fasta", - ... o="aligned", # prefix only! - ... f=8, # FASTA output - ... notree=True, noxml=True) - >>> print(prank_cline) - prank -d=unaligned.fasta -o=aligned -f=8 -noxml -notree - - You would typically run the command line with prank_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="prank", **kwargs): - """Initialize the class.""" - OUTPUT_FORMAT_VALUES = list(range(1, 18)) - self.parameters = [ - # ################# input/output parameters: ################## - # -d=sequence_file - _Option(["-d", "d"], "Input filename", filename=True, is_required=True), - # -t=tree_file [default: no tree, generate approximate NJ tree] - _Option(["-t", "t"], "Input guide tree filename", filename=True), - # -tree="tree_string" [tree in newick format; in double quotes] - _Option(["-tree", "tree"], "Input guide tree as Newick string"), - # -m=model_file [default: HKY2/WAG] - _Option( - ["-m", "m"], "User-defined alignment model filename. Default: HKY2/WAG" - ), - # -o=output_file [default: 'output'] - _Option( - ["-o", "o"], - "Output filenames prefix. Default: 'output'\n " - "Will write: output.?.fas (depending on requested " - "format), output.?.xml and output.?.dnd", - filename=True, - ), - # -f=output_format [default: 8] - _Option( - ["-f", "f"], - "Output alignment format. Default: 8 FASTA\n" - "Option are:\n" - "1. IG/Stanford 8. Pearson/Fasta\n" - "2. GenBank/GB 11. Phylip3.2\n" - "3. NBRF 12. Phylip\n" - "4. EMBL 14. PIR/CODATA\n" - "6. DNAStrider 15. MSF\n" - "7. Fitch 17. PAUP/NEXUS", - checker_function=lambda x: x in OUTPUT_FORMAT_VALUES, - ), - _Switch( - ["-noxml", "noxml"], - "Do not output XML files (PRANK versions earlier than v.120626)", - ), - _Switch( - ["-notree", "notree"], - "Do not output dnd tree files (PRANK versions earlier than v.120626)", - ), - _Switch( - ["-showxml", "showxml"], "Output XML files (PRANK v.120626 and later)" - ), - _Switch( - ["-showtree", "showtree"], - "Output dnd tree files (PRANK v.120626 and later)", - ), - _Switch(["-shortnames", "shortnames"], "Truncate names at first space"), - _Switch(["-quiet", "quiet"], "Reduce verbosity"), - # ###################### model parameters: ###################### - # +F [force insertions to be always skipped] - # -F [equivalent] - _Switch( - ["-F", "+F", "F"], "Force insertions to be always skipped: same as +F" - ), - # -dots [show insertion gaps as dots] - _Switch(["-dots", "dots"], "Show insertion gaps as dots"), - # -gaprate=# [gap opening rate; default: dna 0.025 / prot 0.0025] - _Option( - ["-gaprate", "gaprate"], - "Gap opening rate. Default: dna 0.025 prot 0.0025", - checker_function=lambda x: isinstance(x, float), - ), - # -gapext=# [gap extension probability; default: dna 0.5 / prot 0.5] - _Option( - ["-gapext", "gapext"], - "Gap extension probability. Default: dna 0.5 / prot 0.5", - checker_function=lambda x: isinstance(x, float), - ), - # -dnafreqs=#,#,#,# [ACGT; default: empirical] - _Option( - ["-dnafreqs", "dnafreqs"], - "DNA frequencies - 'A,C,G,T'. eg '25,25,25,25' as a quote " - "surrounded string value. Default: empirical", - checker_function=lambda x: isinstance(x, bytes), - ), - # -kappa=# [ts/tv rate ratio; default:2] - _Option( - ["-kappa", "kappa"], - "Transition/transversion ratio. Default: 2", - checker_function=lambda x: isinstance(x, int), - ), - # -rho=# [pur/pyr rate ratio; default:1] - _Option( - ["-rho", "rho"], - "Purine/pyrimidine ratio. Default: 1", - checker_function=lambda x: isinstance(x, int), - ), - # -codon [for DNA: use empirical codon model] - _Switch(["-codon", "codon"], "Codon aware alignment or not"), - # -termgap [penalise terminal gaps normally] - _Switch(["-termgap", "termgap"], "Penalise terminal gaps normally"), - # ############### other parameters: ################################ - # -nopost [do not compute posterior support; default: compute] - _Switch( - ["-nopost", "nopost"], - "Do not compute posterior support. Default: compute", - ), - # -pwdist=# [expected pairwise distance for computing guidetree; - # default: dna 0.25 / prot 0.5] - _Option( - ["-pwdist", "pwdist"], - "Expected pairwise distance for computing guidetree. " - "Default: dna 0.25 / prot 0.5", - checker_function=lambda x: isinstance(x, float), - ), - _Switch( - ["-once", "once"], "Run only once. Default: twice if no guidetree given" - ), - _Switch(["-twice", "twice"], "Always run twice"), - _Switch(["-skipins", "skipins"], "Skip insertions in posterior support"), - _Switch( - ["-uselogs", "uselogs"], - "Slower but should work for a greater number of sequences", - ), - _Switch(["-writeanc", "writeanc"], "Output ancestral sequences"), - _Switch( - ["-printnodes", "printnodes"], "Output each node; mostly for debugging" - ), - # -matresize=# [matrix resizing multiplier] - # Doesn't specify type but Float and Int work - _Option( - ["-matresize", "matresize"], - "Matrix resizing multiplier", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - # -matinitsize=# [matrix initial size multiplier] - # Doesn't specify type but Float and Int work - _Option( - ["-matinitsize", "matinitsize"], - "Matrix initial size multiplier", - checker_function=lambda x: (isinstance(x, (float, int))), - ), - _Switch(["-longseq", "longseq"], "Save space in pairwise alignments"), - _Switch(["-pwgenomic", "pwgenomic"], "Do pairwise alignment, no guidetree"), - # -pwgenomicdist=# [distance for pairwise alignment; default: 0.3] - _Option( - ["-pwgenomicdist", "pwgenomicdist"], - "Distance for pairwise alignment. Default: 0.3", - checker_function=lambda x: isinstance(x, float), - ), - # -scalebranches=# [scale branch lengths; default: dna 1 / prot 2] - _Option( - ["-scalebranches", "scalebranches"], - "Scale branch lengths. Default: dna 1 / prot 2", - checker_function=lambda x: isinstance(x, int), - ), - # -fixedbranches=# [use fixed branch lengths] - # Assume looking for a float - _Option( - ["-fixedbranches", "fixedbranches"], - "Use fixed branch lengths of input value", - checker_function=lambda x: isinstance(x, float), - ), - # -maxbranches=# [set maximum branch length] - # Assume looking for a float - _Option( - ["-maxbranches", "maxbranches"], - "Use maximum branch lengths of input value", - checker_function=lambda x: isinstance(x, float), - ), - # -realbranches [disable branch length truncation] - _Switch( - ["-realbranches", "realbranches"], "Disable branch length truncation" - ), - _Switch(["-translate", "translate"], "Translate to protein"), - _Switch( - ["-mttranslate", "mttranslate"], "Translate to protein using mt table" - ), - # ##################### other: #################### - _Switch( - ["-convert", "convert"], - "Convert input alignment to new format. Do not perform alignment", - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_Probcons.py b/Bio/Align/Applications/_Probcons.py deleted file mode 100644 index 41ae07e88..000000000 --- a/Bio/Align/Applications/_Probcons.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program PROBCONS.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class ProbconsCommandline(AbstractCommandline): - """Command line wrapper for the multiple alignment program PROBCONS. - - http://probcons.stanford.edu/ - - Notes - ----- - Last checked against version: 1.12 - - References - ---------- - Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005. - PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment. - Genome Research 15: 330-340. - - Examples - -------- - To align a FASTA file (unaligned.fasta) with the output in ClustalW - format, and otherwise default settings, use: - - >>> from Bio.Align.Applications import ProbconsCommandline - >>> probcons_cline = ProbconsCommandline(input="unaligned.fasta", - ... clustalw=True) - >>> print(probcons_cline) - probcons -clustalw unaligned.fasta - - You would typically run the command line with probcons_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - Note that PROBCONS will write the alignment to stdout, which you may - want to save to a file and then parse, e.g.:: - - stdout, stderr = probcons_cline() - with open("aligned.aln", "w") as handle: - handle.write(stdout) - from Bio import AlignIO - align = AlignIO.read("aligned.fasta", "clustalw") - - Alternatively, to parse the output with AlignIO directly you can - use StringIO to turn the string into a handle:: - - stdout, stderr = probcons_cline() - from io import StringIO - from Bio import AlignIO - align = AlignIO.read(StringIO(stdout), "clustalw") - - """ - - def __init__(self, cmd="probcons", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Note that some options cannot be assigned via properties using the - # original documented option (because hyphens are not valid for names in - # python), e.g cmdline.pre-training = 3 will not work - # In these cases the shortened option name should be used - # cmdline.pre = 3 - _Switch( - ["-clustalw", "clustalw"], "Use CLUSTALW output format instead of MFA" - ), - _Option( - ["-c", "c", "--consistency", "consistency"], - "Use 0 <= REPS <= 5 (default: 2) passes of consistency transformation", - checker_function=lambda x: x in range(6), - equate=False, - ), - _Option( - ["-ir", "--iterative-refinement", "iterative-refinement", "ir"], - "Use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement", - checker_function=lambda x: x in range(1001), - equate=False, - ), - _Option( - ["-pre", "--pre-training", "pre-training", "pre"], - "Use 0 <= REPS <= 20 (default: 0) rounds of pretraining", - checker_function=lambda x: x in range(21), - equate=False, - ), - _Switch(["-pairs", "pairs"], "Generate all-pairs pairwise alignments"), - _Switch( - ["-viterbi", "viterbi"], - "Use Viterbi algorithm to generate all pairs " - "(automatically enables -pairs)", - ), - _Switch( - ["-verbose", "verbose"], "Report progress while aligning (default: off)" - ), - _Option( - ["-annot", "annot"], - "Write annotation for multiple alignment to FILENAME", - equate=False, - ), - _Option( - ["-t", "t", "--train", "train"], - "Compute EM transition probabilities, store in FILENAME " - "(default: no training)", - equate=False, - ), - _Switch( - ["-e", "e", "--emissions", "emissions"], - "Also reestimate emission probabilities (default: off)", - ), - _Option( - ["-p", "p", "--paramfile", "paramfile"], - "Read parameters from FILENAME", - equate=False, - ), - _Switch( - ["-a", "--alignment-order", "alignment-order", "a"], - "Print sequences in alignment order rather than input " - "order (default: off)", - ), - # Input file name - _Argument( - ["input"], - "Input file name. Must be multiple FASTA alignment (MFA) format", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/_TCoffee.py b/Bio/Align/Applications/_TCoffee.py deleted file mode 100644 index 6dd4dbce5..000000000 --- a/Bio/Align/Applications/_TCoffee.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2009 by Cymon J. Cox and Brad Chapman. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the multiple alignment program TCOFFEE.""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class TCoffeeCommandline(AbstractCommandline): - """Commandline object for the TCoffee alignment program. - - http://www.tcoffee.org/Projects_home_page/t_coffee_home_page.html - - The T-Coffee command line tool has a lot of switches and options. - This wrapper implements a VERY limited number of options - if you - would like to help improve it please get in touch. - - Notes - ----- - Last checked against: Version_6.92 - - References - ---------- - T-Coffee: A novel method for multiple sequence alignments. - Notredame, Higgins, Heringa, JMB,302(205-217) 2000 - - Examples - -------- - To align a FASTA file (unaligned.fasta) with the output in ClustalW - format (file aligned.aln), and otherwise default settings, use: - - >>> from Bio.Align.Applications import TCoffeeCommandline - >>> tcoffee_cline = TCoffeeCommandline(infile="unaligned.fasta", - ... output="clustalw", - ... outfile="aligned.aln") - >>> print(tcoffee_cline) - t_coffee -output clustalw -infile unaligned.fasta -outfile aligned.aln - - You would typically run the command line with tcoffee_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - SEQ_TYPES = ["dna", "protein", "dna_protein"] - - def __init__(self, cmd="t_coffee", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-output", "output"], - """Specify the output type. - - One (or more separated by a comma) of: - 'clustalw_aln', 'clustalw', 'gcg', 'msf_aln', - 'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq' - """, - equate=False, - ), - _Option( - ["-infile", "infile"], - "Specify the input file.", - filename=True, - is_required=True, - equate=False, - ), - # Indicates the name of the alignment output by t_coffee. If the - # default is used, the alignment is named .aln - _Option( - ["-outfile", "outfile"], - "Specify the output file. Default: .aln", - filename=True, - equate=False, - ), - _Switch( - ["-convert", "convert"], "Specify you want to perform a file conversion" - ), - _Option( - ["-type", "type"], - "Specify the type of sequence being aligned", - checker_function=lambda x: x in self.SEQ_TYPES, - equate=False, - ), - _Option( - ["-outorder", "outorder"], - "Specify the order of sequence to output" - "Either 'input', 'aligned' or of " - "Fasta file with sequence order", - equate=False, - ), - _Option( - ["-matrix", "matrix"], - "Specify the filename of the substitution matrix to use. " - "Default: blosum62mt", - equate=False, - ), - _Option( - ["-gapopen", "gapopen"], - "Indicates the penalty applied for opening a gap (negative integer)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-gapext", "gapext"], - "Indicates the penalty applied for extending a gap (negative integer)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Switch(["-quiet", "quiet"], "Turn off log output"), - _Option( - ["-mode", "mode"], - "Specifies a special mode: genome, quickaln, dali, 3dcoffee", - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Align/Applications/__init__.py b/Bio/Align/Applications/__init__.py deleted file mode 100644 index 2f7e9679c..000000000 --- a/Bio/Align/Applications/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2009 by Peter Cock & Cymon J. Cox. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Alignment command line tool wrappers (OBSOLETE). - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from ._ClustalOmega import ClustalOmegaCommandline -from ._Clustalw import ClustalwCommandline -from ._Dialign import DialignCommandline -from ._Mafft import MafftCommandline -from ._MSAProbs import MSAProbsCommandline -from ._Muscle import MuscleCommandline -from ._Prank import PrankCommandline -from ._Probcons import ProbconsCommandline -from ._TCoffee import TCoffeeCommandline - -# Make this explicit, then they show up in the API docs -__all__ = ( - "MuscleCommandline", - "ClustalwCommandline", - "ClustalOmegaCommandline", - "PrankCommandline", - "MafftCommandline", - "DialignCommandline", - "ProbconsCommandline", - "TCoffeeCommandline", - "MSAProbsCommandline", -) diff --git a/Bio/Application/__init__.py b/Bio/Application/__init__.py deleted file mode 100644 index 47d2a8a88..000000000 --- a/Bio/Application/__init__.py +++ /dev/null @@ -1,855 +0,0 @@ -# Copyright 2001-2004 Brad Chapman. -# Revisions copyright 2009-2013 by Peter Cock. -# All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""General mechanisms to access applications in Biopython (DEPRECATED). - -This module is not intended for direct use. It provides the basic objects which -are subclassed by our command line wrappers, such as: - - - Bio.Align.Applications - - Bio.Blast.Applications - - Bio.Emboss.Applications - - Bio.Sequencing.Applications - -These modules provide wrapper classes for command line tools to help you -construct command line strings by setting the values of each parameter. -The finished command line strings are then normally invoked via the built-in -Python module subprocess. - -Due to the on going maintenance burden or keeping command line application -wrappers up to date, we have decided to deprecate and eventually remove them. -We instead now recommend building your command line and invoking it directly -with the subprocess module. -""" - -import os -import platform -import re -import subprocess -import sys -import warnings - -from Bio import BiopythonDeprecationWarning - -warnings.warn( - """\ -The Bio.Application modules and modules relying on it have been deprecated. - -Due to the on going maintenance burden of keeping command line application -wrappers up to date, we have decided to deprecate and eventually remove these -modules. - -We instead now recommend building your command line and invoking it directly -with the subprocess module.""", - BiopythonDeprecationWarning, -) - - -# Use this regular expression to test the property names are going to -# be valid as Python properties or arguments -_re_prop_name = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$") -assert _re_prop_name.match("t") -assert _re_prop_name.match("test") -assert _re_prop_name.match("_test") is None # we don't want private names -assert _re_prop_name.match("-test") is None -assert _re_prop_name.match("any-hyphen") is None -assert _re_prop_name.match("underscore_ok") -assert _re_prop_name.match("test_name") -assert _re_prop_name.match("test2") -# These are reserved names in Python itself, -_reserved_names = [ - "and", - "del", - "from", - "not", - "while", - "as", - "elif", - "global", - "or", - "with", - "assert", - "else", - "if", - "pass", - "yield", - "break", - "except", - "import", - "print", - "class", - "exec", - "in", - "raise", - "continue", - "finally", - "is", - "return", - "def", - "for", - "lambda", - "try", -] -# These are reserved names due to the way the wrappers work -_local_reserved_names = ["set_parameter"] - - -class ApplicationError(subprocess.CalledProcessError): - """Raised when an application returns a non-zero exit status (OBSOLETE). - - The exit status will be stored in the returncode attribute, similarly - the command line string used in the cmd attribute, and (if captured) - stdout and stderr as strings. - - This exception is a subclass of subprocess.CalledProcessError. - - >>> err = ApplicationError(-11, "helloworld", "", "Some error text") - >>> err.returncode, err.cmd, err.stdout, err.stderr - (-11, 'helloworld', '', 'Some error text') - >>> print(err) - Non-zero return code -11 from 'helloworld', message 'Some error text' - - """ - - def __init__(self, returncode, cmd, stdout="", stderr=""): - """Initialize the class.""" - self.returncode = returncode - self.cmd = cmd - self.stdout = stdout - self.stderr = stderr - - def __str__(self): - """Format the error as a string.""" - # get first line of any stderr message - try: - msg = self.stderr.lstrip().split("\n", 1)[0].rstrip() - except Exception: # TODO, ValueError? AttributeError? - msg = "" - if msg: - return "Non-zero return code %d from %r, message %r" % ( - self.returncode, - self.cmd, - msg, - ) - else: - return "Non-zero return code %d from %r" % (self.returncode, self.cmd) - - def __repr__(self): - """Represent the error as a string.""" - return "ApplicationError(%i, %s, %s, %s)" % ( - self.returncode, - self.cmd, - self.stdout, - self.stderr, - ) - - -class AbstractCommandline: - r"""Generic interface for constructing command line strings (OBSOLETE). - - This class shouldn't be called directly; it should be subclassed to - provide an implementation for a specific application. - - For a usage example we'll show one of the EMBOSS wrappers. You can set - options when creating the wrapper object using keyword arguments - or - later using their corresponding properties: - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) - >>> cline - WaterCommandline(cmd='water', gapopen=10, gapextend=0.5) - - You can instead manipulate the parameters via their properties, e.g. - - >>> cline.gapopen - 10 - >>> cline.gapopen = 20 - >>> cline - WaterCommandline(cmd='water', gapopen=20, gapextend=0.5) - - You can clear a parameter you have already added by 'deleting' the - corresponding property: - - >>> del cline.gapopen - >>> cline.gapopen - >>> cline - WaterCommandline(cmd='water', gapextend=0.5) - - Once you have set the parameters you need, you can turn the object into - a string (e.g. to log the command): - - >>> str(cline) - Traceback (most recent call last): - ... - ValueError: You must either set outfile (output filename), or enable filter or stdout (output to stdout). - - In this case the wrapper knows certain arguments are required to construct - a valid command line for the tool. For a complete example, - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> water_cmd = WaterCommandline(gapopen=10, gapextend=0.5) - >>> water_cmd.asequence = "asis:ACCCGGGCGCGGT" - >>> water_cmd.bsequence = "asis:ACCCGAGCGCGGT" - >>> water_cmd.outfile = "temp_water.txt" - >>> print(water_cmd) - water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 - >>> water_cmd - WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5) - - You would typically run the command line via a standard Python operating - system call using the subprocess module for full control. For the simple - case where you just want to run the command and get the output: - - stdout, stderr = water_cmd() - - Note that by default we assume the underlying tool is installed on the - system $PATH environment variable. This is normal under Linux/Unix, but - may need to be done manually under Windows. Alternatively, you can specify - the full path to the binary as the first argument (cmd): - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> water_cmd = WaterCommandline(r"C:\Program Files\EMBOSS\water.exe", - ... gapopen=10, gapextend=0.5, - ... asequence="asis:ACCCGGGCGCGGT", - ... bsequence="asis:ACCCGAGCGCGGT", - ... outfile="temp_water.txt") - >>> print(water_cmd) - "C:\Program Files\EMBOSS\water.exe" -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 - - Notice that since the path name includes a space it has automatically - been quoted. - - """ - - # TODO - Replace the above example since EMBOSS doesn't work properly - # if installed into a folder with a space like "C:\Program Files\EMBOSS" - # - # Note the call example above is not a doctest as we can't handle EMBOSS - # (or any other tool) being missing in the unit tests. - - parameters = None # will be a list defined in subclasses - - def __init__(self, cmd, **kwargs): - """Create a new instance of a command line wrapper object.""" - # Init method - should be subclassed! - # - # The subclass methods should look like this: - # - # def __init__(self, cmd="muscle", **kwargs): - # self.parameters = [...] - # AbstractCommandline.__init__(self, cmd, **kwargs) - # - # i.e. There should have an optional argument "cmd" to set the location - # of the executable (with a sensible default which should work if the - # command is on the path on Unix), and keyword arguments. It should - # then define a list of parameters, all objects derived from the base - # class _AbstractParameter. - # - # The keyword arguments should be any valid parameter name, and will - # be used to set the associated parameter. - self.program_name = cmd - try: - parameters = self.parameters - except AttributeError: - raise AttributeError( - "Subclass should have defined self.parameters" - ) from None - # Create properties for each parameter at run time - aliases = set() - for p in parameters: - if not p.names: - if not isinstance(p, _StaticArgument): - raise TypeError(f"Expected {p!r} to be of type _StaticArgument") - continue - for name in p.names: - if name in aliases: - raise ValueError(f"Parameter alias {name} multiply defined") - aliases.add(name) - name = p.names[-1] - if _re_prop_name.match(name) is None: - raise ValueError( - "Final parameter name %r cannot be used as " - "an argument or property name in python" % name - ) - if name in _reserved_names: - raise ValueError( - "Final parameter name %r cannot be used as " - "an argument or property name because it is " - "a reserved word in python" % name - ) - if name in _local_reserved_names: - raise ValueError( - "Final parameter name %r cannot be used as " - "an argument or property name due to the " - "way the AbstractCommandline class works" % name - ) - - # Beware of binding-versus-assignment confusion issues - def getter(name): - return lambda x: x._get_parameter(name) - - def setter(name): - return lambda x, value: x.set_parameter(name, value) - - def deleter(name): - return lambda x: x._clear_parameter(name) - - doc = p.description - if isinstance(p, _Switch): - doc += ( - "\n\nThis property controls the addition of the %s " - "switch, treat this property as a boolean." % p.names[0] - ) - else: - doc += ( - "\n\nThis controls the addition of the %s parameter " - "and its associated value. Set this property to the " - "argument value required." % p.names[0] - ) - prop = property(getter(name), setter(name), deleter(name), doc) - setattr(self.__class__, name, prop) # magic! - for key, value in kwargs.items(): - self.set_parameter(key, value) - - def _validate(self): - """Make sure the required parameters have been set (PRIVATE). - - No return value - it either works or raises a ValueError. - - This is a separate method (called from __str__) so that subclasses may - override it. - """ - for p in self.parameters: - # Check for missing required parameters: - if p.is_required and not (p.is_set): - raise ValueError(f"Parameter {p.names[-1]} is not set.") - # Also repeat the parameter validation here, just in case? - - def __str__(self): - """Make the commandline string with the currently set options. - - e.g. - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) - >>> cline.asequence = "asis:ACCCGGGCGCGGT" - >>> cline.bsequence = "asis:ACCCGAGCGCGGT" - >>> cline.outfile = "temp_water.txt" - >>> print(cline) - water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 - >>> str(cline) - 'water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5' - """ - self._validate() - commandline = f"{_escape_filename(self.program_name)} " - for parameter in self.parameters: - if parameter.is_set: - # This will include a trailing space: - commandline += str(parameter) - return commandline.strip() # remove trailing space - - def __repr__(self): - """Return a representation of the command line object for debugging. - - e.g. - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) - >>> cline.asequence = "asis:ACCCGGGCGCGGT" - >>> cline.bsequence = "asis:ACCCGAGCGCGGT" - >>> cline.outfile = "temp_water.txt" - >>> print(cline) - water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 - >>> cline - WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5) - """ - answer = f"{self.__class__.__name__}(cmd={self.program_name!r}" - for parameter in self.parameters: - if parameter.is_set: - if isinstance(parameter, _Switch): - answer += f", {parameter.names[-1]}=True" - else: - answer += f", {parameter.names[-1]}={parameter.value!r}" - answer += ")" - return answer - - def _get_parameter(self, name): - """Get a commandline option value (PRIVATE).""" - for parameter in self.parameters: - if name in parameter.names: - if isinstance(parameter, _Switch): - return parameter.is_set - else: - return parameter.value - raise ValueError(f"Option name {name} was not found.") - - def _clear_parameter(self, name): - """Reset or clear a commandline option value (PRIVATE).""" - cleared_option = False - for parameter in self.parameters: - if name in parameter.names: - parameter.value = None - parameter.is_set = False - cleared_option = True - if not cleared_option: - raise ValueError(f"Option name {name} was not found.") - - def set_parameter(self, name, value=None): - """Set a commandline option for a program (OBSOLETE). - - Every parameter is available via a property and as a named - keyword when creating the instance. Using either of these is - preferred to this legacy set_parameter method which is now - OBSOLETE, and likely to be DEPRECATED and later REMOVED in - future releases. - """ - set_option = False - for parameter in self.parameters: - if name in parameter.names: - if isinstance(parameter, _Switch): - if value is None: - import warnings - - warnings.warn( - "For a switch type argument like %s, " - "we expect a boolean. None is treated " - "as FALSE!" % parameter.names[-1] - ) - parameter.is_set = bool(value) - set_option = True - else: - if value is not None: - self._check_value(value, name, parameter.checker_function) - parameter.value = value - parameter.is_set = True - set_option = True - if not set_option: - raise ValueError(f"Option name {name} was not found.") - - def _check_value(self, value, name, check_function): - """Check whether the given value is valid (PRIVATE). - - No return value - it either works or raises a ValueError. - - This uses the passed function 'check_function', which can either - return a [0, 1] (bad, good) value or raise an error. Either way - this function will raise an error if the value is not valid, or - finish silently otherwise. - """ - if check_function is not None: - is_good = check_function(value) # May raise an exception - if is_good not in [0, 1, True, False]: - raise ValueError( - f"Result of check_function: {is_good!r} is of an unexpected value" - ) - if not is_good: - raise ValueError( - f"Invalid parameter value {value!r} for parameter {name}" - ) - - def __setattr__(self, name, value): - """Set attribute name to value (PRIVATE). - - This code implements a workaround for a user interface issue. - Without this __setattr__ attribute-based assignment of parameters - will silently accept invalid parameters, leading to known instances - of the user assuming that parameters for the application are set, - when they are not. - - >>> from Bio.Emboss.Applications import WaterCommandline - >>> cline = WaterCommandline(gapopen=10, gapextend=0.5, stdout=True) - >>> cline.asequence = "a.fasta" - >>> cline.bsequence = "b.fasta" - >>> cline.csequence = "c.fasta" - Traceback (most recent call last): - ... - ValueError: Option name csequence was not found. - >>> print(cline) - water -stdout -asequence=a.fasta -bsequence=b.fasta -gapopen=10 -gapextend=0.5 - - This workaround uses a whitelist of object attributes, and sets the - object attribute list as normal, for these. Other attributes are - assumed to be parameters, and passed to the self.set_parameter method - for validation and assignment. - """ - if name in ["parameters", "program_name"]: # Allowed attributes - self.__dict__[name] = value - else: - self.set_parameter(name, value) # treat as a parameter - - def __call__(self, stdin=None, stdout=True, stderr=True, cwd=None, env=None): - """Execute command, wait for it to finish, return (stdout, stderr). - - Runs the command line tool and waits for it to finish. If it returns - a non-zero error level, an exception is raised. Otherwise two strings - are returned containing stdout and stderr. - - The optional stdin argument should be a string of data which will be - passed to the tool as standard input. - - The optional stdout and stderr argument may be filenames (string), - but otherwise are treated as a booleans, and control if the output - should be captured as strings (True, default), or ignored by sending - it to /dev/null to avoid wasting memory (False). If sent to a file - or ignored, then empty string(s) are returned. - - The optional cwd argument is a string giving the working directory - to run the command from. See Python's subprocess module documentation - for more details. - - The optional env argument is a dictionary setting the environment - variables to be used in the new process. By default the current - process' environment variables are used. See Python's subprocess - module documentation for more details. - - Default example usage:: - - from Bio.Emboss.Applications import WaterCommandline - water_cmd = WaterCommandline(gapopen=10, gapextend=0.5, - stdout=True, auto=True, - asequence="a.fasta", bsequence="b.fasta") - print("About to run: %s" % water_cmd) - std_output, err_output = water_cmd() - - This functionality is similar to subprocess.check_output(). In general - if you require more control over running the command, use subprocess - directly. - - When the program called returns a non-zero error level, a custom - ApplicationError exception is raised. This includes any stdout and - stderr strings captured as attributes of the exception object, since - they may be useful for diagnosing what went wrong. - """ - if not stdout: - stdout_arg = open(os.devnull, "w") - elif isinstance(stdout, str): - stdout_arg = open(stdout, "w") - else: - stdout_arg = subprocess.PIPE - - if not stderr: - stderr_arg = open(os.devnull, "w") - elif isinstance(stderr, str): - if stdout == stderr: - stderr_arg = stdout_arg # Write both to the same file - else: - stderr_arg = open(stderr, "w") - else: - stderr_arg = subprocess.PIPE - - # We may not need to supply any piped input, but we setup the - # standard input pipe anyway as a work around for a python - # bug if this is called from a Windows GUI program. For - # details, see http://bugs.python.org/issue1124861 - # - # Using universal newlines is important on Python 3, this - # gives unicode handles rather than bytes handles. - - # Windows 7, 8, 8.1 and 10 want shell = True - if sys.platform != "win32": - use_shell = True - else: - win_ver = platform.win32_ver()[0] - if win_ver in ["7", "8", "post2012Server", "10"]: - use_shell = True - else: - use_shell = False - child_process = subprocess.Popen( - str(self), - stdin=subprocess.PIPE, - stdout=stdout_arg, - stderr=stderr_arg, - universal_newlines=True, - cwd=cwd, - env=env, - shell=use_shell, - ) - # Use .communicate as can get deadlocks with .wait(), see Bug 2804 - stdout_str, stderr_str = child_process.communicate(stdin) - if not stdout: - assert not stdout_str, stdout_str - if not stderr: - assert not stderr_str, stderr_str - return_code = child_process.returncode - - # Particularly important to close handles on Jython and PyPy - # (where garbage collection is less predictable) and on Windows - # (where cannot delete files with an open handle): - if not stdout or isinstance(stdout, str): - # We opened /dev/null or a file - stdout_arg.close() - if not stderr or (isinstance(stderr, str) and stdout != stderr): - # We opened /dev/null or a file - stderr_arg.close() - - if return_code: - raise ApplicationError(return_code, str(self), stdout_str, stderr_str) - return stdout_str, stderr_str - - -class _AbstractParameter: - """A class to hold information about a parameter for a commandline. - - Do not use this directly, instead use one of the subclasses. - """ - - def __init__(self): - raise NotImplementedError - - def __str__(self): - raise NotImplementedError - - -class _Option(_AbstractParameter): - """Represent an option that can be set for a program. - - This holds UNIXish options like --append=yes and -a yes, - where a value (here "yes") is generally expected. - - For UNIXish options like -kimura in clustalw which don't - take a value, use the _Switch object instead. - - Attributes: - - names -- a list of string names (typically two entries) by which - the parameter can be set via the legacy set_parameter method - (eg ["-a", "--append", "append"]). The first name in list is used - when building the command line. The last name in the list is a - "human readable" name describing the option in one word. This - must be a valid Python identifier as it is used as the property - name and as a keyword argument, and should therefore follow PEP8 - naming. - - description -- a description of the option. This is used as - the property docstring. - - filename -- True if this argument is a filename (or other argument - that should be quoted) and should be automatically quoted if it - contains spaces. - - checker_function -- a reference to a function that will determine - if a given value is valid for this parameter. This function can either - raise an error when given a bad value, or return a [0, 1] decision on - whether the value is correct. - - equate -- should an equals sign be inserted if a value is used? - - is_required -- a flag to indicate if the parameter must be set for - the program to be run. - - is_set -- if the parameter has been set - - value -- the value of a parameter - - """ - - def __init__( - self, - names, - description, - filename=False, - checker_function=None, - is_required=False, - equate=True, - ): - self.names = names - if not isinstance(description, str): - raise TypeError(f"Should be a string: {description!r} for {names[-1]}") - # Note 'filename' is for any string with spaces that needs quoting - self.is_filename = filename - self.checker_function = checker_function - self.description = description - self.equate = equate - self.is_required = is_required - - self.is_set = False - self.value = None - - def __str__(self): - """Return the value of this option for the commandline. - - Includes a trailing space. - """ - # Note: Before equate was handled explicitly, the old - # code would do either "--name " or "--name=value ", - # or " -name " or " -name value ". This choice is now - # now made explicitly when setting up the option. - if self.value is None: - return f"{self.names[0]} " - if self.is_filename: - v = _escape_filename(self.value) - else: - v = str(self.value) - if self.equate: - return f"{self.names[0]}={v} " - else: - return f"{self.names[0]} {v} " - - -class _Switch(_AbstractParameter): - """Represent an optional argument switch for a program. - - This holds UNIXish options like -kimura in clustalw which don't - take a value, they are either included in the command string - or omitted. - - Attributes: - - names -- a list of string names (typically two entries) by which - the parameter can be set via the legacy set_parameter method - (eg ["-a", "--append", "append"]). The first name in list is used - when building the command line. The last name in the list is a - "human readable" name describing the option in one word. This - must be a valid Python identifier as it is used as the property - name and as a keyword argument, and should therefore follow PEP8 - naming. - - description -- a description of the option. This is used as - the property docstring. - - is_set -- if the parameter has been set - - NOTE - There is no value attribute, see is_set instead, - - """ - - def __init__(self, names, description): - self.names = names - self.description = description - self.is_set = False - self.is_required = False - - def __str__(self): - """Return the value of this option for the commandline. - - Includes a trailing space. - """ - assert not hasattr(self, "value") - if self.is_set: - return f"{self.names[0]} " - else: - return "" - - -class _Argument(_AbstractParameter): - """Represent an argument on a commandline. - - The names argument should be a list containing one string. - This must be a valid Python identifier as it is used as the - property name and as a keyword argument, and should therefore - follow PEP8 naming. - """ - - def __init__( - self, - names, - description, - filename=False, - checker_function=None, - is_required=False, - ): - # if len(names) != 1: - # raise ValueError("The names argument to _Argument should be a " - # "single entry list with a PEP8 property name.") - self.names = names - if not isinstance(description, str): - raise TypeError(f"Should be a string: {description!r} for {names[-1]}") - # Note 'filename' is for any string with spaces that needs quoting - self.is_filename = filename - self.checker_function = checker_function - self.description = description - self.is_required = is_required - self.is_set = False - self.value = None - - def __str__(self): - if self.value is None: - return " " - elif self.is_filename: - return f"{_escape_filename(self.value)} " - else: - return f"{self.value} " - - -class _ArgumentList(_Argument): - """Represent a variable list of arguments on a command line, e.g. multiple filenames.""" - - # TODO - Option to require at least one value? e.g. min/max count? - - def __str__(self): - if not isinstance(self.value, list): - raise TypeError("Arguments should be a list") - if not self.value: - raise ValueError("Requires at least one filename") - # A trailing space is required so that parameters following the last filename - # do not appear merged. - # e.g.: samtools cat in1.bam in2.bam-o out.sam [without trailing space][Incorrect] - # samtools cat in1.bam in2.bam -o out.sam [with trailing space][Correct] - if self.is_filename: - return " ".join(_escape_filename(v) for v in self.value) + " " - else: - return " ".join(self.value) + " " - - -class _StaticArgument(_AbstractParameter): - """Represent a static (read only) argument on a commandline. - - This is not intended to be exposed as a named argument or - property of a command line wrapper object. - """ - - def __init__(self, value): - self.names = [] - self.is_required = False - self.is_set = True - self.value = value - - def __str__(self): - return f"{self.value} " - - -def _escape_filename(filename): - """Escape filenames with spaces by adding quotes (PRIVATE). - - Note this will not add quotes if they are already included: - - >>> print((_escape_filename('example with spaces'))) - "example with spaces" - >>> print((_escape_filename('"example with spaces"'))) - "example with spaces" - >>> print((_escape_filename(1))) - 1 - - Note the function is more generic than the name suggests, since it - is used to add quotes around any string arguments containing spaces. - """ - # Is adding the following helpful - # if os.path.isfile(filename): - # # On Windows, if the file exists, we can ask for - # # its alternative short name (DOS style 8.3 format) - # # which has no spaces in it. Note that this name - # # is not portable between machines, or even folder! - # try: - # import win32api - # short = win32api.GetShortPathName(filename) - # assert os.path.isfile(short) - # return short - # except ImportError: - # pass - if not isinstance(filename, str): - # for example the NCBI BLAST+ -outfmt argument can be an integer - return filename - if " " not in filename: - return filename - # We'll just quote it - works on Windows, Mac OS X etc - if filename.startswith('"') and filename.endswith('"'): - # Its already quoted - return filename - else: - return f'"{filename}"' - - -def _test(): - """Run the Bio.Application module's doctests (PRIVATE).""" - import doctest - - doctest.testmod(verbose=1) - - -if __name__ == "__main__": - # Run the doctests - _test() diff --git a/Bio/Blast/Applications.py b/Bio/Blast/Applications.py deleted file mode 100644 index 7db2afc67..000000000 --- a/Bio/Blast/Applications.py +++ /dev/null @@ -1,1604 +0,0 @@ -# Copyright 2001 Brad Chapman. -# Revisions copyright 2009-2010 by Peter Cock. -# Revisions copyright 2010 by Phillip Garland. -# All rights reserved. -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Definitions for interacting with BLAST related applications (OBSOLETE). - -Wrappers for the new NCBI BLAST+ tools (written in C++): - - - NcbiblastpCommandline - Protein-Protein BLAST - - NcbiblastnCommandline - Nucleotide-Nucleotide BLAST - - NcbiblastxCommandline - Translated Query-Protein Subject BLAST - - NcbitblastnCommandline - Protein Query-Translated Subject BLAST - - NcbitblastxCommandline - Translated Query-Protein Subject BLAST - - NcbipsiblastCommandline - Position-Specific Initiated BLAST - - NcbirpsblastCommandline - Reverse Position Specific BLAST - - NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST - - NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast - - NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats - - NcbimakeblastdbCommandline - Application to create BLAST databases - -For further details, see: - -Camacho et al. BLAST+: architecture and applications -BMC Bioinformatics 2009, 10:421 -https://doi.org/10.1186/1471-2105-10-421 - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class _NcbibaseblastCommandline(AbstractCommandline): - """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to all the BLAST tools (blastn, rpsblast, rpsblast, etc - AND blast_formatter). - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - # Core: - _Switch( - ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." - ), - _Switch( - ["-help", "help"], - "Print USAGE, DESCRIPTION and ARGUMENTS description; " - "ignore other arguments.", - ), - _Switch( - ["-version", "version"], - "Print version number; ignore other arguments.", - ), - # Output configuration options - _Option( - ["-out", "out"], - "Output file for alignment.", - filename=True, - equate=False, - ), - # Formatting options: - _Option( - ["-outfmt", "outfmt"], - "Alignment view. Typically an integer 0-14 but for some " - "formats can be named columns like '6 qseqid sseqid'. " - "Use 5 for XML output (differs from classic BLAST which " - "used 7 for XML).", - filename=True, # to ensure spaced inputs are quoted - equate=False, - ), - # TODO - Document and test the column options - _Switch(["-show_gis", "show_gis"], "Show NCBI GIs in deflines?"), - _Option( - ["-num_descriptions", "num_descriptions"], - "Number of database sequences to show one-line descriptions for.\n\n" - "Integer argument (at least zero). Default is 500. " - "See also num_alignments.", - equate=False, - ), - _Option( - ["-num_alignments", "num_alignments"], - "Number of database sequences to show num_alignments for.\n\n" - "Integer argument (at least zero). Default is 200. " - "See also num_alignments.", - equate=False, - ), - _Option( - ["-line_length", "line_length"], - "Line length for formatting alignments " - "(integer, at least 1, default 60).\n\n" - "Not applicable for outfmt > 4. Added in BLAST+ 2.2.30.", - equate=False, - ), - _Switch( - ["-html", "html"], "Produce HTML output? See also the outfmt option." - ), - # Miscellaneous options - _Switch( - ["-parse_deflines", "parse_deflines"], - "Should the query and subject defline(s) be parsed?", - ), - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - AbstractCommandline.__init__(self, cmd, **kwargs) - - def _validate_incompatibilities(self, incompatibles): - """Validate parameters for incompatibilities (PRIVATE). - - Used by the _validate method. - """ - for a in incompatibles: - if self._get_parameter(a): - for b in incompatibles[a]: - if self._get_parameter(b): - raise ValueError(f"Options {a} and {b} are incompatible.") - - -class _NcbiblastCommandline(_NcbibaseblastCommandline): - """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to all the BLAST tools (blastn, rpsblast, rpsblast, etc). - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - # Input query options: - _Option( - ["-query", "query"], - "The sequence to search with.", - filename=True, - equate=False, - ), # Should this be required? - _Option( - ["-query_loc", "query_loc"], - "Location on the query sequence (Format: start-stop).", - equate=False, - ), - # General search options: - _Option(["-db", "db"], "The database to BLAST against.", equate=False), - _Option(["-evalue", "evalue"], "Expectation value cutoff.", equate=False), - _Option( - ["-word_size", "word_size"], - "Word size for wordfinder algorithm.\n\nInteger. Minimum 2.", - equate=False, - ), - # BLAST-2-Sequences options: - # - see subclass - # Formatting options: - # - see baseclass - # Query filtering options - _Option( - ["-soft_masking", "soft_masking"], - "Apply filtering locations as soft masks (Boolean, Default = true).", - equate=False, - ), - _Switch( - ["-lcase_masking", "lcase_masking"], - "Use lower case filtering in query and subject sequence(s)?", - ), - # Restrict search or results - _Option( - ["-gilist", "gilist"], - "Restrict search of database to list of GI's.\n\n" - "Incompatible with: negative_gilist, seqidlist, negative_seqidlist, " - "remote, subject, subject_loc", - filename=True, - equate=False, - ), - _Option( - ["-negative_gilist", "negative_gilist"], - "Restrict search of database to everything except the listed GIs.\n\n" - "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", - filename=True, - equate=False, - ), - _Option( - ["-seqidlist", "seqidlist"], - "Restrict search of database to list of SeqID's.\n\n" - "Incompatible with: gilist, negative_gilist, remote, subject, " - "subject_loc", - filename=True, - equate=False, - ), - _Option( - ["-negative_seqidlist", "negative_seqidlist"], - "Restrict search of database to everything except listed SeqID's.\n\n" - "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", - filename=True, - equate=False, - ), - _Option( - ["-entrez_query", "entrez_query"], - "Restrict search with the given Entrez query (requires remote).", - equate=False, - ), - _Option( - ["-qcov_hsp_perc", "qcov_hsp_perc"], - "Percent query coverage per hsp (float, 0 to 100).\n\n" - "Added in BLAST+ 2.2.30.", - equate=False, - ), - _Option( - ["-max_target_seqs", "max_target_seqs"], - "Maximum number of aligned sequences to keep (integer, at least one).", - equate=False, - ), - # Statistical options - _Option( - ["-dbsize", "dbsize"], - "Effective length of the database (integer).", - equate=False, - ), - _Option( - ["-searchsp", "searchsp"], - "Effective length of the search space (integer).", - equate=False, - ), - _Option( - ["-max_hsps_per_subject", "max_hsps_per_subject"], - "Override max number of HSPs per subject saved for ungapped searches " - "(integer).", - equate=False, - ), - _Option( - ["-max_hsps", "max_hsps"], - "Set max number of HSPs saved per subject sequence\n\n" - "Ddefault 0 means no limit.", - equate=False, - ), - _Switch(["-sum_statistics", "sum_statistics"], "Use sum statistics."), - # Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch? - _Option( - ["-sum_stats", "sum_stats"], - "Use sum statistics (boolean).\n\nAdded in BLAST+ 2.2.30.", - equate=False, - ), - # Extension options - _Option( - ["-xdrop_ungap", "xdrop_ungap"], - "X-dropoff value (in bits) for ungapped extensions (float).", - equate=False, - ), - _Option( - ["-xdrop_gap", "xdrop_gap"], - "X-dropoff value (in bits) for preliminary gapped extensions (float).", - equate=False, - ), - _Option( - ["-xdrop_gap_final", "xdrop_gap_final"], - "X-dropoff value (in bits) for final gapped alignment (float).", - equate=False, - ), - _Option( - ["-window_size", "window_size"], - "Multiple hits window size, use 0 to specify 1-hit algorithm " - "(integer).", - equate=False, - ), - # Search strategy options - _Option( - ["-import_search_strategy", "import_search_strategy"], - "Search strategy to use.\n\n" - "Incompatible with: export_search_strategy", - filename=True, - equate=False, - ), - _Option( - ["-export_search_strategy", "export_search_strategy"], - "File name to record the search strategy used.\n\n" - "Incompatible with: import_search_strategy", - filename=True, - equate=False, - ), - # Miscellaneous options - _Option( - ["-num_threads", "num_threads"], - "Number of threads to use in the BLAST search.\n\n" - "Integer, at least one. Default is one. Incompatible with: remote", - equate=False, - ), - _Switch( - ["-remote", "remote"], - "Execute search remotely?\n\n" - "Incompatible with: gilist, negative_gilist, subject_loc, " - "num_threads, ...", - ), - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = { - "remote": ["gilist", "negative_gilist", "num_threads"], - "import_search_strategy": ["export_search_strategy"], - "gilist": ["negative_gilist"], - "seqidlist": ["gilist", "negative_gilist", "remote"], - } - self._validate_incompatibilities(incompatibles) - if self.entrez_query and not self.remote: - raise ValueError("Option entrez_query requires remote option.") - AbstractCommandline._validate(self) - - -class _Ncbiblast2SeqCommandline(_NcbiblastCommandline): - """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to all the BLAST tools supporting two-sequence BLAST - (blastn, psiblast, etc) but not rpsblast or rpstblastn. - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - # General search options: - _Option( - ["-gapopen", "gapopen"], "Cost to open a gap (integer).", equate=False - ), - _Option( - ["-gapextend", "gapextend"], - "Cost to extend a gap (integer).", - equate=False, - ), - # BLAST-2-Sequences options: - _Option( - ["-subject", "subject"], - "Subject sequence(s) to search.\n\n" - "Incompatible with: db, gilist, seqidlist, negative_gilist, " - "negative_seqidlist, db_soft_mask, db_hard_mask\n\n" - "See also subject_loc.", - filename=True, - equate=False, - ), - _Option( - ["-subject_loc", "subject_loc"], - "Location on the subject sequence (Format: start-stop).\n\n" - "Incompatible with: db, gilist, seqidlist, negative_gilist, " - "negative_seqidlist, db_soft_mask, db_hard_mask, remote.\n\n" - "See also subject.", - equate=False, - ), - # Restrict search or results: - _Option( - ["-culling_limit", "culling_limit"], - "Hit culling limit (integer).\n\n" - "If the query range of a hit is enveloped by that of at " - "least this many higher-scoring hits, delete the hit.\n\n" - "Incompatible with: best_hit_overhang, best_hit_score_edge.", - equate=False, - ), - _Option( - ["-best_hit_overhang", "best_hit_overhang"], - "Best Hit algorithm overhang value (float, recommended value: 0.1)\n\n" - "Float between 0.0 and 0.5 inclusive. " - "Incompatible with: culling_limit.", - equate=False, - ), - _Option( - ["-best_hit_score_edge", "best_hit_score_edge"], - "Best Hit algorithm score edge value (float).\n\n" - "Float between 0.0 and 0.5 inclusive. Recommended value: 0.1\n\n" - "Incompatible with: culling_limit.", - equate=False, - ), - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - _NcbiblastCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = { - "subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"], - "culling_limit": ["best_hit_overhang", "best_hit_score_edge"], - "subject": ["db", "gilist", "negative_gilist", "seqidlist"], - } - self._validate_incompatibilities(incompatibles) - _NcbiblastCommandline._validate(self) - - -class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline): - """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn - but not psiblast, rpsblast or rpstblastn. - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - # Restrict search or results: - _Option( - ["-db_soft_mask", "db_soft_mask"], - "Filtering algorithm for soft masking (integer).\n\n" - "Filtering algorithm ID to apply to BLAST database as soft masking. " - "Incompatible with: db_hard_mask, subject, subject_loc", - equate=False, - ), - _Option( - ["-db_hard_mask", "db_hard_mask"], - "Filtering algorithm for hard masking (integer).\n\n" - "Filtering algorithm ID to apply to BLAST database as hard masking. " - "Incompatible with: db_soft_mask, subject, subject_loc", - equate=False, - ), - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = { - "db_soft_mask": ["db_hard_mask", "subject", "subject_loc"], - "db_hard_mask": ["db_soft_mask", "subject", "subject_loc"], - } - self._validate_incompatibilities(incompatibles) - _Ncbiblast2SeqCommandline._validate(self) - - -class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline): - """Create a commandline for the NCBI BLAST+ program blastp (for proteins). - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastall tool with separate tools for each of the searches. - This wrapper therefore replaces BlastallCommandline with option -p blastp. - - >>> from Bio.Blast.Applications import NcbiblastpCommandline - >>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr", - ... evalue=0.001, remote=True, ungapped=True) - >>> cline - NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True) - >>> print(cline) - blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="blastp", **kwargs): - """Initialize the class.""" - self.parameters = [ - # General search options: - _Option( - ["-task", "task"], - "Task to execute (string, blastp (default), blastp-fast or blastp-short).", - checker_function=lambda value: value - in ["blastp", "blastp-fast", "blastp-short"], - equate=False, - ), - _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics (string, default 2, i.e. True).\n\n" - "0, F or f: no composition-based statistics\n\n" - "2, T or t, D or d : Composition-based score adjustment as in " - "Bioinformatics 21:902-911, 2005, conditioned on sequence " - "properties\n\n" - "Note that tblastn also supports values of 1 and 3.", - checker_function=lambda value: value in "0Ft2TtDd", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable\n' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Extension options: - _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), - # Miscellaneous options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - ] - _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) - - -class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline): - """Wrapper for the NCBI BLAST+ program blastn (for nucleotides). - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastall tool with separate tools for each of the searches. - This wrapper therefore replaces BlastallCommandline with option -p blastn. - - For example, to run a search against the "nt" nucleotide database using the - FASTA nucleotide file "m_code.fasta" as the query, with an expectation value - cut off of 0.001, saving the output to a file in XML format: - - >>> from Bio.Blast.Applications import NcbiblastnCommandline - >>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus", - ... evalue=0.001, out="m_cold.xml", outfmt=5) - >>> cline - NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus') - >>> print(cline) - blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="blastn", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Input query options: - _Option( - ["-strand", "strand"], - "Query strand(s) to search against database/subject.\n\n" - 'Values allowed are "both" (default), "minus", "plus".', - checker_function=lambda value: value in ["both", "minus", "plus"], - equate=False, - ), - # General search options: - _Option( - ["-task", "task"], - "Task to execute (string, default 'megablast')\n\n" - "Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' " - "(the default), or 'vecscreen'.", - checker_function=lambda value: value - in ["blastn", "blastn-short", "dc-megablast", "megablast", "vecscreen"], - equate=False, - ), - _Option( - ["-penalty", "penalty"], - "Penalty for a nucleotide mismatch (integer, at most zero).", - equate=False, - ), - _Option( - ["-reward", "reward"], - "Reward for a nucleotide match (integer, at least zero).", - equate=False, - ), - _Option( - ["-use_index", "use_index"], - "Use MegaBLAST database index (Boolean, Default = False)", - equate=False, - ), - _Option( - ["-index_name", "index_name"], - "MegaBLAST database index name.", - equate=False, - ), - # Query filtering options: - _Option( - ["-dust", "dust"], - "Filter query sequence with DUST (string).\n\n" - "Format: 'yes', 'level window linker', or 'no' to disable.\n\n" - "Default = '20 64 1'.", - equate=False, - ), - _Option( - ["-filtering_db", "filtering_db"], - "BLAST database containing filtering elements (i.e. repeats).", - equate=False, - ), - _Option( - ["-window_masker_taxid", "window_masker_taxid"], - "Enable WindowMasker filtering using a Taxonomic ID (integer).", - equate=False, - ), - _Option( - ["-window_masker_db", "window_masker_db"], - "Enable WindowMasker filtering using this repeats database (string).", - equate=False, - ), - # Restrict search or results: - _Option( - ["-perc_identity", "perc_identity"], - "Percent identity (real, 0 to 100 inclusive).", - equate=False, - ), - # Discontiguous MegaBLAST options - _Option( - ["-template_type", "template_type"], - "Discontiguous MegaBLAST template type (string).\n\n" - "Allowed values: 'coding', 'coding_and_optimal' or 'optimal'.\n" - "Requires: template_length.", - checker_function=lambda value: value - in ["coding", "coding_and_optimal", "optimal"], - equate=False, - ), - _Option( - ["-template_length", "template_length"], - "Discontiguous MegaBLAST template length (integer).\n\n" - "Allowed values: 16, 18, 21.\n\n" - "Requires: template_type.", - checker_function=lambda value: value in [16, 18, 21, "16", "18", "21"], - equate=False, - ), - # Extension options: - _Switch( - ["-no_greedy", "no_greedy"], - "Use non-greedy dynamic programming extension", - ), - _Option( - ["-min_raw_gapped_score", "min_raw_gapped_score"], - "Minimum raw gapped score to keep an alignment in the " - "preliminary gapped and traceback stages (integer).", - equate=False, - ), - _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), - _Option( - ["-off_diagonal_range", "off_diagonal_range"], - "Number of off-diagonals to search for the 2nd hit (integer).\n\n" - "Expects a positive integer, or 0 (default) to turn off." - "Added in BLAST 2.2.23+", - equate=False, - ), - ] - _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - if (self.template_type and not self.template_length) or ( - self.template_length and not self.template_type - ): - raise ValueError( - "Options template_type and template_type require each other." - ) - _NcbiblastMain2SeqCommandline._validate(self) - - -class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline): - """Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database). - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastall tool with separate tools for each of the searches. - This wrapper therefore replaces BlastallCommandline with option -p blastx. - - >>> from Bio.Blast.Applications import NcbiblastxCommandline - >>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001) - >>> cline - NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001) - >>> print(cline) - blastx -query m_cold.fasta -db nr -evalue 0.001 - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="blastx", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Input query options: - _Option( - ["-task", "task"], - "Task to execute (string, blastx (default) or blastx-fast).", - checker_function=lambda value: value in ["blastx", "blastx-fast"], - equate=False, - ), - _Option( - ["-strand", "strand"], - "Query strand(s) to search against database/subject.\n\n" - 'Values allowed are "both" (default), "minus", "plus".', - checker_function=lambda value: value in ["both", "minus", "plus"], - equate=False, - ), - # Input query options: - _Option( - ["-query_gencode", "query_gencode"], - "Genetic code to use to translate query (integer, default 1).", - equate=False, - ), - # General search options: - _Option( - ["-frame_shift_penalty", "frame_shift_penalty"], - "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" - "This was removed in BLAST 2.2.27+", - equate=False, - ), - _Option( - ["-max_intron_length", "max_intron_length"], - "Maximum intron length (integer).\n\n" - "Length of the largest intron allowed in a translated nucleotide " - "sequence when linking multiple distinct alignments (a negative " - "value disables linking). Default zero.", - equate=False, - ), - _Option( - ["-matrix", "matrix"], - "Scoring matrix name (default BLOSUM62).", - equate=False, - ), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics for blastp, blastx, or tblastn.\n\n" - "D or d: default (equivalent to 2 )\n\n" - "0 or F or f: no composition-based statistics\n\n" - "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" - "2 or T or t : Composition-based score adjustment as in " - "Bioinformatics 21:902-911, 2005, conditioned on sequence " - "properties\n\n" - "3: Composition-based score adjustment as in Bioinformatics " - "21:902-911, 2005, unconditionally.\n\n" - "For programs other than tblastn, must either be absent or be " - "D, F or 0\n\n" - "Default = 2.", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable.' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Extension options: - _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - ] - _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) - - -class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline): - """Wrapper for the NCBI BLAST+ program tblastn. - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastall tool with separate tools for each of the searches. - This wrapper therefore replaces BlastallCommandline with option -p tblastn. - - >>> from Bio.Blast.Applications import NcbitblastnCommandline - >>> cline = NcbitblastnCommandline(help=True) - >>> cline - NcbitblastnCommandline(cmd='tblastn', help=True) - >>> print(cline) - tblastn -help - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="tblastn", **kwargs): - """Initialize the class.""" - self.parameters = [ - # General search options: - _Option( - ["-task", "task"], - "Task to execute (string, tblastn (default) or tblastn-fast).", - checker_function=lambda value: value in ["tblastn", "tblastn-fast"], - equate=False, - ), - _Option( - ["-db_gencode", "db_gencode"], - "Genetic code to use to translate query (integer, default 1).", - equate=False, - ), - _Option( - ["-frame_shift_penalty", "frame_shift_penalty"], - "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" - "This was removed in BLAST 2.2.27+", - equate=False, - ), - _Option( - ["-max_intron_length", "max_intron_length"], - "Maximum intron length (integer).\n\n" - "Length of the largest intron allowed in a translated nucleotide " - "sequence when linking multiple distinct alignments (a negative " - "value disables linking). Default zero.", - equate=False, - ), - _Option( - ["-matrix", "matrix"], - "Scoring matrix name (default BLOSUM62).", - equate=False, - ), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics (string, default 2, i.e. True).\n\n" - "0, F or f: no composition-based statistics\n\n" - "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" - "2, T or t, D or d : Composition-based score adjustment as in " - "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" - "3: Composition-based score adjustment as in Bioinformatics 21:902-911, " - "2005, unconditionally\n\n" - "Note that only tblastn supports values of 1 and 3.", - checker_function=lambda value: value in "0Ft12TtDd3", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Extension options: - _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), - # Miscellaneous options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - # PSI-TBLASTN options: - _Option( - ["-in_pssm", "in_pssm"], - "PSI-BLAST checkpoint file.\n\nIncompatible with: remote, query", - filename=True, - equate=False, - ), - ] - _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) - - -class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline): - """Wrapper for the NCBI BLAST+ program tblastx. - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastall tool with separate tools for each of the searches. - This wrapper therefore replaces BlastallCommandline with option -p tblastx. - - >>> from Bio.Blast.Applications import NcbitblastxCommandline - >>> cline = NcbitblastxCommandline(help=True) - >>> cline - NcbitblastxCommandline(cmd='tblastx', help=True) - >>> print(cline) - tblastx -help - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="tblastx", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Input query options: - _Option( - ["-strand", "strand"], - "Query strand(s) to search against database/subject.\n\n" - 'Values allowed are "both" (default), "minus", "plus".', - checker_function=lambda value: value in ["both", "minus", "plus"], - equate=False, - ), - # Input query options: - _Option( - ["-query_gencode", "query_gencode"], - "Genetic code to use to translate query (integer, default 1).", - equate=False, - ), - # General search options: - _Option( - ["-db_gencode", "db_gencode"], - "Genetic code to use to translate query (integer, default 1).", - equate=False, - ), - _Option( - ["-max_intron_length", "max_intron_length"], - "Maximum intron length (integer).\n\n" - "Length of the largest intron allowed in a translated nucleotide " - "sequence when linking multiple distinct alignments (a negative " - "value disables linking). Default zero.", - equate=False, - ), - _Option( - ["-matrix", "matrix"], - "Scoring matrix name (default BLOSUM62).", - equate=False, - ), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' - 'Default is "12 2.2 2.5"', - equate=False, - ), - ] - _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) - - -class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline): - """Wrapper for the NCBI BLAST+ program psiblast. - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old blastpgp tool with a similar tool psiblast. This wrapper - therefore replaces BlastpgpCommandline, the wrapper for blastpgp. - - >>> from Bio.Blast.Applications import NcbipsiblastCommandline - >>> cline = NcbipsiblastCommandline(help=True) - >>> cline - NcbipsiblastCommandline(cmd='psiblast', help=True) - >>> print(cline) - psiblast -help - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="psiblast", **kwargs): - """Initialize the class.""" - self.parameters = [ - # General search options: - _Option( - ["-matrix", "matrix"], - "Scoring matrix name (default BLOSUM62).", - equate=False, - ), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics (string, default 2, i.e. True).\n\n" - "0, F or f: no composition-based statistics\n\n" - "2, T or t, D or d : Composition-based score adjustment as in " - "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" - "Note that tblastn also supports values of 1 and 3.", - checker_function=lambda value: value in "0Ft2TtDd", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable. ' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Extension options: - _Option( - ["-gap_trigger", "gap_trigger"], - "Number of bits to trigger gapping (float, default 22).", - equate=False, - ), - # Miscellaneous options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - # PSI-BLAST options: - _Option( - ["-num_iterations", "num_iterations"], - "Number of iterations to perform (integer, at least one).\n\n" - "Default is one. Incompatible with: remote", - equate=False, - ), - _Option( - ["-out_pssm", "out_pssm"], - "File name to store checkpoint file.", - filename=True, - equate=False, - ), - _Option( - ["-out_ascii_pssm", "out_ascii_pssm"], - "File name to store ASCII version of PSSM.", - filename=True, - equate=False, - ), - _Switch( - ["-save_pssm_after_last_round", "save_pssm_after_last_round"], - "Save PSSM after the last database search.", - ), - _Switch( - ["-save_each_pssm", "save_each_pssm"], - "Save PSSM after each iteration\n\n" - "File name is given in -save_pssm or -save_ascii_pssm options.", - ), - _Option( - ["-in_msa", "in_msa"], - "File name of multiple sequence alignment to restart PSI-BLAST.\n\n" - "Incompatible with: in_pssm, query", - filename=True, - equate=False, - ), - _Option( - ["-msa_master_idx", "msa_master_idx"], - "Index of sequence to use as master in MSA.\n\n" - "Index (1-based) of sequence to use as the master in the multiple " - "sequence alignment. If not specified, the first sequence is used.", - equate=False, - ), - _Option( - ["-in_pssm", "in_pssm"], - "PSI-BLAST checkpoint file.\n\n" - "Incompatible with: in_msa, query, phi_pattern", - filename=True, - equate=False, - ), - # PSSM engine options: - _Option( - ["-pseudocount", "pseudocount"], - "Pseudo-count value used when constructing PSSM.\n\n" - "Integer. Default is zero.", - equate=False, - ), - _Option( - ["-inclusion_ethresh", "inclusion_ethresh"], - "E-value inclusion threshold for pairwise alignments (float, default 0.002).", - equate=False, - ), - _Switch( - ["-ignore_msa_master", "ignore_msa_master"], - "Ignore the master sequence when creating PSSM.\n\n" - "Requires: in_msa\n" - "Incompatible with: msa_master_idx, in_pssm, query, query_loc, " - "phi_pattern", - ), - # PHI-BLAST options: - _Option( - ["-phi_pattern", "phi_pattern"], - "File name containing pattern to search.\n\n" - "Incompatible with: in_pssm", - filename=True, - equate=False, - ), - ] - _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = { - "num_iterations": ["remote"], - "in_msa": ["in_pssm", "query"], - "in_pssm": ["in_msa", "query", "phi_pattern"], - "ignore_msa_master": [ - "msa_master_idx", - "in_pssm", - "query", - "query_loc", - "phi_pattern", - ], - } - self._validate_incompatibilities(incompatibles) - _Ncbiblast2SeqCommandline._validate(self) - - -class NcbirpsblastCommandline(_NcbiblastCommandline): - """Wrapper for the NCBI BLAST+ program rpsblast. - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old rpsblast tool with a similar tool of the same name. This - wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast. - - >>> from Bio.Blast.Applications import NcbirpsblastCommandline - >>> cline = NcbirpsblastCommandline(help=True) - >>> cline - NcbirpsblastCommandline(cmd='rpsblast', help=True) - >>> print(cline) - rpsblast -help - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="rpsblast", **kwargs): - """Initialize the class.""" - # TODO - remove the -word_size argument as per BLAST+ 2.2.30 - # (BLAST team say it should never have been included, since - # the word size is set when building the domain database.) - # This likely means reviewing the class hierarchy again. - self.parameters = [ - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable.' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Restrict search or results: - _Option( - ["-culling_limit", "culling_limit"], - "Hit culling limit (integer).\n\n" - "If the query range of a hit is enveloped by that of at " - "least this many higher-scoring hits, delete the hit. " - "Incompatible with: best_hit_overhang, best_hit_score_edge.", - equate=False, - ), - _Option( - ["-best_hit_overhang", "best_hit_overhang"], - "Best Hit algorithm overhang value (recommended value: 0.1).\n\n" - "Float between 0.0 and 0.5 inclusive. " - "Incompatible with: culling_limit.", - equate=False, - ), - _Option( - ["-best_hit_score_edge", "best_hit_score_edge"], - "Best Hit algorithm score edge value (recommended value: 0.1).\n\n" - "Float between 0.0 and 0.5 inclusive. " - "Incompatible with: culling_limit.", - equate=False, - ), - # General search options: - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics.\n\n" - "D or d: default (equivalent to 0)\n\n" - "0 or F or f: Simplified Composition-based statistics as in " - "Bioinformatics 15:1000-1011, 1999\n\n" - "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " - "2001\n\n" - "Default = 0.", - checker_function=lambda value: value in "Dd0Ff1Tt", - equate=False, - ), - # Misc options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - ] - _NcbiblastCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]} - self._validate_incompatibilities(incompatibles) - _NcbiblastCommandline._validate(self) - - -class NcbirpstblastnCommandline(_NcbiblastCommandline): - """Wrapper for the NCBI BLAST+ program rpstblastn. - - With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI - replaced the old rpsblast tool with a similar tool of the same name, and a - separate tool rpstblastn for Translated Reverse Position Specific BLAST. - - >>> from Bio.Blast.Applications import NcbirpstblastnCommandline - >>> cline = NcbirpstblastnCommandline(help=True) - >>> cline - NcbirpstblastnCommandline(cmd='rpstblastn', help=True) - >>> print(cline) - rpstblastn -help - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="rpstblastn", **kwargs): - """Initialize the class.""" - # TODO - remove the -word_size argument as per BLAST+ 2.2.30 - # (BLAST team say it should never have been included, since - # the word size is set when building the domain database.) - # This likely means reviewing the class hierarchy again. - self.parameters = [ - # Input query options: - _Option( - ["-strand", "strand"], - "Query strand(s) to search against database/subject.\n\n" - 'Values allowed are "both" (default), "minus", "plus".', - checker_function=lambda value: value in ["both", "minus", "plus"], - equate=False, - ), - # Input query options: - _Option( - ["-query_gencode", "query_gencode"], - "Genetic code to use to translate query (integer, default 1).", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable. ' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # General search options: - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics.\n\n" - "D or d: default (equivalent to 0)\n\n" - "0 or F or f: Simplified Composition-based statistics as in " - "Bioinformatics 15:1000-1011, 1999\n\n" - "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " - "2001\n\n" - "Default = 0.", - checker_function=lambda value: value in "Dd0Ff1Tt", - equate=False, - ), - # Extension options: - _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), - # Miscellaneous options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - ] - _NcbiblastCommandline.__init__(self, cmd, **kwargs) - - -class NcbiblastformatterCommandline(_NcbibaseblastCommandline): - """Wrapper for the NCBI BLAST+ program blast_formatter. - - With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++ - instead of C), the NCBI added the ASN.1 output format option to all the - search tools, and extended the blast_formatter to support this as input. - - The blast_formatter command allows you to convert the ASN.1 output into - the other output formats (XML, tabular, plain text, HTML). - - >>> from Bio.Blast.Applications import NcbiblastformatterCommandline - >>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml") - >>> cline - NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn') - >>> print(cline) - blast_formatter -out example.xml -outfmt 5 -archive example.asn - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - - Note that this wrapper is for the version of blast_formatter from BLAST - 2.2.24+ (or later) which is when the NCBI first announced the inclusion - this tool. There was actually an early version in BLAST 2.2.23+ (and - possibly in older releases) but this did not have the -archive option - (instead -rid is a mandatory argument), and is not supported by this - wrapper. - """ - - def __init__(self, cmd="blast_formatter", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Input options - _Option( - ["-rid", "rid"], - "BLAST Request ID (RID), not compatible with archive arg.", - equate=False, - ), - _Option( - ["-archive", "archive"], - "Archive file of results, not compatible with rid arg.", - filename=True, - equate=False, - ), - # Restrict search or results - _Option( - ["-max_target_seqs", "max_target_seqs"], - "Maximum number of aligned sequences to keep.", - checker_function=lambda value: value >= 1, - equate=False, - ), - ] - _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) - - def _validate(self): - incompatibles = {"rid": ["archive"]} - self._validate_incompatibilities(incompatibles) - _NcbibaseblastCommandline._validate(self) - - -class NcbideltablastCommandline(_Ncbiblast2SeqCommandline): - """Create a commandline for the NCBI BLAST+ program deltablast (for proteins). - - This is a wrapper for the deltablast command line command included in - the NCBI BLAST+ software (not present in the original BLAST). - - >>> from Bio.Blast.Applications import NcbideltablastCommandline - >>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr", - ... evalue=0.001, remote=True) - >>> cline - NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True) - >>> print(cline) - deltablast -query rosemary.pro -db nr -evalue 0.001 -remote - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="deltablast", **kwargs): - """Initialize the class.""" - self.parameters = [ - # General search options: - _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), - _Option( - ["-threshold", "threshold"], - "Minimum score for words to be added to the BLAST lookup table (float).", - equate=False, - ), - _Option( - ["-comp_based_stats", "comp_based_stats"], - "Use composition-based statistics (string, default 2, i.e. True).\n\n" - "0, F or f: no composition-based statistics.\n\n" - "2, T or t, D or d : Composition-based score adjustment as in " - "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" - "Note that tblastn also supports values of 1 and 3.", - checker_function=lambda value: value in "0Ft2TtDd", - equate=False, - ), - # Query filtering options: - _Option( - ["-seg", "seg"], - "Filter query sequence with SEG (string).\n\n" - 'Format: "yes", "window locut hicut", or "no" to disable. ' - 'Default is "12 2.2 2.5"', - equate=False, - ), - # Extension options: - _Option( - ["-gap_trigger", "gap_trigger"], - "Number of bits to trigger gapping. Default = 22.", - equate=False, - ), - # Miscellaneous options: - _Switch( - ["-use_sw_tback", "use_sw_tback"], - "Compute locally optimal Smith-Waterman alignments?", - ), - # PSI-BLAST options - _Option( - ["-num_iterations", "num_iterations"], - "Number of iterations to perform. (integer >=1, Default is 1).\n\n" - "Incompatible with: remote", - equate=False, - ), - _Option( - ["-out_pssm", "out_pssm"], - "File name to store checkpoint file.", - filename=True, - equate=False, - ), - _Option( - ["-out_ascii_pssm", "out_ascii_pssm"], - "File name to store ASCII version of PSSM.", - filename=True, - equate=False, - ), - _Switch( - ["-save_pssm_after_last_round", "save_pssm_after_last_round"], - "Save PSSM after the last database search.", - ), - _Switch( - ["-save_each_pssm", "save_each_pssm"], - "Save PSSM after each iteration.\n\n" - "File name is given in -save_pssm or -save_ascii_pssm options.", - ), - # PSSM engine options - _Option( - ["-pseudocount", "pseudocount"], - "Pseudo-count value used when constructing PSSM (integer, default 0).", - equate=False, - ), - _Option( - ["-domain_inclusion_ethresh", "domain_inclusion_ethresh"], - "E-value inclusion threshold for alignments with conserved domains.\n\n" - "(float, Default is 0.05)", - equate=False, - ), - _Option( - ["-inclusion_ethresh", "inclusion_ethresh"], - "Pairwise alignment e-value inclusion threshold (float, default 0.002).", - equate=False, - ), - # DELTA-BLAST options - _Option( - ["-rpsdb", "rpsdb"], - "BLAST domain database name (dtring, Default = 'cdd_delta').", - equate=False, - ), - _Switch( - ["-show_domain_hits", "show_domain_hits"], - "Show domain hits?\n\nIncompatible with: remote, subject", - ), - ] - _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) - - -class NcbimakeblastdbCommandline(AbstractCommandline): - """Wrapper for the NCBI BLAST+ program makeblastdb. - - This is a wrapper for the NCBI BLAST+ makeblastdb application - to create BLAST databases. By default, this creates a blast database - with the same name as the input file. The default output location - is the same directory as the input. - - >>> from Bio.Blast.Applications import NcbimakeblastdbCommandline - >>> cline = NcbimakeblastdbCommandline(dbtype="prot", - ... input_file="NC_005816.faa") - >>> cline - NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa') - >>> print(cline) - makeblastdb -dbtype prot -in NC_005816.faa - - You would typically run the command line with cline() or via the Python - subprocess module, as described in the Biopython tutorial. - """ - - def __init__(self, cmd="makeblastdb", **kwargs): - """Initialize the class.""" - self.parameters = [ - # Basic input options - _Switch( - ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." - ), - _Switch( - ["-help", "help"], - "Print USAGE, DESCRIPTION and ARGUMENTS description; " - "ignore other arguments.", - ), - _Switch( - ["-version", "version"], - "Print version number; ignore other arguments.", - ), - # Output configuration options - _Option( - ["-out", "out"], - "Output file for alignment.", - filename=True, - equate=False, - ), - # makeblastdb specific options - _Option( - ["-blastdb_version", "blastdb_version"], - "Version of BLAST database to be created. " - "Tip: use BLAST database version 4 on 32 bit CPU. " - "Default = 5", - equate=False, - checker_function=lambda x: x == 4 or x == 5, - ), - _Option( - ["-dbtype", "dbtype"], - "Molecule type of target db ('nucl' or 'prot').", - equate=False, - is_required=True, - checker_function=lambda x: x == "nucl" or x == "prot", - ), - _Option( - ["-in", "input_file"], - "Input file/database name.", - filename=True, - equate=False, - ), - _Option( - ["-input_type", "input_type"], - "Type of the data specified in input_file.\n\n" - "Default = 'fasta'. Added in BLAST 2.2.26.", - filename=False, - equate=False, - checker_function=self._input_type_checker, - ), - _Option( - ["-title", "title"], - "Title for BLAST database.", - filename=False, - equate=False, - ), - _Switch( - ["-parse_seqids", "parse_seqids"], - "Option to parse seqid for FASTA input if set.\n\n" - "For all other input types, seqids are parsed automatically", - ), - _Switch( - ["-hash_index", "hash_index"], "Create index of sequence hash values." - ), - _Option( - ["-mask_data", "mask_data"], - "Comma-separated list of input files containing masking " - "data as produced by NCBI masking applications " - "(e.g. dustmasker, segmasker, windowmasker).", - filename=True, - equate=False, - ), - _Option( - ["-mask_id", "mask_id"], - "Comma-separated list of strings to uniquely identify the " - "masking algorithm.", - filename=False, - equate=False, - ), - _Option( - ["-mask_desc", "mask_desc"], - "Comma-separated list of free form strings to describe " - "the masking algorithm details.", - filename=False, - equate=False, - ), - _Switch(["-gi_mask", "gi_mask"], "Create GI indexed masking data."), - _Option( - ["-gi_mask_name", "gi_mask_name"], - "Comma-separated list of masking data output files.", - filename=False, - equate=False, - ), - _Option( - ["-max_file_sz", "max_file_sz"], - "Maximum file size for BLAST database files. Default = '1GB'.", - filename=False, - equate=False, - ), - _Option( - ["-logfile", "logfile"], - "File to which the program log should be redirected.", - filename=True, - equate=False, - ), - _Option( - ["-taxid", "taxid"], - "Taxonomy ID to assign to all sequences.", - filename=False, - equate=False, - checker_function=lambda x: type(x)(int(x)) == x, - ), - _Option( - ["-taxid_map", "taxid_map"], - "Text file mapping sequence IDs to taxonomy IDs.\n\n" - "Format: ", - filename=True, - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - def _input_type_checker(self, command): - return command in ("asn1_bin", "asn1_txt", "blastdb", "fasta") - - def _validate(self): - incompatibles = { - "mask_id": ["gi_mask"], - "gi_mask": ["mask_id"], - "taxid": ["taxid_map"], - } - - # Copied from _NcbibaseblastCommandline class above. - # Code repeated here for python2 and 3 compatibility, - # because this is not a _NcbibaseblastCommandline subclass. - for a in incompatibles: - if self._get_parameter(a): - for b in incompatibles[a]: - if self._get_parameter(b): - raise ValueError(f"Options {a} and {b} are incompatible.") - - if self.mask_id and not self.mask_data: - raise ValueError("Option mask_id requires mask_data to be set.") - if self.mask_desc and not self.mask_id: - raise ValueError("Option mask_desc requires mask_id to be set.") - if self.gi_mask and not self.parse_seqids: - raise ValueError("Option gi_mask requires parse_seqids to be set.") - if self.gi_mask_name and not (self.mask_data and self.gi_mask): - raise ValueError( - "Option gi_mask_name requires mask_data and gi_mask to be set." - ) - if self.taxid_map and not self.parse_seqids: - raise ValueError("Option taxid_map requires parse_seqids to be set.") - AbstractCommandline._validate(self) - - -def _test(): - """Run the Bio.Blast.Applications module's doctests (PRIVATE).""" - import doctest - - doctest.testmod(verbose=1) - - -if __name__ == "__main__": - # Run the doctests - _test() diff --git a/Bio/Emboss/Applications.py b/Bio/Emboss/Applications.py deleted file mode 100644 index 5d0e63b43..000000000 --- a/Bio/Emboss/Applications.py +++ /dev/null @@ -1,1218 +0,0 @@ -# Copyright 2001-2009 Brad Chapman. -# Revisions copyright 2009-2016 by Peter Cock. -# Revisions copyright 2009 by David Winter. -# Revisions copyright 2009-2010 by Leighton Pritchard. -# All rights reserved. -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Code to interact with and run various EMBOSS programs (OBSOLETE). - -These classes follow the AbstractCommandline interfaces for running -programs. - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class _EmbossMinimalCommandLine(AbstractCommandline): - """Base Commandline object for EMBOSS wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to all the EMBOSS tools: - - Attributes: - - auto Turn off prompts - - stdout Write standard output - - filter Read standard input, write standard output - - options Prompt for standard and additional values - - debug Write debug output to program.dbg - - verbose Report some/full command line options - - help Report command line options. More - information on associated and general - qualifiers can be found with -help -verbose - - warning Report warnings - - error Report errors - - fatal Report fatal errors - - die Report dying program messages - - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - _Switch( - ["-auto", "auto"], - "Turn off prompts.\n\n" - "Automatic mode disables prompting, so we recommend you set this " - "argument all the time when calling an EMBOSS tool from Biopython.", - ), - _Switch(["-stdout", "stdout"], "Write standard output."), - _Switch( - ["-filter", "filter"], "Read standard input, write standard output." - ), - _Switch( - ["-options", "options"], - "Prompt for standard and additional values.\n\n" - "If you are calling an EMBOSS tool from within Biopython, " - "we DO NOT recommend using this option.", - ), - _Switch(["-debug", "debug"], "Write debug output to program.dbg."), - _Switch(["-verbose", "verbose"], "Report some/full command line options"), - _Switch( - ["-help", "help"], - "Report command line options.\n\n" - "More information on associated and general qualifiers " - "can be found with -help -verbose", - ), - _Switch(["-warning", "warning"], "Report warnings."), - _Switch(["-error", "error"], "Report errors."), - _Switch(["-die", "die"], "Report dying program messages."), - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class _EmbossCommandLine(_EmbossMinimalCommandLine): - """Base Commandline object for EMBOSS wrappers (PRIVATE). - - This is provided for subclassing, it deals with shared options - common to all the EMBOSS tools plus: - - - outfile Output filename - - """ - - def __init__(self, cmd=None, **kwargs): - assert cmd is not None - extra_parameters = [ - _Option(["-outfile", "outfile"], "Output filename", filename=True) - ] - try: - # Insert extra parameters - at the start just in case there - # are any arguments which must come last: - self.parameters = extra_parameters + self.parameters - except AttributeError: - # Should we raise an error? The subclass should have set this up! - self.parameters = extra_parameters - _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs) - - def _validate(self): - # Check the outfile, filter, or stdout option has been set. - # We can't simply do this via the required flag for the outfile - # output - this seems the simplest solution. - if not (self.outfile or self.filter or self.stdout): - raise ValueError( - "You must either set outfile (output filename), " - "or enable filter or stdout (output to stdout)." - ) - return _EmbossMinimalCommandLine._validate(self) - - -class Primer3Commandline(_EmbossCommandLine): - """Commandline object for the Primer3 interface from EMBOSS. - - The precise set of supported arguments depends on your version of EMBOSS. - This version accepts arguments current at EMBOSS 6.1.0: - - >>> cline = Primer3Commandline(sequence="mysequence.fas", auto=True, hybridprobe=True) - >>> cline.explainflag = True - >>> cline.osizeopt=20 - >>> cline.psizeopt=200 - >>> cline.outfile = "myresults.out" - >>> cline.bogusparameter = 1967 # Invalid parameter - Traceback (most recent call last): - ... - ValueError: Option name bogusparameter was not found. - >>> print(cline) - eprimer3 -auto -outfile=myresults.out -sequence=mysequence.fas -hybridprobe=True -psizeopt=200 -osizeopt=20 -explainflag=True - - """ - - def __init__(self, cmd="eprimer3", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "Sequence to choose primers from.", - is_required=True, - ), - _Option(["-task", "task"], "Tell eprimer3 what task to perform."), - _Option( - ["-hybridprobe", "hybridprobe"], - "Find an internal oligo to use as a hyb probe.", - ), - _Option( - ["-numreturn", "numreturn"], "Maximum number of primer pairs to return." - ), - _Option( - ["-includedregion", "includedregion"], - "Subregion of the sequence in which to pick primers.", - ), - _Option(["-target", "target"], "Sequence to target for flanking primers."), - _Option( - ["-excludedregion", "excludedregion"], - "Regions to exclude from primer picking.", - ), - _Option( - ["-forwardinput", "forwardinput"], - "Sequence of a forward primer to check.", - ), - _Option( - ["-reverseinput", "reverseinput"], - "Sequence of a reverse primer to check.", - ), - _Option( - ["-gcclamp", "gcclamp"], - "The required number of Gs and Cs at the 3' of each primer.", - ), - _Option(["-osize", "osize"], "Optimum length of a primer oligo."), - _Option(["-minsize", "minsize"], "Minimum length of a primer oligo."), - _Option(["-maxsize", "maxsize"], "Maximum length of a primer oligo."), - _Option( - ["-otm", "otm"], - "Melting temperature for primer oligo (OBSOLETE).\n\n" - "Option replaced in EMBOSS 6.6.0 by -opttm", - ), - _Option( - ["-opttm", "opttm"], - "Optimum melting temperature for a primer oligo.\n\n" - "Option added in EMBOSS 6.6.0, replacing -otm", - ), - _Option( - ["-mintm", "mintm"], "Minimum melting temperature for a primer oligo." - ), - _Option( - ["-maxtm", "maxtm"], "Maximum melting temperature for a primer oligo." - ), - _Option( - ["-maxdifftm", "maxdifftm"], - "Maximum difference in melting temperatures between " - "forward and reverse primers.", - ), - _Option(["-ogcpercent", "ogcpercent"], "Optimum GC% for a primer."), - _Option(["-mingc", "mingc"], "Minimum GC% for a primer."), - _Option(["-maxgc", "maxgc"], "Maximum GC% for a primer."), - _Option( - ["-saltconc", "saltconc"], "Millimolar salt concentration in the PCR." - ), - _Option( - ["-dnaconc", "dnaconc"], - "Nanomolar concentration of annealing oligos in the PCR.", - ), - _Option( - ["-maxpolyx", "maxpolyx"], - "Maximum allowable mononucleotide repeat length in a primer.", - ), - # Primer length: - _Option(["-psizeopt", "psizeopt"], "Optimum size for the PCR product."), - _Option( - ["-prange", "prange"], "Acceptable range of length for the PCR product." - ), - # Primer temperature: - _Option( - ["-ptmopt", "ptmopt"], - "Optimum melting temperature for the PCR product.", - ), - _Option( - ["-ptmmin", "ptmmin"], - "Minimum allowed melting temperature for the amplicon.", - ), - _Option( - ["-ptmmax", "ptmmax"], - "Maximum allowed melting temperature for the amplicon.", - ), - # Note to self, should be -oexcludedregion not -oexcluderegion - _Option( - ["-oexcludedregion", "oexcludedregion"], - "Do not pick internal oligos in this region.", - ), - _Option(["-oligoinput", "oligoinput"], "Sequence of the internal oligo."), - # Oligo length: - _Option(["-osizeopt", "osizeopt"], "Optimum length of internal oligo."), - _Option(["-ominsize", "ominsize"], "Minimum length of internal oligo."), - _Option(["-omaxsize", "omaxsize"], "Maximum length of internal oligo."), - # Oligo GC temperature: - _Option( - ["-otmopt", "otmopt"], "Optimum melting temperature of internal oligo." - ), - _Option( - ["-otmmin", "otmmin"], "Minimum melting temperature of internal oligo." - ), - _Option( - ["-otmmax", "otmmax"], "Maximum melting temperature of internal oligo." - ), - # Oligo GC percent: - _Option(["-ogcopt", "ogcopt"], "Optimum GC% for internal oligo."), - _Option(["-ogcmin", "ogcmin"], "Minimum GC% for internal oligo."), - _Option(["-ogcmax", "ogcmax"], "Maximum GC% for internal oligo."), - # Oligo salt concentration: - _Option( - ["-osaltconc", "osaltconc"], - "Millimolar concentration of salt in the hybridisation.", - ), - _Option( - ["-odnaconc", "odnaconc"], - "Nanomolar concentration of internal oligo in the hybridisation.", - ), - # Oligo self complementarity - _Option( - ["-oanyself", "oanyself"], - "Maximum allowable alignment score for self-complementarity.", - ), - _Option( - ["-oendself", "oendself"], - "Max 3'-anchored self-complementarity global alignment score.", - ), - _Option( - ["-opolyxmax", "opolyxmax"], - "Maximum length of mononucleotide repeat in internal oligo.", - ), - _Option( - ["-mispriminglibraryfile", "mispriminglibraryfile"], - "File containing library of sequences to avoid amplifying", - ), - _Option( - ["-maxmispriming", "maxmispriming"], - "Maximum allowed similarity of primers to sequences in " - "library specified by -mispriminglibrary", - ), - _Option( - ["-omishybmax", "omishybmax"], - "Maximum alignment score for hybridisation of internal oligo to " - "library specified by -mishyblibraryfile.", - ), - _Option( - ["-mishyblibraryfile", "mishyblibraryfile"], - "Library file of seqs to avoid internal oligo hybridisation.", - ), - _Option( - ["-explainflag", "explainflag"], - "Produce output tags with eprimer3 statistics", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class PrimerSearchCommandline(_EmbossCommandLine): - """Commandline object for the primersearch program from EMBOSS.""" - - def __init__(self, cmd="primersearch", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-seqall", "-sequences", "sequences", "seqall"], - "Sequence to look for the primer pairs in.", - is_required=True, - ), - # When this wrapper was written primersearch used -sequences - # as the argument name. Since at least EMBOSS 5.0 (and - # perhaps earlier) this has been -seqall instead. - _Option( - ["-infile", "-primers", "primers", "infile"], - "File containing the primer pairs to search for.", - filename=True, - is_required=True, - ), - # When this wrapper was written primersearch used -primers - # as the argument name. Since at least EMBOSS 5.0 (and - # perhaps earlier) this has been -infile instead. - _Option( - ["-mismatchpercent", "mismatchpercent"], - "Allowed percentage mismatch (any integer value, default 0).", - is_required=True, - ), - _Option( - ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" - ), - _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FDNADistCommandline(_EmbossCommandLine): - """Commandline object for the fdnadist program from EMBOSS. - - fdnadist is an EMBOSS wrapper for the PHYLIP program dnadist for - calculating distance matrices from DNA sequence files. - """ - - def __init__(self, cmd="fdnadist", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "seq file to use (phylip)", - filename=True, - is_required=True, - ), - _Option(["-method", "method"], "sub. model [f,k,j,l,s]", is_required=True), - _Option(["-gamma", "gamma"], "gamma [g, i,n]"), - _Option(["-ncategories", "ncategories"], "number of rate categories (1-9)"), - _Option(["-rate", "rate"], "rate for each category"), - _Option( - ["-categories", "categories"], "File of substitution rate categories" - ), - _Option(["-weights", "weights"], "weights file"), - _Option( - ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" - ), - _Option(["-invarfrac", "invarfrac"], "proportoin of invariant sites"), - _Option(["-ttratio", "ttratio"], "ts/tv ratio"), - _Option(["-freqsfrom", "freqsfrom"], "use empirical base freqs"), - _Option(["-basefreq", "basefreq"], "specify basefreqs"), - _Option(["-lower", "lower"], "lower triangle matrix (y/N)"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FTreeDistCommandline(_EmbossCommandLine): - """Commandline object for the ftreedist program from EMBOSS. - - ftreedist is an EMBOSS wrapper for the PHYLIP program treedist used for - calculating distance measures between phylogentic trees. - """ - - def __init__(self, cmd="ftreedist", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-intreefile", "intreefile"], - "tree file to score (phylip)", - filename=True, - is_required=True, - ), - _Option(["-dtype", "dtype"], "distance type ([S]ymetric, [b]ranch score)"), - _Option( - ["-pairing", "pairing"], - "tree pairing method ([A]djacent pairs, all [p]ossible pairs)", - ), - _Option(["-style", "style"], "output style - [V]erbose, [f]ill, [s]parse"), - _Option(["-noroot", "noroot"], "treat trees as rooted [N/y]"), - _Option( - ["-outgrno", "outgrno"], - "which taxon to root the trees with (starts from 0)", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FNeighborCommandline(_EmbossCommandLine): - """Commandline object for the fneighbor program from EMBOSS. - - fneighbor is an EMBOSS wrapper for the PHYLIP program neighbor used for - calculating neighbor-joining or UPGMA trees from distance matrices. - """ - - def __init__(self, cmd="fneighbor", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-datafile", "datafile"], - "dist file to use (phylip)", - filename=True, - is_required=True, - ), - _Option( - ["-matrixtype", "matrixtype"], - "is matrix square (S), upper (U) or lower (L)", - ), - _Option(["-treetype", "treetype"], "nj or UPGMA tree (n/u)"), - _Option(["-outgrno", "outgrno"], "taxon to use as OG"), - _Option(["-jumble", "jumble"], "randommise input order (Y/n)"), - _Option(["-seed", "seed"], "provide a random seed"), - _Option(["-trout", "trout"], "write tree (Y/n)"), - _Option(["-outtreefile", "outtreefile"], "filename for output tree"), - _Option(["-progress", "progress"], "print progress (Y/n)"), - _Option(["-treeprint", "treeprint"], "print tree (Y/n)"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FSeqBootCommandline(_EmbossCommandLine): - """Commandline object for the fseqboot program from EMBOSS. - - fseqboot is an EMBOSS wrapper for the PHYLIP program seqboot used to - pseudo-sample alignment files. - """ - - def __init__(self, cmd="fseqboot", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "seq file to sample (phylip)", - filename=True, - is_required=True, - ), - _Option(["-categories", "catergories"], "file of input categories"), - _Option(["-weights", "weights"], " weights file"), - _Option(["-test", "test"], "specify operation, default is bootstrap"), - _Option(["-regular", "regular"], "absolute number to resample"), - _Option(["-fracsample", "fracsample"], "fraction to resample"), - _Option( - ["-rewriteformat", "rewriteformat"], - "output format ([P]hyilp, [n]exus, [x]ml", - ), - _Option(["-seqtype", "seqtype"], "output format ([D]na, [p]rotein, [r]na"), - _Option(["-blocksize", "blocksize"], "print progress (Y/n)"), - _Option(["-reps", "reps"], "how many replicates, defaults to 100)"), - _Option( - ["-justweights", "jusweights"], - "what to write out [D]atasets of just [w]eights", - ), - _Option(["-seed", "seed"], "specify random seed"), - _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FDNAParsCommandline(_EmbossCommandLine): - """Commandline object for the fdnapars program from EMBOSS. - - fdnapars is an EMBOSS version of the PHYLIP program dnapars, for - estimating trees from DNA sequences using parsiomny. Calling this command - without providing a value for the option "-intreefile" will invoke - "interactive mode" (and as a result fail if called with subprocess) if - "-auto" is not set to true. - """ - - def __init__(self, cmd="fdnapars", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "seq file to use (phylip)", - filename=True, - is_required=True, - ), - _Option(["-intreefile", "intreefile"], "Phylip tree file"), - _Option(["-weights", "weights"], "weights file"), - _Option(["-maxtrees", "maxtrees"], "max trees to save during run"), - _Option(["-thorough", "thorough"], "more thorough search (Y/n)"), - _Option(["-rearrange", "rearrange"], "Rearrange on just 1 best tree (Y/n)"), - _Option( - ["-transversion", "transversion"], "Use tranversion parsimony (y/N)" - ), - _Option( - ["-njumble", "njumble"], - "number of times to randomise input order (default is 0)", - ), - _Option(["-seed", "seed"], "provide random seed"), - _Option(["-outgrno", "outgrno"], "Specify outgroup"), - _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), - _Option(["-threshold", "threshold"], "Threshold value"), - _Option(["-trout", "trout"], "Write trees to file (Y/n)"), - _Option(["-outtreefile", "outtreefile"], "filename for output tree"), - _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FProtParsCommandline(_EmbossCommandLine): - """Commandline object for the fdnapars program from EMBOSS. - - fprotpars is an EMBOSS version of the PHYLIP program protpars, for - estimating trees from protein sequences using parsiomny. Calling this - command without providing a value for the option "-intreefile" will invoke - "interactive mode" (and as a result fail if called with subprocess) if - "-auto" is not set to true. - """ - - def __init__(self, cmd="fprotpars", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "seq file to use (phylip)", - filename=True, - is_required=True, - ), - _Option(["-intreefile", "intreefile"], "Phylip tree file to score"), - _Option( - ["-outtreefile", "outtreefile"], - "phylip tree output file", - filename=True, - is_required=True, - ), - _Option(["-weights", "weights"], "weights file"), - _Option(["-whichcode", "whichcode"], "which genetic code, [U,M,V,F,Y]]"), - _Option( - ["-njumble", "njumble"], - "number of times to randomise input order (default is 0)", - ), - _Option(["-seed", "seed"], "provide random seed"), - _Option(["-outgrno", "outgrno"], "Specify outgroup"), - _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), - _Option(["-threshold", "threshold"], "Threshold value"), - _Option(["-trout", "trout"], "Write trees to file (Y/n)"), - _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FProtDistCommandline(_EmbossCommandLine): - """Commandline object for the fprotdist program from EMBOSS. - - fprotdist is an EMBOSS wrapper for the PHYLIP program protdist used to - estimate trees from protein sequences using parsimony - """ - - def __init__(self, cmd="fprotdist", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "seq file to use (phylip)", - filename=True, - is_required=True, - ), - _Option(["-ncategories", "ncategories"], "number of rate categories (1-9)"), - _Option(["-rate", "rate"], "rate for each category"), - _Option(["-catergories", "catergories"], "file of rates"), - _Option(["-weights", "weights"], "weights file"), - _Option(["-method", "method"], "sub. model [j,h,d,k,s,c]"), - _Option(["-gamma", "gamma"], "gamma [g, i,c]"), - _Option( - ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" - ), - _Option( - ["-invarcoefficient", "invarcoefficient"], - "float for variation of substitution rate among sites", - ), - _Option(["-aacateg", "aacateg"], "Choose the category to use [G,C,H]"), - _Option(["-whichcode", "whichcode"], "genetic code [c,m,v,f,y]"), - _Option(["-ease", "ease"], "Pob change category (float between -0 and 1)"), - _Option(["-ttratio", "ttratio"], "Transition/transversion ratio (0-1)"), - _Option( - ["-basefreq", "basefreq"], "DNA base frequencies (space separated list)" - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FConsenseCommandline(_EmbossCommandLine): - """Commandline object for the fconsense program from EMBOSS. - - fconsense is an EMBOSS wrapper for the PHYLIP program consense used to - calculate consensus trees. - """ - - def __init__(self, cmd="fconsense", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-intreefile", "intreefile"], - "file with phylip trees to make consensus from", - filename=True, - is_required=True, - ), - _Option(["-method", "method"], "consensus method [s, mr, MRE, ml]"), - _Option( - ["-mlfrac", "mlfrac"], - "cut-off freq for branch to appear in consensus (0.5-1.0)", - ), - _Option(["-root", "root"], "treat trees as rooted (YES, no)"), - _Option(["-outgrno", "outgrno"], "OTU to use as outgroup (starts from 0)"), - _Option(["-trout", "trout"], "treat trees as rooted (YES, no)"), - _Option( - ["-outtreefile", "outtreefile"], "Phylip tree output file (optional)" - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class WaterCommandline(_EmbossCommandLine): - """Commandline object for the water program from EMBOSS.""" - - def __init__(self, cmd="water", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "First sequence to align", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Second sequence to align", - filename=True, - is_required=True, - ), - _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), - _Option( - ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True - ), - _Option(["-datafile", "datafile"], "Matrix file", filename=True), - _Switch( - ["-nobrief", "nobrief"], "Display extended identity and similarity" - ), - _Switch(["-brief", "brief"], "Display brief identity and similarity"), - _Option( - ["-similarity", "similarity"], "Display percent identity and similarity" - ), - _Option( - ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" - ), - _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), - _Option( - ["-aformat", "aformat"], - "Display output in a different specified output format", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class NeedleCommandline(_EmbossCommandLine): - """Commandline object for the needle program from EMBOSS.""" - - def __init__(self, cmd="needle", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "First sequence to align", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Second sequence to align", - filename=True, - is_required=True, - ), - _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), - _Option( - ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True - ), - _Option(["-datafile", "datafile"], "Matrix file", filename=True), - _Option(["-endweight", "endweight"], "Apply And gap penalties"), - _Option( - ["-endopen", "endopen"], - "The score taken away when an end gap is created.", - ), - _Option( - ["-endextend", "endextend"], - "The score added to the end gap penalty for each base or " - "residue in the end gap.", - ), - _Switch( - ["-nobrief", "nobrief"], "Display extended identity and similarity" - ), - _Switch(["-brief", "brief"], "Display brief identity and similarity"), - _Option( - ["-similarity", "similarity"], "Display percent identity and similarity" - ), - _Option( - ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" - ), - _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), - _Option( - ["-aformat", "aformat"], - "Display output in a different specified output format", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class NeedleallCommandline(_EmbossCommandLine): - """Commandline object for the needleall program from EMBOSS.""" - - def __init__(self, cmd="needleall", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "First sequence to align", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Second sequence to align", - filename=True, - is_required=True, - ), - _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), - _Option( - ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True - ), - _Option(["-datafile", "datafile"], "Matrix file", filename=True), - _Option( - ["-minscore", "minscore"], - "Exclude alignments with scores below this threshold score.", - ), - _Option(["-errorfile", "errorfile"], "Error file to be written to."), - _Option(["-endweight", "endweight"], "Apply And gap penalties"), - _Option( - ["-endopen", "endopen"], - "The score taken away when an end gap is created.", - ), - _Option( - ["-endextend", "endextend"], - "The score added to the end gap penalty for each base or " - "residue in the end gap.", - ), - _Switch( - ["-nobrief", "nobrief"], "Display extended identity and similarity" - ), - _Switch(["-brief", "brief"], "Display brief identity and similarity"), - _Option( - ["-similarity", "similarity"], "Display percent identity and similarity" - ), - _Option( - ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" - ), - _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), - _Option( - ["-aformat", "aformat"], - "Display output in a different specified output format", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class StretcherCommandline(_EmbossCommandLine): - """Commandline object for the stretcher program from EMBOSS.""" - - def __init__(self, cmd="stretcher", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "First sequence to align", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Second sequence to align", - filename=True, - is_required=True, - ), - _Option( - ["-gapopen", "gapopen"], - "Gap open penalty", - is_required=True, - checker_function=lambda value: isinstance(value, int), - ), - _Option( - ["-gapextend", "gapextend"], - "Gap extension penalty", - is_required=True, - checker_function=lambda value: isinstance(value, int), - ), - _Option(["-datafile", "datafile"], "Matrix file", filename=True), - _Option( - ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" - ), - _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), - _Option( - ["-aformat", "aformat"], - "Display output in a different specified output format", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FuzznucCommandline(_EmbossCommandLine): - """Commandline object for the fuzznuc program from EMBOSS.""" - - def __init__(self, cmd="fuzznuc", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Sequence database USA", is_required=True - ), - _Option( - ["-pattern", "pattern"], - "Search pattern, using standard IUPAC one-letter codes", - is_required=True, - ), - _Option(["-pmismatch", "pmismatch"], "Number of mismatches"), - _Option(["-complement", "complement"], "Search complementary strand"), - _Option(["-rformat", "rformat"], "Specify the report format to output in."), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class FuzzproCommandline(_EmbossCommandLine): - """Commandline object for the fuzzpro program from EMBOSS.""" - - def __init__(self, cmd="fuzzpro", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Sequence database USA", is_required=True - ), - _Option( - ["-pattern", "pattern"], - "Search pattern, using standard IUPAC one-letter codes", - is_required=True, - ), - _Option(["-pmismatch", "pmismatch"], "Number of mismatches"), - _Option(["-rformat", "rformat"], "Specify the report format to output in."), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class Est2GenomeCommandline(_EmbossCommandLine): - """Commandline object for the est2genome program from EMBOSS.""" - - def __init__(self, cmd="est2genome", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option(["-est", "est"], "EST sequence(s)", is_required=True), - _Option(["-genome", "genome"], "Genomic sequence", is_required=True), - _Option(["-match", "match"], "Score for matching two bases"), - _Option(["-mismatch", "mismatch"], "Cost for mismatching two bases"), - _Option( - ["-gappenalty", "gappenalty"], - "Cost for deleting a single base in either sequence, " - "excluding introns", - ), - _Option( - ["-intronpenalty", "intronpenalty"], - "Cost for an intron, independent of length.", - ), - _Option( - ["-splicepenalty", "splicepenalty"], - "Cost for an intron, independent of length " - "and starting/ending on donor-acceptor sites", - ), - _Option( - ["-minscore", "minscore"], - "Exclude alignments with scores below this threshold score.", - ), - _Option( - ["-reverse", "reverse"], "Reverse the orientation of the EST sequence" - ), - _Option(["-splice", "splice"], "Use donor and acceptor splice sites."), - _Option( - ["-mode", "mode"], - "This determines the comparison mode. 'both', 'forward', or 'reverse'", - ), - _Option( - ["-best", "best"], - "You can print out all comparisons instead of just the best", - ), - _Option(["-space", "space"], "for linear-space recursion."), - _Option(["-shuffle", "shuffle"], "Shuffle"), - _Option(["-seed", "seed"], "Random number seed"), - _Option(["-align", "align"], "Show the alignment."), - _Option(["-width", "width"], "Alignment width"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class ETandemCommandline(_EmbossCommandLine): - """Commandline object for the etandem program from EMBOSS.""" - - def __init__(self, cmd="etandem", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Sequence", filename=True, is_required=True - ), - _Option( - ["-minrepeat", "minrepeat"], "Minimum repeat size", is_required=True - ), - _Option( - ["-maxrepeat", "maxrepeat"], "Maximum repeat size", is_required=True - ), - _Option(["-threshold", "threshold"], "Threshold score"), - _Option(["-mismatch", "mismatch"], "Allow N as a mismatch"), - _Option(["-uniform", "uniform"], "Allow uniform consensus"), - _Option(["-rformat", "rformat"], "Output report format"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class EInvertedCommandline(_EmbossCommandLine): - """Commandline object for the einverted program from EMBOSS.""" - - def __init__(self, cmd="einverted", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Sequence", filename=True, is_required=True - ), - _Option(["-gap", "gap"], "Gap penalty", filename=True, is_required=True), - _Option( - ["-threshold", "threshold"], "Minimum score threshold", is_required=True - ), - _Option(["-match", "match"], "Match score", is_required=True), - _Option(["-mismatch", "mismatch"], "Mismatch score", is_required=True), - _Option( - ["-maxrepeat", "maxrepeat"], - "Maximum separation between the start and end of repeat", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class PalindromeCommandline(_EmbossCommandLine): - """Commandline object for the palindrome program from EMBOSS.""" - - def __init__(self, cmd="palindrome", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Sequence", filename=True, is_required=True - ), - _Option( - ["-minpallen", "minpallen"], - "Minimum palindrome length", - is_required=True, - ), - _Option( - ["-maxpallen", "maxpallen"], - "Maximum palindrome length", - is_required=True, - ), - _Option( - ["-gaplimit", "gaplimit"], - "Maximum gap between repeats", - is_required=True, - ), - _Option( - ["-nummismatches", "nummismatches"], - "Number of mismatches allowed", - is_required=True, - ), - _Option( - ["-overlap", "overlap"], "Report overlapping matches", is_required=True - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class TranalignCommandline(_EmbossCommandLine): - """Commandline object for the tranalign program from EMBOSS.""" - - def __init__(self, cmd="tranalign", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "Nucleotide sequences to be aligned.", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Protein sequence alignment", - filename=True, - is_required=True, - ), - _Option( - ["-outseq", "outseq"], - "Output sequence file.", - filename=True, - is_required=True, - ), - _Option(["-table", "table"], "Code to use"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class DiffseqCommandline(_EmbossCommandLine): - """Commandline object for the diffseq program from EMBOSS.""" - - def __init__(self, cmd="diffseq", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-asequence", "asequence"], - "First sequence to compare", - filename=True, - is_required=True, - ), - _Option( - ["-bsequence", "bsequence"], - "Second sequence to compare", - filename=True, - is_required=True, - ), - _Option( - ["-wordsize", "wordsize"], - "Word size to use for comparisons (10 default)", - is_required=True, - ), - _Option( - ["-aoutfeat", "aoutfeat"], - "File for output of first sequence's features", - filename=True, - is_required=True, - ), - _Option( - ["-boutfeat", "boutfeat"], - "File for output of second sequence's features", - filename=True, - is_required=True, - ), - _Option(["-rformat", "rformat"], "Output report file format"), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -class IepCommandline(_EmbossCommandLine): - """Commandline for EMBOSS iep: calculated isoelectric point and charge. - - Examples - -------- - >>> from Bio.Emboss.Applications import IepCommandline - >>> iep_cline = IepCommandline(sequence="proteins.faa", - ... outfile="proteins.txt") - >>> print(iep_cline) - iep -outfile=proteins.txt -sequence=proteins.faa - - You would typically run the command line with iep_cline() or via the - Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="iep", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "Protein sequence(s) filename", - filename=True, - is_required=True, - ), - _Option( - ["-amino", "amino"], - """Number of N-termini - - Integer 0 (default) or more. - """, - ), - _Option( - ["-carboxyl", "carboxyl"], - """Number of C-termini - - Integer 0 (default) or more. - """, - ), - _Option( - ["-lysinemodified", "lysinemodified"], - """Number of modified lysines - - Integer 0 (default) or more. - """, - ), - _Option( - ["-disulphides", "disulphides"], - """Number of disulphide bridges - - Integer 0 (default) or more. - """, - ), - # Should we implement the -termini switch as well? - _Option( - ["-notermini", "notermini"], - "Exclude (True) or include (False) charge at N and C terminus.", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -# seqret uses -outseq, not -outfile, so use the base class: -class SeqretCommandline(_EmbossMinimalCommandLine): - """Commandline object for the seqret program from EMBOSS. - - This tool allows you to interconvert between different sequence file - formats (e.g. GenBank to FASTA). Combining Biopython's Bio.SeqIO module - with seqret using a suitable intermediate file format can allow you to - read/write to an even wider range of file formats. - - This wrapper currently only supports the core functionality, things like - feature tables (in EMBOSS 6.1.0 onwards) are not yet included. - """ - - def __init__(self, cmd="seqret", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], "Input sequence(s) filename", filename=True - ), - _Option(["-outseq", "outseq"], "Output sequence file.", filename=True), - _Option( - ["-sformat", "sformat"], - "Input sequence(s) format (e.g. fasta, genbank)", - ), - _Option( - ["-osformat", "osformat"], - "Output sequence(s) format (e.g. fasta, genbank)", - ), - ] - _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs) - - def _validate(self): - # Check the outfile, filter, or stdout option has been set. - # We can't simply do this via the required flag for the outfile - # output - this seems the simplest solution. - if not (self.outseq or self.filter or self.stdout): - raise ValueError( - "You must either set outfile (output filename), " - "or enable filter or stdout (output to stdout)." - ) - if not (self.sequence or self.filter or self.stdint): - raise ValueError( - "You must either set sequence (input filename), " - "or enable filter or stdin (input from stdin)." - ) - return _EmbossMinimalCommandLine._validate(self) - - -class SeqmatchallCommandline(_EmbossCommandLine): - """Commandline object for the seqmatchall program from EMBOSS. - - e.g. - >>> cline = SeqmatchallCommandline(sequence="opuntia.fasta", outfile="opuntia.txt") - >>> cline.auto = True - >>> cline.wordsize = 18 - >>> cline.aformat = "pair" - >>> print(cline) - seqmatchall -auto -outfile=opuntia.txt -sequence=opuntia.fasta -wordsize=18 -aformat=pair - - """ - - def __init__(self, cmd="seqmatchall", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-sequence", "sequence"], - "Readable set of sequences", - filename=True, - is_required=True, - ), - _Option( - ["-wordsize", "wordsize"], "Word size (Integer 2 or more, default 4)" - ), - _Option( - ["-aformat", "aformat"], - "Display output in a different specified output format", - ), - ] - _EmbossCommandLine.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Phylo/Applications/_Fasttree.py b/Bio/Phylo/Applications/_Fasttree.py deleted file mode 100644 index d86fa20e2..000000000 --- a/Bio/Phylo/Applications/_Fasttree.py +++ /dev/null @@ -1,600 +0,0 @@ -# Copyright 2013 by Nate Sutton. -# Based on code in _Phyml.py by Eric Talevich. -# All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command-line wrapper for tree inference program Fasttree.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -def _is_int(x): - """Test whether the argument can be serialized as an integer (PRIVATE).""" - return isinstance(x, int) or str(x).isdigit() - - -def _is_numeric(x): - """Test whether the argument can be serialized as a number (PRIVATE).""" - try: - float(str(x)) - return True - except ValueError: - return False - - -class FastTreeCommandline(AbstractCommandline): - r"""Command-line wrapper for FastTree. - - Only the ``input`` and ``out`` parameters are mandatory. - - From the terminal command line use ``fasttree.exe -help`` or ``fasttree.exe -expert`` - for more explanation of usage options. - - Homepage: http://www.microbesonline.org/fasttree/ - - References - ---------- - Price, M.N., Dehal, P.S., and Arkin, A.P. (2010) FastTree 2 -- Approximately - Maximum-Likelihood Trees for Large Alignments. PLoS ONE, 5(3):e9490. - https://doi.org/10.1371/journal.pone.0009490. - - Examples - -------- - This is an example on Windows:: - - import _Fasttree - fasttree_exe = r"C:\FasttreeWin32\fasttree.exe" - cmd = _Fasttree.FastTreeCommandline(fasttree_exe, - ... input=r'C:\Input\ExampleAlignment.fsa', - ... out=r'C:\Output\ExampleTree.tree') - print(cmd) - out, err = cmd() - print(out) - print(err) - - """ - - def __init__(self, cmd="fasttree", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Switch( - ["-nt", "nt"], - "By default FastTree expects protein alignments, use -nt for nucleotides", - ), - _Option( - ["-n", "n"], - """-n -- read N multiple alignments in. - - This only works with phylip interleaved format. For example, you can - use it with the output from phylip's seqboot. If you use -n, FastTree - will write 1 tree per line to standard output. - """, - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-quote", "quote"], - """-quote -- add quotes to sequence names in output. - - Quote sequence names in the output and allow spaces, commas, - parentheses, and colons in them but not ' characters (fasta files only). - """, - ), - _Option( - ["-pseudo", "pseudo"], - """-pseudo [weight] -- Pseudocounts are used with sequence distance estimation. - - Use pseudocounts to estimate distances between sequences with little or no - overlap. (Off by default.) Recommended if analyzing the alignment has - sequences with little or no overlap. - If the weight is not specified, it is 1.0 - """, - checker_function=_is_numeric, - equate=False, - ), - _Option( - ["-boot", "boot"], - """Specify the number of resamples for support values. - - Support value options: - By default, FastTree computes local support values by resampling the site - likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, - it will compute minimum-evolution bootstrap supports instead - In either case, the support values are proportions ranging from 0 to 1 - - Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. - """, - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-nosupport", "nosupport"], - """Turn off support values. - - Support value options: - By default, FastTree computes local support values by resampling the site - likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, - it will compute minimum-evolution bootstrap supports instead - In either case, the support values are proportions ranging from 0 to 1 - - Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. - """, - ), - _Option( - ["-intree", "intree"], - """-intree newickfile -- read the starting tree in from newickfile. - - Any branch lengths in the starting trees are ignored. - -intree with -n will read a separate starting tree for each alignment. - """, - filename=True, - equate=False, - ), - _Option( - ["-intree1", "intree1"], - "intree1 newickfile -- read the same starting tree for each alignment.", - filename=True, - equate=False, - ), - _Switch( - ["-quiet", "quiet"], - """-quiet -- do not write to standard error during normal operation - - (no progress indicator, no options summary, no likelihood values, etc.) - """, - ), - _Switch( - ["-nopr", "nopr"], - "-nopr -- do not write the progress indicator to stderr.", - ), - _Option( - ["-nni", "nni"], - """Set the rounds of minimum-evolution nearest-neighbor interchanges - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs. - """, - checker_function=_is_int, - equate=False, - ), - _Option( - ["-spr", "spr"], - """Set the rounds of subtree-prune-regraft moves - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - """, - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-noml", "noml"], - """Deactivate min-evo NNIs and SPRs. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -noml to turn off both min-evo NNIs and SPRs (useful if refining - an approximately maximum-likelihood tree with further NNIs). - """, - ), - _Switch( - ["-mllen", "mllen"], - """Optimize branch lengths on a fixed topology. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -mllen to optimize branch lengths without ML NNIs - Use -mllen -nome with -intree to optimize branch lengths on a fixed topology. - """, - ), - _Switch( - ["-nome", "nome"], - """Changes support values calculation to a minimum-evolution bootstrap method. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -mllen to optimize branch lengths without ML NNIs - Use -mllen -nome with -intree to optimize branch lengths on a fixed topology - - Support value options: - By default, FastTree computes local support values by resampling the site - likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, - it will compute minimum-evolution bootstrap supports instead - In either case, the support values are proportions ranging from 0 to 1. - """, - ), - _Option( - ["-mlnni", "mlnni"], - """Set the number of rounds of maximum-likelihood NNIs. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -mlnni to set the number of rounds of maximum-likelihood NNIs. - """, - checker_function=_is_int, - equate=False, - ), - _Option( - ["-mlacc", "mlacc"], - """Option for optimization of branches at each NNI. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI, - and to optimize all 5 branches in 2 or 3 rounds. - """, - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-slownni", "slownni"], - """Turn off heuristics to avoid constant subtrees with NNIs. - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - Use -slownni to turn off heuristics to avoid constant subtrees - (affects both ML and ME NNIs). - """, - ), - _Switch( - ["-wag", "wag"], - """Maximum likelihood model options. - - Whelan-And-Goldman 2001 model instead of (default) - Jones-Taylor-Thorton 1992 model (a.a. only) - """, - ), - _Switch( - ["-gtr", "gtr"], - """Maximum likelihood model options. - - Use generalized time-reversible instead of (default) - Jukes-Cantor (nt only) - """, - ), - _Option( - ["-cat", "cat"], - """Maximum likelihood model options. - - Specify the number of rate categories of sites (default 20).""", - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-nocat", "nocat"], - "Maximum likelihood model options: No CAT model (just 1 category)", - ), - _Switch( - ["-gamma", "gamma"], - """Report the likelihood under the discrete gamma model. - - Maximum likelihood model options: - -gamma -- after the final round of optimizing branch lengths with the CAT model, - report the likelihood under the discrete gamma model with the same - number of categories. FastTree uses the same branch lengths but - optimizes the gamma shape parameter and the scale of the lengths. - The final tree will have rescaled lengths. Used with -log, this - also generates per-site likelihoods for use with CONSEL, see - GammaLogToPaup.pl and documentation on the FastTree web site. - """, - ), - _Switch( - ["-slow", "slow"], - """Use an exhaustive search. - - Searching for the best join: - By default, FastTree combines the 'visible set' of fast neighbor-joining with - local hill-climbing as in relaxed neighbor-joining - -slow -- exhaustive search (like NJ or BIONJ, but different gap handling) - -slow takes half an hour instead of 8 seconds for 1,250 proteins - """, - ), - _Switch( - ["-fastest", "fastest"], - """Search the visible set (the top hit for each node) only. - - Searching for the best join: - By default, FastTree combines the 'visible set' of fast neighbor-joining with - local hill-climbing as in relaxed neighbor-joining - -fastest -- search the visible set (the top hit for each node) only - Unlike the original fast neighbor-joining, -fastest updates visible(C) - after joining A and B if join(AB,C) is better than join(C,visible(C)) - -fastest also updates out-distances in a very lazy way, - -fastest sets -2nd on as well, use -fastest -no2nd to avoid this - """, - ), - _Switch( - ["-2nd", "second"], - """Turn 2nd-level top hits heuristic on. - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - Use -notop (or -slow) to turn this feature off - and compare all leaves to each other, - and all new joined nodes to each other - - -2nd or -no2nd to turn 2nd-level top hits heuristic on or off - This reduces memory usage and running time but may lead to - marginal reductions in tree quality. - (By default, -fastest turns on -2nd.) - """, - ), - _Switch( - ["-no2nd", "no2nd"], - """Turn 2nd-level top hits heuristic off. - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - Use -notop (or -slow) to turn this feature off - and compare all leaves to each other, - and all new joined nodes to each other - - -2nd or -no2nd to turn 2nd-level top hits heuristic on or off - This reduces memory usage and running time but may lead to - marginal reductions in tree quality. - (By default, -fastest turns on -2nd.) - """, - ), - _Option( - ["-seed", "seed"], - """Use -seed to initialize the random number generator. - - Support value options: - By default, FastTree computes local support values by resampling the site - likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, - it will compute minimum-evolution bootstrap supports instead - In either case, the support values are proportions ranging from 0 to 1. - """, - checker_function=_is_int, - equate=False, - ), - _Switch( - ["-top", "top"], - """Top-hit list to speed up search - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - Use -notop (or -slow) to turn this feature off - and compare all leaves to each other, - and all new joined nodes to each other. - """, - ), - _Switch( - ["-notop", "notop"], - """Turn off top-hit list to speed up search - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - Use -notop (or -slow) to turn this feature off - and compare all leaves to each other, - and all new joined nodes to each other. - """, - ), - _Option( - ["-topm", "topm"], - """Change the top hits calculation method - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - -topm 1.0 -- set the top-hit list size to parameter*sqrt(N) - FastTree estimates the top m hits of a leaf from the - top 2*m hits of a 'close' neighbor, where close is - defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m), - and updates the top-hits as joins proceed. - """, - checker_function=_is_numeric, - equate=False, - ), - _Option( - ["-close", "close"], - """Modify the close heuristic for the top-hit list - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - -close 0.75 -- modify the close heuristic, lower is more conservative. - """, - checker_function=_is_numeric, - equate=False, - ), - _Option( - ["-refresh", "refresh"], - """Parameter for conditions that joined nodes are compared to other nodes - - Top-hit heuristics: - By default, FastTree uses a top-hit list to speed up search - -refresh 0.8 -- compare a joined node to all other nodes if its - top-hit list is less than 80% of the desired length, - or if the age of the top-hit list is log2(m) or greater. - """, - checker_function=_is_numeric, - equate=False, - ), - _Option( - ["-matrix", "matrix"], - """Specify a matrix for nucleotide or amino acid distances - - Distances: - Default: For protein sequences, log-corrected distances and an - amino acid dissimilarity matrix derived from BLOSUM45 - or for nucleotide sequences, Jukes-Cantor distances - To specify a different matrix, use -matrix FilePrefix or -nomatrix - """, - filename=True, - equate=False, - ), - _Switch( - ["-nomatrix", "nomatrix"], - """Specify that no matrix should be used for nucleotide or amino acid distances - - Distances: - Default: For protein sequences, log-corrected distances and an - amino acid dissimilarity matrix derived from BLOSUM45 - or for nucleotide sequences, Jukes-Cantor distances - To specify a different matrix, use -matrix FilePrefix or -nomatrix - """, - ), - _Switch( - ["-nj", "nj"], - "Join options: regular (unweighted) neighbor-joining (default)", - ), - _Switch( - ["-bionj", "bionj"], - """Join options: weighted joins as in BIONJ. - - FastTree will also weight joins during NNIs. - """, - ), - _Option( - ["-gtrrates", "gtrrates"], "-gtrrates ac ag at cg ct gt", equate=False - ), - _Option(["-gtrfreq", "gtrfreq"], "-gtrfreq A C G T", equate=False), - _Option( - ["-constraints", "constraints"], - """Specifies an alignment file for use with constrained topology searching - - Constrained topology search options: - -constraints alignmentfile -- an alignment with values of 0, 1, and - - Not all sequences need be present. A column of 0s and 1s defines a - constrained split. Some constraints may be violated - (see 'violating constraints:' in standard error). - """, - filename=True, - equate=False, - ), - _Option( - ["-constraintWeight", "constraintWeight"], - """Weight strength of constraints in topology searching. - - Constrained topology search options: - -constraintWeight -- how strongly to weight the constraints. A value of 1 - means a penalty of 1 in tree length for violating a constraint - Default: 100.0 - """, - checker_function=_is_numeric, - equate=False, - ), - _Option( - ["-log", "log"], - """Create log files of data such as intermediate trees and per-site rates - - -log logfile -- save intermediate trees so you can extract - the trees and restart long-running jobs if they crash - -log also reports the per-site rates (1 means slowest category). - """, - filename=True, - equate=False, - ), - _Option( - ["-makematrix", "makematrix"], - "-makematrix [alignment]", - filename=True, - equate=False, - ), - _Switch( - ["-rawdist", "rawdist"], - """Turn off or adjust log-correction in AA or NT distances. - - Use -rawdist to turn the log-correction off or to use - %different instead of Jukes-Cantor in AA or NT distances - - Distances: - Default: For protein sequences, log-corrected distances and an - amino acid dissimilarity matrix derived from BLOSUM45 - or for nucleotide sequences, Jukes-Cantor distances - To specify a different matrix, use -matrix FilePrefix or -nomatrix - """, - ), - _Option( - ["-sprlength", "sprlength"], - """Set maximum SPR move length in topology refinement (default 10). - - Topology refinement: - By default, FastTree tries to improve the tree with up to 4*log2(N) - rounds of minimum-evolution nearest-neighbor interchanges (NNI), - where N is the number of unique sequences, 2 rounds of - subtree-prune-regraft (SPR) moves (also min. evo.), and - up to 2*log(N) rounds of maximum-likelihood NNIs. - Use -nni to set the number of rounds of min. evo. NNIs, - and -spr to set the rounds of SPRs. - """, - checker_function=_is_int, - equate=False, - ), - _Switch(["-help", "help"], "Show the help."), - _Switch(["-expert", "expert"], "Show the expert level help."), - _Option( - ["-out", "out"], - """Enter - - The path to a Newick Tree output file needs to be specified. - """, - filename=True, - equate=False, - ), - _Argument( - ["input"], - """Enter - - An input file of sequence alignments in fasta or phylip format - is needed. By default FastTree expects protein - alignments, use -nt for nucleotides. - """, - filename=True, - is_required=True, - ), - ] - - AbstractCommandline.__init__(self, cmd, **kwargs) diff --git a/Bio/Phylo/Applications/_Phyml.py b/Bio/Phylo/Applications/_Phyml.py deleted file mode 100644 index 6430d495c..000000000 --- a/Bio/Phylo/Applications/_Phyml.py +++ /dev/null @@ -1,291 +0,0 @@ -# Copyright 2011 by Eric Talevich. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command-line wrapper for the tree inference program PhyML.""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class PhymlCommandline(AbstractCommandline): - """Command-line wrapper for the tree inference program PhyML. - - Homepage: http://www.atgc-montpellier.fr/phyml - - References - ---------- - Guindon S, Gascuel O. - A simple, fast, and accurate algorithm to estimate large phylogenies by maximum - likelihood. - Systematic Biology, 2003 Oct;52(5):696-704. - PubMed PMID: 14530136. - - Guindon S, Dufayard JF, Lefort V, Anisimova M, Hordijk W, Gascuel O. - New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing - the Performance of PhyML 3.0. - Systematic Biology, 2010 59(3):307-21. - - """ - - def __init__(self, cmd="phyml", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-i", "--input", "input"], - "PHYLIP format input nucleotide or amino-acid sequence filenam.", - filename=True, - is_required=True, - equate=False, - ), - _Option( - ["-d", "--datatype", "datatype"], - "Datatype 'nt' for nucleotide (default) or 'aa' for amino-acids.", - checker_function=lambda x: x in ("nt", "aa"), - equate=False, - ), - _Switch( - ["-q", "--sequential", "sequential"], - "Changes interleaved format (default) to sequential format.", - ), - _Option( - ["-n", "--multiple", "multiple"], - "Number of data sets to analyse (integer).", - checker_function=(lambda x: isinstance(x, int) or x.isdigit()), - equate=False, - ), - _Switch( - ["-p", "--pars", "pars"], - """Use a minimum parsimony starting tree. - - This option is taken into account when the '-u' option is absent - and when tree topology modifications are to be done. - """, - ), - _Option( - ["-b", "--bootstrap", "bootstrap"], - r"""Number of bootstrap replicates, if value is > 0. - - Otherwise: - - 0: neither approximate likelihood ratio test nor bootstrap - values are computed. - - -1: approximate likelihood ratio test returning aLRT statistics. - - -2: approximate likelihood ratio test returning Chi2-based - parametric branch supports. - - -4: SH-like branch supports alone. - """, - equate=False, - ), - _Option( - ["-m", "--model", "model"], - """Substitution model name. - - Nucleotide-based models: - - HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR | custom - - For the custom option, a string of six digits identifies the - model. For instance, 000000 corresponds to F81 (or JC69, - provided the distribution of nucleotide frequencies is uniform). - 012345 corresponds to GTR. This option can be used for encoding - any model that is a nested within GTR. - - Amino-acid based models: - - LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV | - CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb | custom - """, - checker_function=( - lambda x: x - in ( - # Nucleotide models: - "HKY85", - "JC69", - "K80", - "F81", - "F84", - "TN93", - "GTR", - # Amino acid models: - "LG", - "WAG", - "JTT", - "MtREV", - "Dayhoff", - "DCMut", - "RtREV", - "CpREV", - "VT", - "Blosum62", - "MtMam", - "MtArt", - "HIVw", - "HIVb", - ) - or isinstance(x, int) - ), - equate=False, - ), - _Option( - ["-f", "frequencies"], - """Character frequencies. - - -f e, m, or "fA fC fG fT" - - e : Empirical frequencies, determined as follows : - - - Nucleotide sequences: (Empirical) the equilibrium base - frequencies are estimated by counting the occurrence - of the different bases in the alignment. - - Amino-acid sequences: (Empirical) the equilibrium - amino-acid frequencies are estimated by counting the - occurrence of the different amino-acids in the alignment. - - m : ML/model-based frequencies, determined as follows : - - - Nucleotide sequences: (ML) the equilibrium base - frequencies are estimated using maximum likelihood - - Amino-acid sequences: (Model) the equilibrium amino-acid - frequencies are estimated using the frequencies defined by - the substitution model. - - "fA fC fG fT" : only valid for nucleotide-based models. - fA, fC, fG and fT are floating-point numbers that correspond - to the frequencies of A, C, G and T, respectively. - """, - filename=True, # ensure ".25 .25 .25 .25" stays quoted - equate=False, - ), - _Option( - ["-t", "--ts/tv", "ts_tv_ratio"], - """Transition/transversion ratio. (DNA sequences only.) - - Can be a fixed positive value (ex:4.0) or e to get the - maximum-likelihood estimate. - """, - equate=False, - ), - _Option( - ["-v", "--pinv", "prop_invar"], - """Proportion of invariable sites. - - Can be a fixed value in the range [0,1], or 'e' to get the - maximum-likelihood estimate. - """, - equate=False, - ), - _Option( - ["-c", "--nclasses", "nclasses"], - """Number of relative substitution rate categories. - - Default 1. Must be a positive integer. - """, - equate=False, - ), - _Option( - ["-a", "--alpha", "alpha"], - """Distribution of the gamma distribution shape parameter. - - Can be a fixed positive value, or 'e' to get the - maximum-likelihood estimate. - """, - equate=False, - ), - _Option( - ["-s", "--search", "search"], - """Tree topology search operation option. - - Can be one of: - - NNI : default, fast - - SPR : a bit slower than NNI - - BEST : best of NNI and SPR search - """, - checker_function=lambda x: x in ("NNI", "SPR", "BEST"), - equate=False, - ), - # alt name: user_tree_file - _Option( - ["-u", "--inputtree", "input_tree"], - "Starting tree filename. The tree must be in Newick format.", - filename=True, - equate=False, - ), - _Option( - ["-o", "optimize"], - r"""Specific parameter optimisation. - - tlr : tree topology (t), branch length (l) and - rate parameters (r) are optimised. - - tl : tree topology and branch length are optimised. - - lr : branch length and rate parameters are optimised. - - l : branch length are optimised. - - r : rate parameters are optimised. - - n : no parameter is optimised. - """, - equate=False, - ), - _Switch( - ["--rand_start", "rand_start"], - """Sets the initial tree to random. - - Only valid if SPR searches are to be performed. - """, - ), - _Option( - ["--n_rand_starts", "n_rand_starts"], - """Number of initial random trees to be used. - - Only valid if SPR searches are to be performed. - """, - equate=False, - ), - _Option( - ["--r_seed", "r_seed"], - """Seed used to initiate the random number generator. - - Must be an integer. - """, - equate=False, - ), - _Switch( - ["--print_site_lnl", "print_site_lnl"], - r"Print the likelihood for each site in file \*_phyml_lk.txt.", - ), - _Switch( - ["--print_trace", "print_trace"], - r""" - Print each phylogeny explored during the tree search process - in file \*_phyml_trace.txt.""", - ), - _Option( - ["--run_id", "run_id"], - """Append the given string at the end of each PhyML output file. - - This option may be useful when running simulations involving - PhyML. - """, - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - # XXX should this always be set to True? - _Switch( - ["--quiet", "quiet"], - "No interactive questions (for running in batch mode).", - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) diff --git a/Bio/Phylo/Applications/_Raxml.py b/Bio/Phylo/Applications/_Raxml.py deleted file mode 100644 index a43a93e4d..000000000 --- a/Bio/Phylo/Applications/_Raxml.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright 2012 by Eric Talevich. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command-line wrapper for the tree inference program RAxML. - -Derived from the help page for RAxML version 7.3 by Alexandros Stamatakis, but -should work for any version 7.X (and probably earlier for most options). -""" - -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class RaxmlCommandline(AbstractCommandline): - """Command-line wrapper for the tree inference program RAxML. - - The required parameters are 'sequences' (-s), 'model' (-m) and 'name' (-n). - The parameter 'parsimony_seed' (-p) must also be set for RAxML, but if you - do not specify it, this wrapper will set the seed to 10000 for you. - - References - ---------- - Stamatakis A. - RAxML-VI-HPC: Maximum Likelihood-based Phylogenetic Analyses with - Thousands of Taxa and Mixed Models. - Bioinformatics 2006, 22(21):2688-2690. - - Homepage: http://sco.h-its.org/exelixis/software.html - - Examples - -------- - >>> from Bio.Phylo.Applications import RaxmlCommandline - >>> raxml_cline = RaxmlCommandline(sequences="Tests/Phylip/interlaced2.phy", - ... model="PROTCATWAG", name="interlaced2") - >>> print(raxml_cline) - raxmlHPC -m PROTCATWAG -n interlaced2 -p 10000 -s Tests/Phylip/interlaced2.phy - - You would typically run the command line with raxml_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="raxmlHPC", **kwargs): - """Initialize the class.""" - self.parameters = [ - _Option( - ["-a", "weight_filename"], - "Name of a column weight file to assign individual weights " - "to each column of the alignment. Those weights must be " - "integers separated by any type and number of whitespaces " - "within a separate file.", - filename=True, - equate=False, - ), - _Option( - ["-b", "bootstrap_seed"], "Random seed for bootstrapping.", equate=False - ), - _Option( - ["-c", "num_categories"], - "Number of distinct rate categories for RAxML when " - "evolution model is set to GTRCAT or GTRMIX." - "Individual per-site rates are categorized into this " - "many rate categories to accelerate computations. " - "Default: 25.", - equate=False, - ), - _Switch( - ["-d", "random_starting_tree"], - "Start ML optimization from random starting tree.", - ), - _Option( - ["-e", "epsilon"], - "Set model optimization precision in log likelihood units " - "for final optimization of tree topology under MIX/MIXI " - "or GAMMA/GAMMAI." - "Default: 0.1 for models not using proportion of " - "invariant sites estimate; 0.001 for models using " - "proportion of invariant sites estimate.", - equate=False, - ), - _Option( - ["-E", "exclude_filename"], - "An exclude file name, containing a specification of " - "alignment positions you wish to exclude. Format is " - "similar to Nexus, the file shall contain entries like " - "'100-200 300-400'; to exclude a single column write, " - "e.g., '100-100'. If you use a mixed model, an " - "appropriately adapted model file will be written.", - filename=True, - equate=False, - ), - _Option( - ["-f", "algorithm"], - r""" - Select algorithm: - - a: Rapid Bootstrap analysis and search for best-scoring ML - tree in one program run. - - b: Draw bipartition information on a tree provided with '-t' - based on multiple trees (e.g. form a bootstrap) in a file - specified by '-z'. - - c: Check if the alignment can be properly read by RAxML. - - d: New rapid hill-climbing (DEFAULT). - - e: Optimize model+branch lengths for given input tree under - GAMMA/GAMMAI only. - - g: Compute per site log Likelihoods for one or more trees - passed via '-z' and write them to a file that can be read - by CONSEL. - - h: Compute log likelihood test (SH-test) between best tree - passed via '-t' and a bunch of other trees passed via '-z'. - - i: Perform a really thorough bootstrap, refinement of final - bootstrap tree under GAMMA and a more exhaustive algorithm. - - j: Generate a bunch of bootstrapped alignment files from an - original alignment file. - - m: Compare bipartitions between two bunches of trees passed - via '-t' and '-z' respectively. This will return the - Pearson correlation between all bipartitions found in the - two tree files. A file called - RAxML_bipartitionFrequencies.outputFileName will be - printed that contains the pair-wise bipartition - frequencies of the two sets. - - n: Compute the log likelihood score of all trees contained - in a tree file provided by '-z' under GAMMA or - GAMMA+P-Invar. - - o: Old and slower rapid hill-climbing. - - p: Perform pure stepwise MP addition of new sequences to an - incomplete starting tree. - - s: Split up a multi-gene partitioned alignment into the - respective subalignments. - - t: Do randomized tree searches on one fixed starting tree. - - w: Compute ELW test on a bunch of trees passed via '-z'. - - x: Compute pair-wise ML distances, ML model parameters will - be estimated on an MP starting tree or a user-defined - tree passed via '-t', only allowed for GAMMA-based models - of rate heterogeneity. - """, - checker_function=(lambda x: isinstance(x, str) and len(x) == 1), - equate=False, - ), - _Option( - ["-g", "grouping_constraint"], - "File name of a multifurcating constraint tree. " - "this tree does not need to be comprehensive, i.e. " - "contain all taxa.", - filename=True, - equate=False, - ), - _Option( - ["-i", "rearrangements"], - "Initial rearrangement setting for the subsequent " - "application of topological changes phase.", - equate=False, - ), - _Switch( - ["-j", "checkpoints"], - "Write checkpoints (intermediate tree topologies).", - ), - _Switch( - ["-k", "bootstrap_branch_lengths"], - "Print bootstrapped trees with branch lengths. " - "The bootstraps will run a bit longer, because model " - "parameters will be optimized at the end of each run. " - "Use with CATMIX/PROTMIX or GAMMA/GAMMAI.", - ), - _Option( - ["-l", "cluster_threshold"], - "Threshold for sequence similarity clustering. " - "RAxML will then print out an alignment to a file " - "called sequenceFileName.reducedBy.threshold that " - "only contains sequences <= the specified threshold " - "that must be between 0.0 and 1.0. RAxML uses the " - "QT-clustering algorithm to perform this task. " - "In addition, a file called " - "RAxML_reducedList.outputFileName will be written " - "that contains clustering information.", - equate=False, - ), - _Option( - ["-L", "cluster_threshold_fast"], - "Same functionality as '-l', but uses a less " - "exhaustive and thus faster clustering algorithm. " - "This is intended for very large datasets with more " - "than 20,000-30,000 sequences.", - equate=False, - ), - _Option( - ["-m", "model"], - r"""Model of Nucleotide or Amino Acid Substitution: - - NUCLEOTIDES: - - GTRCAT : GTR + Optimization of substitution rates + Optimization of site-specific - evolutionary rates which are categorized into numberOfCategories distinct - rate categories for greater computational efficiency - if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program - will use GTRMIX instead - - GTRGAMMA : GTR + Optimization of substitution rates + GAMMA model of rate - heterogeneity (alpha parameter will be estimated) - - GTRMIX : Inference of the tree under GTRCAT - and thereafter evaluation of the final tree topology under GTRGAMMA - - GTRCAT_GAMMA : Inference of the tree with site-specific evolutionary rates. - However, here rates are categorized using the 4 discrete GAMMA rates. - Evaluation of the final tree topology under GTRGAMMA - - GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of invariable sites - - GTRMIXI : Same as GTRMIX, but with estimate of proportion of invariable sites - - GTRCAT_GAMMAI : Same as GTRCAT_GAMMA, but with estimate of proportion of invariable sites - - AMINO ACIDS: - - PROTCATmatrixName[F] : specified AA matrix + Optimization of substitution rates + Optimization of site-specific - evolutionary rates which are categorized into numberOfCategories distinct - rate categories for greater computational efficiency - if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program - will use PROTMIX... instead - - PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of substitution rates + GAMMA model of rate - heterogeneity (alpha parameter will be estimated) - - PROTMIXmatrixName[F] : Inference of the tree under specified AA matrix + CAT - and thereafter evaluation of the final tree topology under specified AA matrix + GAMMA - - PROTCAT_GAMMAmatrixName[F] : Inference of the tree under specified AA matrix and site-specific evolutionary rates. - However, here rates are categorized using the 4 discrete GAMMA rates. - Evaluation of the final tree topology under specified AA matrix + GAMMA - - PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but with estimate of proportion of invariable sites - - PROTMIXImatrixName[F] : Same as PROTMIXmatrixName[F], but with estimate of proportion of invariable sites - - PROTCAT_GAMMAImatrixName[F] : Same as PROTCAT_GAMMAmatrixName[F], but with estimate of proportion of invariable sites - - Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, CPREV, VT, BLOSUM62, MTMAM, GTR - With the optional 'F' appendix you can specify if you want to use empirical base frequencies - Please not that for mixed models you can in addition specify the per-gene AA model in - the mixed model file (see manual for details) - """, - equate=False, - ), - _Switch( - ["-M", "partition_branch_lengths"], - "Switch on estimation of individual per-partition " - "branch lengths. Only has effect when used in " - "combination with 'partition_filename' ('-q'). " - "Branch lengths for individual partitions will be " - "printed to separate files. A weighted average of the " - "branch lengths is computed by using the respective " - "partition lengths. ", - ), - _Option( - ["-n", "name"], - "Name used in the output files.", - filename=True, - equate=False, - ), - _Option( - ["-o", "outgroup"], - "Name of a single outgroup or a comma-separated list " - "of outgroups, eg '-o Rat' or '-o Rat,Mouse'. In case " - "that multiple outgroups are not monophyletic the " - "first name in the list will be selected as outgroup. " - "Don't leave spaces between taxon names!", - checker_function=lambda x: len(x.split()) == 1, - equate=False, - ), - _Option( - ["-q", "partition_filename"], - "File name containing the assignment of models to " - "alignment partitions for multiple models of " - "substitution. For the syntax of this file please " - "consult the RAxML manual.", - filename=True, - equate=False, - ), - _Option( - ["-p", "parsimony_seed"], - "Random number seed for the parsimony inferences. " - "This allows you to reproduce your results and will " - "help developers debug the program. This option HAS " - "NO EFFECT in the parallel MPI version.", - equate=False, - ), - _Option( - ["-P", "protein_model"], - "File name of a user-defined AA (Protein) substitution " - "model. This file must contain 420 entries, the first " - "400 being the AA substitution rates (this must be a " - "symmetric matrix) and the last 20 are the empirical " - "base frequencies.", - filename=True, - equate=False, - ), - _Option( - ["-r", "binary_constraint"], - "File name of a binary constraint tree. " - "This tree does not need to be comprehensive, i.e. " - "contain all taxa.", - filename=True, - equate=False, - ), - _Option( - ["-s", "sequences"], - "Name of the alignment data file, in PHYLIP format.", - filename=True, - equate=False, - ), - _Option( - ["-t", "starting_tree"], - "File name of a user starting tree, in Newick format.", - filename=True, - equate=False, - ), - _Option( - ["-T", "threads"], - "Number of threads to run. " - "PTHREADS VERSION ONLY! " - "Make sure to set this at most the number of CPUs " - "you have on your machine, otherwise, there will be " - "a huge performance decrease!", - equate=False, - ), - _Option( - ["-u", "num_bootstrap_searches"], - "Number of multiple bootstrap searches per replicate. " - "Use this to obtain better ML trees for each " - "replicate. Default: 1 ML search per bootstrap " - "replicate.", - equate=False, - ), - _Switch(["-v", "version"], "Display version information."), - _Option( - ["-w", "working_dir"], - "Name of the working directory where RAxML will " - "write its output files. Default: current directory.", - filename=True, - equate=False, - ), - _Option( - ["-x", "rapid_bootstrap_seed"], - "Random seed for rapid bootstrapping.", - equate=False, - ), - _Switch( - ["-y", "parsimony"], - "Only compute a parsimony starting tree, then exit.", - ), - _Option( - ["-z", "bipartition_filename"], - "Name of a file containing multiple trees, e.g. from " - "a bootstrap run, that shall be used to draw " - "bipartition values onto a tree provided with '-t'. " - "It can also be used to compute per-site log " - "likelihoods in combination with '-f g', and to read " - "a bunch of trees for a couple of other options " - "('-f h', '-f m', '-f n').", - filename=True, - equate=False, - ), - _Option( - ["-N", "-#", "num_replicates"], - "Number of alternative runs on distinct starting trees. " - "In combination with the '-b' option, this will invoke a " - "multiple bootstrap analysis. " - "DEFAULT: 1 single analysis." - "Note that '-N' has been added as an alternative since " - "'-#' sometimes caused problems with certain MPI job " - "submission systems, since '-#' is often used to start " - "comments. ", - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - # ENH: enforce -s, -n and -m - if not self.parsimony_seed: - self.parsimony_seed = 10000 - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Phylo/Applications/__init__.py b/Bio/Phylo/Applications/__init__.py deleted file mode 100644 index b99fde3ba..000000000 --- a/Bio/Phylo/Applications/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2011 by Eric Talevich. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Phylogenetics command line tool wrappers (OBSOLETE). - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from ._Fasttree import FastTreeCommandline -from ._Phyml import PhymlCommandline -from ._Raxml import RaxmlCommandline - -# Make this explicit, then they show up in the API docs -__all__ = ("PhymlCommandline", "RaxmlCommandline", "FastTreeCommandline") diff --git a/Bio/PopGen/GenePop/Controller.py b/Bio/PopGen/GenePop/Controller.py deleted file mode 100644 index 0739f9181..000000000 --- a/Bio/PopGen/GenePop/Controller.py +++ /dev/null @@ -1,948 +0,0 @@ -# Copyright 2009 by Tiago Antao . All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Module to control GenePop.""" - -import os -import re -import shutil -import tempfile - -from Bio.Application import _Argument -from Bio.Application import AbstractCommandline - - -def _gp_float(tok): - """Get a float from a token, if it fails, returns the string (PRIVATE).""" - try: - return float(tok) - except ValueError: - return str(tok) - - -def _gp_int(tok): - """Get a int from a token, if it fails, returns the string (PRIVATE).""" - try: - return int(tok) - except ValueError: - return str(tok) - - -def _read_allele_freq_table(f): - line = f.readline() - while " --" not in line: - if line == "": - raise StopIteration - if "No data" in line: - return None, None - line = f.readline() - alleles = [x for x in f.readline().rstrip().split(" ") if x != ""] - alleles = [_gp_int(x) for x in alleles] - line = f.readline().rstrip() - table = [] - while line != "": - parts = [x for x in line.split(" ") if x != ""] - try: - table.append( - (parts[0], [_gp_float(x) for x in parts[1:-1]], _gp_int(parts[-1])) - ) - except ValueError: - table.append((parts[0], [None] * len(alleles), 0)) - line = f.readline().rstrip() - return alleles, table - - -def _read_table(f, funs): - table = [] - line = f.readline().rstrip() - while "---" not in line: - line = f.readline().rstrip() - line = f.readline().rstrip() - while "===" not in line and "---" not in line and line != "": - toks = [x for x in line.split(" ") if x != ""] - parts = [] - for i, tok in enumerate(toks): - try: - parts.append(funs[i](tok)) - except ValueError: - parts.append(tok) # Could not cast - table.append(tuple(parts)) - line = f.readline().rstrip() - return table - - -def _read_triangle_matrix(f): - matrix = [] - line = f.readline().rstrip() - while line != "": - matrix.append([_gp_float(x) for x in [y for y in line.split(" ") if y != ""]]) - line = f.readline().rstrip() - return matrix - - -def _read_headed_triangle_matrix(f): - matrix = {} - header = f.readline().rstrip() - if "---" in header or "===" in header: - header = f.readline().rstrip() - nlines = len([x for x in header.split(" ") if x != ""]) - 1 - for line_pop in range(nlines): - line = f.readline().rstrip() - vals = [x for x in line.split(" ")[1:] if x != ""] - clean_vals = [] - for val in vals: - try: - clean_vals.append(_gp_float(val)) - except ValueError: - clean_vals.append(None) - for col_pop, clean_val in enumerate(clean_vals): - matrix[(line_pop + 1, col_pop)] = clean_val - return matrix - - -def _hw_func(stream, is_locus, has_fisher=False): - line = stream.readline() - if is_locus: - hook = "Locus " - else: - hook = "Pop : " - while line != "": - if line.lstrip().startswith(hook): - stream.readline() - stream.readline() - stream.readline() - table = _read_table( - stream, [str, _gp_float, _gp_float, _gp_float, _gp_float, _gp_int, str] - ) - # loci might mean pop if hook="Locus " - loci = {} - for entry in table: - if len(entry) < 4: - loci[entry[0]] = None - else: - locus, p, se, fis_wc, fis_rh, steps = entry[:-1] - if se == "-": - se = None - loci[locus] = p, se, fis_wc, fis_rh, steps - return loci - line = stream.readline() - # self.done = True - raise StopIteration - - -class _FileIterator: - """Return an iterator which crawls over a stream of lines with a function (PRIVATE). - - The generator function is expected to yield a tuple, while - consuming input - """ - - def __init__(self, func, fname, handle=None): - self.func = func - if handle is None: - self.stream = open(fname) - else: - # For special cases where calling code wants to - # seek into the file before starting: - self.stream = handle - self.fname = fname - self.done = False - - def __iter__(self): - if self.done: - self.done = True - raise StopIteration - return self - - def __next__(self): - return self.func(self) - - def __del__(self): - self.stream.close() - os.remove(self.fname) - - -class _GenePopCommandline(AbstractCommandline): - """Return a Command Line Wrapper for GenePop (PRIVATE).""" - - def __init__(self, genepop_dir=None, cmd="Genepop", **kwargs): - self.parameters = [ - _Argument(["command"], "GenePop option to be called", is_required=True), - _Argument(["mode"], "Should always be batch", is_required=True), - _Argument(["input"], "Input file", is_required=True), - _Argument(["Dememorization"], "Dememorization step"), - _Argument(["BatchNumber"], "Number of MCMC batches"), - _Argument(["BatchLength"], "Length of MCMC chains"), - _Argument(["HWtests"], "Enumeration or MCMC"), - _Argument(["IsolBDstatistic"], "IBD statistic (a or e)"), - _Argument(["MinimalDistance"], "Minimal IBD distance"), - _Argument(["GeographicScale"], "Log or Linear"), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - self.set_parameter("mode", "Mode=Batch") - - def set_menu(self, option_list): - """Set the menu option. - - Example set_menu([6,1]) = get all F statistics (menu 6.1) - """ - self.set_parameter( - "command", "MenuOptions=" + ".".join(str(x) for x in option_list) - ) - - def set_input(self, fname): - """Set the input file name.""" - self.set_parameter("input", "InputFile=" + fname) - - -class GenePopController: - """Define a class to interface with the GenePop program.""" - - def __init__(self, genepop_dir=None): - """Initialize the controller. - - genepop_dir is the directory where GenePop is. - - The binary should be called Genepop (capital G) - """ - self.controller = _GenePopCommandline(genepop_dir) - - def _get_opts(self, dememorization, batches, iterations, enum_test=None): - opts = {} - opts["Dememorization"] = dememorization - opts["BatchNumber"] = batches - opts["BatchLength"] = iterations - if enum_test is not None: - if enum_test is True: - opts["HWtests"] = "Enumeration" - else: - opts["HWtests"] = "MCMC" - return opts - - def _run_genepop(self, extensions, option, fname, opts=None): - if opts is None: - opts = {} - cwd = os.getcwd() - temp_dir = tempfile.mkdtemp() - os.chdir(temp_dir) - self.controller.set_menu(option) - if os.path.isabs(fname): - self.controller.set_input(fname) - else: - self.controller.set_input(cwd + os.sep + fname) - for opt in opts: - self.controller.set_parameter(opt, opt + "=" + str(opts[opt])) - self.controller() # checks error level is zero - os.chdir(cwd) - shutil.rmtree(temp_dir) - - def _test_pop_hz_both( - self, - fname, - type, - ext, - enum_test=True, - dememorization=10000, - batches=20, - iterations=5000, - ): - """Use Hardy-Weinberg test for heterozygote deficiency/excess (PRIVATE). - - Returns a population iterator containing a dictionary where - dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps). - - Some loci have a None if the info is not available. - SE might be none (for enumerations). - """ - opts = self._get_opts(dememorization, batches, iterations, enum_test) - self._run_genepop([ext], [1, type], fname, opts) - - def hw_func(self): - return _hw_func(self.stream, False) - - return _FileIterator(hw_func, fname + ext) - - def _test_global_hz_both( - self, - fname, - type, - ext, - enum_test=True, - dememorization=10000, - batches=20, - iterations=5000, - ): - """Use Global Hardy-Weinberg test for heterozygote deficiency/excess (PRIVATE). - - Returns a triple with: - - A list per population containing (pop_name, P-val, SE, switches). - Some pops have a None if the info is not available. - SE might be none (for enumerations). - - A list per loci containing (locus_name, P-val, SE, switches). - Some loci have a None if the info is not available. - SE might be none (for enumerations). - - Overall results (P-val, SE, switches). - - """ - opts = self._get_opts(dememorization, batches, iterations, enum_test) - self._run_genepop([ext], [1, type], fname, opts) - - def hw_pop_func(self): - return _read_table(self.stream, [str, _gp_float, _gp_float, _gp_float]) - - with open(fname + ext) as f1: - line = f1.readline() - while "by population" not in line: - line = f1.readline() - pop_p = _read_table(f1, [str, _gp_float, _gp_float, _gp_float]) - with open(fname + ext) as f2: - line = f2.readline() - while "by locus" not in line: - line = f2.readline() - loc_p = _read_table(f2, [str, _gp_float, _gp_float, _gp_float]) - with open(fname + ext) as f: - line = f.readline() - while "all locus" not in line: - line = f.readline() - f.readline() - f.readline() - f.readline() - f.readline() - line = f.readline().rstrip() - p, se, switches = tuple( - _gp_float(x) for x in [y for y in line.split(" ") if y != ""] - ) - return pop_p, loc_p, (p, se, switches) - - # 1.1 - def test_pop_hz_deficiency( - self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000 - ): - """Use Hardy-Weinberg test for heterozygote deficiency. - - Returns a population iterator containing a dictionary where - dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps). - - Some loci have a None if the info is not available. - SE might be none (for enumerations). - """ - return self._test_pop_hz_both( - fname, 1, ".D", enum_test, dememorization, batches, iterations - ) - - # 1.2 - def test_pop_hz_excess( - self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000 - ): - """Use Hardy-Weinberg test for heterozygote deficiency. - - Returns a population iterator containing a dictionary where - dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps). - - Some loci have a None if the info is not available. - SE might be none (for enumerations). - """ - return self._test_pop_hz_both( - fname, 2, ".E", enum_test, dememorization, batches, iterations - ) - - # 1.3 P file - def test_pop_hz_prob( - self, - fname, - ext, - enum_test=False, - dememorization=10000, - batches=20, - iterations=5000, - ): - """Use Hardy-Weinberg test based on probability. - - Returns 2 iterators and a final tuple: - - 1. Returns a loci iterator containing: - - A dictionary[pop_pos]=(P-val, SE, Fis-WC, Fis-RH, steps). - Some pops have a None if the info is not available. - SE might be none (for enumerations). - - Result of Fisher's test (Chi2, deg freedom, prob). - 2. Returns a population iterator containing: - - A dictionary[locus]=(P-val, SE, Fis-WC, Fis-RH, steps). - Some loci have a None if the info is not available. - SE might be none (for enumerations). - - Result of Fisher's test (Chi2, deg freedom, prob). - 3. Final tuple (Chi2, deg freedom, prob). - - """ - opts = self._get_opts(dememorization, batches, iterations, enum_test) - self._run_genepop([ext], [1, 3], fname, opts) - - def hw_prob_loci_func(self): - return _hw_func(self.stream, True, True) - - def hw_prob_pop_func(self): - return _hw_func(self.stream, False, True) - - shutil.copyfile(fname + ".P", fname + ".P2") - - return ( - _FileIterator(hw_prob_loci_func, fname + ".P"), - _FileIterator(hw_prob_pop_func, fname + ".P2"), - ) - - # 1.4 - def test_global_hz_deficiency( - self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000 - ): - """Use Global Hardy-Weinberg test for heterozygote deficiency. - - Returns a triple with: - - An list per population containing (pop_name, P-val, SE, switches). - Some pops have a None if the info is not available. - SE might be none (for enumerations). - - An list per loci containing (locus_name, P-val, SE, switches). - Some loci have a None if the info is not available. - SE might be none (for enumerations). - - Overall results (P-val, SE, switches). - - """ - return self._test_global_hz_both( - fname, 4, ".DG", enum_test, dememorization, batches, iterations - ) - - # 1.5 - def test_global_hz_excess( - self, fname, enum_test=True, dememorization=10000, batches=20, iterations=5000 - ): - """Use Global Hardy-Weinberg test for heterozygote excess. - - Returns a triple with: - - A list per population containing (pop_name, P-val, SE, switches). - Some pops have a None if the info is not available. - SE might be none (for enumerations). - - A list per loci containing (locus_name, P-val, SE, switches). - Some loci have a None if the info is not available. - SE might be none (for enumerations). - - Overall results (P-val, SE, switches) - - """ - return self._test_global_hz_both( - fname, 5, ".EG", enum_test, dememorization, batches, iterations - ) - - # 2.1 - def test_ld(self, fname, dememorization=10000, batches=20, iterations=5000): - """Test for linkage disequilibrium on each pair of loci in each population.""" - opts = self._get_opts(dememorization, batches, iterations) - self._run_genepop([".DIS"], [2, 1], fname, opts) - - def ld_pop_func(self): - current_pop = None - line = self.stream.readline().rstrip() - if line == "": - self.done = True - raise StopIteration - toks = [x for x in line.split(" ") if x != ""] - pop, locus1, locus2 = toks[0], toks[1], toks[2] - if not hasattr(self, "start_locus1"): - start_locus1, start_locus2 = locus1, locus2 - current_pop = -1 - if locus1 == start_locus1 and locus2 == start_locus2: - current_pop += 1 - if toks[3] == "No": - return current_pop, pop, (locus1, locus2), None - p, se, switches = _gp_float(toks[3]), _gp_float(toks[4]), _gp_int(toks[5]) - return current_pop, pop, (locus1, locus2), (p, se, switches) - - def ld_func(self): - line = self.stream.readline().rstrip() - if line == "": - self.done = True - raise StopIteration - toks = [x for x in line.split(" ") if x != ""] - locus1, locus2 = toks[0], toks[2] - try: - chi2, df, p = _gp_float(toks[3]), _gp_int(toks[4]), _gp_float(toks[5]) - except ValueError: - return (locus1, locus2), None - return (locus1, locus2), (chi2, df, p) - - f1 = open(fname + ".DIS") - line = f1.readline() - while "----" not in line: - line = f1.readline() - shutil.copyfile(fname + ".DIS", fname + ".DI2") - f2 = open(fname + ".DI2") - line = f2.readline() - while "Locus pair" not in line: - line = f2.readline() - while "----" not in line: - line = f2.readline() - return ( - _FileIterator(ld_pop_func, fname + ".DIS", f1), - _FileIterator(ld_func, fname + ".DI2", f2), - ) - - # 2.2 - def create_contingency_tables(self, fname): - """Provision for creating Genotypic contingency tables.""" - raise NotImplementedError - - # 3.1 PR/GE files - def test_genic_diff_all( - self, fname, dememorization=10000, batches=20, iterations=5000 - ): - """Provision for Genic differentiation for all populations.""" - raise NotImplementedError - - # 3.2 PR2/GE2 files - def test_genic_diff_pair( - self, fname, dememorization=10000, batches=20, iterations=5000 - ): - """Provision for Genic differentiation for all population pairs.""" - raise NotImplementedError - - # 3.3 G files - def test_genotypic_diff_all( - self, fname, dememorization=10000, batches=20, iterations=5000 - ): - """Provision for Genotypic differentiation for all populations.""" - raise NotImplementedError - - # 3.4 2G2 files - def test_genotypic_diff_pair( - self, fname, dememorization=10000, batches=20, iterations=5000 - ): - """Provision for Genotypic differentiation for all population pairs.""" - raise NotImplementedError - - # 4 - def estimate_nm(self, fname): - """Estimate the Number of Migrants. - - Parameters: - - fname - file name - - Returns - - Mean sample size - - Mean frequency of private alleles - - Number of migrants for Ne=10 - - Number of migrants for Ne=25 - - Number of migrants for Ne=50 - - Number of migrants after correcting for expected size - - """ - self._run_genepop(["PRI"], [4], fname) - with open(fname + ".PRI") as f: - lines = f.readlines() # Small file, it is ok - for line in lines: - m = re.search("Mean sample size: ([.0-9]+)", line) - if m is not None: - mean_sample_size = _gp_float(m.group(1)) - m = re.search(r"Mean frequency of private alleles p\(1\)= ([.0-9]+)", line) - if m is not None: - mean_priv_alleles = _gp_float(m.group(1)) - m = re.search("N=10: ([.0-9]+)", line) - if m is not None: - mig10 = _gp_float(m.group(1)) - m = re.search("N=25: ([.0-9]+)", line) - if m is not None: - mig25 = _gp_float(m.group(1)) - m = re.search("N=50: ([.0-9]+)", line) - if m is not None: - mig50 = _gp_float(m.group(1)) - m = re.search("for size= ([.0-9]+)", line) - if m is not None: - mig_corrected = _gp_float(m.group(1)) - os.remove(fname + ".PRI") - return mean_sample_size, mean_priv_alleles, mig10, mig25, mig50, mig_corrected - - # 5.1 - def calc_allele_genotype_freqs(self, fname): - """Calculate allele and genotype frequencies per locus and per sample. - - Parameters: - - fname - file name - - Returns tuple with 2 elements: - - Population iterator with - - - population name - - Locus dictionary with key = locus name and content tuple as - Genotype List with - (Allele1, Allele2, observed, expected) - (expected homozygotes, observed hm, - expected heterozygotes, observed ht) - Allele frequency/Fis dictionary with allele as key and - (count, frequency, Fis Weir & Cockerham) - - Totals as a pair - - count - - Fis Weir & Cockerham, - - Fis Robertson & Hill - - - Locus iterator with - - - Locus name - - allele list - - Population list with a triple - - - population name - - list of allele frequencies in the same order as allele list above - - number of genes - - Will create a file called fname.INF - - """ - self._run_genepop(["INF"], [5, 1], fname) - # First pass, general information - # num_loci = None - # num_pops = None - # with open(fname + ".INF") as f: - # line = f.readline() - # while (num_loci is None or num_pops is None) and line != '': - # m = re.search("Number of populations detected : ([0-9+])", l) - # if m is not None: - # num_pops = _gp_int(m.group(1)) - # m = re.search("Number of loci detected : ([0-9+])", l) - # if m is not None: - # num_loci = _gp_int(m.group(1)) - # line = f.readline() - - def pop_parser(self): - if hasattr(self, "old_line"): - line = self.old_line - del self.old_line - else: - line = self.stream.readline() - loci_content = {} - while line != "": - line = line.rstrip() - if "Tables of allelic frequencies for each locus" in line: - return self.curr_pop, loci_content - match = re.match(".*Pop: (.+) Locus: (.+)", line) - if match is not None: - pop = match.group(1).rstrip() - locus = match.group(2) - if not hasattr(self, "first_locus"): - self.first_locus = locus - if hasattr(self, "curr_pop"): - if self.first_locus == locus: - old_pop = self.curr_pop - # self.curr_pop = pop - self.old_line = line - del self.first_locus - del self.curr_pop - return old_pop, loci_content - self.curr_pop = pop - else: - line = self.stream.readline() - continue - geno_list = [] - line = self.stream.readline() - if "No data" in line: - continue - - while "Genotypes Obs." not in line: - line = self.stream.readline() - - while line != "\n": - m2 = re.match(" +([0-9]+) , ([0-9]+) *([0-9]+) *(.+)", line) - if m2 is not None: - geno_list.append( - ( - _gp_int(m2.group(1)), - _gp_int(m2.group(2)), - _gp_int(m2.group(3)), - _gp_float(m2.group(4)), - ) - ) - else: - line = self.stream.readline() - continue - line = self.stream.readline() - - while "Expected number of ho" not in line: - line = self.stream.readline() - expHo = _gp_float(line[38:]) - line = self.stream.readline() - obsHo = _gp_int(line[38:]) - line = self.stream.readline() - expHe = _gp_float(line[38:]) - line = self.stream.readline() - obsHe = _gp_int(line[38:]) - line = self.stream.readline() - - while "Sample count" not in line: - line = self.stream.readline() - line = self.stream.readline() - freq_fis = {} - overall_fis = None - while "----" not in line: - vals = [x for x in line.rstrip().split(" ") if x != ""] - if vals[0] == "Tot": - overall_fis = ( - _gp_int(vals[1]), - _gp_float(vals[2]), - _gp_float(vals[3]), - ) - else: - freq_fis[_gp_int(vals[0])] = ( - _gp_int(vals[1]), - _gp_float(vals[2]), - _gp_float(vals[3]), - ) - line = self.stream.readline() - loci_content[locus] = ( - geno_list, - (expHo, obsHo, expHe, obsHe), - freq_fis, - overall_fis, - ) - self.done = True - raise StopIteration - - def locus_parser(self): - line = self.stream.readline() - while line != "": - line = line.rstrip() - match = re.match(" Locus: (.+)", line) - if match is not None: - locus = match.group(1) - alleles, table = _read_allele_freq_table(self.stream) - return locus, alleles, table - line = self.stream.readline() - self.done = True - raise StopIteration - - shutil.copyfile(fname + ".INF", fname + ".IN2") - pop_iter = _FileIterator(pop_parser, fname + ".INF") - locus_iter = _FileIterator(locus_parser, fname + ".IN2") - return (pop_iter, locus_iter) - - def _calc_diversities_fis(self, fname, ext): - self._run_genepop([ext], [5, 2], fname) - with open(fname + ext) as f: - line = f.readline() - while line != "": - line = line.rstrip() - if line.startswith( - "Statistics per sample over all loci with at least two individuals typed" - ): - avg_fis = _read_table(f, [str, _gp_float, _gp_float, _gp_float]) - avg_Qintra = _read_table(f, [str, _gp_float]) - line = f.readline() - - def fis_func(self): - line = self.stream.readline() - while line != "": - line = line.rstrip() - m = re.search("Locus: (.+)", line) - if m is not None: - locus = m.group(1) - self.stream.readline() - if "No complete" in self.stream.readline(): - return locus, None - self.stream.readline() - fis_table = _read_table( - self.stream, [str, _gp_float, _gp_float, _gp_float] - ) - self.stream.readline() - avg_qinter, avg_fis = tuple( - _gp_float(x) - for x in [ - y for y in self.stream.readline().split(" ") if y != "" - ] - ) - return locus, fis_table, avg_qinter, avg_fis - line = self.stream.readline() - self.done = True - raise StopIteration - - return _FileIterator(fis_func, fname + ext), avg_fis, avg_Qintra - - # 5.2 - def calc_diversities_fis_with_identity(self, fname): - """Compute identity-base Gene diversities and Fis.""" - return self._calc_diversities_fis(fname, ".DIV") - - # 5.3 - def calc_diversities_fis_with_size(self, fname): - """Provision to Computer Allele size-based Gene diversities and Fis.""" - raise NotImplementedError - - # 6.1 Less genotype frequencies - def calc_fst_all(self, fname): - """Execute GenePop and gets Fst/Fis/Fit (all populations). - - Parameters: - - fname - file name - - Returns: - - (multiLocusFis, multiLocusFst, multiLocus Fit), - - Iterator of tuples - (Locus name, Fis, Fst, Fit, Qintra, Qinter) - - Will create a file called ``fname.FST``. - - This does not return the genotype frequencies. - - """ - self._run_genepop([".FST"], [6, 1], fname) - with open(fname + ".FST") as f: - line = f.readline() - while line != "": - if line.startswith(" All:"): - toks = [x for x in line.rstrip().split(" ") if x != ""] - try: - allFis = _gp_float(toks[1]) - except ValueError: - allFis = None - try: - allFst = _gp_float(toks[2]) - except ValueError: - allFst = None - try: - allFit = _gp_float(toks[3]) - except ValueError: - allFit = None - line = f.readline() - - def proc(self): - if hasattr(self, "last_line"): - line = self.last_line - del self.last_line - else: - line = self.stream.readline() - locus = None - fis = None - fst = None - fit = None - qintra = None - qinter = None - while line != "": - line = line.rstrip() - if line.startswith(" Locus:"): - if locus is not None: - self.last_line = line - return locus, fis, fst, fit, qintra, qinter - else: - locus = line.split(":")[1].lstrip() - elif line.startswith("Fis^="): - fis = _gp_float(line.split(" ")[1]) - elif line.startswith("Fst^="): - fst = _gp_float(line.split(" ")[1]) - elif line.startswith("Fit^="): - fit = _gp_float(line.split(" ")[1]) - elif line.startswith("1-Qintra^="): - qintra = _gp_float(line.split(" ")[1]) - elif line.startswith("1-Qinter^="): - qinter = _gp_float(line.split(" ")[1]) - return locus, fis, fst, fit, qintra, qinter - line = self.stream.readline() - if locus is not None: - return locus, fis, fst, fit, qintra, qinter - self.stream.close() - self.done = True - raise StopIteration - - return (allFis, allFst, allFit), _FileIterator(proc, fname + ".FST") - - # 6.2 - def calc_fst_pair(self, fname): - """Estimate spatial structure from Allele identity for all population pairs.""" - self._run_genepop([".ST2", ".MIG"], [6, 2], fname) - with open(fname + ".ST2") as f: - line = f.readline() - while line != "": - line = line.rstrip() - if line.startswith("Estimates for all loci"): - avg_fst = _read_headed_triangle_matrix(f) - line = f.readline() - - def loci_func(self): - line = self.stream.readline() - while line != "": - line = line.rstrip() - m = re.search(" Locus: (.+)", line) - if m is not None: - locus = m.group(1) - matrix = _read_headed_triangle_matrix(self.stream) - return locus, matrix - line = self.stream.readline() - self.done = True - raise StopIteration - - os.remove(fname + ".MIG") - return _FileIterator(loci_func, fname + ".ST2"), avg_fst - - # 6.3 - def calc_rho_all(self, fname): - """Provision for estimating spatial structure from Allele size for all populations.""" - raise NotImplementedError - - # 6.4 - def calc_rho_pair(self, fname): - """Provision for estimating spatial structure from Allele size for all population pairs.""" - raise NotImplementedError - - def _calc_ibd(self, fname, sub, stat="a", scale="Log", min_dist=0.00001): - """Calculate isolation by distance statistics (PRIVATE).""" - self._run_genepop( - [".GRA", ".MIG", ".ISO"], - [6, sub], - fname, - opts={ - "MinimalDistance": min_dist, - "GeographicScale": scale, - "IsolBDstatistic": stat, - }, - ) - with open(fname + ".ISO") as f: - f.readline() - f.readline() - f.readline() - f.readline() - estimate = _read_triangle_matrix(f) - f.readline() - f.readline() - distance = _read_triangle_matrix(f) - f.readline() - match = re.match("a = (.+), b = (.+)", f.readline().rstrip()) - a = _gp_float(match.group(1)) - b = _gp_float(match.group(2)) - f.readline() - f.readline() - match = re.match(" b=(.+)", f.readline().rstrip()) - bb = _gp_float(match.group(1)) - match = re.match(r".*\[(.+) ; (.+)\]", f.readline().rstrip()) - bblow = _gp_float(match.group(1)) - bbhigh = _gp_float(match.group(2)) - os.remove(fname + ".MIG") - os.remove(fname + ".GRA") - os.remove(fname + ".ISO") - return estimate, distance, (a, b), (bb, bblow, bbhigh) - - # 6.5 - def calc_ibd_diplo(self, fname, stat="a", scale="Log", min_dist=0.00001): - """Calculate isolation by distance statistics for diploid data. - - See _calc_ibd for parameter details. - - Note that each pop can only have a single individual and - the individual name has to be the sample coordinates. - """ - return self._calc_ibd(fname, 5, stat, scale, min_dist) - - # 6.6 - def calc_ibd_haplo(self, fname, stat="a", scale="Log", min_dist=0.00001): - """Calculate isolation by distance statistics for haploid data. - - See _calc_ibd for parameter details. - - Note that each pop can only have a single individual and - the individual name has to be the sample coordinates. - """ - return self._calc_ibd(fname, 6, stat, scale, min_dist) diff --git a/Bio/PopGen/GenePop/EasyController.py b/Bio/PopGen/GenePop/EasyController.py deleted file mode 100644 index c650b8e2d..000000000 --- a/Bio/PopGen/GenePop/EasyController.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright 2009 by Tiago Antao . All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Control GenePop through an easier interface. - -This interface is less efficient than the standard GenePopControler - -""" - -from Bio.PopGen import GenePop - -from .Controller import GenePopController - - -class EasyController: - """Define a class for an easier interface with the GenePop program.""" - - def __init__(self, fname, genepop_dir=None): - """Initialize the controller. - - genepop_dir is the directory where GenePop is. - - The binary should be called Genepop (capital G) - """ - self._fname = fname - self._controller = GenePopController(genepop_dir) - self.__fst_pair_locus = {} # More caches like this needed! - self.__allele_frequency = {} # More caches like this needed! - - def get_basic_info(self): - """Obtain the population list and loci list from the file.""" - with open(self._fname) as f: - rec = GenePop.read(f) - return rec.pop_list, rec.loci_list - - # 1.3 - def test_hw_pop(self, pop_pos, test_type="probability"): - """Perform Hardy-Weinberg test on the given position.""" - if test_type == "deficiency": - hw_res = self._controller.test_pop_hz_deficiency(self._fname) - elif test_type == "excess": - hw_res = self._controller.test_pop_hz_excess(self._fname) - else: - loci_res, hw_res, fisher_full = self._controller.test_pop_hz_prob( - self._fname, ".P" - ) - for i in range(pop_pos - 1): - next(hw_res) - return next(hw_res) - - # 1.4 - def test_hw_global( - self, - test_type="deficiency", - enum_test=True, - dememorization=10000, - batches=20, - iterations=5000, - ): - """Perform Hardy-Weinberg global Heterozygote test.""" - if test_type == "deficiency": - pop_res, loc_res, all = self._controller.test_global_hz_deficiency( - self._fname, enum_test, dememorization, batches, iterations - ) - else: - pop_res, loc_res, all = self._controller.test_global_hz_excess( - self._fname, enum_test, dememorization, batches, iterations - ) - return list(pop_res), list(loc_res), all - - # 2.1 - def test_ld_all_pair( - self, locus1, locus2, dememorization=10000, batches=20, iterations=5000 - ): - """Test for linkage disequilibrium for each pair of loci in each population.""" - all_ld = self._controller.test_ld( - self._fname, dememorization, batches, iterations - )[1] - for ld_case in all_ld: - (l1, l2), result = ld_case - if (l1 == locus1 and l2 == locus2) or (l1 == locus2 and l2 == locus1): - return result - - def estimate_nm(self): - """Estimate Nm. Just a simple bridge.""" - return self._controller.estimate_nm(self._fname) - - def get_heterozygosity_info(self, pop_pos, locus_name): - """Return the heterozygosity info for a certain locus on a population. - - Returns (Expected homozygotes, observed homozygotes, - Expected heterozygotes, observed heterozygotes) - """ - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - pops = list(pop_iter) - return pops[pop_pos][1][locus_name][1] - - def get_genotype_count(self, pop_pos, locus_name): - """Return the genotype counts for a certain population and locus.""" - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - pop_iter = list(pop_iter) - return pop_iter[pop_pos][1][locus_name][0] - - def get_fis(self, pop_pos, locus_name): - """Return the Fis for a certain population and locus. - - Below CW means Cockerham and Weir and RH means Robertson and Hill. - - Returns a pair: - - - dictionary [allele] = (repetition count, frequency, Fis CW ) - with information for each allele - - a triple with total number of alleles, Fis CW, Fis RH - - """ - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - pops = list(pop_iter) - return pops[pop_pos][1][locus_name][2:] - - def get_alleles(self, pop_pos, locus_name): - """Return the alleles for a certain population and locus.""" - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - pop_iter = list(pop_iter) - return list(pop_iter[pop_pos][1][locus_name][2].keys()) - - def get_alleles_all_pops(self, locus_name): - """Return the alleles for a certain population and locus.""" - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - for locus_info in loc_iter: - if locus_info[0] == locus_name: - return locus_info[1] - - def get_allele_frequency(self, pop_pos, locus_name): - """Calculate the allele frequency for a certain locus on a population.""" - if len(self.__allele_frequency) == 0: - geno_freqs = self._controller.calc_allele_genotype_freqs(self._fname) - pop_iter, loc_iter = geno_freqs - for locus_info in loc_iter: - if locus_info[0] is None: - self.__allele_frequency[locus_info[0]] = None, None - else: - self.__allele_frequency[locus_info[0]] = locus_info[1:] - info = self.__allele_frequency[locus_name] - pop_name, freqs, total = info[1][pop_pos] - allele_freq = {} - alleles = info[0] - for i, allele in enumerate(alleles): - allele_freq[allele] = freqs[i] - return total, allele_freq - - def get_multilocus_f_stats(self): - """Return the multilocus F stats. - - Explain averaging. - Returns Fis(CW), Fst, Fit - """ - return self._controller.calc_fst_all(self._fname)[0] - - def get_f_stats(self, locus_name): - """Return F stats for a locus. - - Returns Fis(CW), Fst, Fit, Qintra, Qinter - """ - loci_iter = self._controller.calc_fst_all(self._fname)[1] - for name, fis, fst, fit, qintra, qinter in loci_iter: - if name == locus_name: - return fis, fst, fit, qintra, qinter - - def get_avg_fis(self): - """Calculate identity-base average Fis.""" - return self._controller.calc_diversities_fis_with_identity(self._fname)[1] - - def get_avg_fst_pair(self): - """Calculate Allele size-base average Fis for all population pairs.""" - return self._controller.calc_fst_pair(self._fname)[1] - - def get_avg_fst_pair_locus(self, locus): - """Calculate Allele size-base average Fis for all population pairs of the given locus.""" - if len(self.__fst_pair_locus) == 0: - iter = self._controller.calc_fst_pair(self._fname)[0] - for locus_info in iter: - self.__fst_pair_locus[locus_info[0]] = locus_info[1] - return self.__fst_pair_locus[locus] - - def calc_ibd(self, is_diplo=True, stat="a", scale="Log", min_dist=0.00001): - """Calculate isolation by distance statistics for Diploid or Haploid.""" - if is_diplo: - return self._controller.calc_ibd_diplo(self._fname, stat, scale, min_dist) - else: - return self._controller.calc_ibd_haplo(self._fname, stat, scale, min_dist) diff --git a/Bio/Sequencing/Applications/_Novoalign.py b/Bio/Sequencing/Applications/_Novoalign.py deleted file mode 100644 index 7f1e236fd..000000000 --- a/Bio/Sequencing/Applications/_Novoalign.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright 2009 by Osvaldo Zagordi. All rights reserved. -# Revisions copyright 2010 by Peter Cock. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for the short read aligner Novoalign by Novocraft.""" - -from Bio.Application import _Option -from Bio.Application import AbstractCommandline - - -class NovoalignCommandline(AbstractCommandline): - """Command line wrapper for novoalign by Novocraft. - - See www.novocraft.com - novoalign is a short read alignment program. - - Examples - -------- - >>> from Bio.Sequencing.Applications import NovoalignCommandline - >>> novoalign_cline = NovoalignCommandline(database='some_db', - ... readfile='some_seq.txt') - >>> print(novoalign_cline) - novoalign -d some_db -f some_seq.txt - - As with all the Biopython application wrappers, you can also add or - change options after creating the object: - - >>> novoalign_cline.format = 'PRBnSEQ' - >>> novoalign_cline.r_method='0.99' # limited valid values - >>> novoalign_cline.fragment = '250 20' # must be given as a string - >>> novoalign_cline.miRNA = 100 - >>> print(novoalign_cline) - novoalign -d some_db -f some_seq.txt -F PRBnSEQ -r 0.99 -i 250 20 -m 100 - - You would typically run the command line with novoalign_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - Last checked against version: 2.05.04 - - """ - - def __init__(self, cmd="novoalign", **kwargs): - """Initialize the class.""" - READ_FORMAT = ["FA", "SLXFQ", "STDFQ", "ILMFQ", "PRB", "PRBnSEQ"] - REPORT_FORMAT = ["Native", "Pairwise", "SAM"] - REPEAT_METHOD = ["None", "Random", "All", "Exhaustive", "0.99"] - - self.parameters = [ - _Option( - ["-d", "database"], "database filename", filename=True, equate=False - ), - _Option(["-f", "readfile"], "read file", filename=True, equate=False), - _Option( - ["-F", "format"], - f"Format of read files.\n\nAllowed values: {', '.join(READ_FORMAT)}", - checker_function=lambda x: x in READ_FORMAT, - equate=False, - ), - # Alignment scoring options - _Option( - ["-t", "threshold"], - "Threshold for alignment score", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-g", "gap_open"], - "Gap opening penalty [default: 40]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-x", "gap_extend"], - "Gap extend penalty [default: 15]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-u", "unconverted"], - "Experimental: unconverted cytosines penalty in bisulfite mode\n\n" - "Default: no penalty", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Quality control and read filtering - _Option( - ["-l", "good_bases"], - "Minimum number of good quality bases [default: log(N_g, 4) + 5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-h", "homopolymer"], - "Homopolymer read filter [default: 20; disable: negative value]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Read preprocessing options - _Option( - ["-a", "adapter3"], - "Strips a 3' adapter sequence prior to alignment.\n\n" - "With paired ends two adapters can be specified", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - _Option( - ["-n", "truncate"], - "Truncate to specific length before alignment", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-s", "trimming"], - "If fail to align, trim by s bases until they map or become shorter than l.\n\n" - "Ddefault: 2", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-5", "adapter5"], - "Strips a 5' adapter sequence.\n\n" - "Similar to -a (adaptor3), but on the 5' end.", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - # Reporting options - _Option( - ["-o", "report"], - "Specifies the report format.\n\nAllowed values: %s\nDefault: Native" - % ", ".join(REPORT_FORMAT), - checker_function=lambda x: x in REPORT_FORMAT, - equate=False, - ), - _Option( - ["-Q", "quality"], - "Lower threshold for an alignment to be reported [default: 0]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-R", "repeats"], - "If score difference is higher, report repeats.\n\n" - "Otherwise -r read method applies [default: 5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-r", "r_method"], - "Methods to report reads with multiple matches.\n\n" - "Allowed values: %s\n" - "'All' and 'Exhaustive' accept limits." % ", ".join(REPEAT_METHOD), - checker_function=lambda x: x.split()[0] in REPEAT_METHOD, - equate=False, - ), - _Option( - ["-e", "recorded"], - "Alignments recorded with score equal to the best.\n\n" - "Default: 1000 in default read method, otherwise no limit.", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-q", "qual_digits"], - "Decimal digits for quality scores [default: 0]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Paired end options - _Option( - ["-i", "fragment"], - "Fragment length (2 reads + insert) and standard deviation [default: 250 30]", - checker_function=lambda x: len(x.split()) == 2, - equate=False, - ), - _Option( - ["-v", "variation"], - "Structural variation penalty [default: 70]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # miRNA mode - _Option( - ["-m", "miRNA"], - "Sets miRNA mode and optionally sets a value for the region scanned [default: off]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Multithreading - _Option( - ["-c", "cores"], - "Number of threads, disabled on free versions [default: number of cores]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Quality calibrations - _Option( - ["-k", "read_cal"], - "Read quality calibration from file (mismatch counts)", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - _Option( - ["-K", "write_cal"], - "Accumulate mismatch counts and write to file", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Sequencing/Applications/__init__.py b/Bio/Sequencing/Applications/__init__.py deleted file mode 100644 index 206391b2f..000000000 --- a/Bio/Sequencing/Applications/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2009 by Osvaldo Zagordi. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Sequencing related command line application wrappers (OBSOLETE). - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from ._bwa import BwaAlignCommandline -from ._bwa import BwaBwaswCommandline -from ._bwa import BwaIndexCommandline -from ._bwa import BwaMemCommandline -from ._bwa import BwaSampeCommandline -from ._bwa import BwaSamseCommandline -from ._Novoalign import NovoalignCommandline -from ._samtools import SamtoolsCalmdCommandline -from ._samtools import SamtoolsCatCommandline -from ._samtools import SamtoolsFaidxCommandline -from ._samtools import SamtoolsFixmateCommandline -from ._samtools import SamtoolsIdxstatsCommandline -from ._samtools import SamtoolsIndexCommandline -from ._samtools import SamtoolsMergeCommandline -from ._samtools import SamtoolsMpileupCommandline -from ._samtools import SamtoolsPhaseCommandline -from ._samtools import SamtoolsReheaderCommandline -from ._samtools import SamtoolsRmdupCommandline -from ._samtools import SamtoolsTargetcutCommandline -from ._samtools import SamtoolsVersion0xSortCommandline -from ._samtools import SamtoolsVersion0xSortCommandline as SamtoolsSortCommandline -from ._samtools import SamtoolsVersion1xSortCommandline -from ._samtools import SamtoolsViewCommandline - -# Make this explicit, then they show up in the API docs -__all__ = ( - "BwaIndexCommandline", - "BwaAlignCommandline", - "BwaSamseCommandline", - "BwaSampeCommandline", - "BwaBwaswCommandline", - "BwaMemCommandline", - "NovoalignCommandline", - "SamtoolsViewCommandline", - "SamtoolsCalmdCommandline", - "SamtoolsCatCommandline", - "SamtoolsFaidxCommandline", - "SamtoolsFixmateCommandline", - "SamtoolsIdxstatsCommandline", - "SamtoolsIndexCommandline", - "SamtoolsMergeCommandline", - "SamtoolsMpileupCommandline", - "SamtoolsPhaseCommandline", - "SamtoolsReheaderCommandline", - "SamtoolsRmdupCommandline", - "SamtoolsSortCommandline", - "SamtoolsVersion0xSortCommandline", - "SamtoolsVersion1xSortCommandline", - "SamtoolsTargetcutCommandline", -) diff --git a/Bio/Sequencing/Applications/_bwa.py b/Bio/Sequencing/Applications/_bwa.py deleted file mode 100644 index 6a8df32f1..000000000 --- a/Bio/Sequencing/Applications/_bwa.py +++ /dev/null @@ -1,643 +0,0 @@ -# Copyright 2013 Saket Choudhary. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for bwa.""" - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _StaticArgument -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class BwaIndexCommandline(AbstractCommandline): - """Command line wrapper for Burrows Wheeler Aligner (BWA) index. - - Index database sequences in the FASTA format, equivalent to:: - - $ bwa index [-p prefix] [-a algoType] [-c] - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaIndexCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") - >>> print(index_cmd) - bwa index -a bwtsw /path/to/reference_genome.fasta - - You would typically run the command using index_cmd() or via the - Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("index"), - _Option( - ["-a", "a", "algorithm"], - """Algorithm for constructing BWT index. - - Available options are: - - is: IS linear-time algorithm for constructing suffix array. - It requires 5.37N memory where N is the size of the database. - IS is moderately fast, but does not work with database larger - than 2GB. IS is the default algorithm due to its simplicity. - - bwtsw: Algorithm implemented in BWT-SW. This method works with the - whole human genome, but it does not work with database - smaller than 10MB and it is usually slower than IS.""", - checker_function=lambda x: x in ["is", "bwtsw"], - equate=False, - is_required=True, - ), - _Option( - ["-p", "p", "prefix"], - "Prefix of the output database [same as db filename]", - equate=False, - is_required=False, - ), - _Argument(["infile"], "Input file name", filename=True, is_required=True), - _Switch( - ["-c", "c"], - "Build color-space index. The input fasta should be in nucleotide space.", - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class BwaAlignCommandline(AbstractCommandline): - """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. - - Run a BWA alignment, equivalent to:: - - $ bwa aln [...] > - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaAlignCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> read_file = "/path/to/read_1.fq" - >>> output_sai_file = "/path/to/read_1.sai" - >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) - >>> print(align_cmd) - bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq - - You would typically run the command line using align_cmd(stdout=output_sai_file) - or via the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("aln"), - _Argument( - ["reference"], "Reference file name", filename=True, is_required=True - ), - _Argument(["read_file"], "Read file name", filename=True, is_required=True), - _Option( - ["-n", "n"], - "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", - checker_function=lambda x: isinstance(x, (int, float)), - equate=False, - ), - _Option( - ["-o", "o"], - "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", - checker_function=lambda x: isinstance(x, (int, float)), - equate=False, - ), - _Option( - ["-e", "e"], - "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-d", "d"], - "Disallow a long deletion within INT bp towards the 3-end [16]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-i", "i"], - "Disallow an indel within INT bp towards the ends [5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-l", "l"], - """Take the first INT subsequence as seed. - - If INT is larger than the query sequence, seeding will be disabled. - For long reads, this option is typically ranged from 25 to 35 for - -k 2. [inf]""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-k", "k"], - "Maximum edit distance in the seed [2]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-t", "t"], - "Number of threads (multi-threading mode) [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-M", "M"], - "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-O", "O"], - "Gap open penalty [11]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-E", "E"], - "Gap extension penalty [4]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-R", "R"], - """Proceed with suboptimal alignments if there are no more than INT equally best hits. - - This option only affects paired-end mapping. Increasing this threshold helps - to improve the pairing accuracy at the cost of speed, especially for short - reads (~32bp).""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-q", "q"], - r"""Parameter for read trimming [0]. - - BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l > - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaSamseCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> read_file = "/path/to/read_1.fq" - >>> sai_file = "/path/to/read_1.sai" - >>> output_sam_file = "/path/to/read_1.sam" - >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, - ... read_file=read_file, sai_file=sai_file) - >>> print(samse_cmd) - bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq - - You would typically run the command line using samse_cmd(stdout=output_sam_file) - or via the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("samse"), - _Argument( - ["reference"], "Reference file name", filename=True, is_required=True - ), - _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), - _Argument( - ["read_file"], "Read file name", filename=True, is_required=True - ), - _Option( - ["-n", "n"], - """Maximum number of alignments to output in the XA tag for reads paired properly. - - If a read has more than INT hits, the XA tag will not be written. [3]""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-r", "r"], - "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class BwaSampeCommandline(AbstractCommandline): - r"""Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. - - Generate alignments in the SAM format given paired-end reads. - Equivalent to:: - - $ bwa sampe [...] > - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaSampeCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> read_file1 = "/path/to/read_1.fq" - >>> read_file2 = "/path/to/read_2.fq" - >>> sai_file1 = "/path/to/read_1.sai" - >>> sai_file2 = "/path/to/read_2.sai" - >>> output_sam_file = "/path/to/output.sam" - >>> read_group = r"@RG\tID:foo\tSM:bar" # BWA will turn backslash-t into tab - >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, - ... sai_file1=sai_file1, sai_file2=sai_file2, - ... read_file1=read_file1, read_file2=read_file2, - ... r=read_group) - >>> print(sampe_cmd) - bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG\tID:foo\tSM:bar - - You would typically run the command line using sampe_cmd(stdout=output_sam_file) - or via the Python subprocess module, as described in the Biopython tutorial. - - """ - - # TODO - Should the read group have a raw tab in it, or \t? - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("sampe"), - _Argument( - ["reference"], "Reference file name", filename=True, is_required=True - ), - _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), - _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), - _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), - _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), - _Option( - ["-a", "a"], - """Maximum insert size for a read pair to be considered being mapped properly [500]. - - Since 0.4.5, this option is only used when there are not enough - good alignments to infer the distribution of insert sizes.""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-o", "o"], - """Maximum occurrences of a read for pairing [100000]. - - A read with more occurrences will be treated as a single-end read. - Reducing this parameter helps faster pairing.""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-n", "n"], - """Maximum number of alignments to output in the XA tag for reads paired properly [3]. - - If a read has more than INT hits, the XA tag will not be written.""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-N", "N"], - """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. - - If a read has more than INT hits, the XA tag will not be written.""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-r", "r"], - "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class BwaBwaswCommandline(AbstractCommandline): - """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. - - Align query sequences from FASTQ files. Equivalent to:: - - $ bwa bwasw [...] - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaBwaswCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> read_file = "/path/to/read_1.fq" - >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) - >>> print(bwasw_cmd) - bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq - - You would typically run the command line using bwasw_cmd() or via the - Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("bwasw"), - _Argument( - ["reference"], "Reference file name", filename=True, is_required=True - ), - _Argument(["read_file"], "Read file", filename=True, is_required=True), - _Argument(["mate_file"], "Mate file", filename=True, is_required=False), - _Option( - ["-a", "a"], - "Score of a match [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-b", "b"], - "Mismatch penalty [3]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-q", "q"], - "Gap open penalty [5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-r", "r"], - "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-t", "t"], - "Number of threads in the multi-threading mode [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-w", "w"], - "Band width in the banded alignment [33]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-T", "T"], - "Minimum score threshold divided by a [37]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-c", "c"], - """Coefficient for threshold adjustment according to query length [5.5]. - - Given an l-long query, the threshold for a hit to be retained is - a*max{T,c*log(l)}.""", - checker_function=lambda x: isinstance(x, float), - equate=False, - ), - _Option( - ["-z", "z"], - "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-s", "s"], - """Maximum SA interval size for initiating a seed [3]. - - Higher -s increases accuracy at the cost of speed.""", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-N", "N"], - "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class BwaMemCommandline(AbstractCommandline): - """Command line wrapper for Burrows Wheeler Aligner (BWA) mem. - - Run a BWA-MEM alignment, with single- or paired-end reads, equivalent to:: - - $ bwa mem [...] > - - See http://bio-bwa.sourceforge.net/bwa.shtml for details. - - Examples - -------- - >>> from Bio.Sequencing.Applications import BwaMemCommandline - >>> reference_genome = "/path/to/reference_genome.fasta" - >>> read_file = "/path/to/read_1.fq" - >>> output_sam_file = "/path/to/output.sam" - >>> align_cmd = BwaMemCommandline(reference=reference_genome, read_file1=read_file) - >>> print(align_cmd) - bwa mem /path/to/reference_genome.fasta /path/to/read_1.fq - - You would typically run the command line using align_cmd(stdout=output_sam_file) - or via the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="bwa", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("mem"), - _Argument( - ["reference"], "Reference file name", filename=True, is_required=True - ), - _Argument( - ["read_file1"], "Read 1 file name", filename=True, is_required=True - ), - _Argument( - ["read_file2"], "Read 2 file name", filename=True, is_required=False - ), - _Option( - ["-t", "t"], - "Number of threads [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-k", "k"], - "Minimum seed length. Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [19]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-w", "w"], - "Band width. Essentially, gaps longer than INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-d", "d"], - r"Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above \|i-j\|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-r", "r"], - "Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]", - checker_function=lambda x: isinstance(x, (int, float)), - equate=False, - ), - _Option( - ["-c", "c"], - "Discard a MEM if it has more than INT occurrence in the genome. This is an insensitive parameter. [10000]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-A", "A"], - "Matching score. [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-B", "B"], - "Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-O", "O"], - "Gap open penalty. [6]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-E", "E"], - "Gap extension penalty. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [1]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-L", "L"], - "Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [5]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-U", "U"], - "Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [9] ", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-R", "R"], - "Complete read group header line. 't' can be used in STR and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\tID:foo\tSM:bar'. [null]", - checker_function=lambda x: isinstance(x, str), - equate=False, - ), - _Option( - ["-T", "T"], - "Don't output alignment with score lower than INT. This option only affects output. [30]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-v", "v"], - "Control the verbose level of the output. This option has not been fully supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Switch( - ["-P", "P"], - "In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair.", - ), - _Switch( - ["-p", "p"], - "Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.", - ), - _Switch( - ["-a", "a"], - "Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments.", - ), - _Switch( - ["-C", "C"], - "Append FASTA/Q comment to SAM output. This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.", - ), - _Switch( - ["-H", "H"], - "Use hard clipping 'H' in the SAM output. This option may dramatically reduce the redundancy of output when mapping long contig or BAC sequences.", - ), - _Switch( - ["-M", "M"], - "Mark shorter split hits as secondary (for Picard compatibility).", - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/Sequencing/Applications/_samtools.py b/Bio/Sequencing/Applications/_samtools.py deleted file mode 100644 index 0c561e9fa..000000000 --- a/Bio/Sequencing/Applications/_samtools.py +++ /dev/null @@ -1,1037 +0,0 @@ -# Copyright 2014 Saket Choudhary. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. -"""Command line wrapper for samtools.""" -# Last Checked with samtools [0.1.20 and 1.2] -# TODO samtools 1.x has additional options over 0.x which -# are missing from this wrapper - -from Bio.Application import _Argument -from Bio.Application import _ArgumentList -from Bio.Application import _Option -from Bio.Application import _StaticArgument -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class SamtoolsViewCommandline(AbstractCommandline): - """Command line wrapper for samtools view. - - Extract/print all or sub alignments in SAM or BAM format, equivalent to:: - - $ samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] - [-F skipFlag] [-q minMapQ] [-l library] [-r readGroup] - [-R rgFile] | [region1 [...]] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsViewCommandline - >>> input_file = "/path/to/sam_or_bam_file" - >>> samtools_view_cmd = SamtoolsViewCommandline(input_file=input_file) - >>> print(samtools_view_cmd) - samtools view /path/to/sam_or_bam_file - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("view"), - _Switch(["-b", "b"], "Output in the BAM format"), - _Switch( - ["-c", "c"], - """Instead of printing the alignments, only count them and - print the total number. - - All filter options, such as '-f', '-F' and '-q', - are taken into account""", - ), - _Switch(["-h", "h"], "Include the header in the output"), - _Switch( - ["-u", "u"], - """Output uncompressed BAM. - - This option saves time spent on compression/decompression - and is thus preferred when the output is piped to - another samtools command""", - ), - _Switch(["-H", "H"], "Output the header only"), - _Switch( - ["-S", "S"], - """Input is in SAM. - If @SQ header lines are absent, - the '-t' option is required.""", - ), - _Option( - ["-t", "t"], - """This file is TAB-delimited. - Each line must contain the reference name and the - length of the reference, one line for each - distinct reference; additional fields are ignored. - - This file also defines the order of the reference - sequences in sorting. - If you run 'samtools faidx ', - the resultant index file .fai can be used - as this file.""", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-o", "o"], - "Output file", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-f", "f"], - """Only output alignments with all bits in - INT present in the FLAG field""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-F", "F"], - "Skip alignments with bits present in INT", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-q", "q"], - "Skip alignments with MAPQ smaller than INT", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-r", "r"], - "Only output reads in read group STR", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-R", "R"], - "Output reads in read groups listed in FILE", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-l", "l"], - "Only output reads in library STR", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Switch( - ["-1", "fast_bam"], - "Use zlib compression level 1 to compress the output", - ), - _Argument( - ["input", "input_file"], - "Input File Name", - filename=True, - is_required=True, - ), - _Argument(["region"], "Region", is_required=False), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsMpileupCommandline(AbstractCommandline): - """Command line wrapper for samtools mpileup. - - Generate BCF or pileup for one or multiple BAM files, equivalent to:: - - $ samtools mpileup [-EBug] [-C capQcoef] [-r reg] [-f in.fa] - [-l list] [-M capMapQ] [-Q minBaseQ] - [-q minMapQ] in.bam [in2.bam [...]] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsMpileupCommandline - >>> input = ["/path/to/sam_or_bam_file"] - >>> samtools_mpileup_cmd = SamtoolsMpileupCommandline(input_file=input) - >>> print(samtools_mpileup_cmd) - samtools mpileup /path/to/sam_or_bam_file - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("mpileup"), - _Switch( - ["-E", "E"], - """Extended BAQ computation. - This option helps sensitivity especially - for MNPs, but may hurt specificity a little bit""", - ), - _Switch( - ["-B", "B"], - """Disable probabilistic realignment for the - computation of base alignment quality (BAQ). - - BAQ is the Phred-scaled probability of a read base being - misaligned. - Applying this option greatly helps to reduce false SNPs - caused by misalignments""", - ), - _Switch( - ["-g", "g"], - """Compute genotype likelihoods and output them in the - binary call format (BCF)""", - ), - _Switch( - ["-u", "u"], - """Similar to -g except that the output is - uncompressed BCF, which is preferred for piping""", - ), - _Option( - ["-C", "C"], - """Coefficient for downgrading mapping quality for - reads containing excessive mismatches. - - Given a read with a phred-scaled probability q of - being generated from the mapped position, - the new mapping quality is about sqrt((INT-q)/INT)*INT. - A zero value disables this functionality; - if enabled, the recommended value for BWA is 50""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-r", "r"], - "Only generate pileup in region STR", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-f", "f"], - """The faidx-indexed reference file in the FASTA format. - - The file can be optionally compressed by razip""", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-l", "l"], - """BED or position list file containing a list of regions - or sites where pileup or BCF should be generated""", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-M", "M"], - "Cap Mapping Quality at M", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-q", "q"], - "Minimum mapping quality for an alignment to be used", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-Q", "Q"], - "Minimum base quality for a base to be considered", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Switch( - ["-6", "illumina_13"], - "Assume the quality is in the Illumina 1.3+ encoding", - ), - _Switch( - ["-A", "A"], "Do not skip anomalous read pairs in variant calling." - ), - _Option( - ["-b", "b"], - "List of input BAM files, one file per line", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-d", "d"], - "At a position, read maximally INT reads per input BAM", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Switch(["-D", "D"], "Output per-sample read depth"), - _Switch( - ["-S", "S"], - """Output per-sample Phred-scaled - strand bias P-value""", - ), - _Option( - ["-e", "e"], - """Phred-scaled gap extension sequencing error probability. - - Reducing INT leads to longer indels""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-h", "h"], - """Coefficient for modeling homopolymer errors. - - Given an l-long homopolymer run, the sequencing error - of an indel of size s is modeled as INT*s/l""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Switch(["-I", "I"], "Do not perform INDEL calling"), - _Option( - ["-L", "L"], - """Skip INDEL calling if the average per-sample - depth is above INT""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-o", "o"], - """Phred-scaled gap open sequencing error probability. - - Reducing INT leads to more indel calls.""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-p", "p"], - """Comma delimited list of platforms (determined by @RG-PL) - from which indel candidates are obtained. - - It is recommended to collect indel candidates from - sequencing technologies that have low indel error rate - such as ILLUMINA""", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _ArgumentList( - ["input_file"], - "Input File for generating mpileup", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsReheaderCommandline(AbstractCommandline): - """Command line wrapper for samtools reheader. - - Replace the header in in.bam with the header - in in.header.sam, equivalent to:: - - $ samtools reheader - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsReheaderCommandline - >>> input_header = "/path/to/header_sam_file" - >>> input_bam = "/path/to/input_bam_file" - >>> reheader_cmd = SamtoolsReheaderCommandline(input_header=input_header, - ... input_bam=input_bam) - >>> print(reheader_cmd) - samtools reheader /path/to/header_sam_file /path/to/input_bam_file - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("reheader"), - _Argument( - ["input_header", "header_sam", "sam_file"], - "Sam file with header", - filename=True, - is_required=True, - ), - _Argument( - ["input_bam", "input_file", "bam_file"], - "BAM file for writing header to", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsCatCommandline(AbstractCommandline): - """Command line wrapper for samtools cat. - - Concatenate BAMs, equivalent to:: - - $ samtools cat [-h header.sam] [-o out.bam] [ ... ] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsCatCommandline - >>> input_bam1 = "/path/to/input_bam1" - >>> input_bam2 = "/path/to/input_bam2" - >>> input_bams = [input_bam1, input_bam2] - >>> samtools_cat_cmd = SamtoolsCatCommandline(input_bam=input_bams) - >>> print(samtools_cat_cmd) - samtools cat /path/to/input_bam1 /path/to/input_bam2 - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("cat"), - _Option( - ["-h", "h"], - "Header SAM file", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-o", "o"], - "Output SAM file", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _ArgumentList( - ["input", "input_bam", "bams"], - "Input BAM files", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsVersion0xSortCommandline(AbstractCommandline): - """Command line wrapper for samtools version 0.1.x sort. - - Concatenate BAMs, equivalent to:: - - $ samtools sort [-no] [-m maxMem] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsVersion0xSortCommandline - >>> input_bam = "/path/to/input_bam" - >>> out_prefix = "/path/to/out_prefix" - >>> samtools_sort_cmd = SamtoolsVersion0xSortCommandline(input=input_bam, out_prefix=out_prefix) - >>> print(samtools_sort_cmd) - samtools sort /path/to/input_bam /path/to/out_prefix - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - - # options for version samtools 0.0.19 - self.parameters = [ - _StaticArgument("sort"), - _Switch( - ["-o", "o"], - """Output the final alignment - to the standard output""", - ), - _Switch( - ["-n", "n"], - """Sort by read names rather - than by chromosomal coordinates""", - ), - _Option( - ["-m", "m"], - "Approximately the maximum required memory", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Argument(["input"], "Input BAM file", filename=True, is_required=True), - _Argument(["out_prefix"], "Output prefix", filename=True, is_required=True), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsVersion1xSortCommandline(AbstractCommandline): - """Command line wrapper for samtools version 1.3.x sort. - - Concatenate BAMs, equivalent to:: - - $ samtools sort [-n] [-T FREFIX] [-o file] [-I INT] [-m maxMem] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsVersion1xSortCommandline - >>> input_bam = "/path/to/input_bam" - >>> FREFIX = "/path/to/out_prefix" - >>> file_name = "/path/to/out_file" - >>> samtools_sort_cmd = SamtoolsVersion1xSortCommandline(input=input_bam, T=FREFIX, o=file_name) - >>> print(samtools_sort_cmd) - samtools sort -o /path/to/out_file -T /path/to/out_prefix /path/to/input_bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - - # options for version samtools 1.3.1 - self.parameters = [ - _StaticArgument("sort"), - _Switch( - ["-n", "n"], - """Sort by read names rather - than by chromosomal coordinates""", - ), - _Option( - ["-o", "o"], - """(file) Write the final sorted output to FILE, - rather than to standard output""", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-O", "O"], - """(FORMAT) Write the final output as sam, bam, or cram""", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-T", "T"], - """(PREFIX) Write temporary files to PREFIX.nnnn.bam, or if the specified PREFIX - is an existing directory, to PREFIX/samtools.mmm.mmm.tmp.nnnn.bam, - where mmm is unique to this invocation of the sort command""", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-I", "I"], - """(INT) Set the desired compression level for the final output file, - ranging from 0 (uncompressed) or 1 (fastest but minimal compression) - to 9 (best compression but slowest to write), similarly to gzip(1)'s compression level setting.""", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-m", "m"], - "Approximately the maximum required memory", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Argument( - ["input"], "Input SAM/BAM/CRAM file", filename=True, is_required=True - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsMergeCommandline(AbstractCommandline): - """Command line wrapper for samtools merge. - - Merge multiple sorted alignments, equivalent to:: - - $ samtools merge [-nur1f] [-h inh.sam] [-R reg] - [...] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsMergeCommandline - >>> out_bam = "/path/to/out_bam" - >>> in_bam = ["/path/to/input_bam1", "/path/to/input_bam2"] - >>> merge_cmd = SamtoolsMergeCommandline(out_bam=out_bam, - ... input_bam=in_bam) - >>> print(merge_cmd) - samtools merge /path/to/out_bam /path/to/input_bam1 /path/to/input_bam2 - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("merge"), - _Switch( - ["-n", "n"], - """The input alignments are sorted by read names - rather than by chromosomal coordinates""", - ), - _Switch( - ["-r", "r"], - """Attach an RG tag to each alignment. - The tag value is inferred from file names""", - ), - _Switch(["-u", "u"], "Uncompressed BAM output"), - _Switch( - ["-1", "fast_bam"], - """Use zlib compression level 1 - to compress the output""", - ), - _Switch( - ["-f", "f"], - """Force to overwrite the - output file if present""", - ), - _Option( - ["-h", "h"], - """Use the lines of FILE as '@' - headers to be copied to out.bam""", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-R", "R"], - "Merge files in the specified region indicated by STR", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Argument( - ["output_bam", "out_bam", "out", "output"], - "Output BAM file", - filename=True, - is_required=True, - ), - _ArgumentList( - ["input_bam", "in_bam", "input", "bam"], - "Input BAM", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsIndexCommandline(AbstractCommandline): - """Command line wrapper for samtools index. - - Index sorted alignment for fast random access, equivalent to:: - - $ samtools index - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsIndexCommandline - >>> input = "/path/to/aln_bam" - >>> samtools_index_cmd = SamtoolsIndexCommandline(input_bam=input) - >>> print(samtools_index_cmd) - samtools index /path/to/aln_bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("index"), - _Argument(["input", "in_bam", "input_bam"], "BAM file to be indexed"), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsIdxstatsCommandline(AbstractCommandline): - """Command line wrapper for samtools idxstats. - - Retrieve and print stats in the index file, equivalent to:: - - $ samtools idxstats - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsIdxstatsCommandline - >>> input = "/path/to/aln_bam" - >>> samtools_idxstats_cmd = SamtoolsIdxstatsCommandline(input_bam=input) - >>> print(samtools_idxstats_cmd) - samtools idxstats /path/to/aln_bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("idxstats"), - _Argument(["input", "in_bam", "input_bam"], "BAM file to be indexed"), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsFaidxCommandline(AbstractCommandline): - """Command line wrapper for samtools faidx. - - Retrieve and print stats in the index file, equivalent to:: - - $ samtools faidx [region1 [...]] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsFaidxCommandline - >>> reference = "/path/to/reference.fasta" - >>> samtools_faidx_cmd = SamtoolsFaidxCommandline(reference=reference) - >>> print(samtools_faidx_cmd) - samtools faidx /path/to/reference.fasta - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("faidx"), - _Argument( - ["reference", "reference_fasta", "ref"], - "Reference FASTA to be indexed", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsFixmateCommandline(AbstractCommandline): - """Command line wrapper for samtools fixmate. - - Fill in mate coordinates, ISIZE and mate related - flags from a name-sorted alignment, equivalent to:: - - $ samtools fixmate - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsFixmateCommandline - >>> in_bam = "/path/to/in.nameSrt.bam" - >>> out_bam = "/path/to/out.bam" - >>> fixmate_cmd = SamtoolsFixmateCommandline(input_bam=in_bam, - ... out_bam=out_bam) - >>> print(fixmate_cmd) - samtools fixmate /path/to/in.nameSrt.bam /path/to/out.bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("fixmate"), - _Argument( - ["in_bam", "sorted_bam", "input_bam", "input", "input_file"], - "Name Sorted Alignment File ", - filename=True, - is_required=True, - ), - _Argument( - ["out_bam", "output_bam", "output", "output_file"], - "Output file", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsRmdupCommandline(AbstractCommandline): - """Command line wrapper for samtools rmdup. - - Remove potential PCR duplicates, equivalent to:: - - $ samtools rmdup [-sS] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsRmdupCommandline - >>> input_sorted_bam = "/path/to/input.srt.bam" - >>> out_bam = "/path/to/out.bam" - >>> rmdup_cmd = SamtoolsRmdupCommandline(input_bam=input_sorted_bam, - ... out_bam=out_bam) - >>> print(rmdup_cmd) - samtools rmdup /path/to/input.srt.bam /path/to/out.bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("rmdup"), - _Switch( - ["-s", "s"], - """Remove duplicates for single-end reads. - - By default, the command works for paired-end - reads only""", - ), - _Switch( - ["-S", "S"], - """Treat paired-end reads - as single-end reads""", - ), - _Argument( - ["in_bam", "sorted_bam", "input_bam", "input", "input_file"], - "Name Sorted Alignment File ", - filename=True, - is_required=True, - ), - _Argument( - ["out_bam", "output_bam", "output", "output_file"], - "Output file", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsCalmdCommandline(AbstractCommandline): - """Command line wrapper for samtools calmd. - - Generate the MD tag, equivalent to:: - - $ samtools calmd [-EeubSr] [-C capQcoef] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsCalmdCommandline - >>> input_bam = "/path/to/aln.bam" - >>> reference_fasta = "/path/to/reference.fasta" - >>> calmd_cmd = SamtoolsCalmdCommandline(input_bam=input_bam, - ... reference=reference_fasta) - >>> print(calmd_cmd) - samtools calmd /path/to/aln.bam /path/to/reference.fasta - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("calmd"), - _Switch( - ["-E", "E"], - """Extended BAQ calculation. - This option trades specificity for sensitivity, - though the effect is minor.""", - ), - _Switch( - ["-e", "e"], - """Convert the read base to = if it is - identical to the aligned reference base. - - Indel caller does not support the = bases - at the moment.""", - ), - _Switch(["-u", "u"], "Output uncompressed BAM"), - _Switch(["-b", "b"], "Output compressed BAM "), - _Switch(["-S", "S"], "The input is SAM with header lines "), - _Switch( - ["-r", "r"], - """Compute the BQ tag (without -A) - or cap base quality by BAQ (with -A).""", - ), - _Switch( - ["-A", "A"], - """When used jointly with -r this option overwrites - the original base quality""", - ), - _Option( - ["-C", "C"], - """Coefficient to cap mapping quality - of poorly mapped reads. - - See the pileup command for details.""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Argument( - ["input", "input_file", "in_bam", "infile", "input_bam"], - "Input BAM", - filename=True, - is_required=True, - ), - _Argument( - ["reference", "reference_fasta", "ref"], - "Reference FASTA to be indexed", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsTargetcutCommandline(AbstractCommandline): - """Command line wrapper for samtools targetcut. - - This command identifies target regions by examining the continuity - of read depth, computes haploid consensus sequences of targets - and outputs a SAM with each sequence corresponding to a target, - equivalent to:: - - $ samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] - [-1 em1] [-2 em2] [-f ref] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsTargetcutCommandline - >>> input_bam = "/path/to/aln.bam" - >>> samtools_targetcut_cmd = SamtoolsTargetcutCommandline(input_bam=input_bam) - >>> print(samtools_targetcut_cmd) - samtools targetcut /path/to/aln.bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("targetcut"), - _Option( - ["-Q", "Q"], - "Minimum Base Quality ", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-i", "i"], - "Insertion Penalty", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-f", "f"], - "Reference Filename", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-0", "em0"], - "em0", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-1", "em1"], - "em1", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Option( - ["-2", "em2"], - "em2", - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Argument( - ["input", "input_bam", "in_bam"], - "Input file", - filename=True, - is_required=True, - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class SamtoolsPhaseCommandline(AbstractCommandline): - """Command line wrapper for samtools phase. - - Call and phase heterozygous SNPs, equivalent to:: - - $ samtools phase [-AF] [-k len] [-b prefix] - [-q minLOD] [-Q minBaseQ] - - See http://samtools.sourceforge.net/samtools.shtml for more details - - Examples - -------- - >>> from Bio.Sequencing.Applications import SamtoolsPhaseCommandline - >>> input_bam = "/path/to/in.bam" - >>> samtools_phase_cmd = SamtoolsPhaseCommandline(input_bam=input_bam) - >>> print(samtools_phase_cmd) - samtools phase /path/to/in.bam - - """ - - def __init__(self, cmd="samtools", **kwargs): - """Initialize the class.""" - self.program_name = cmd - self.parameters = [ - _StaticArgument("phase"), - _Argument( - ["input", "input_bam", "in_bam"], - "Input file", - filename=True, - is_required=True, - ), - _Switch(["-A", "A"], "Drop reads with ambiguous phase"), - _Option( - ["-b", "b"], - "Prefix of BAM output", - filename=True, - equate=False, - checker_function=lambda x: isinstance(x, str), - ), - _Switch(["-F", "F"], "Do not attempt to fix chimeric reads"), - _Option( - ["-k", "k"], - "Maximum length for local phasing", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-q", "q"], - """Minimum Phred-scaled LOD to - call a heterozygote""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - _Option( - ["-Q", "Q"], - """Minimum base quality to be - used in het calling""", - equate=False, - checker_function=lambda x: isinstance(x, int), - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/Bio/motifs/applications/__init__.py b/Bio/motifs/applications/__init__.py deleted file mode 100644 index 083255470..000000000 --- a/Bio/motifs/applications/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2009 by Bartek Wilczynski. All rights reserved. -# Revisions copyright 2009 by Peter Cock. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Motif command line tool wrappers (OBSOLETE). - -We have decided to remove this module in future, and instead recommend -building your command and invoking it via the subprocess module directly. -""" - -from ._xxmotif import XXmotifCommandline diff --git a/Bio/motifs/applications/_xxmotif.py b/Bio/motifs/applications/_xxmotif.py deleted file mode 100644 index b5b956d98..000000000 --- a/Bio/motifs/applications/_xxmotif.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright 2012 by Christian Brueffer. All rights reserved. -# -# This file is part of the Biopython distribution and governed by your -# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". -# Please see the LICENSE file that should have been included as part of this -# package. - -"""Command line wrapper for the motif finding program XXmotif.""" - -import os - -from Bio.Application import _Argument -from Bio.Application import _Option -from Bio.Application import _Switch -from Bio.Application import AbstractCommandline - - -class XXmotifCommandline(AbstractCommandline): - """Command line wrapper for XXmotif. - - http://xxmotif.genzentrum.lmu.de/ - - Notes - ----- - Last checked against version: 1.3 - - References - ---------- - Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive, - weight matriX-based motif discovery in nucleotide sequences, - Nucleic Acids Res. 40: W104-W109 (2012). - - Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value - based regulatory motif discovery using positional weight matrices, - Genome Res. 23: 181–194 (2013) - - Examples - -------- - >>> from Bio.motifs.applications import XXmotifCommandline - >>> out_dir = "results" - >>> in_file = "sequences.fasta" - >>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True) - >>> print(xxmotif_cline) - XXmotif results sequences.fasta --revcomp - - You would typically run the command line with xxmotif_cline() or via - the Python subprocess module, as described in the Biopython tutorial. - - """ - - def __init__(self, cmd="XXmotif", **kwargs): - """Initialize the class.""" - # order of parameters is the same as in XXmotif --help - _valid_alphabet = set("ACGTNX") - - self.parameters = [ - _Argument( - ["outdir", "OUTDIR"], - "output directory for all results", - filename=True, - is_required=True, - # XXmotif currently does not accept spaces in the outdir name - checker_function=lambda x: " " not in x, - ), - _Argument( - ["seqfile", "SEQFILE"], - "file name with sequences from positive set in FASTA format", - filename=True, - is_required=True, - # XXmotif currently only accepts a pure filename - checker_function=lambda x: os.path.split(x)[0] == "", - ), - # Options - _Option( - ["--negSet", "negSet", "NEGSET", "negset"], - "sequence set which has to be used as a reference set", - filename=True, - equate=False, - ), - _Switch( - ["--zoops", "ZOOPS", "zoops"], - "use zero-or-one occurrence per sequence model (DEFAULT)", - ), - _Switch( - ["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model" - ), - _Switch( - ["--oops", "OOPS", "oops"], "use one occurrence per sequence model" - ), - _Switch( - ["--revcomp", "REVCOMP", "revcomp"], - "search in reverse complement of sequences as well (DEFAULT: NO)", - ), - _Option( - [ - "--background-model-order", - "background-model-order", - "BACKGROUND-MODEL-ORDER", - "background_model_order", - ], - "order of background distribution (DEFAULT: 2, 8(--negset) )", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["--pseudo", "PSEUDO", "pseudo"], - "percentage of pseudocounts used (DEFAULT: 10)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["-g", "--gaps", "GAPS", "gaps"], - "maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)", - checker_function=lambda x: x in [0 - 3], - equate=False, - ), - _Option( - ["--type", "TYPE", "type"], - "defines what kind of start seeds are used (DEFAULT: ALL)" - "possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM", - checker_function=lambda x: x - in [ - "ALL", - "all", - "FIVEMERS", - "fivemers", - "PALINDROME", - "palindrome", - "TANDEM", - "tandem", - "NOPALINDROME", - "nopalindrome", - "NOTANDEM", - "notandem", - ], - equate=False, - ), - _Option( - [ - "--merge-motif-threshold", - "merge-motif-threshold", - "MERGE-MOTIF-THRESHOLD", - "merge_motif_threshold", - ], - "defines the similarity threshold for merging motifs (DEFAULT: HIGH)" - "possible modes: LOW, MEDIUM, HIGH", - checker_function=lambda x: x - in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"], - equate=False, - ), - _Switch( - [ - "--no-pwm-length-optimization", - "no-pwm-length-optimization", - "NO-PWM-LENGTH-OPTIMIZATION", - "no_pwm_length_optimization", - ], - "do not optimize length during iterations (runtime advantages)", - ), - _Option( - [ - "--max-match-positions", - "max-match-positions", - "MAX-MATCH-POSITIONS", - "max_match_positions", - ], - "max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Switch( - ["--batch", "BATCH", "batch"], - "suppress progress bars (reduce output size for batch jobs)", - ), - _Option( - ["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"], - "maximum number of sequences from the positive set used [DEFAULT: all]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # does not make sense in biopython - # _Switch(["--help", "help", "HELP"], - # "print this help page"), - _Option( - ["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"], - "inspect extensions and refinement of a given seed (DEFAULT: not used)", - checker_function=lambda x: any((c in _valid_alphabet) for c in x), - equate=False, - ), - # Using conservation information - _Option( - ["--format", "FORMAT", "format"], - "defines what kind of format the input sequences have (DEFAULT: FASTA)", - checker_function=lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"], - equate=False, - ), - _Option( - [ - "--maxMultipleSequences", - "maxMultipleSequences", - "MAXMULTIPLESEQUENCES", - "maxmultiplesequences", - ], - "maximum number of sequences used in an alignment [DEFAULT: all]", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Using localization information - _Switch( - ["--localization", "LOCALIZATION", "localization"], - "use localization information to calculate combined P-values" - "(sequences should have all the same length)", - ), - _Option( - ["--downstream", "DOWNSTREAM", "downstream"], - "number of residues in positive set downstream of anchor point (DEFAULT: 0)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # Start with self defined motif - _Option( - ["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"], - "Start motif (IUPAC characters)", - checker_function=lambda x: any((c in _valid_alphabet) for c in x), - equate=False, - ), - _Option( - ["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"], - "profile file", - filename=True, - equate=False, - ), - _Option( - ["--startRegion", "startRegion", "STARTREGION", "startregion"], - "expected start position for motif occurrences relative to anchor point (--localization)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - _Option( - ["--endRegion", "endRegion", "ENDREGION", "endregion"], - "expected end position for motif occurrences relative to anchor point (--localization)", - checker_function=lambda x: isinstance(x, int), - equate=False, - ), - # XXmotif wrapper options - _Switch( - ["--XXmasker", "masker"], - "mask the input sequences for homology, repeats and low complexity regions", - ), - _Switch( - ["--XXmasker-pos", "maskerpos"], - "mask only the positive set for homology, repeats and low complexity regions", - ), - _Switch( - ["--no-graphics", "nographics"], "run XXmotif without graphical output" - ), - ] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -if __name__ == "__main__": - from Bio._utils import run_doctest - - run_doctest() diff --git a/DEPRECATED.rst b/DEPRECATED.rst index c556c8495..3aad8eb9e 100644 --- a/DEPRECATED.rst +++ b/DEPRECATED.rst @@ -180,8 +180,9 @@ Bio.Data.PDBData instead. Bio.Application and the command line wrappers using it ------------------------------------------------------ -Declared obsolete in release 1.79, and deprecated in release 1.82. Please use -the standard library subprocess module directly instead. +Declared obsolete in release 1.79, deprecated in release 1.82, and removed +tn release 1.85. Please use the standard library subprocess module directly +instead. Bio.Index --------- @@ -495,7 +496,8 @@ NCBI "legacy" BLAST tool wrappers FastacmdCommandline, BlastallCommandline, BlastpgpCommandline and RpsBlastCommandline were declared obsolete in Release 1.53, deprecated in Release 1.61, and removed in Release 1.64, having been replaced with wrappers for the new NCBI BLAST+ tools (e.g. -NcbiblastpCommandline and NcbipsiblastCommandline). +NcbiblastpCommandline and NcbipsiblastCommandline). This module was removed +in release 1.85 as it relied on Bio.Application, which was being removed. Bio.Blast.ParseBlastTable ------------------------- diff --git a/Tests/test_Application.py b/Tests/test_Application.py deleted file mode 100644 index fcddb7b57..000000000 --- a/Tests/test_Application.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2013 by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Bio.Application related tests for command line application wrappers. - -This is intended to check generic things like argument parsing, and -stdin/stdout/stderr handling. -""" - -import os -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import _Argument - from Bio.Application import AbstractCommandline - - -class EchoApp(AbstractCommandline): - """Minimal command line wrapper for echo command.""" - - def __init__(self, cmd="echo", **kwargs): - """Initialize wrapper for echo command.""" - self.parameters = [_Argument(["text"], "Text to echo")] - AbstractCommandline.__init__(self, cmd, **kwargs) - - -class TestApp(unittest.TestCase): - def test_echo(self): - cline = EchoApp(text="Hello World") - stdout, stderr = cline() - self.assertEqual(stderr, "") - self.assertEqual(stdout, "Hello World\n") - - def test_echo_capture_both(self): - cline = EchoApp(text="Hello World") - stdout, stderr = cline(stdout=True, stderr=True) - self.assertEqual(stderr, "") - self.assertEqual(stdout, "Hello World\n") - - def test_echo_capture_stdout(self): - cline = EchoApp(text="Hello World") - stdout, stderr = cline(stdout=True, stderr=False) - self.assertIsNone(stderr) - self.assertEqual(stdout, "Hello World\n") - - def test_echo_capture_stderr(self): - cline = EchoApp(text="Hello World") - stdout, stderr = cline(stdout=False, stderr=True) - self.assertEqual(stderr, "") - self.assertIsNone(stdout) - - def test_echo_capture_neither(self): - cline = EchoApp(text="Hello World") - stdout, stderr = cline(stdout=False, stderr=False) - self.assertIsNone(stderr) - self.assertIsNone(stdout) - - def test_echo_file_stdout(self): - cline = EchoApp(text="Hello World") - tmp = "echo_stdout.tmp" - if os.path.isfile(tmp): - os.remove(tmp) - stdout, stderr = cline(stdout=tmp) - self.assertEqual(stderr, "") - self.assertIsNone(stdout) - self.assertTrue(os.path.isfile(tmp)) - with open(tmp) as handle: - contents = handle.read() - self.assertEqual(contents, "Hello World\n") - os.remove(tmp) - - def test_echo_file_stderr(self): - cline = EchoApp(text="Hello World") - tmp = "echo_stderr.tmp" - if os.path.isfile(tmp): - os.remove(tmp) - stdout, stderr = cline(stderr=tmp) - self.assertIsNone(stderr) - self.assertEqual(stdout, "Hello World\n") - self.assertTrue(os.path.isfile(tmp)) - with open(tmp) as handle: - contents = handle.read() - self.assertEqual(contents, "") - os.remove(tmp) - - def test_echo_file_same(self): - cline = EchoApp(text="Hello World") - tmp = "echo_stdout_stderr.tmp" - if os.path.isfile(tmp): - os.remove(tmp) - stdout, stderr = cline(stdout=tmp, stderr=tmp) - self.assertIsNone(stderr) - self.assertIsNone(stdout) - self.assertTrue(os.path.isfile(tmp)) - with open(tmp) as handle: - contents = handle.read() - self.assertEqual(contents, "Hello World\n") # stdout + stderr - os.remove(tmp) - - def test_echo_file_both(self): - cline = EchoApp(text="Hello World") - tmp = "echo_stdout.tmp" - if os.path.isfile(tmp): - os.remove(tmp) - tmp2 = "echo_stderr.tmp" - if os.path.isfile(tmp2): - os.remove(tmp2) - stdout, stderr = cline(stdout=tmp, stderr=tmp2) - self.assertIsNone(stderr) - self.assertIsNone(stdout) - self.assertTrue(os.path.isfile(tmp), tmp) - with open(tmp) as handle: - contents = handle.read() - self.assertEqual(contents, "Hello World\n") # stdout - os.remove(tmp) - self.assertTrue(os.path.isfile(tmp2), tmp2) - with open(tmp2) as handle: - contents = handle.read() - self.assertEqual(contents, "") # stderr - os.remove(tmp2) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_BWA_tool.py b/Tests/test_BWA_tool.py deleted file mode 100644 index c7ac74c36..000000000 --- a/Tests/test_BWA_tool.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright 2013 by Saket Choudhary. Based on test_Clustalw_tool.py by Peter -# Cock . -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for calling BWA.""" - -import os -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - # TODO from Bio.Sequencing.Applications import BwaBwaswCommandline - from Bio.Sequencing.Applications import BwaAlignCommandline - from Bio.Sequencing.Applications import BwaIndexCommandline - from Bio.Sequencing.Applications import BwaMemCommandline - from Bio.Sequencing.Applications import BwaSampeCommandline - from Bio.Sequencing.Applications import BwaSamseCommandline - - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -bwa_exe = None -if sys.platform == "win32": - # TODO - Check the path? - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files" - likely_dirs = ["bwa", "bwa-0.6.2", ""] - likely_exes = ["bwa"] - for folder in likely_dirs: - if os.path.isdir(os.path.join(prog_files, folder)): - for filename in likely_exes: - if os.path.isfile(os.path.join(prog_files, folder, filename)): - bwa_exe = os.path.join(prog_files, folder, filename) - break - if bwa_exe: - break -else: - from subprocess import getoutput - - output = getoutput("bwa") - - # Since "not found" may be in another language, try and be sure this is - # really the bwa tool's output - bwa_found = False - if "not found" not in output and "not recognized" not in output: - if "bwa" in output and "alignment via Burrows-Wheeler transformation" in output: - bwa_exe = "bwa" - skip_aln_tests = False - aln_output = getoutput("bwa aln") - if "unrecognized" in aln_output: - skip_aln_tests = True - print("'bwa aln' is unrecognized, skipping aln/samse/sampe tests") - -if not bwa_exe: - raise MissingExternalDependencyError( - "Install bwa and correctly set" - " the file path to the program if" - " you want to use it from Biopython" - ) - - -class BwaTestCase(unittest.TestCase): - """Class for implementing BWA test cases.""" - - def setUp(self): - self.reference_file = "BWA/human_g1k_v37_truncated.fasta" - self.reference_extensions = ["amb", "ann", "bwt", "pac", "sa"] - self.infile1 = "BWA/HNSCC1_1_truncated.fastq" - self.infile2 = "BWA/HNSCC1_2_truncated.fastq" - self.saifile1 = "BWA/1.sai" - self.saifile2 = "BWA/2.sai" - self.samfile1 = "BWA/1.sam" - self.samfile2 = "BWA/2.sam" - self.samfile = "BWA/out.sam" - self.files_to_clean = [ - self.saifile1, - self.saifile2, - self.samfile1, - self.samfile2, - self.samfile, - ] - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - for extension in self.reference_extensions: - index_file = self.reference_file + "." + extension - if os.path.exists(index_file): - os.remove(index_file) - - def test_index(self): - """Test for creating index files for the reference genome fasta file.""" - cmdline = BwaIndexCommandline(bwa_exe) - cmdline.set_parameter("infile", self.reference_file) - cmdline.set_parameter("algorithm", "bwtsw") - stdout, stderr = cmdline() - for extension in self.reference_extensions: - index_file = self.reference_file + "." + extension - self.assertTrue( - os.path.exists(index_file), f"Index File {index_file} not found" - ) - self.assertIn( - "Finished constructing BWT", - str(stdout) + str(stderr), - f"FASTA indexing failed:\n{cmdline}\nStdout:{stdout}\nStderr:{stderr}\n", - ) - - def do_aln(self, in_file, out_file): - """Test for generating sai files given the reference and read file.""" - cmdline = BwaAlignCommandline(bwa_exe) - cmdline.set_parameter("reference", self.reference_file) - cmdline.read_file = in_file - self.assertTrue(os.path.isfile(in_file)) - stdout, stderr = cmdline(stdout=out_file) - self.assertNotIn( - "fail to locate the index", - str(stderr) + str(stdout), - "Error aligning sequence to reference:\n%s\nStdout:%s\nStderr:%s\n" - % (cmdline, stdout, stderr), - ) - - def create_fasta_index(self): - """Test for generating index for fasta file. - - BWA requires an indexed fasta for each alignment operation. - This should be called to create an index before any alignment - operation. - """ - cmdline = BwaIndexCommandline(bwa_exe) - cmdline.set_parameter("infile", self.reference_file) - cmdline.set_parameter("algorithm", "bwtsw") - stdout, stderr = cmdline() - - if not skip_aln_tests: - - def test_samse(self): - """Test for single end sequencing.""" - self.create_fasta_index() - self.do_aln(self.infile1, self.saifile1) - cmdline = BwaSamseCommandline(bwa_exe) - cmdline.set_parameter("reference", self.reference_file) - cmdline.set_parameter("read_file", self.infile1) - cmdline.set_parameter("sai_file", self.saifile1) - stdout, stderr = cmdline(stdout=self.samfile1) - - with open(self.samfile1) as handle: - headline = handle.readline() - self.assertTrue( - headline.startswith("@SQ"), - f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}", - ) - - def test_sampe(self): - """Test for generating samfile by paired end sequencing.""" - self.create_fasta_index() - - # Generate sai files from paired end data - self.do_aln(self.infile1, self.saifile1) - self.do_aln(self.infile2, self.saifile2) - - cmdline = BwaSampeCommandline(bwa_exe) - cmdline.set_parameter("reference", self.reference_file) - cmdline.set_parameter("sai_file1", self.saifile1) - cmdline.set_parameter("sai_file2", self.saifile2) - cmdline.set_parameter("read_file1", self.infile1) - cmdline.set_parameter("read_file2", self.infile2) - stdout, stderr = cmdline(stdout=self.samfile) - - with open(self.samfile) as handle: - headline = handle.readline() - self.assertTrue( - headline.startswith("@SQ"), - f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}", - ) - - def test_mem(self): - """Test for generating samfile by paired end sequencing using BWA-MEM.""" - self.create_fasta_index() - - cmdline = BwaMemCommandline(bwa_exe) - cmdline.set_parameter("reference", self.reference_file) - cmdline.set_parameter("read_file1", self.infile1) - cmdline.set_parameter("read_file2", self.infile2) - stdout, stderr = cmdline(stdout=self.samfile) - - with open(self.samfile) as handle: - headline = handle.readline() - self.assertTrue( - headline.startswith("@SQ"), - f"Error generating sam files:\n{cmdline}\nOutput starts:{headline}", - ) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_ClustalOmega_tool.py b/Tests/test_ClustalOmega_tool.py deleted file mode 100644 index 2a3245014..000000000 --- a/Tests/test_ClustalOmega_tool.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright 2008-2011 by Peter Cock. All rights reserved. -# Revisions copyright 2012 by Christian Brueffer. All rights reserved. -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for ClustalOmega tool.""" - -import os -import unittest -import warnings -from subprocess import getoutput - -from Bio import Align -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import ClustalOmegaCommandline - from Bio.Application import ApplicationError - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -clustalo_exe = None -try: - output = getoutput("clustalo --help") - if output.startswith("Clustal Omega"): - clustalo_exe = "clustalo" -except FileNotFoundError: - pass - -if not clustalo_exe: - raise MissingExternalDependencyError( - "Install clustalo if you want to use Clustal Omega from Biopython." - ) - - -class ClustalOmegaTestCase(unittest.TestCase): - def setUp(self): - self.files_to_clean = set() - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - - def standard_test_procedure(self, cline): - """Shared test procedure used by all tests.""" - # Overwrite existing files. - cline.force = True - - # Mark output files for later cleanup. - self.add_file_to_clean(cline.outfile) - if cline.guidetree_out: - self.add_file_to_clean(cline.guidetree_out) - - input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) - self.assertEqual(str(eval(repr(cline))), str(cline)) - output, error = cline() - self.assertTrue(not output or output.strip().startswith("CLUSTAL")) - - # Test if ClustalOmega executed successfully. - self.assertTrue( - error.strip() == "" - or error.startswith( - ( - "WARNING: Sequence type is DNA.", - "WARNING: DNA alignment is still experimental.", - ) - ) - ) - - # TODO - Try and parse this with Bio.Nexus? - if cline.guidetree_out: - self.assertTrue(os.path.isfile(cline.guidetree_out)) - - def add_file_to_clean(self, filename): - """Add a file for deferred removal by the tearDown routine.""" - self.files_to_clean.add(filename) - - -################################################################# - - -class ClustalOmegaTestErrorConditions(ClustalOmegaTestCase): - def test_empty_file(self): - """Test an empty file.""" - input_file = "does_not_exist.fasta" - self.assertFalse(os.path.isfile(input_file)) - cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file) - try: - stdout, stderr = cline() - except ApplicationError as err: - message = str(err) - self.assertTrue( - "Cannot open sequence file" in message - or "Cannot open input file" in message - or "Non-zero return code" in message, - message, - ) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_single_sequence(self): - """Test an input file containing a single sequence.""" - input_file = "Fasta/f001" - self.assertTrue(os.path.isfile(input_file)) - self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1) - cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file) - try: - stdout, stderr = cline() - except ApplicationError as err: - self.assertIn("contains 1 sequence, nothing to align", str(err)) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_invalid_format(self): - """Test an input file in an invalid format.""" - input_file = "Medline/pubmed_result1.txt" - self.assertTrue(os.path.isfile(input_file)) - cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file) - with self.assertRaises(ApplicationError) as cm: - stdout, stderr = cline() - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - err = str(cm.exception) - # Ideally we'd catch the return code and raise the specific - # error for "invalid format". - self.assertIn("Can't determine format of sequence file", err) - - -################################################################# - - -class ClustalOmegaTestNormalConditions(ClustalOmegaTestCase): - def test_simple_fasta(self): - """Test a simple fasta file.""" - input_file = "Registry/seqs.fasta" - output_file = "temp_test.aln" - - cline = ClustalOmegaCommandline( - clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal" - ) - - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual( - str(alignment), - """\ -gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA - 0 ---------..-........|......|....|......|..............|.---- -gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI---- - -gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC - 60 -----.|.||.......|....|-------------------......|.......||.. -gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV - -gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG- - 120 ...--------..|.|......|..............|...|..............|..- -gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR - -gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC - 180 -.|.|...|....|.......|........|------------...........||.... -gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL - -gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285 - 240 ...|..........|..........|.|-----.|.....|.|..-- 287 -gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222 -""", - ) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], - " * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ", - ) - - def test_properties(self): - """Test setting options via properties.""" - input_file = "Registry/seqs.fasta" - output_file = "temp_test.aln" - - cline = ClustalOmegaCommandline(clustalo_exe) - cline.infile = input_file - cline.outfile = output_file - cline.outfmt = "clustal" - - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual( - str(alignment), - """\ -gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA - 0 ---------..-........|......|....|......|..............|.---- -gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI---- - -gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC - 60 -----.|.||.......|....|-------------------......|.......||.. -gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV - -gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG- - 120 ...--------..|.|......|..............|...|..............|..- -gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR - -gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC - 180 -.|.|...|....|.......|........|------------...........||.... -gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL - -gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285 - 240 ...|..........|..........|.|-----.|.....|.|..-- 287 -gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222 -""", - ) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], - " * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ", - ) - - def test_input_filename_with_space(self): - """Test an input filename containing a space.""" - input_file = "Clustalw/temp horses.fasta" - with open(input_file, "w") as handle: - SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta") - output_file = "temp_test.aln" - - cline = ClustalOmegaCommandline( - clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal" - ) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual( - str(alignment), - """\ -A 0 -CACACACAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40 -B 0 -CACACAACAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40 -C 0 -CACAACAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40 -D 0 -CAACAAAACAAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40 -E 0 -CAACAAAAACAAAAAAAACAAAAAAAAAAAAAAAAAAAAA 40 -F 0 ACAAAAAAAACACACAAAACAAAAAAAAAAAAAAAAAAAA- 40 -G 0 ACAAAAAAAACACAACAAACAAAAAAAAAAAAAAAAAAAA- 40 -H 0 ACAAAAAAAACAACAAAAACAAAAAAAAAAAAAAAAAAAA- 40 -I 0 ACAAAAAAAAACAAAACAACAAAAAAAAAAAAAAAAAAAA- 40 -J 0 ACAAAAAAAAACAAAAACACAAAAAAAAAAAAAAAAAAAA- 40 -""", - ) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], - " ** ********************** ", - ) - - def test_output_filename_with_spaces(self): - """Test an output filename containing spaces.""" - input_file = "Registry/seqs.fasta" - output_file = "temp with spaces.aln" - - cline = ClustalOmegaCommandline( - clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal" - ) - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual( - str(alignment), - """\ -gi|134891 0 GATCCCTACCCTTNCCGTTGGTCTCTNTCGCTGACTCGAGGCACCTAACATCCATTCACA - 0 ---------..-........|......|....|......|..............|.---- -gi|129628 0 ---------MP-VVVVASSKGGAGKSTTAVVLGTELAHKGVPVTMLDCDPNRSLTI---- - -gi|134891 60 CCCAACACAGGCCAGCGACTTCTGGGGCTCAGCCACAGACATGGTTTGTNACTNTTGAGC - 60 -----.|.||.......|....|-------------------......|.......||.. -gi|129628 46 -----WANAGEVPENITALSDVT-------------------ESSIVKTIKQHDVDGAVV - -gi|134891 120 TTCTGTTCCTAGAGAATCCTAGAGGCTTGATTGGCCCAGGCTGCTGTNTGTNCTGGAGG- - 120 ...--------..|.|......|..............|...|..............|..- -gi|129628 82 IVD--------LEGVASRMVSRAISQADLVLIPMRPKALDATIGAQSLQLIAEEEEAIDR - -gi|134891 179 -CAAAGAATCCCTACCTCCTAGGGGTGAAAGGAAATNAAAATGGAAAGTTCTTGTAGCGC - 180 -.|.|...|....|.......|........|------------...........||.... -gi|129628 134 KIAHAVVFTMVSPAIRSHEYTGIKASLIENG------------VEIIEPPLVERTAYSAL - -gi|134891 238 AAGGCCTGACATGGGTAGCTGCTCAATAAATGCTAGTNTGTTATTTC 285 - 240 ...|..........|..........|.|-----.|.....|.|..-- 287 -gi|129628 182 FQFGGNLHSMKSKQGNMAAAIENAEAFA-----MAIFKKLTEALR-- 222 -""", - ) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], - " * * * * * * ** * * * ** * * * * * * * * * * * * ** * * * * * * * ", - ) - - def test_large_fasta_file(self): - """Test a large fasta input file.""" - # Create a large input file by converting another example file - # (See Bug 2804, this will produce so much output on stdout that - # subprocess could suffer a deadlock and hang). Using all the - # records should show the deadlock but is very slow - just thirty - # seems to lockup on Mac OS X, even 20 on Linux (without the fix). - input_file = "temp_cw_prot.fasta" - records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40] - with open(input_file, "w") as handle: - SeqIO.write(records, handle, "fasta") - del handle, records - output_file = "temp_cw_prot.aln" - - cline = ClustalOmegaCommandline( - clustalo_exe, infile=input_file, outfile=output_file, outfmt="clustal" - ) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - - def test_newtree_files(self): - """Test requesting a guide tree.""" - input_file = "Fasta/f002" - output_file = "temp_test.aln" - newtree_file = "temp_test.dnd" - alignment_text = """\ -gi|134891 0 CGGACCAGACGGACACAGGGAGAAGCTAGTTTCTTTCATGTGATTGANATNATGACTCTA -gi|134891 0 ---------CGGAGCCAGCGAGCATAT--------------------------------- -gi|159293 0 ------------------------------------------------------------ - -gi|134891 60 CTCCTAAAAGGGAAAAANCAATATCCTTGTTTACAGAAGAGAAACAAACAAGCCCCACTC -gi|134891 18 ----------------------------------------------------GCTGCATG -gi|159293 0 --------------------------------------------GATCAAATCTGCACTG - -gi|134891 120 AGCTCAGTCACAGGAGAGANCACAGAAAGTCTTAGGATCATGANCTCTGAA-AAAAAGAG -gi|134891 26 -------------------------AGGACCTTTCTATCTTACATTATGGC-TGGGAATC -gi|159293 16 TGTCTACATATAGGAAAGGTCCTGGTGTGTGCTAATGTTCCCAATGCAGGACTTGAGGAA - -gi|134891 179 AAACCTTATCTTTNCTTTGTGGTTCCTTTAAACACACTCACACACACTTGGTCAGAGATG -gi|134891 60 TTACTCTTTCATCTG-------ATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATC -gi|159293 76 GAGCTCTGTTATATGTTTCCATTTCTCTTTATCAAAGATAACCAAACCTTATGGCCCTT- - -gi|134891 239 CTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGC----------AGAGGGAC -gi|134891 113 CTGGTTTTACAGATGTGAAACTT----TCAAGAGATTTACTGACTTTCCTAGAATA---- -gi|159293 135 ---ATAACAATGGAGGCACTGGCTGCCTCTTAATTTTCAATCATGGACCTAAAGAAGTAC - -gi|134891 289 GTTTGA--GTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTT -gi|134891 165 --------GT--------------TTCTCTACTGGAAACCTGATGCTTTTATAAGCCATT -gi|159293 192 TCTGAAGGGTCTCAACAATGCCAGGTGGGGACAGATATACTCAGAGATTATCCAGGTCTG - -gi|134891 347 ATGTNAAGCNTGACAACACCAGGCAGGTATGAGAGGA-AAGCAAGGCCCGTCCATNGCTG -gi|134891 203 GTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTT------C -gi|159293 252 CCTCCCAGCGAGCC-----------TGGA------GT-ACACCAGACCCTCCTAGAGAAA - -gi|134891 406 TCCGTACNCTTACGGNTTGCTTGTNGGAGNCATTTNGGTATTGTTTGTTGTAANANCCAA -gi|134891 257 TCCTAGGTAATGAGTAGTGCTGTTCATATTACTNT-------AAGTTCTATAGCATACTT -gi|159293 294 TCTGTT------------------------------------ATAATTTACCACCCACTT - -gi|134891 466 AANGGGCTTTGGNNTGGNAAAA----GGGCAGANNGGGGGGGTTGGTGTNGTTTTTTGG- -gi|134891 310 GCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTT-----TGCCCTT -gi|159293 318 ATCCACCTTTAAACTTGGGGAA----GGNNGCN------TTTCAAATTAAATTTAATCNT - -gi|134891 521 GGGGANNNTTTNGATTTGG-------TNCCGGGNTTTNGTTTNCCNCGGNACCGGNTTTT -gi|134891 365 TTG-GGTTTNTTNTTGGTAA--ANNNTTCCCGGGTGGGGGNGGTNNNGAAA--------- -gi|159293 368 NGGGGGNTTTTAAACTTTAACCCTTTTNCCNTTNTNGGGGTNGGNANTTGNCCCCNTTAA - -gi|134891 574 GGTTGGGGNCCATTTNTGNGGGGCNTTGGNGTTNCNTTNCCCNNNTNNGANTGGTTTNA -gi|134891 413 ----------------------------------------------------------- -gi|159293 428 AGGGGGNNCCCCT-NCNNGGGGGAATAA-AACAA----------NTTNNTTT--TTT-- - -gi|134891 633 -gi|134891 413 -gi|159293 471 -""" - clustal_consensus = " * * * * * * * * * ** ** * * * * * * * * * * * ** * * * * * * * ** * * * * ** * * ** * * **** * * * * * * * * * ** * * " - - cline = ClustalOmegaCommandline( - clustalo_exe, - infile=input_file, - outfile=output_file, - guidetree_out=newtree_file, - outfmt="clustal", - ) - - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual(str(alignment), alignment_text) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], clustal_consensus - ) - - cline.guidetree_out = "temp with space.dnd" - self.standard_test_procedure(cline) - alignment = Align.read(cline.outfile, "clustal") - self.assertEqual(str(alignment), alignment_text) - self.assertEqual( - alignment.column_annotations["clustal_consensus"], clustal_consensus - ) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Clustalw_tool.py b/Tests/test_Clustalw_tool.py deleted file mode 100644 index 62343183e..000000000 --- a/Tests/test_Clustalw_tool.py +++ /dev/null @@ -1,327 +0,0 @@ -# Copyright 2008-2011 by Peter Cock. All rights reserved. -# Revisions copyright 2012 by Christian Brueffer. All rights reserved. -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -# TODO - Clean up the extra files created by clustalw? e.g. *.dnd -# and *.aln where we have not requested an explicit name? -"""Tests for Clustalw tool.""" - -import os -import sys -import unittest -import warnings - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import ClustalwCommandline - from Bio.Application import ApplicationError - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -clustalw_exe = None -if sys.platform == "win32": - # TODO - Check the path? - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files" - - # Note that EBI's clustalw2 installer, e.g. clustalw-2.0.10-win.msi - # uses C:\Program Files\ClustalW2\clustalw2.exe so we should check - # for that. - # - # Some users doing a manual install have reported using - # C:\Program Files\clustalw.exe - # - # Older installers might use something like this, - # C:\Program Files\Clustalw\clustalw.exe - # - # One particular case is www.tc.cornell.edu currently provide a - # clustalw1.83 installer which uses the following long location: - # C:\Program Files\CTCBioApps\clustalw\v1.83\clustalw1.83.exe - likely_dirs = [ - "ClustalW2", - "", - "Clustal", - "Clustalw", - "Clustalw183", - "Clustalw1.83", - r"CTCBioApps\clustalw\v1.83", - ] - likely_exes = ["clustalw2.exe", "clustalw.exe", "clustalw1.83.exe"] - for folder in likely_dirs: - if os.path.isdir(os.path.join(prog_files, folder)): - for filename in likely_exes: - if os.path.isfile(os.path.join(prog_files, folder, filename)): - clustalw_exe = os.path.join(prog_files, folder, filename) - break - if clustalw_exe: - break -else: - from subprocess import getoutput - - # Note that clustalw 1.83 and clustalw 2.1 don't obey the --version - # command, but this does cause them to quit cleanly. Otherwise they prompt - # the user for input (causing a lock up). - output = getoutput("clustalw2 --version") - # Since "not found" may be in another language, try and be sure this is - # really the clustalw tool's output - if "not found" not in output and "not recognized" not in output: - if "CLUSTAL" in output and "Multiple Sequence Alignments" in output: - clustalw_exe = "clustalw2" - if not clustalw_exe: - output = getoutput("clustalw --version") - if "not found" not in output and "not recognized" not in output: - if "CLUSTAL" in output and "Multiple Sequence Alignments" in output: - clustalw_exe = "clustalw" - -if not clustalw_exe: - raise MissingExternalDependencyError( - "Install clustalw or clustalw2 if you want to use it from Biopython." - ) - - -class ClustalWTestCase(unittest.TestCase): - """Class implementing common functions for ClustalW tests.""" - - def setUp(self): - self.files_to_clean = set() - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - - def standard_test_procedure(self, cline): - """Shared test procedure used by all tests.""" - self.assertEqual(str(eval(repr(cline))), str(cline)) - input_records = SeqIO.to_dict( - SeqIO.parse(cline.infile, "fasta"), lambda rec: rec.id.replace(":", "_") - ) # noqa: E731 - - # Determine name of tree file - if cline.newtree: - tree_file = cline.newtree - else: - # Clustalw will name it based on the input file - tree_file = os.path.splitext(cline.infile)[0] + ".dnd" - - # Mark generated files for later removal - self.add_file_to_clean(cline.outfile) - self.add_file_to_clean(tree_file) - - output, error = cline() - self.assertTrue(output.strip().startswith("CLUSTAL")) - self.assertEqual(error.strip(), "") - - # Check the output... - align = AlignIO.read(cline.outfile, "clustal") - # The length of the alignment will depend on the version of clustalw - # (clustalw 2.1 and clustalw 1.83 are certainly different). - output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal")) - self.assertCountEqual(input_records.keys(), output_records.keys()) - for record in align: - self.assertEqual(record.seq, output_records[record.id].seq) - self.assertEqual( - str(record.seq).replace("-", ""), input_records[record.id].seq - ) - - # Check the DND file was created. - # TODO - Try and parse this with Bio.Nexus? - self.assertTrue(os.path.isfile(tree_file)) - - def add_file_to_clean(self, filename): - """Add a file for deferred removal by the tearDown routine.""" - self.files_to_clean.add(filename) - - -class ClustalWTestErrorConditions(ClustalWTestCase): - """Test general error conditions.""" - - def test_empty_file(self): - """Test a non-existing input file.""" - input_file = "does_not_exist.fasta" - self.assertFalse(os.path.isfile(input_file)) - cline = ClustalwCommandline(clustalw_exe, infile=input_file) - - try: - stdout, stderr = cline() - except ApplicationError as err: - message = str(err) - self.assertTrue( - "Cannot open sequence file" in message - or "Cannot open input file" in message - or "Non-zero return code " in message, - message, - ) - else: - self.fail("expected an ApplicationError") - - def test_single_sequence(self): - """Test an input file containing a single sequence.""" - input_file = "Fasta/f001" - self.assertTrue(os.path.isfile(input_file)) - self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1) - cline = ClustalwCommandline(clustalw_exe, infile=input_file) - - try: - stdout, stderr = cline() - # Zero return code is a possible bug in clustalw 2.1? - self.assertIn("cannot do multiple alignment", (stdout + stderr)) - except ApplicationError as err: - # Good, non-zero return code indicating an error in clustalw - # e.g. Using clustalw 1.83 get: - # Command 'clustalw -infile=Fasta/f001' returned non-zero exit status 4 - pass - - if os.path.isfile(input_file + ".aln"): - # Clustalw 2.1 made an empty aln file, clustalw 1.83 did not - self.add_file_to_clean(input_file + ".aln") - - def test_invalid_sequence(self): - """Test an input file containing an invalid sequence.""" - input_file = "Medline/pubmed_result1.txt" - self.assertTrue(os.path.isfile(input_file)) - cline = ClustalwCommandline(clustalw_exe, infile=input_file) - - with self.assertRaises(ApplicationError) as cm: - stdout, stderr = cline() - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - err = str(cm.exception) - # Ideally we'd catch the return code and raise the specific - # error for "invalid format", rather than just notice there - # is not output file. - # Note: - # Python 2.3 on Windows gave (0, 'Error') - # Python 2.5 on Windows gives [Errno 0] Error - self.assertTrue( - "invalid format" in err - or "not produced" in err - or "No sequences in file" in err - or "Non-zero return code " in err - ) - - -class ClustalWTestNormalConditions(ClustalWTestCase): - """Tests for normal conditions.""" - - def test_properties(self): - """Test passing options via properties.""" - cline = ClustalwCommandline(clustalw_exe) - cline.infile = "Fasta/f002" - cline.outfile = "temp_test.aln" - cline.align = True - - self.standard_test_procedure(cline) - - def test_simple_fasta(self): - """Test a simple fasta input file.""" - input_file = "Fasta/f002" - output_file = "temp_test.aln" - cline = ClustalwCommandline( - clustalw_exe, infile=input_file, outfile=output_file - ) - - self.standard_test_procedure(cline) - - def test_newtree(self): - """Test newtree files.""" - input_file = "Registry/seqs.fasta" - output_file = "temp_test.aln" - newtree_file = "temp_test.dnd" - cline = ClustalwCommandline( - clustalw_exe, - infile=input_file, - outfile=output_file, - newtree=newtree_file, - align=True, - ) - - self.standard_test_procedure(cline) - cline.newtree = "temp with space.dnd" - self.standard_test_procedure(cline) - - def test_large_input_file(self): - """Test a large input file.""" - # Create a large input file by converting another example file - # (See Bug 2804, this will produce so much output on stdout that - # subprocess could suffer a deadlock and hang). Using all the - # records should show the deadlock but is very slow - just thirty - # seems to lockup on Mac OS X, even 20 on Linux (without the fix). - input_file = "temp_cw_prot.fasta" - records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40] - with open(input_file, "w") as handle: - SeqIO.write(records, handle, "fasta") - del records - output_file = "temp_cw_prot.aln" - - cline = ClustalwCommandline( - clustalw_exe, infile=input_file, outfile=output_file - ) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - def test_input_filename_with_space(self): - """Test an input filename containing a space.""" - input_file = "Clustalw/temp horses.fasta" - with open(input_file, "w") as handle: - SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta") - output_file = "temp with space.aln" - - cline = ClustalwCommandline( - clustalw_exe, infile=input_file, outfile=output_file - ) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - def test_output_filename_with_spaces(self): - """Test an output filename containing spaces.""" - input_file = "GFF/multi.fna" - output_file = "temp with space.aln" - cline = ClustalwCommandline( - clustalw_exe, infile=input_file, outfile=output_file - ) - - self.standard_test_procedure(cline) - - -class ClustalWTestVersionTwoSpecific(ClustalWTestCase): - """Tests specific to ClustalW2.""" - - def test_statistics(self): - """Test a statistics file.""" - if clustalw_exe == "clustalw2": - input_file = "Fasta/f002" - output_file = "temp_test.aln" - statistics_file = "temp_stats.txt" - cline = ClustalwCommandline( - clustalw_exe, - infile=input_file, - outfile=output_file, - stats=statistics_file, - ) - - self.add_file_to_clean(statistics_file) - self.standard_test_procedure(cline) - self.assertTrue(os.path.isfile(statistics_file)) - else: - print("Skipping ClustalW2 specific test.") - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Dialign_tool.py b/Tests/test_Dialign_tool.py deleted file mode 100644 index 3201e59f2..000000000 --- a/Tests/test_Dialign_tool.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# Revisions 2009 copyright by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Unittests for Bio.Align.Applications interface for DIALIGN2-2.""" - -import os -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import DialignCommandline - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -dialign_exe = None -if sys.platform == "win32": - raise MissingExternalDependencyError("DIALIGN2-2 not available on Windows") -else: - from subprocess import getoutput - - output = getoutput("dialign2-2") - if "not found" not in output and "not recognized" not in output: - if "dialign2-2" in output.lower(): - dialign_exe = "dialign2-2" - if "DIALIGN2_DIR" not in os.environ: - raise MissingExternalDependencyError( - "Environment variable DIALIGN2_DIR for DIALIGN2-2 missing." - ) - if not os.path.isdir(os.environ["DIALIGN2_DIR"]): - raise MissingExternalDependencyError( - "Environment variable DIALIGN2_DIR for DIALIGN2-2 is not a valid directory." - ) - if not os.path.isfile(os.path.join(os.environ["DIALIGN2_DIR"], "BLOSUM")): - raise MissingExternalDependencyError( - "Environment variable DIALIGN2_DIR directory missing BLOSUM file." - ) - # TODO - check for tp400_dna, tp400_prot and tp400_trans too? - -if not dialign_exe: - raise MissingExternalDependencyError( - "Install DIALIGN2-2 if you want to use the Bio.Align.Applications wrapper." - ) - - -class DialignApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/f002" - # Standard output file - self.outfile1 = "Fasta/f002.ali" - # MSF output - self.outfile2 = "Fasta/f002.ms" - - def tearDown(self): - if os.path.isfile(self.outfile1): - os.remove(self.outfile1) - if os.path.isfile(self.outfile2): - os.remove(self.outfile2) - - def test_Dialign_simple(self): - """Simple round-trip through app with infile.""" - # Test using keyword arguments: - cmdline = DialignCommandline(dialign_exe, input=self.infile1) - self.assertEqual(str(cmdline), dialign_exe + " Fasta/f002") - stdout, stderr = cmdline() - self.assertEqual(stderr, "") - self.assertEqual(stdout, "") - self.assertTrue(os.path.exists(self.outfile1)) - - def test_Dialign_simple_with_options(self): - """Simple round-trip through app with infile and options.""" - cmdline = DialignCommandline(dialign_exe) - cmdline.set_parameter("input", self.infile1) - cmdline.set_parameter("-max_link", True) - cmdline.set_parameter("stars", 4) - self.assertEqual(str(cmdline), dialign_exe + " -max_link -stars 4 Fasta/f002") - stdout, stderr = cmdline() - self.assertEqual(stderr, "") - self.assertEqual(stdout, "") - self.assertTrue(os.path.exists(self.outfile1)) - - def test_Dialign_simple_with_MSF_output(self): - """Simple round-trip through app with infile, output MSF.""" - cmdline = DialignCommandline(dialign_exe) - # Test with properties - cmdline.input = self.infile1 - cmdline.msf = True - self.assertEqual(str(cmdline), dialign_exe + " -msf Fasta/f002") - stdout, stderr = cmdline() - self.assertEqual(stderr, "") - self.assertEqual(stdout, "") - self.assertTrue(os.path.exists(self.outfile1)) - self.assertTrue(os.path.exists(self.outfile2)) - - def test_Dialign_complex_command_line(self): - """Round-trip through app with complex command line.""" - cmdline = DialignCommandline(dialign_exe) - cmdline.set_parameter("input", self.infile1) - cmdline.set_parameter("-nt", True) - cmdline.set_parameter("-thr", 4) - cmdline.set_parameter("stars", 9) - cmdline.set_parameter("-ow", True) - cmdline.set_parameter("mask", True) - cmdline.set_parameter("-cs", True) - self.assertEqual( - str(cmdline), dialign_exe + " -cs -mask -nt -ow -stars 9 -thr 4 Fasta/f002" - ) - stdout, stderr = cmdline() - self.assertEqual(stderr, "") - self.assertTrue(os.path.exists(self.outfile1)) - self.assertTrue(stdout.startswith(" e_len = 633")) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Emboss.py b/Tests/test_Emboss.py deleted file mode 100644 index c3f0823f8..000000000 --- a/Tests/test_Emboss.py +++ /dev/null @@ -1,968 +0,0 @@ -# Copyright 2009 by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Runs a few EMBOSS tools to check our wrappers and parsers.""" - -import os -import subprocess -import sys -import unittest -import warnings -from io import StringIO - -from Bio import BiopythonDeprecationWarning - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import _escape_filename - from Bio.Emboss.Applications import NeedleCommandline - from Bio.Emboss.Applications import SeqmatchallCommandline - from Bio.Emboss.Applications import SeqretCommandline - from Bio.Emboss.Applications import WaterCommandline - -from Bio import AlignIO -from Bio import MissingExternalDependencyError -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Seq import translate -from Bio.SeqRecord import SeqRecord - -# ############################################################### - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -exes_wanted = ["water", "needle", "seqret", "transeq", "seqmatchall", "embossversion"] -exes = {} # Dictionary mapping from names to exe locations - -if "EMBOSS_ROOT" in os.environ: - # Windows default installation path is C:\mEMBOSS which contains the exes. - # EMBOSS also sets an environment variable which we will check for. - path = os.environ["EMBOSS_ROOT"] - if os.path.isdir(path): - for name in exes_wanted: - if os.path.isfile(os.path.join(path, name + ".exe")): - exes[name] = os.path.join(path, name + ".exe") - del name - else: - raise MissingExternalDependencyError( - f"$EMBOSS_ROOT={path!r} which does not exist!" - ) - del path -if sys.platform != "win32": - from subprocess import getoutput - - for name in exes_wanted: - # This will "just work" if installed on the path as normal on Unix - output = getoutput(f"{name} -help") - if "not found" not in output and "not recognized" not in output: - exes[name] = name - del output - del name - -if len(exes) < len(exes_wanted): - raise MissingExternalDependencyError( - "Install EMBOSS if you want to use Bio.Emboss." - ) - - -def get_emboss_version(): - """Return a tuple of three ints, e.g. (6,1,0).""" - # Windows and Unix versions of EMBOSS seem to differ in - # which lines go to stdout and stderr - so merge them. - child = subprocess.Popen( - _escape_filename(exes["embossversion"]), - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdout, stderr = child.communicate() - child.stdout.close() # This is both stdout and stderr - del child - assert stderr is None # Send to stdout instead - for line in stdout.split("\n"): - if line.strip() == "Report the current EMBOSS version number": - # e.g. - # $ embossversion - # Report the current EMBOSS version number - # 6.5.7.0 - pass - elif line.strip() == "Reports the current EMBOSS version number": - # e.g. - # $ embossversion - # Reports the current EMBOSS version number - # 6.3.1 - pass - elif line.startswith("Writes the current EMBOSS version number"): - pass - elif line.count(".") == 2: - return tuple(int(v) for v in line.strip().split(".")) - elif line.count(".") == 3: - # e.g. I installed mEMBOSS-6.2.0.1-setup.exe - # which reports 6.2.0.1 - for this return (6,2,0) - return tuple(int(v) for v in line.strip().split("."))[:3] - else: - # Either we can't understand the output, or this is really - # an error message not caught earlier (e.g. not in English) - raise MissingExternalDependencyError( - f"Install EMBOSS if you want to use Bio.Emboss ({line})." - ) - # In case there was no output at all... - raise MissingExternalDependencyError("Could not get EMBOSS version") - - -# To avoid confusing known errors from old versions of EMBOSS ... -emboss_version = get_emboss_version() -if emboss_version < (6, 1, 0): - raise MissingExternalDependencyError("Test requires EMBOSS 6.1.0 patch 3 or later.") - -################################################################# - - -# Top level function as this makes it easier to use for debugging: -def emboss_piped_SeqIO_convert(records, old_format, new_format): - """Run seqret, returns records (as a generator).""" - # Setup, this assumes for all the format names used - # Biopython and EMBOSS names are consistent! - cline = SeqretCommandline( - exes["seqret"], - sformat=old_format, - osformat=new_format, - auto=True, # no prompting - filter=True, - ) - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - SeqIO.write(records, child.stdin, old_format) - child.stdin.close() - child.stderr.close() - records = SeqIO.parse(child.stdout, new_format) - yield from records - child.stdout.close() - - -# Top level function as this makes it easier to use for debugging: -def emboss_piped_AlignIO_convert(alignments, old_format, new_format): - """Run seqret, returns alignments (as a generator).""" - # Setup, this assumes for all the format names used - # Biopython and EMBOSS names are consistent! - cline = SeqretCommandline( - exes["seqret"], - sformat=old_format, - osformat=new_format, - auto=True, # no prompting - filter=True, - ) - # Run the tool, - with subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) as child: - AlignIO.write(alignments, child.stdin, old_format) - child.stdin.close() - aligns = list(AlignIO.parse(child.stdout, new_format)) - return aligns - - -class SeqRetTests(unittest.TestCase): - """Base class providing SeqRecord comparison method.""" - - def compare_records(self, old_records, new_records, msg=None): - self.assertEqual(len(old_records), len(new_records), msg) - for old, new in zip(old_records, new_records): - # Note the name matching is a bit fuzzy, e.g. truncation and - # no spaces in PHYLIP files. - self.assertTrue( - (old.id in new.id) - or (new.id in old.id) - or (old.id.replace(" ", "_") == new.id.replace(" ", "_")) - or (old.name == new.name), - msg, - ) - self.assertEqual(len(old.seq), len(new.seq), msg) - if old.seq.upper() != new.seq.upper(): - raise Exception - if str(old.seq).replace("X", "N") == str(new.seq): - self.fail(f"{msg}: X -> N (protein forced into nucleotide?)") - else: - self.assertEqual(old.seq, new.seq, msg) - if old.features and new.features: - self.assertEqual(len(old.features), len(new.features), msg) - # TODO - check annotation - - -class SeqRetSeqIOTests(SeqRetTests): - """Check EMBOSS seqret against Bio.SeqIO for converting files.""" - - def tearDown(self): - clean_up() - - def check_SeqIO_to_EMBOSS(self, in_filename, in_format, skip_formats=()): - """Check SeqIO writes files seqret can read back.""" - records = list(SeqIO.parse(in_filename, in_format)) - for temp_format in ["genbank", "embl", "fasta"]: - if temp_format in skip_formats: - continue - new_records = list( - emboss_piped_SeqIO_convert(records, temp_format, "fasta") - ) - msg = f"converting {in_filename} from {in_format} to {temp_format}" - self.compare_records(records, new_records, msg) - - def check_EMBOSS_to_SeqIO(self, filename, old_format, skip_formats=()): - """Check SeqIO can read read seqret's conversion output.""" - # TODO: Why can't we read EMBOSS's swiss output? - self.assertTrue(os.path.isfile(filename)) - old_records = list(SeqIO.parse(filename, old_format)) - for new_format in ["genbank", "fasta", "pir", "embl", "ig"]: - if new_format in skip_formats: - continue - cline = SeqretCommandline( - exes["seqret"], - sequence=filename, - sformat=old_format, - osformat=new_format, - auto=True, # no prompting - stdout=True, - ) - # Run the tool, - with subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) as child: - child.stdin.close() - new_records = list(SeqIO.parse(child.stdout, new_format)) - msg = f"converting {filename} from {old_format} to {new_format}" - self.compare_records(old_records, new_records, msg) - - def check_SeqIO_with_EMBOSS(self, filename, old_format, skip_formats=()): - # Check EMBOSS can read Bio.SeqIO output... - self.check_SeqIO_to_EMBOSS(filename, old_format, skip_formats) - # Check Bio.SeqIO can read EMBOSS seqret output... - self.check_EMBOSS_to_SeqIO(filename, old_format, skip_formats) - - def test_abi(self): - """Check SeqIO agrees with EMBOSS' Abi to FASTQ conversion.""" - # This lets use check the id, sequence, and quality scores - for filename in ["Abi/3730.ab1", "Abi/empty.ab1"]: - old = SeqIO.read(filename, "abi") - cline = SeqretCommandline( - exes["seqret"], - sequence=filename, - sformat="abi", - osformat="fastq-sanger", - auto=True, # no prompting - stdout=True, - ) - # Run the tool, - with subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) as child: - child.stdin.close() - new = SeqIO.read(child.stdout, "fastq-sanger") - if emboss_version == (6, 4, 0) and new.id == "EMBOSS_001": - # Avoid bug in EMBOSS 6.4.0 (patch forthcoming) - pass - else: - self.assertEqual(old.id, new.id) - self.assertEqual(old.seq, new.seq) - if emboss_version < (6, 3, 0) and new.letter_annotations[ - "phred_quality" - ] == [1] * len(old): - # Apparent bug in EMBOSS 6.2.0.1 on Windows - pass - else: - self.assertEqual(old.letter_annotations, new.letter_annotations) - - def test_genbank(self): - """Check SeqIO & EMBOSS reading each other's conversions of a GenBank file.""" - self.check_SeqIO_with_EMBOSS("GenBank/cor6_6.gb", "genbank") - - def test_genbank2(self): - """Check SeqIO & EMBOSS reading each other's conversions of another GenBank file.""" - self.check_SeqIO_with_EMBOSS("GenBank/NC_000932.gb", "genbank") - - def test_embl(self): - """Check SeqIO & EMBOSS reading each other's conversions of an EMBL file.""" - self.check_SeqIO_with_EMBOSS("EMBL/U87107.embl", "embl") - - def test_ig(self): - """Check SeqIO & EMBOSS reading each other's conversions of an ig file.""" - # NOTE - EMBOSS considers "genbank" to be for nucleotides only, - # and will turn "X" into "N" for GenBank output. - self.check_SeqIO_to_EMBOSS( - "IntelliGenetics/VIF_mase-pro.txt", "ig", skip_formats=["genbank", "embl"] - ) - # TODO - What does a % in an ig sequence mean? - # e.g. "IntelliGenetics/vpu_nucaligned.txt" - # and "IntelliGenetics/TAT_mase_nuc.txt" - # EMBOSS seems to ignore them. - - def test_pir(self): - """Check SeqIO & EMBOSS reading each other's conversions of a PIR file.""" - # Skip genbank here, EMBOSS mangles the LOCUS line: - self.check_SeqIO_with_EMBOSS( - "NBRF/clustalw.pir", "pir", skip_formats=["genbank"] - ) - # Skip EMBL here, EMBOSS mangles the ID line - # Skip GenBank, EMBOSS 6.0.1 on Windows won't output proteins as GenBank - self.check_SeqIO_with_EMBOSS( - "NBRF/DMB_prot.pir", "pir", skip_formats=["embl", "genbank"] - ) - - def test_clustalw(self): - """Check SeqIO & EMBOSS reading each other's conversions of a Clustalw file.""" - self.check_SeqIO_with_EMBOSS( - "Clustalw/hedgehog.aln", "clustal", skip_formats=["embl", "genbank"] - ) - self.check_SeqIO_with_EMBOSS( - "Clustalw/opuntia.aln", "clustal", skip_formats=["embl", "genbank"] - ) - - -class SeqRetAlignIOTests(SeqRetTests): - """Check EMBOSS seqret against Bio.AlignIO for converting files.""" - - def tearDown(self): - clean_up() - - def compare_alignments(self, old_list, new_list, msg=None): - self.assertEqual(len(old_list), len(new_list), msg) - for old, new in zip(old_list, new_list): - self.compare_records(old, new, msg) - - def check_EMBOSS_to_AlignIO(self, filename, old_format, skip_formats=()): - """Check AlignIO can read seqret's conversion of the file.""" - self.assertTrue(os.path.isfile(filename), filename) - old_aligns = list(AlignIO.parse(filename, old_format)) - formats = ["clustal", "phylip", "ig", "msf"] - if len(old_aligns) == 1: - formats.extend(["fasta", "nexus"]) - for new_format in formats: - if new_format in skip_formats: - continue - cline = SeqretCommandline( - exes["seqret"], - sequence=filename, - sformat=old_format, - osformat=new_format, - auto=True, # no prompting - stdout=True, - ) - # Run the tool, - with subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) as child: - child.stdin.close() - new_aligns = list(AlignIO.parse(child.stdout, new_format)) - msg = f"converting {filename} from {old_format} to {new_format}" - self.compare_alignments(old_aligns, new_aligns, msg) - - def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=()): - """Check Bio.AlignIO can write files seqret can read.""" - old_aligns = list(AlignIO.parse(in_filename, in_format)) - - formats = ["clustal", "phylip"] - if len(old_aligns) == 1: - formats.extend(["fasta", "nexus"]) - for temp_format in formats: - if temp_format in skip_formats: - continue - # PHYLIP is a simple format which explicitly supports - # multiple alignments (unlike FASTA). - try: - new_aligns = list( - emboss_piped_AlignIO_convert(old_aligns, temp_format, "phylip") - ) - except ValueError as e: - self.assertIn( - str(e), - ( - "Need the molecule type to be defined", - "Repeated name 'AT3G20900.' (originally 'AT3G20900.1-SEQ'), possibly due to truncation", - "Repeated name 'gi|1377497' (originally 'gi|13774975|gb|AAK39115.1|AF35'), possibly due to truncation", - "Repeated name 'gi_1393639' (originally 'gi_13936397_dbj_BAB47195.'), possibly due to truncation", - ), - ) - continue - msg = f"converting {in_filename} from {in_format} to {temp_format}" - self.compare_alignments(old_aligns, new_aligns, msg) - - def check_AlignIO_with_EMBOSS(self, filename, old_format, skip_formats=()): - # Check EMBOSS can read Bio.AlignIO output... - self.check_AlignIO_to_EMBOSS(filename, old_format, skip_formats) - # Check Bio.AlignIO can read EMBOSS seqret output... - self.check_EMBOSS_to_AlignIO(filename, old_format, skip_formats) - - def test_align_clustalw(self): - """Check AlignIO & EMBOSS reading each other's conversions of a ClustalW file.""" - self.check_AlignIO_with_EMBOSS("Clustalw/hedgehog.aln", "clustal") - self.check_AlignIO_with_EMBOSS("Clustalw/opuntia.aln", "clustal") - self.check_AlignIO_with_EMBOSS( - "Clustalw/odd_consensus.aln", "clustal", skip_formats=["nexus"] - ) # TODO - why not nexus? - self.check_AlignIO_with_EMBOSS("Clustalw/protein.aln", "clustal") - self.check_AlignIO_with_EMBOSS("Clustalw/promals3d.aln", "clustal") - - def test_clustalw(self): - """Check AlignIO & EMBOSS reading each other's conversions of a PHYLIP file.""" - self.check_AlignIO_with_EMBOSS("Phylip/horses.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/hennigian.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/reference_dna.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/reference_dna2.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/interlaced.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/interlaced2.phy", "phylip") - self.check_AlignIO_with_EMBOSS("Phylip/random.phy", "phylip") - - -class PairwiseAlignmentTests(unittest.TestCase): - """Run pairwise alignments with water and needle, and parse them.""" - - def tearDown(self): - clean_up() - - def pairwise_alignment_check(self, query_seq, targets, alignments, local=True): - """Check pairwise alignment data is sane.""" - # The datasets should be small, so making iterators into lists is OK - targets = list(targets) - alignments = list(alignments) - self.assertEqual(len(targets), len(alignments)) - for target, alignment in zip(targets, alignments): - self.assertEqual(len(alignment), 2) - # self.assertEqual(target.id, alignment[1].id) #too strict - msg = f"{alignment[1].id} vs {target.id} or {target.name}" - self.assertTrue( - alignment[1].id in target.id or alignment[1].id in target.name, msg=msg - ) - if local: - # Local alignment - self.assertIn(str(alignment[0].seq).replace("-", ""), query_seq) - self.assertIn( - str(alignment[1].seq).replace("-", "").upper(), target.seq.upper() - ) - else: - # Global alignment - self.assertEqual(query_seq, str(alignment[0].seq).replace("-", "")) - self.assertEqual( - target.seq.upper(), str(alignment[1].seq).replace("-", "").upper() - ) - return True - - def run_water(self, cline): - # Run the tool, - stdout, stderr = cline() - self.assertTrue( - stderr.strip().startswith("Smith-Waterman local alignment"), stderr - ) - if cline.outfile: - self.assertEqual(stdout.strip(), "") - self.assertTrue( - os.path.isfile(cline.outfile), - f"Missing output file {cline.outfile!r} from:\n{cline}", - ) - else: - # Don't use this yet... could return stdout handle instead? - return stdout - - def test_water_file(self): - """Run water with the asis trick, output to a file.""" - # Setup, try a mixture of keyword arguments and later additions: - cline = WaterCommandline(cmd=exes["water"], gapopen="10", gapextend="0.5") - # Try using both human readable names, and the literal ones: - cline.set_parameter("asequence", "asis:ACCCGGGCGCGGT") - cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") - # Try using a property set here: - cline.outfile = "Emboss/temp with space.water" - self.assertEqual(str(eval(repr(cline))), str(cline)) - # Run the tool, - self.run_water(cline) - # Check we can parse the output... - align = AlignIO.read(cline.outfile, "emboss") - self.assertEqual(len(align), 2) - self.assertEqual(align[0].seq, "ACCCGGGCGCGGT") - self.assertEqual(align[1].seq, "ACCCGAGCGCGGT") - # Clean up, - os.remove(cline.outfile) - - def test_water_piped(self): - """Run water with asis trick, output piped to stdout.""" - cline = WaterCommandline( - cmd=exes["water"], - asequence="asis:ACCCGGGCGCGGT", - bsequence="asis:ACCCGAGCGCGGT", - gapopen=10, - gapextend=0.5, - auto=True, - filter=True, - ) - self.assertEqual( - str(cline), - exes["water"] - + " -auto -filter" - + " -asequence=asis:ACCCGGGCGCGGT" - + " -bsequence=asis:ACCCGAGCGCGGT" - + " -gapopen=10 -gapextend=0.5", - ) - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - child.stdin.close() - # Check we could read its output - align = AlignIO.read(child.stdout, "emboss") - self.assertEqual(len(align), 2) - self.assertEqual(align[0].seq, "ACCCGGGCGCGGT") - self.assertEqual(align[1].seq, "ACCCGAGCGCGGT") - # Check no error output: - self.assertEqual(child.stderr.read(), "") - self.assertEqual(0, child.wait()) - child.stdout.close() - child.stderr.close() - - def test_needle_file(self): - """Run needle with the asis trick, output to a file.""" - # Setup, - cline = NeedleCommandline(cmd=exes["needle"]) - cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") - cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") - cline.set_parameter("-gapopen", "10") - cline.set_parameter("-gapextend", "0.5") - # EMBOSS would guess this, but let's be explicit: - cline.set_parameter("-snucleotide", "True") - cline.set_parameter("-outfile", "Emboss/temp with space.needle") - self.assertEqual(str(eval(repr(cline))), str(cline)) - # Run the tool, - stdout, stderr = cline() - # Check it worked, - self.assertTrue( - stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr - ) - self.assertEqual(stdout.strip(), "") - filename = cline.outfile - self.assertTrue( - os.path.isfile(filename), - f"Missing output file {filename!r} from:\n{cline}", - ) - # Check we can parse the output... - align = AlignIO.read(filename, "emboss") - self.assertEqual(len(align), 2) - self.assertEqual(align[0].seq, "ACCCGGGCGCGGT") - self.assertEqual(align[1].seq, "ACCCGAGCGCGGT") - # Clean up, - os.remove(filename) - - def test_needle_piped(self): - """Run needle with asis trick, output piped to stdout.""" - cline = NeedleCommandline( - cmd=exes["needle"], - asequence="asis:ACCCGGGCGCGGT", - bsequence="asis:ACCCGAGCGCGGT", - gapopen=10, - gapextend=0.5, - auto=True, - filter=True, - ) - self.assertEqual( - str(cline), - exes["needle"] - + " -auto -filter" - + " -asequence=asis:ACCCGGGCGCGGT" - + " -bsequence=asis:ACCCGAGCGCGGT" - + " -gapopen=10 -gapextend=0.5", - ) - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - child.stdin.close() - # Check we could read its output - align = AlignIO.read(child.stdout, "emboss") - self.assertEqual(len(align), 2) - self.assertEqual(align[0].seq, "ACCCGGGCGCGGT") - self.assertEqual(align[1].seq, "ACCCGAGCGCGGT") - # Check no error output: - self.assertEqual(child.stderr.read(), "") - self.assertEqual(0, child.wait()) - child.stdout.close() - child.stderr.close() - - def test_water_file2(self): - """Run water with the asis trick and nucleotide FASTA file, output to a file.""" - # Setup, - query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG" - out_file = "Emboss/temp_test2.water" - in_file = "Fasta/f002" - self.assertTrue(os.path.isfile(in_file)) - if os.path.isfile(out_file): - os.remove(out_file) - cline = WaterCommandline(cmd=exes["water"]) - cline.set_parameter("-asequence", f"asis:{query}") - cline.set_parameter("-bsequence", in_file) - cline.set_parameter("-gapopen", "10") - cline.set_parameter("-gapextend", "0.5") - cline.set_parameter("-outfile", out_file) - self.assertEqual(str(eval(repr(cline))), str(cline)) - # Run the tool, - self.run_water(cline) - # Check we can parse the output and it is sensible... - self.pairwise_alignment_check( - query, - SeqIO.parse(in_file, "fasta"), - AlignIO.parse(out_file, "emboss"), - local=True, - ) - # Clean up, - os.remove(out_file) - - def test_water_file3(self): - """Run water with the asis trick and GenBank file, output to a file.""" - # Setup, - query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA" - out_file = "Emboss/temp_test3.water" - in_file = "GenBank/cor6_6.gb" - self.assertTrue(os.path.isfile(in_file)) - if os.path.isfile(out_file): - os.remove(out_file) - cline = WaterCommandline(cmd=exes["water"]) - cline.set_parameter("asequence", f"asis:{query}") - cline.set_parameter("bsequence", in_file) - # TODO - Tell water this is a GenBank file! - cline.set_parameter("gapopen", "1") - cline.set_parameter("gapextend", "0.5") - cline.set_parameter("outfile", out_file) - self.assertEqual(str(eval(repr(cline))), str(cline)) - # Run the tool, - self.run_water(cline) - # Check we can parse the output and it is sensible... - self.pairwise_alignment_check( - query, - SeqIO.parse(in_file, "genbank"), - AlignIO.parse(out_file, "emboss"), - local=True, - ) - # Clean up, - os.remove(out_file) - - def test_water_file4(self): - """Run water with the asis trick and SwissProt file, output to a file.""" - # Setup, - query = "DVCTGKALCDPVTQNIKTYPVKIENLRVMI" - out_file = "Emboss/temp_test4.water" - in_file = "SwissProt/P0A186.txt" - self.assertTrue(os.path.isfile(in_file)) - if os.path.isfile(out_file): - os.remove(out_file) - cline = WaterCommandline(cmd=exes["water"]) - cline.set_parameter("-asequence", f"asis:{query}") - cline.set_parameter("-bsequence", in_file) - # EMBOSS should work this out, but let's be explicit: - cline.set_parameter("-sprotein", True) - # TODO - Tell water this is a SwissProt file! - cline.set_parameter("-gapopen", "20") - cline.set_parameter("-gapextend", "5") - cline.set_parameter("-outfile", out_file) - self.assertEqual(str(eval(repr(cline))), str(cline)) - # Run the tool, - self.run_water(cline) - # Check we can parse the output and it is sensible... - self.pairwise_alignment_check( - query, - SeqIO.parse(in_file, "swiss"), - AlignIO.parse(out_file, "emboss"), - local=True, - ) - # Clean up, - os.remove(out_file) - - def test_needle_piped2(self): - """Run needle with asis trick, and nucleotide FASTA file, output piped to stdout.""" - # TODO - Support needle in Bio.Emboss.Applications - # (ideally with the -auto and -filter arguments) - # Setup, - query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAA" - cline = exes["needle"] - cline += " -asequence asis:" + query - cline += " -bsequence Fasta/f002" - cline += " -auto" # no prompting - cline += " -filter" # use stdout - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - child.stdin.close() - # Check we can parse the output and it is sensible... - self.pairwise_alignment_check( - query, - SeqIO.parse("Fasta/f002", "fasta"), - AlignIO.parse(child.stdout, "emboss"), - local=False, - ) - # Check no error output: - self.assertEqual(child.stderr.read(), "") - self.assertEqual(0, child.wait()) - child.stdout.close() - child.stderr.close() - - def test_water_needs_output(self): - """Run water without output file or stdout/filter should give error.""" - cline = WaterCommandline( - cmd=exes["water"], - asequence="asis:ACCCGGGCGCGGT", - bsequence="asis:ACCCGAGCGCGGT", - gapopen=10, - gapextend=0.5, - auto=True, - ) - self.assertTrue(cline.auto) - self.assertTrue(not cline.stdout) - self.assertTrue(not cline.filter) - self.assertIsNone(cline.outfile) - self.assertRaises(ValueError, str, cline) - - def test_needle_needs_output(self): - """Run needle without output file or stdout/filter should give error.""" - cline = NeedleCommandline( - cmd=exes["needle"], - asequence="asis:ACCCGGGCGCGGT", - bsequence="asis:ACCCGAGCGCGGT", - gapopen=10, - gapextend=0.5, - auto=True, - ) - self.assertTrue(cline.auto) - self.assertTrue(not cline.stdout) - self.assertTrue(not cline.filter) - self.assertIsNone(cline.outfile) - self.assertRaises(ValueError, str, cline) - - def test_seqtmatchall_piped(self): - """Run seqmatchall with pair output piped to stdout.""" - cline = SeqmatchallCommandline( - cmd=exes["seqmatchall"], - sequence="Fasta/f002", - aformat="pair", - wordsize=9, - auto=True, - stdout=True, - ) - self.assertEqual( - str(cline), - exes["seqmatchall"] - + " -auto -stdout" - + " -sequence=Fasta/f002" - + " -wordsize=9 -aformat=pair", - ) - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - child.stdin.close() - # Check we could read its output - for align in AlignIO.parse(child.stdout, "emboss"): - self.assertEqual(len(align), 2) - self.assertEqual(align.get_alignment_length(), 9) - # Check no error output: - self.assertEqual(child.stderr.read(), "") - self.assertEqual(0, child.wait()) - child.stdout.close() - child.stderr.close() - - -class TranslationTests(unittest.TestCase): - """Run pairwise alignments with water and needle, and parse them.""" - - def tearDown(self): - clean_up() - - def test_simple(self): - """Run transeq vs Bio.Seq for simple translations (including alt tables).""" - examples = [ - Seq("ACGTGACTGACGTAGCATGCCACTAGG"), - # Unamibguous TA? codons: - Seq("TAATACTATTAG"), - # Most of the ambiguous TA? codons: - Seq("TANTARTAYTAMTAKTAHTABTADTAV"), - # Problem cases, - # - # Seq("TAW"), - # W = A or T, but EMBOSS does TAW -> X - # TAA -> Y, TAT ->Y, so in Biopython TAW -> Y - # - # Seq("TAS"), - # S = C or G, but EMBOSS does TAS -> Y - # TAG -> *, TAC ->Y, so in Biopython TAS -> X (Y or *) - # - # Seq("AAS"), - # On table 9, EMBOSS gives N, we give X. - # S = C or G, so according to my reading of - # table 9 on the NCBI page, AAC=N, AAG=K - # suggesting this is a bug in EMBOSS. - # - Seq("ACGGGGGGGGTAAGTGGTGTGTGTGTAGT"), - ] - - for sequence in examples: - # EMBOSS treats spare residues differently... avoid this issue - if len(sequence) % 3 != 0: - sequence = sequence[: -(len(sequence) % 3)] - self.assertEqual(len(sequence) % 3, 0) - self.assertGreater(len(sequence), 0) - self.check(sequence) - - def check_emboss_translate(self, sequence, table=None, frame=None): - """Call transeq, returns protein sequence as string.""" - # TODO - Support transeq in Bio.Emboss.Applications? - # (doesn't seem worthwhile as Biopython can do translations) - - # Setup, - cline = exes["transeq"] - - if len(sequence) < 100: - filename = None - cline += f" -sequence asis:{sequence}" - else: - # There are limits on command line string lengths... - # use a temp file instead. - filename = "Emboss/temp_transeq.txt" - SeqIO.write(SeqRecord(sequence, id="Test"), filename, "fasta") - cline += f" -sequence {filename}" - - cline += " -auto" # no prompting - cline += " -filter" # use stdout - if table is not None: - cline += f" -table {table!s}" - if frame is not None: - cline += f" -frame {frame!s}" - # Run the tool, - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - out, err = child.communicate() - - msg = f"cline='{cline}'" - # Check no error output: - self.assertEqual(err, "", msg=msg) - - # Check we could read its output - record = SeqIO.read(StringIO(out), "fasta") - - result = child.wait() - self.assertEqual(result, 0, msg=msg) - - if filename: - os.remove(filename) - self.assertTrue(record.id.startswith("Test"), msg=msg) - else: - self.assertTrue(record.id.startswith("asis"), msg=msg) - - translation = record.seq - if table is None: - table = 1 - self.assertEqual(translation, sequence.translate(table)) - self.assertEqual(translation, translate(sequence, table)) - self.assertEqual(translation, translate(str(sequence), table)) - # More details... - for i, amino in enumerate(translation): - codon = sequence[i * 3 : i * 3 + 3] - msg = f"codon {codon}, table {table}" - self.assertEqual(amino, codon.translate(table), msg=msg) - - def check(self, sequence): - """Compare our translation to EMBOSS's using all tables. - - Takes a Seq object (and a filename containing it). - """ - self.check_emboss_translate(sequence) - - for table in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23]: - self.check_emboss_translate(sequence, table) - - def translate_all_codons(self, letters): - sequence = Seq( - "".join(c1 + c3 + c3 for c1 in letters for c2 in letters for c3 in letters) - ) - self.check(sequence) - - # def test_all_ambig_dna_codons(self): - # """transeq vs Bio.Seq on ambiguous DNA codons (inc. alt tables).""" - # self.translate_all_codons(ambiguous_dna_letters) - - def test_all_unambig_dna_codons(self): - """Run transeq vs Bio.Seq on unambiguous DNA codons (inc. alt tables).""" - self.translate_all_codons("ATCGatcg") - - def test_all_unambig_rna_codons(self): - """Run transeq vs Bio.Seq on unambiguous RNA codons (inc. alt tables).""" - self.translate_all_codons("AUCGaucg") - - def test_mixed_unambig_rna_codons(self): - """Run transeq vs Bio.Seq on unambiguous DNA/RNA codons (inc. alt tables).""" - self.translate_all_codons("ATUCGatucg") - - -def clean_up(): - """Fallback clean up method to remove temp files.""" - for filename in os.listdir("Emboss"): - if filename.startswith("temp_"): - try: - os.remove(filename) - except Exception: # TODO - Which exceptions? - pass - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) - clean_up() diff --git a/Tests/test_EmbossPhylipNew.py b/Tests/test_EmbossPhylipNew.py deleted file mode 100644 index e11b81562..000000000 --- a/Tests/test_EmbossPhylipNew.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright 2009 by David Winter. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for EmbossPhylipNew module.""" - -import os -import sys -import unittest -import warnings - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio.Nexus import Trees # One day we should use planned TreeIO module - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Emboss.Applications import FConsenseCommandline - from Bio.Emboss.Applications import FDNADistCommandline - from Bio.Emboss.Applications import FDNAParsCommandline - from Bio.Emboss.Applications import FNeighborCommandline - from Bio.Emboss.Applications import FProtDistCommandline - from Bio.Emboss.Applications import FProtParsCommandline - from Bio.Emboss.Applications import FSeqBootCommandline - from Bio.Emboss.Applications import FTreeDistCommandline - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -exes_wanted = [ - "fdnadist", - "fneighbor", - "fprotdist", - "fprotpars", - "fconsense", - "fseqboot", - "ftreedist", - "fdnapars", -] -exes = {} # Dictionary mapping from names to exe locations - -if "EMBOSS_ROOT" in os.environ: - # Windows default installation path is C:\mEMBOSS which contains the exes. - # EMBOSS also sets an environment variable which we will check for. - path = os.environ["EMBOSS_ROOT"] - if os.path.isdir(path): - for name in exes_wanted: - if os.path.isfile(os.path.join(path, name + ".exe")): - exes[name] = os.path.join(path, name + ".exe") - del path, name -if sys.platform != "win32": - from subprocess import getoutput - - for name in exes_wanted: - # This will "just work" if installed on the path as normal on Unix - output = getoutput(f"{name} -help") - if "not found" not in output and "not recognized" not in output: - exes[name] = name - del output - del name - -if len(exes) < len(exes_wanted): - raise MissingExternalDependencyError( - "Install the Emboss package 'PhylipNew' if you want to use the " - "Bio.Emboss.Applications wrappers for phylogenetic tools." - ) - -# ######################################################################### - - -# A few top level functions that are called repeatedly in the test cases -def clean_up(): - """Delete tests files (to be used as tearDown() function in test fixtures).""" - for filename in ["test_file", "Phylip/opuntia.phy", "Phylip/hedgehog.phy"]: - if os.path.isfile(filename): - os.remove(filename) - - -def parse_trees(filename): - """Parse trees. - - Helper function until we have Bio.Phylo on trunk. - """ - # TODO - Can this be removed now? - with open("test_file") as handle: - data = handle.read() - for tree_str in data.split(";\n"): - if tree_str: - yield Trees.Tree(tree_str + ";") - - -class DistanceTests(unittest.TestCase): - """Tests for calculating distance based phylogenetic trees with phylip.""" - - def tearDown(self): - clean_up() - - test_taxa = [ - "Archaeohip", - "Calippus", - "Hypohippus", - "M._secundu", - "Merychippu", - "Mesohippus", - "Nannipus", - "Neohippari", - "Parahippus", - "Pliohippus", - ] - - def distances_from_alignment(self, filename, DNA=True): - """Check we can make a distance matrix from a given alignment.""" - self.assertTrue(os.path.isfile(filename), f"Missing {filename}") - if DNA: - cline = FDNADistCommandline( - exes["fdnadist"], - method="j", - sequence=filename, - outfile="test_file", - auto=True, - ) - else: - cline = FProtDistCommandline( - exes["fprotdist"], - method="j", - sequence=filename, - outfile="test_file", - auto=True, - ) - stdout, strerr = cline() - # biopython can't grok distance matrices, so we'll just check it exists - self.assertTrue(os.path.isfile("test_file")) - - def tree_from_distances(self, filename): - """Check we can estimate a tree from a distance matrix.""" - self.assertTrue(os.path.isfile(filename), f"Missing {filename}") - cline = FNeighborCommandline( - exes["fneighbor"], - datafile=filename, - outtreefile="test_file", - auto=True, - filter=True, - ) - stdout, stderr = cline() - for tree in parse_trees("test_file"): - tree_taxa = [t.replace(" ", "_") for t in tree.get_taxa()] - self.assertEqual(self.test_taxa, sorted(tree_taxa)) - - def test_distances_from_phylip_DNA(self): - """Calculate a distance matrix from an phylip alignment.""" - self.distances_from_alignment("Phylip/horses.phy") - - def test_distances_from_AlignIO_DNA(self): - """Calculate a distance matrix from an alignment written by AlignIO.""" - n = AlignIO.convert( - "Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip" - ) - self.assertEqual(n, 1) - self.distances_from_alignment("Phylip/opuntia.phy") - - # def test_distances_from_bootstrapped_phylip_DNA(self): - # """Calculate a set of distance matrices from phylip alignments""" - # self.distances_from_alignment("Phylip/bs_horses.phy") - - # fprotdist tests - def test_distances_from_protein_phylip(self): - """Calculate a distance matrix from phylip protein alignment.""" - self.distances_from_alignment("Phylip/interlaced.phy", DNA=False) - - def test_distances_from_protein_AlignIO(self): - """Calculate distance matrix from an AlignIO written protein alignment.""" - n = AlignIO.convert( - "Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip" - ) - self.assertEqual(n, 1) - self.distances_from_alignment("Phylip/hedgehog.phy", DNA=False) - - # def test_distances_from_bootstrapped_phylip_protein(self): - # """Calculate distance matrices from a bootstrapped protein alignment""" - # self.distances_from_alignment("Clustalw/bs_interlaced.phy", DNA=False) - - # fneighbor tests - # def test_tree_from_distances(self): - # """Estimate tree from distance matrix and parse it.""" - # self.tree_from_distances("Phylip/horses.fdnadist") - - # This one won't work because of a bug in EMBOSS 6.0.1 - # def test_tree_from_bootstrapped_distances(self): - # """Estimate tree from bootstrapped distance matrix and parse it""" - # self.tree_from_distances("Phylip/bs_horses.fdnadist") - - -class ParsimonyTests(unittest.TestCase): - """Tests for estimating parsimony based phylogenetic trees with phylip.""" - - def tearDown(self): - clean_up() - - def parsimony_tree(self, filename, format, DNA=True): - """Estimate a parsimony tree from an alignment.""" - self.assertTrue(os.path.isfile(filename), f"Missing {filename}") - if DNA: - cline = FDNAParsCommandline( - exes["fdnapars"], - sequence=filename, - outtreefile="test_file", - auto=True, - stdout=True, - ) - else: - cline = FProtParsCommandline( - exes["fprotpars"], - sequence=filename, - outtreefile="test_file", - auto=True, - stdout=True, - ) - stdout, stderr = cline() - with open(filename) as handle: - a_taxa = [ - s.name.replace(" ", "_") for s in next(AlignIO.parse(handle, format)) - ] - for tree in parse_trees("test_file"): - t_taxa = [t.replace(" ", "_") for t in tree.get_taxa()] - self.assertEqual(sorted(a_taxa), sorted(t_taxa)) - - # fdnapars tests - # def test_parsimony_tree_from_phylip_DNA(self): - # """Make a parsimony tree from a phylip DNA alignment""" - # self.parsimony_tree("Phylip/horses.phy", "phylip") - - def test_parsimony_tree_from_AlignIO_DNA(self): - """Make a parsimony tree from an alignment written with AlignIO.""" - n = AlignIO.convert( - "Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip" - ) - self.assertEqual(n, 1) - self.parsimony_tree("Phylip/opuntia.phy", "phylip") - - # def test_parsimony_bootstrapped_phylip_DNA(self): - # """Make a parsimony tree from a bootstrapped phylip DNA alignment""" - # self.parsimony_tree("Phylip/bs_horses.phy", "phylip") - - # fprotpars tests - # def test_parsimony_tree_from_phylip_protein(self): - # """Make a parsimony tree from a phylip DNA alignment""" - # self.parsimony_tree("Phylip/interlaced.phy", "phylip", DNA=False) - - def test_parsimony_from_AlignIO_protein(self): - """Make a parsimony tree from protein alignment written with AlignIO.""" - n = AlignIO.convert( - "Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip" - ) - self.parsimony_tree("Phylip/interlaced.phy", "phylip", DNA=False) - - # def test_parsimony_tree_bootstrapped_phylip_protein(self): - # """Make a parsimony tree from a phylip DNA alignment""" - # self.parsimony_tree("Phylip/bs_interlaced.phy", "phylip", DNA=False) - - -class BootstrapTests(unittest.TestCase): - """Tests for pseudosampling alignments with fseqboot.""" - - def tearDown(self): - clean_up() - - def check_bootstrap(self, filename, format, align_type="d"): - """Check we can use fseqboot to pseudosample an alignment. - - The align_type type argument is passed to the commandline object to - set the output format to use (from [D]na,[p]rotein and [r]na ) - """ - self.assertTrue(os.path.isfile(filename), f"Missing {filename}") - cline = FSeqBootCommandline( - exes["fseqboot"], - sequence=filename, - outfile="test_file", - seqtype=align_type, - reps=2, - auto=True, - filter=True, - ) - stdout, stderr = cline() - # the resultant file should have 2 alignments... - with open("test_file") as handle: - bs = list(AlignIO.parse(handle, format)) - self.assertEqual(len(bs), 2) - # ..and each name in the original alignment... - with open(filename) as handle: - a_names = [s.name.replace(" ", "_") for s in AlignIO.read(handle, format)] - # ...should be in each alignment in the bootstrapped file - for a in bs: - self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a]) - - def test_bootstrap_phylip_DNA(self): - """Pseudosample a phylip DNA alignment.""" - self.check_bootstrap("Phylip/horses.phy", "phylip") - - def test_bootstrap_AlignIO_DNA(self): - """Pseudosample a phylip DNA alignment written with AlignIO.""" - n = AlignIO.convert( - "Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip" - ) - self.assertEqual(n, 1) - self.check_bootstrap("Phylip/opuntia.phy", "phylip") - - def test_bootstrap_phylip_protein(self): - """Pseudosample a phylip protein alignment.""" - self.check_bootstrap("Phylip/interlaced.phy", "phylip", "p") - - def test_bootstrap_AlignIO_protein(self): - """Pseudosample a phylip protein alignment written with AlignIO.""" - n = AlignIO.convert( - "Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip" - ) - self.check_bootstrap("Phylip/hedgehog.phy", "phylip", "p") - - -class TreeComparisonTests(unittest.TestCase): - """Tests for comparing phylogenetic trees with phylip tools.""" - - def tearDown(self): - clean_up() - - def test_fconsense(self): - """Calculate a consensus tree with fconsense.""" - cline = FConsenseCommandline( - exes["fconsense"], - intreefile="Phylip/horses.tree", - outtreefile="test_file", - auto=True, - filter=True, - ) - stdout, stderr = cline() - # Split the next and get_taxa into two steps to help 2to3 work - tree1 = next(parse_trees("test_file")) - taxa1 = tree1.get_taxa() - for tree in parse_trees("Phylip/horses.tree"): - taxa2 = tree.get_taxa() - self.assertEqual(sorted(taxa1), sorted(taxa2)) - - def test_ftreedist(self): - """Calculate the distance between trees with ftreedist.""" - cline = FTreeDistCommandline( - exes["ftreedist"], - intreefile="Phylip/horses.tree", - outfile="test_file", - auto=True, - filter=True, - ) - stdout, stderr = cline() - self.assertTrue(os.path.isfile("test_file")) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) - clean_up() diff --git a/Tests/test_Fasttree_tool.py b/Tests/test_Fasttree_tool.py deleted file mode 100644 index 2eccaa36f..000000000 --- a/Tests/test_Fasttree_tool.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2013 by Nate Sutton. All rights reserved. -# Based on test_Clustalw_tool.py by Peter Cock. -# Example code used from Biopython's Phylo cookbook by Eric Talevich. -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for Fasttree tool.""" - -import itertools -import os -import sys -import unittest -import warnings -from io import StringIO - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import Phylo -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import ApplicationError - from Bio.Phylo.Applications import _Fasttree - from Bio.Phylo.Applications import FastTreeCommandline - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -fasttree_exe = None -if sys.platform == "win32": - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files (x86)" - - # A default fasttree file path of "C:\Program Files (x86)\Fasttree.exe" - # was chosen here but users can alter the path according to where - # fasttree is located on their systems - - likely_dirs = ["", "FastTree"] - likely_exes = ["FastTree.exe"] - for folder in likely_dirs: - if os.path.isdir(os.path.join(prog_files, folder)): - for filename in likely_exes: - if os.path.isfile(os.path.join(prog_files, folder, filename)): - fasttree_exe = os.path.join(prog_files, folder, filename) - break - if fasttree_exe: - break -else: - from subprocess import getoutput - - # Website uses 'FastTree', Nate's system had 'fasttree' - likely_exes = ["FastTree", "fasttree"] - for filename in likely_exes: - # Checking the -help argument - output = getoutput(f"{filename} -help") - # Since "is not recognized" may be in another language, try and be sure this - # is really the fasttree tool's output - if ( - "is not recognized" not in output - and "protein_alignment" in output - and "nucleotide_alignment" in output - ): - fasttree_exe = filename - break - -if not fasttree_exe: - raise MissingExternalDependencyError( - "Install FastTree and correctly set the file path to the program " - "if you want to use it from Biopython." - ) - - -class FastTreeTestCase(unittest.TestCase): - def check(self, path, length): - input_records = SeqIO.to_dict(SeqIO.parse(path, "fasta")) - self.assertEqual(len(input_records), length) - # Any filenames with spaces should get escaped with quotes - # automatically. - # Using keyword arguments here. - cline = _Fasttree.FastTreeCommandline(fasttree_exe, input=path, nt=True) - self.assertEqual(str(eval(repr(cline))), str(cline)) - out, err = cline() - self.assertTrue(err.strip().startswith("FastTree")) - tree = Phylo.read(StringIO(out), "newick") - - names = {} - for clade in tree.find_clades(): - if clade.name: - self.assertNotIn(clade.name, names) - names[clade.name] = clade - - self.assertGreater(len(names), 0) - - def terminal_neighbor_dists(self): - """Return a list of distances between adjacent terminals.""" - - def generate_pairs(self): - pairs = itertools.tee(self) - next(pairs[1]) # Advance second iterator one step - return zip(pairs[0], pairs[1]) - - return [ - self.distance(*i) - for i in generate_pairs(self.find_clades(terminal=True)) - ] - - for dist in terminal_neighbor_dists(tree): - self.assertGreater(dist, 0.0) - - def test_normal(self): - self.check("Quality/example.fasta", 3) - - def test_filename_spaces(self): - path = "Clustalw/temp horses.fasta" # note spaces in filename - records = SeqIO.parse("Phylip/hennigian.phy", "phylip") - with open(path, "w") as handle: - length = SeqIO.write(records, handle, "fasta") - self.assertEqual(length, 10) - self.check(path, length) - - def test_invalid(self): - path = "Medline/pubmed_result1.txt" - cline = FastTreeCommandline(fasttree_exe, input=path) - with self.assertRaises(ApplicationError) as cm: - stdout, stderr = cline() - message = str(cm.exception) - self.assertTrue( - "invalid format" in message - or "not produced" in message - or "No sequences in file" in message - or "Error parsing header line:" in message - or "Non-zero return code " in message, - msg=f"Unknown ApplicationError raised: {message}", - ) - - def test_single(self): - path = "Fasta/f001" - records = list(SeqIO.parse(path, "fasta")) - self.assertEqual(len(records), 1) - cline = FastTreeCommandline(fasttree_exe, input=path) - stdout, stderr = cline() - self.assertIn("Unique: 1/1", stderr) - - def test_empty(self): - path = "does_not_exist.fasta" - cline = FastTreeCommandline(fasttree_exe, input=path) - with self.assertRaises(ApplicationError) as cm: - stdout, stderr = cline() - message = str(cm.exception) - self.assertTrue( - "Cannot open sequence file" in message - or "Cannot open sequence file" in message - or f"Cannot read {path}" in message - or "Non-zero return code " in message, - msg=f"Unknown ApplicationError raised: {message}", - ) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_MSAProbs_tool.py b/Tests/test_MSAProbs_tool.py deleted file mode 100644 index 35ec691e8..000000000 --- a/Tests/test_MSAProbs_tool.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2013 by Christian Brueffer. All rights reserved. -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for MSAProbs tool.""" - -import os -import sys -import unittest -import warnings -from subprocess import getoutput - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import MSAProbsCommandline - from Bio.Application import ApplicationError - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -msaprobs_exe = None -try: - output = getoutput("msaprobs -version") - if output.startswith("MSAPROBS version"): - msaprobs_exe = "msaprobs" -except FileNotFoundError: - pass - -if not msaprobs_exe: - raise MissingExternalDependencyError( - "Install msaprobs if you want to use MSAProbs from Biopython." - ) - - -class MSAProbsTestCase(unittest.TestCase): - def setUp(self): - self.files_to_clean = set() - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - - def standard_test_procedure(self, cline): - """Shared testing procedure used by all tests.""" - # Mark output files for later cleanup. - self.add_file_to_clean(cline.outfile) - - input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) - self.assertEqual(str(eval(repr(cline))), str(cline)) - output, error = cline() - - def add_file_to_clean(self, filename): - """Add a file for deferred removal by the tearDown routine.""" - self.files_to_clean.add(filename) - - -################################################################# - - -class MSAProbsTestErrorConditions(MSAProbsTestCase): - def test_empty_file(self): - """Test an empty file.""" - input_file = "does_not_exist.fasta" - self.assertFalse(os.path.isfile(input_file)) - cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) - try: - stdout, stderr = cline() - except ApplicationError as err: - self.assertTrue( - "Cannot open sequence file" in str(err) - or "Cannot open input file" in str(err) - or "Non-zero return code " in str(err), - str(err), - ) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_single_sequence(self): - """Test an input file containing a single sequence.""" - input_file = "Fasta/f001" - self.assertTrue(os.path.isfile(input_file)) - self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1) - cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) - try: - stdout, stderr = cline() - except ApplicationError as err: - if sys.platform == "win32": - expected = 0xC0000005 - elif sys.platform == "darwin": - expected = -11 - else: - expected = 139 # TODO: Check return codes on various other platforms - self.assertEqual(expected, err.returncode) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_invalid_format(self): - """Test an input file in an invalid format.""" - input_file = "Medline/pubmed_result1.txt" - self.assertTrue(os.path.isfile(input_file)) - cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) - try: - stdout, stderr = cline() - except ApplicationError as err: - self.assertEqual(err.returncode, 1) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - -################################################################# - - -class MSAProbsTestNormalConditions(MSAProbsTestCase): - def test_simple_fasta(self): - """Test a simple fasta file.""" - input_file = "Registry/seqs.fasta" - output_file = "temp_test.aln" - - cline = MSAProbsCommandline( - msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True - ) - - self.standard_test_procedure(cline) - - def test_properties(self): - """Test setting options via properties.""" - input_file = "Registry/seqs.fasta" - output_file = "temp_test.aln" - - cline = MSAProbsCommandline(msaprobs_exe) - cline.infile = input_file - cline.outfile = output_file - cline.clustalw = True - - self.standard_test_procedure(cline) - - def test_input_filename_with_space(self): - """Test an input filename containing a space.""" - input_file = "Clustalw/temp horses.fasta" - with open(input_file, "w") as handle: - SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta") - output_file = "temp_test.aln" - - cline = MSAProbsCommandline( - msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True - ) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - def test_output_filename_with_spaces(self): - """Test an output filename containing spaces.""" - input_file = "Registry/seqs.fasta" - output_file = "temp with spaces.aln" - - cline = MSAProbsCommandline( - msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True - ) - - self.standard_test_procedure(cline) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Mafft_tool.py b/Tests/test_Mafft_tool.py deleted file mode 100644 index 94e143e86..000000000 --- a/Tests/test_Mafft_tool.py +++ /dev/null @@ -1,191 +0,0 @@ -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Unittests for Bio.Align.Applications interface for MAFFT.""" - -import os -import subprocess -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import MafftCommandline - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -mafft_exe = None -if sys.platform == "win32": - raise MissingExternalDependencyError( - "Testing with MAFFT not implemented on Windows yet" - ) -else: - from subprocess import getoutput - - output = getoutput("mafft -help") - if "not found" not in output and "not recognized" not in output: - if "MAFFT" in output: - mafft_exe = "mafft" -if not mafft_exe: - raise MissingExternalDependencyError( - "Install MAFFT if you want to use the Bio.Align.Applications wrapper." - ) - - -def check_mafft_version(mafft_exe): - child = subprocess.Popen( - f"{mafft_exe} --help", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - output = stdoutdata + "\n" + stderrdata - return_code = child.returncode - del child - if ( - "correctly installed?" in output - or "mafft binaries have to be installed" in output - ): - raise MissingExternalDependencyError( - "MAFFT does not seem to be correctly installed." - ) - - # e.g. "MAFFT version 5.732 (2005/09/14)\n" - # e.g. " MAFFT v6.717b (2009/12/03)\n" - for marker in ["MAFFT version", "MAFFT v"]: - index = output.find(marker) - if index == -1: - continue - version = output[index + len(marker) :].strip().split(None, 1)[0] - major = int(version.split(".", 1)[0]) - if major < 6: - raise MissingExternalDependencyError( - f"Test requires MAFFT v6 or later (found {version})." - ) - return (major, version) - raise MissingExternalDependencyError("Couldn't determine MAFFT version.") - - -# This also checks it actually runs! -version_major, version_string = check_mafft_version(mafft_exe) - - -class MafftApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/f002" - - def tearDown(self): - if os.path.isfile("Fasta/f002.tree"): - os.remove("Fasta/f002.tree") - - def test_Mafft_simple(self): - """Simple round-trip through app with infile, result passed to stdout.""" - # Use a keyword argument at init, - cmdline = MafftCommandline(mafft_exe, input=self.infile1) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdoutdata, stderrdata = cmdline() - self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) - # Used to get "Progressive alignment ..." but in v7.245 - # became "Progressive alignment 1/2..." and "Progressive alignment 2/2..." - self.assertTrue( - ("Progressive alignment ..." in stderrdata) - or ("Progressive alignment 1/" in stderrdata), - stderrdata, - ) - self.assertNotIn("$#=0", stderrdata) - - def test_Mafft_with_options(self): - """Simple round-trip through app with infile and options, result passed to stdout.""" - cmdline = MafftCommandline(mafft_exe) - cmdline.set_parameter("input", self.infile1) - cmdline.set_parameter("maxiterate", 100) - cmdline.set_parameter("--localpair", True) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdoutdata, stderrdata = cmdline() - self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) - self.assertNotIn("$#=0", stderrdata) - - def test_Mafft_with_Clustalw_output(self): - """Simple round-trip through app with clustal output.""" - cmdline = MafftCommandline(mafft_exe) - # Use some properties: - cmdline.input = self.infile1 - cmdline.clustalout = True - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdoutdata, stderrdata = cmdline() - # e.g. "CLUSTAL format alignment by MAFFT ..." - # or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" - self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata) - self.assertNotIn("$#=0", stderrdata) - - if version_major >= 7: - - def test_Mafft_with_PHYLIP_output(self): - """Simple round-trip through app with PHYLIP output.""" - cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdoutdata, stderrdata = cmdline() - # e.g. " 3 706\n" or " 3 681" but allow some variation in the column count - self.assertTrue( - stdoutdata.startswith((" 3 68", " 3 69", " 3 70")), - stdoutdata, - ) - self.assertIn("gi|1348912 ", stdoutdata, stdoutdata) - self.assertNotIn("gi|1348912|gb|G26680|G26680", stdoutdata, stdoutdata) - self.assertNotIn("$#=0", stderrdata) - - def test_Mafft_with_PHYLIP_namelength(self): - """Check PHYLIP with --namelength.""" - cmdline = MafftCommandline( - mafft_exe, input=self.infile1, phylipout=True, namelength=50 - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdoutdata, stderrdata = cmdline() - # e.g. " 3 706\n" or " 3 681" but allow some variation in the column count - self.assertTrue( - stdoutdata.startswith((" 3 68", " 3 69", " 3 70")), - stdoutdata, - ) - self.assertIn("gi|1348912|gb|G26680|G26680", stdoutdata, stdoutdata) - self.assertNotIn("$#=0", stderrdata) - - def test_Mafft_with_complex_command_line(self): - """Round-trip with complex command line.""" - cmdline = MafftCommandline(mafft_exe) - cmdline.set_parameter("input", self.infile1) - cmdline.set_parameter("--localpair", True) - cmdline.set_parameter("--weighti", 4.2) - cmdline.set_parameter("retree", 5) - cmdline.set_parameter("maxiterate", 200) - cmdline.set_parameter("--nofft", True) - cmdline.set_parameter("op", 2.04) - cmdline.set_parameter("--ep", 0.51) - cmdline.set_parameter("--lop", 0.233) - cmdline.set_parameter("lep", 0.2) - cmdline.set_parameter("--reorder", True) - cmdline.set_parameter("--treeout", True) - cmdline.set_parameter("nuc", True) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - self.assertEqual( - str(cmdline), - mafft_exe - + " --localpair --weighti 4.2 --retree 5 " - + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" - + " --lop 0.233 --lep 0.2 --reorder --treeout" - + " --nuc Fasta/f002", - ) - stdoutdata, stderrdata = cmdline() - self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) - self.assertNotIn("$#=0", stderrdata) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Muscle_tool.py b/Tests/test_Muscle_tool.py deleted file mode 100644 index 90c587e15..000000000 --- a/Tests/test_Muscle_tool.py +++ /dev/null @@ -1,446 +0,0 @@ -# Copyright 2009-2013 by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for Muscle tool.""" - -import os -import subprocess -import sys -import unittest -import warnings - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import MuscleCommandline - from Bio.Application import _escape_filename - - -################################################################# - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -muscle_exe = None -if sys.platform == "win32": - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files" - # For Windows, MUSCLE just comes as a zip file which contains the - # a Muscle directory with the muscle.exe file plus a readme etc, - # which the user could put anywhere. We'll try a few sensible - # locations under Program Files... and then the full path. - likely_dirs = [ - "", # Current dir - prog_files, - os.path.join(prog_files, "Muscle3.6"), - os.path.join(prog_files, "Muscle3.7"), - os.path.join(prog_files, "Muscle3.8"), - os.path.join(prog_files, "Muscle3.9"), - os.path.join(prog_files, "Muscle"), - ] + sys.path - for folder in likely_dirs: - if os.path.isdir(folder): - if os.path.isfile(os.path.join(folder, "muscle.exe")): - muscle_exe = os.path.join(folder, "muscle.exe") - break - if muscle_exe: - break -else: - from subprocess import getoutput - - output = getoutput("muscle -version") - # Since "not found" may be in another language, try and be sure this is - # really the MUSCLE tool's output - if "not found" not in output and "not recognized" not in output: - if "MUSCLE" in output and "Edgar" in output: - muscle_exe = "muscle" - -if not muscle_exe: - raise MissingExternalDependencyError( - "Install MUSCLE if you want to use the Bio.Align.Applications wrapper." - ) - -################################################################# - - -class MuscleApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/f002" - self.infile2 = "Fasta/fa01" - self.infile3 = "Fasta/f001" - self.outfile1 = "Fasta/temp align out1.fa" # with spaces! - self.outfile2 = "Fasta/temp_align_out2.fa" - self.outfile3 = "Fasta/temp_align_out3.fa" - self.outfile4 = "Fasta/temp_align_out4.fa" - - def tearDown(self): - if os.path.isfile(self.outfile1): - os.remove(self.outfile1) - if os.path.isfile(self.outfile2): - os.remove(self.outfile2) - if os.path.isfile(self.outfile3): - os.remove(self.outfile3) - if os.path.isfile(self.outfile4): - os.remove(self.outfile4) - - def test_Muscle_simple(self): - """Simple round-trip through app just infile and outfile.""" - cmdline = MuscleCommandline(muscle_exe, input=self.infile1, out=self.outfile1) - self.assertEqual( - str(cmdline), - _escape_filename(muscle_exe) - + ' -in Fasta/f002 -out "Fasta/temp align out1.fa"', - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - output, error = cmdline() - self.assertEqual(output, "") - self.assertNotIn("ERROR", error) - - def test_Muscle_with_options(self): - """Round-trip through app with a switch and valued option.""" - cmdline = MuscleCommandline(muscle_exe) - cmdline.set_parameter("input", self.infile1) # "input" is alias for "in" - cmdline.set_parameter("out", self.outfile2) - # Use property: - cmdline.objscore = "sp" - cmdline.noanchors = True - self.assertEqual( - str(cmdline), - _escape_filename(muscle_exe) - + " -in Fasta/f002 -out Fasta/temp_align_out2.fa -objscore sp -noanchors", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - output, error = cmdline() - self.assertEqual(output, "") - self.assertNotIn("ERROR", error) - self.assertTrue(error.strip().startswith("MUSCLE"), output) - - def test_Muscle_profile_simple(self): - """Simple round-trip through app doing a profile alignment.""" - cmdline = MuscleCommandline(muscle_exe) - cmdline.set_parameter("out", self.outfile3) - cmdline.set_parameter("profile", True) - cmdline.set_parameter("in1", self.infile2) - cmdline.set_parameter("in2", self.infile3) - self.assertEqual( - str(cmdline), - _escape_filename(muscle_exe) - + " -out Fasta/temp_align_out3.fa -profile -in1 Fasta/fa01 -in2 Fasta/f001", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - output, error = cmdline() - self.assertEqual(output, "") - self.assertNotIn("ERROR", error) - self.assertTrue(error.strip().startswith("MUSCLE"), output) - - def test_Muscle_profile_with_options(self): - """Profile alignment, and switch and valued options.""" - # Using some keyword arguments, note -stable isn't supported in v3.8 - cmdline = MuscleCommandline( - muscle_exe, - out=self.outfile4, - in1=self.infile2, - in2=self.infile3, - profile=True, - stable=True, - cluster1="neighborjoining", - ) - self.assertEqual( - str(cmdline), - _escape_filename(muscle_exe) - + " -out Fasta/temp_align_out4.fa -profile -in1 Fasta/fa01 -in2 Fasta/f001" - + " -cluster1 neighborjoining -stable", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - """ - #TODO - Why doesn't this work with MUSCLE 3.6 on the Mac? - #It may be another bug fixed in MUSCLE 3.7 ... - result, stdout, stderr = generic_run(cmdline) - #NOTE: generic_run has been removed from Biopython - self.assertEqual(result.return_code, 0) - self.assertEqual(stdout.read(), "") - self.assertNotIn("ERROR", stderr.read()) - self.assertEqual(str(result._cl), str(cmdline)) - """ - - -class SimpleAlignTest(unittest.TestCase): - """Simple MUSCLE tests.""" - - """ - #FASTA output seems broken on Muscle 3.6 (on the Mac). - def test_simple_fasta(self): - input_file = "Fasta/f002" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file,"fasta")) - #Prepare the command... - cmdline = MuscleCommandline(muscle_exe) - cmdline.set_parameter("in", input_file) - #Preserve input record order (makes checking output easier) - cmdline.set_parameter("stable") - #Set some others options just to test them - cmdline.set_parameter("maxiters", 2) - self.assertEqual(str(cmdline).rstrip(), "muscle -in Fasta/f002 -maxiters 2 -stable") - result, out_handle, err_handle = generic_run(cmdline) - #NOTE: generic_run has been removed from Biopython - print(err_handle.read()) - print(out_handle.read()) - align = AlignIO.read(out_handle, "fasta") - self.assertEqual(len(records),len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-",""), old.seq) - """ - - def test_simple_msf(self): - """Simple muscle call using MSF output.""" - input_file = "Fasta/f002" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file, "fasta")) - records.sort(key=lambda rec: rec.id) # noqa: E731 - cmdline = MuscleCommandline(muscle_exe, input=input_file, msf=True) - self.assertEqual( - str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -msf" - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - child = subprocess.Popen( - str(cmdline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - # Didn't use -quiet so there should be progress reports on stderr, - align = AlignIO.read(child.stdout, "msf") - align.sort() # by record.id - self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) - return_code = child.wait() - self.assertEqual(return_code, 0) - child.stdout.close() - child.stderr.close() - del child - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-", ""), old.seq) - - def test_simple_clustal(self): - """Simple muscle call using Clustal output with a MUSCLE header.""" - input_file = "Fasta/f002" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file, "fasta")) - records.sort(key=lambda rec: rec.id) # noqa: E731 - # Prepare the command... use Clustal output (with a MUSCLE header) - cmdline = MuscleCommandline(muscle_exe, input=input_file, clw=True) - self.assertEqual( - str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -clw" - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - child = subprocess.Popen( - str(cmdline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - # Didn't use -quiet so there should be progress reports on stderr, - align = AlignIO.read(child.stdout, "clustal") - align.sort() # by record.id - self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) - return_code = child.wait() - self.assertEqual(return_code, 0) - child.stdout.close() - child.stderr.close() - del child - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-", ""), old.seq) - - def test_simple_clustal_strict(self): - """Simple muscle call using strict Clustal output.""" - input_file = "Fasta/f002" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file, "fasta")) - records.sort(key=lambda rec: rec.id) # noqa: E731 - # Prepare the command... - cmdline = MuscleCommandline(muscle_exe) - cmdline.set_parameter("in", input_file) - # Use clustal output (with a CLUSTAL header) - cmdline.set_parameter("clwstrict", True) # Default None treated as False! - self.assertEqual( - str(cmdline).rstrip(), - _escape_filename(muscle_exe) + " -in Fasta/f002 -clwstrict", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - child = subprocess.Popen( - str(cmdline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - # Didn't use -quiet so there should be progress reports on stderr, - align = AlignIO.read(child.stdout, "clustal") - align.sort() - self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-", ""), old.seq) - return_code = child.wait() - self.assertEqual(return_code, 0) - child.stdout.close() - child.stderr.close() - del child - - def test_long(self): - """Simple muscle call using long file.""" - # Create a large input file by converting some of another example file - temp_large_fasta_file = "temp_cw_prot.fasta" - records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40] - SeqIO.write(records, temp_large_fasta_file, "fasta") - # Prepare the command... - cmdline = MuscleCommandline(muscle_exe) - cmdline.set_parameter("in", temp_large_fasta_file) - # Use fast options - cmdline.set_parameter("maxiters", 1) - cmdline.set_parameter("diags", True) # Default None treated as False! - # Use clustal output - cmdline.set_parameter("clwstrict", True) # Default None treated as False! - # Shouldn't need this, but just to make sure it is accepted - cmdline.set_parameter("maxhours", 0.1) - # No progress reports to stderr - cmdline.set_parameter("quiet", True) # Default None treated as False! - self.assertEqual( - str(cmdline).rstrip(), - _escape_filename(muscle_exe) - + " -in temp_cw_prot.fasta -diags -maxhours 0.1" - + " -maxiters 1 -clwstrict -quiet", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - child = subprocess.Popen( - str(cmdline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - align = AlignIO.read(child.stdout, "clustal") - align.sort() - records.sort(key=lambda rec: rec.id) # noqa: E731 - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-", ""), old.seq) - # See if quiet worked: - self.assertEqual("", child.stderr.read().strip()) - return_code = child.wait() - self.assertEqual(return_code, 0) - child.stdout.close() - child.stderr.close() - del child - os.remove(temp_large_fasta_file) - - def test_using_stdin(self): - """Simple alignment using stdin.""" - input_file = "Fasta/f002" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file, "fasta")) - # Prepare the command... use Clustal output (with a MUSCLE header) - cline = MuscleCommandline(muscle_exe, clw=True) - self.assertEqual(str(cline).rstrip(), _escape_filename(muscle_exe) + " -clw") - self.assertEqual(str(eval(repr(cline))), str(cline)) - child = subprocess.Popen( - str(cline), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - SeqIO.write(records, child.stdin, "fasta") - child.stdin.close() - # Alignment will now run... - align = AlignIO.read(child.stdout, "clustal") - align.sort() - records.sort(key=lambda rec: rec.id) # noqa: E731 - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual(str(new.seq).replace("-", ""), old.seq) - self.assertEqual(0, child.wait()) - child.stdout.close() - child.stderr.close() - del child - - def test_with_multiple_output_formats(self): - """Simple muscle call with multiple output formats.""" - input_file = "Fasta/f002" - output_html = "temp_f002.html" - output_clwstrict = "temp_f002.clw" - self.assertTrue(os.path.isfile(input_file)) - records = list(SeqIO.parse(input_file, "fasta")) - records.sort(key=lambda rec: rec.id) # noqa: E731 - # Prepare the command... use Clustal output (with a MUSCLE header) - cmdline = MuscleCommandline( - muscle_exe, - input=input_file, - clw=True, - htmlout=output_html, - clwstrictout=output_clwstrict, - ) - self.assertEqual( - str(cmdline).rstrip(), - _escape_filename(muscle_exe) - + " -in Fasta/f002 -clw -htmlout temp_f002.html" - + " -clwstrictout temp_f002.clw", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - child = subprocess.Popen( - str(cmdline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - # Clustalw on stdout: - align = AlignIO.read(child.stdout, "clustal") - align.sort() - # Didn't use -quiet so there should be progress reports on stderr, - self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) - return_code = child.wait() - self.assertEqual(return_code, 0) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - child.stdout.close() - child.stderr.close() - del child - handle = open(output_html) - html = handle.read().strip().upper() - handle.close() - self.assertTrue(html.startswith("")) - # ClustalW strict: - align = AlignIO.read(output_clwstrict, "clustal") - align.sort() - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - os.remove(output_html) - os.remove(output_clwstrict) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_NCBI_BLAST_tools.py b/Tests/test_NCBI_BLAST_tools.py deleted file mode 100644 index d999466b0..000000000 --- a/Tests/test_NCBI_BLAST_tools.py +++ /dev/null @@ -1,465 +0,0 @@ -# Copyright 2009-2013 by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -# -# This unit test attempts to locate the blastall executable and the nr -# database. - -"""Tests for NCBI BLAST tools module.""" - -import os -import os.path -import re -import subprocess -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import _escape_filename - from Bio.Blast import Applications - -# TODO - On windows, can we use the ncbi.ini file? -wanted = [ - "blastx", - "blastp", - "blastn", - "tblastn", - "tblastx", - "rpsblast+", # For Debian - "rpsblast", - "rpstblastn", - "psiblast", - "blast_formatter", - "deltablast", - "makeblastdb", -] -exe_names = {} - -if sys.platform == "win32": - # The Windows 32 bit BLAST 2.2.22+ installer does add itself to the path, - # and by default installs to C:\Program Files\NCBI\BLAST-2.2.22+\bin - # To keep things simple, assume BLAST+ is on the path on Windows. - # - # On Windows the environment variable name isn't case sensitive, - # but must split on ";" not ":" - likely_dirs = os.environ.get("PATH", "").split(";") -else: - likely_dirs = os.environ.get("PATH", "").split(":") - -for folder in likely_dirs: - if not os.path.isdir(folder): - continue - # Loop over copy as will remove entries from wanted: - for name in wanted[:]: - if sys.platform == "win32": - exe_name = os.path.join(folder, name + ".exe") - else: - exe_name = os.path.join(folder, name) - if not os.path.isfile(exe_name): - continue - # To tell the old and new rpsblast apart (since I have both on - # my path and the old blast has priority), try -h as a parameter. - # This should also reject WU-BLAST (since it doesn't like -h). - child = subprocess.Popen( - exe_name + " -h", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - output, error = child.communicate() - if child.returncode == 0 and "ERROR: Invalid argument: -h" not in output: - # Special case, blast_formatter from BLAST 2.2.23+ (i.e. BLAST+) - # has mandatory argument -rid, but no -archive. We don't support it. - if name == "blast_formatter" and " -archive " not in output: - continue - exe_names[name] = exe_name - wanted.remove(name) # can stop search for this now - # else: - # print("Rejecting %r" % exe_name) - del exe_name, name - -# To avoid the name clash with legacy BLAST, Debian introduced rpsblast+ alias -if "rpsblast+" in wanted: - wanted.remove("rpsblast+") -if "rpsblast+" in exe_names: - exe_names["rpsblast"] = exe_names["rpsblast+"] - del exe_names["rpsblast+"] - -# We can cope with blast_formatter being missing, only added in BLAST 2.2.24+ -# We can cope with deltablast being missing, only added in BLAST 2.2.26+ -optional = ["blast_formatter", "deltablast"] -if len(set(exe_names).difference(optional)) < len(set(wanted).difference(optional)): - raise MissingExternalDependencyError( - "Install the NCBI BLAST+ command line tools if you want to use the " - "Bio.Blast.Applications wrapper." - ) - - -class Pairwise(unittest.TestCase): - def test_blastp(self): - """Pairwise BLASTP search.""" - global exe_names - cline = Applications.NcbiblastpCommandline( - exe_names["blastp"], - query="Fasta/rose.pro", - subject="GenBank/NC_005816.faa", - evalue=1, - ) - self.assertEqual( - str(cline), - _escape_filename(exe_names["blastp"]) - + " -query Fasta/rose.pro -evalue 1" - + " -subject GenBank/NC_005816.faa", - ) - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - self.assertEqual( - return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline) - ) - # Used to get 10 matches from 10 pairwise searches, - # as of NCBI BLAST+ 2.3.0 only get 1 Query= line: - if stdoutdata.count("Query= ") == 10: - if stdoutdata.count("***** No hits found *****") == 7: - # This happens with BLAST 2.2.26+ which is potentially a bug - pass - else: - self.assertEqual(9, stdoutdata.count("***** No hits found *****")) - else: - # Assume this is NCBI BLAST+ 2.3.0 or later, - self.assertEqual(1, stdoutdata.count("Query= ")) - self.assertEqual(0, stdoutdata.count("***** No hits found *****")) - - def test_blastn(self): - """Pairwise BLASTN search.""" - global exe_names - cline = Applications.NcbiblastnCommandline( - exe_names["blastn"], - query="GenBank/NC_005816.ffn", - subject="GenBank/NC_005816.fna", - evalue="0.000001", - ) - self.assertEqual( - str(cline), - _escape_filename(exe_names["blastn"]) - + " -query GenBank/NC_005816.ffn -evalue 0.000001" - + " -subject GenBank/NC_005816.fna", - ) - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - self.assertEqual( - return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline) - ) - self.assertEqual(10, stdoutdata.count("Query= ")) - self.assertEqual(0, stdoutdata.count("***** No hits found *****")) - # TODO - Parse it? - - def test_tblastn(self): - """Pairwise TBLASTN search.""" - global exe_names - cline = Applications.NcbitblastnCommandline( - exe_names["tblastn"], - query="GenBank/NC_005816.faa", - subject="GenBank/NC_005816.fna", - evalue="1e-6", - ) - self.assertEqual( - str(cline), - _escape_filename(exe_names["tblastn"]) - + " -query GenBank/NC_005816.faa -evalue 1e-6" - + " -subject GenBank/NC_005816.fna", - ) - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - self.assertEqual( - return_code, 0, "Got error code %i back from:\n%s" % (return_code, cline) - ) - self.assertEqual(10, stdoutdata.count("Query= ")) - self.assertEqual(0, stdoutdata.count("***** No hits found *****")) - # TODO - Parse it? - - -class BlastDB(unittest.TestCase): - def test_requires_dbtype(self): - """Check that dbtype throws error if not set.""" - global exe_names - cline = Applications.NcbimakeblastdbCommandline( - exe_names["makeblastdb"], input_file="GenBank/NC_005816.faa" - ) - with self.assertRaises(ValueError): - str(cline) - - def test_fasta_db_prot(self): - """Test makeblastdb wrapper with protein database.""" - global exe_names - cline = Applications.NcbimakeblastdbCommandline( - exe_names["makeblastdb"], - input_file="GenBank/NC_005816.faa", - dbtype="prot", - hash_index=True, - max_file_sz="20MB", - parse_seqids=True, - taxid=10, - ) - - self.assertEqual( - str(cline), - _escape_filename(exe_names["makeblastdb"]) - + " -dbtype prot -in GenBank/NC_005816.faa" - " -parse_seqids -hash_index -max_file_sz 20MB" - " -taxid 10", - ) - - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phd")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phi")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phr")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pin")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pog")) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.faa.psd") - or os.path.isfile("GenBank/NC_005816.faa.pnd") - ) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.faa.psi") - or os.path.isfile("GenBank/NC_005816.faa.pni") - ) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.psq")) - - def test_fasta_db_prot_legacy(self): - """Test makeblastdb wrapper with protein database legacy, version 4.""" - global exe_names - cline = Applications.NcbimakeblastdbCommandline( - exe_names["makeblastdb"], - blastdb_version=4, - input_file="GenBank/NC_005816.faa", - dbtype="prot", - hash_index=True, - max_file_sz="20MB", - parse_seqids=True, - taxid=10, - ) - - self.assertEqual( - str(cline), - _escape_filename(exe_names["makeblastdb"]) + " -blastdb_version 4" - " -dbtype prot -in GenBank/NC_005816.faa" - " -parse_seqids -hash_index -max_file_sz 20MB" - " -taxid 10", - ) - - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phd")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phi")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.phr")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pin")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.pog")) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.faa.psd") - or os.path.isfile("GenBank/NC_005816.faa.pnd") - ) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.faa.psi") - or os.path.isfile("GenBank/NC_005816.faa.pni") - ) - self.assertTrue(os.path.isfile("GenBank/NC_005816.faa.psq")) - - def test_fasta_db_nucl(self): - """Test makeblastdb wrapper with nucleotide database.""" - global exe_names - cline = Applications.NcbimakeblastdbCommandline( - exe_names["makeblastdb"], - input_file="GenBank/NC_005816.fna", - dbtype="nucl", - hash_index=True, - max_file_sz="20MB", - parse_seqids=True, - taxid=10, - ) - - self.assertEqual( - str(cline), - _escape_filename(exe_names["makeblastdb"]) - + " -dbtype nucl -in GenBank/NC_005816.fna" - " -parse_seqids -hash_index -max_file_sz 20MB" - " -taxid 10", - ) - - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - return_code = child.returncode - - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhd")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhi")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nhr")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nin")) - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nog")) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.fna.nsd") - or os.path.isfile("GenBank/NC_005816.fna.nnd") - ) - self.assertTrue( - os.path.isfile("GenBank/NC_005816.fna.nsi") - or os.path.isfile("GenBank/NC_005816.fna.nni") - ) - self.assertTrue(os.path.isfile("GenBank/NC_005816.fna.nsq")) - - # makeblastdb makes files in the same dir as the input, clean these up - def tearDown(self): - blastdb_matcher_prot = re.compile(r"NC_005816\.faa\.p.+") - for file in os.listdir("GenBank/"): - if blastdb_matcher_prot.match(file): - path = os.path.join("GenBank/", file) - os.remove(path) - - blastdb_matcher_nucl = re.compile(r"NC_005816\.fna\.n.+") - for file in os.listdir("GenBank/"): - if blastdb_matcher_nucl.match(file): - path = os.path.join("GenBank/", file) - os.remove(path) - - -class CheckCompleteArgList(unittest.TestCase): - def check(self, exe_name, wrapper): - global exe_names - exe = exe_names[exe_name] - # dbtype must be set to initialize NcbimakeblastdbCommandline - if exe_name == "makeblastdb": - cline = wrapper(exe, h=True, dbtype="prot") - else: - cline = wrapper(exe, h=True) - names = {parameter.names[0] for parameter in cline.parameters} - - child = subprocess.Popen( - str(cline), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=(sys.platform != "win32"), - ) - stdoutdata, stderrdata = child.communicate() - self.assertEqual(stderrdata, "", f"{cline}\n{stderrdata}") - names_in_tool = set() - while stdoutdata: - index = stdoutdata.find("[") - if index == -1: - break - stdoutdata = stdoutdata[index + 1 :] - index = stdoutdata.find("]") - assert index != -1 - name = stdoutdata[:index] - if " " in name: - name = name.split(None, 1)[0] - names_in_tool.add(name) - stdoutdata = stdoutdata[index + 1 :] - - # An almost trivial example to test any validation - if "-query" in names: - cline = wrapper(exe, query="dummy") - elif "-archive" in names: - cline = wrapper(exe, archive="dummy") - str(cline) - - def test_blastx(self): - """Check all blastx arguments are supported.""" - self.check("blastx", Applications.NcbiblastxCommandline) - - def test_blastp(self): - """Check all blastp arguments are supported.""" - self.check("blastp", Applications.NcbiblastpCommandline) - - def test_blastn(self): - """Check all blastn arguments are supported.""" - self.check("blastn", Applications.NcbiblastnCommandline) - - def test_tblastx(self): - """Check all tblastx arguments are supported.""" - self.check("tblastx", Applications.NcbitblastxCommandline) - - def test_tblastn(self): - """Check all tblastn arguments are supported.""" - self.check("tblastn", Applications.NcbitblastnCommandline) - - def test_psiblast(self): - """Check all psiblast arguments are supported.""" - self.check("psiblast", Applications.NcbipsiblastCommandline) - - def test_rpsblast(self): - """Check all rpsblast arguments are supported.""" - self.check("rpsblast", Applications.NcbirpsblastCommandline) - - def test_rpstblastn(self): - """Check all rpstblastn arguments are supported.""" - self.check("rpstblastn", Applications.NcbirpstblastnCommandline) - - def test_makeblastdb(self): - """Check all makeblastdb arguments are supported.""" - self.check("makeblastdb", Applications.NcbimakeblastdbCommandline) - - if "blast_formatter" in exe_names: - - def test_blast_formatter(self): - """Check all blast_formatter arguments are supported.""" - self.check("blast_formatter", Applications.NcbiblastformatterCommandline) - - if "deltablast" in exe_names: - - def test_deltablast(self): - """Check all deltablast arguments are supported.""" - self.check("deltablast", Applications.NcbideltablastCommandline) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_PopGen_GenePop.py b/Tests/test_PopGen_GenePop.py deleted file mode 100644 index f50dda96a..000000000 --- a/Tests/test_PopGen_GenePop.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2009 by Tiago Antao . All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Test GenePop.""" - -import os -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.PopGen.GenePop.Controller import GenePopController - -# Tests genepop related code. Note: this case requires genepop -# test_PopGen_GenePop_nodepend tests code that does not require genepop - -found = False -for path in os.environ["PATH"].split(os.pathsep): - try: - for filename in os.listdir(path): - if filename.startswith("Genepop"): - found = True - except OSError: - pass # Path doesn't exist - correct to pass -if not found: - raise MissingExternalDependencyError( - "Install GenePop if you want to use Bio.PopGen.GenePop." - ) - - -class AppTest(unittest.TestCase): - """Tests genepop execution via biopython.""" - - def test_allele_genotype_frequencies(self): - """Test genepop execution on basic allele and genotype frequencies.""" - ctrl = GenePopController() - path = os.path.join("PopGen", "big.gen") - pop_iter, locus_iter = ctrl.calc_allele_genotype_freqs(path) - # print("%s %s" % (pop, loci)) - # for popc in pop_iter: - # pop_name, loci_content = popc - # print(pop_name) - # for locus in loci_content: - # geno_list, hets, freq_fis = loci_content[locus] - # print(locus) - # print(hets) - # print(freq_fis) - # print(geno_list) - # print("") - - def test_calc_diversities_fis_with_identity(self): - """Test calculations of diversities.""" - ctrl = GenePopController() - path = os.path.join("PopGen", "big.gen") - iter, avg_fis, avg_Qintra = ctrl.calc_diversities_fis_with_identity(path) - liter = list(iter) - self.assertEqual(len(liter), 37) - self.assertEqual(liter[0][0], "Locus1") - self.assertEqual(len(avg_fis), 10) - self.assertEqual(len(avg_Qintra), 10) - - def test_estimate_nm(self): - """Test Nm estimation.""" - ctrl = GenePopController() - path = os.path.join("PopGen", "big.gen") - ( - mean_sample_size, - mean_priv_alleles, - mig10, - mig25, - mig50, - mig_corrected, - ) = ctrl.estimate_nm(path) - self.assertAlmostEqual(mean_sample_size, 28.0) - self.assertAlmostEqual(mean_priv_alleles, 0.016129) - self.assertAlmostEqual(mig10, 52.5578) - self.assertAlmostEqual(mig25, 15.3006) - self.assertAlmostEqual(mig50, 8.94583) - self.assertAlmostEqual(mig_corrected, 13.6612) - - def test_fst_all(self): - """Test genepop execution on all fst.""" - ctrl = GenePopController() - path = os.path.join("PopGen", "c2line.gen") - (allFis, allFst, allFit), itr = ctrl.calc_fst_all(path) - results = list(itr) - self.assertEqual(len(results), 3) - self.assertEqual(results[0][0], "136255903") - self.assertAlmostEqual(results[1][3], 0.335846) - - def test_haploidy(self): - """Test haploidy.""" - ctrl = GenePopController() - path = os.path.join("PopGen", "haplo.gen") - (allFis, allFst, allFit), itr = ctrl.calc_fst_all(path) - litr = list(itr) - self.assertNotIsInstance(allFst, int) - self.assertEqual(len(litr), 37) - self.assertEqual(litr[36][0], "Locus37") - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_PopGen_GenePop_EasyController.py b/Tests/test_PopGen_GenePop_EasyController.py deleted file mode 100644 index 796775695..000000000 --- a/Tests/test_PopGen_GenePop_EasyController.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2009 by Tiago Antao . All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for GenePop easy-controller.""" - -import os -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.PopGen.GenePop.EasyController import EasyController - -# Tests genepop related code for easy controller. Note: this requires genepop -# test_PopGen_GenePop_nodepend tests code that does not require genepop - -found = False -for path in os.environ["PATH"].split(os.pathsep): - try: - for filename in os.listdir(path): - if filename.startswith("Genepop"): - found = True - except OSError: - pass # Path doesn't exist - correct to pass -if not found: - raise MissingExternalDependencyError( - "Install GenePop if you want to use Bio.PopGen.GenePop." - ) - - -cur_dir = os.path.abspath(".") # Tests directory - - -class AppTest(unittest.TestCase): - """Tests genepop execution via biopython using EasyController.""" - - def setUp(self): - """Change working directory.""" - # Genepop likes to be on the directory where the file is. - os.chdir("PopGen") - self.ctrl = EasyController("big.gen") - - def tearDown(self): - """Restore working directory.""" - os.chdir(cur_dir) - - def test_basic_info(self): - """Test basic info.""" - pops, loci = self.ctrl.get_basic_info() - self.assertEqual(len(pops), 10) - self.assertEqual(len(loci), 37) - - def test_get_heterozygosity_info(self): - """Test heterozygosity info.""" - hz_info = self.ctrl.get_heterozygosity_info(0, "Locus2") - self.assertEqual(hz_info[1], 24) - self.assertEqual(hz_info[3], 7) - - def test_get_alleles(self): - """Test get alleles.""" - # Returns keys of a dict, so order is Python implementation dependent - self.assertCountEqual(self.ctrl.get_alleles(0, "Locus3"), [3, 20]) - - def test_get_alleles_all_pops(self): - """Test get alleles for all populations.""" - self.assertEqual(self.ctrl.get_alleles_all_pops("Locus4"), [1, 3]) - - def test_get_fis(self): - """Test get Fis.""" - alleles, overall = self.ctrl.get_fis(0, "Locus2") - self.assertEqual(alleles[3][0], 55) - self.assertEqual(overall[0], 62) - - def test_get_allele_frequency(self): - """Test allele frequency.""" - tot_genes, alleles = self.ctrl.get_allele_frequency(0, "Locus2") - self.assertEqual(tot_genes, 62) - self.assertLess(abs(alleles[20] - 0.113), 0.05) - - def test_get_genotype_count(self): - """Test genotype count.""" - self.assertEqual(len(self.ctrl.get_genotype_count(0, "Locus2")), 3) - - def test_estimate_nm(self): - """Test Nm estimation.""" - nms = self.ctrl.estimate_nm() - self.assertEqual(nms[0], 28.0) - - def test_hwe_excess(self): - """Test Hardy-Weinberg Equilibrium.""" - hwe_excess = self.ctrl.test_hw_pop(0, "excess") - self.assertEqual(hwe_excess["Locus1"], (0.4955, None, -0.16, -0.1623, 5)) - - # These tests are frequently failing, possibly due to a Genepop problem. - # def test_get_avg_fst_pair_locus(self): - # """Test get average Fst for pairwise pops on a locus.""" - # self.assertEqual(len(self.ctrl.get_avg_fst_pair_locus("Locus4")), 45) - # - # def test_get_avg_fst_pair(self): - # """Test get pairwise Fst.""" - # pop_fis = self.ctrl.get_avg_fst_pair() - # self.assertEqual(len(pop_fis), 45) - - def test_get_avg_fis(self): - """Test average Fis.""" - self.ctrl.get_avg_fis() - - def test_get_multilocus_f_stats(self): - """Test multilocus F stats.""" - mf = self.ctrl.get_multilocus_f_stats() - self.assertEqual(len(mf), 3) - self.assertLess(mf[0], 0.1) - - def test_get_f_stats(self): - """Test F stats.""" - fs = self.ctrl.get_f_stats("Locus2") - self.assertEqual(len(fs), 5) - self.assertLess(fs[0], 0) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Prank_tool.py b/Tests/test_Prank_tool.py deleted file mode 100644 index 4ed3af46d..000000000 --- a/Tests/test_Prank_tool.py +++ /dev/null @@ -1,229 +0,0 @@ -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Unittests for Bio.Align.Applications interface for PRANK.""" - -import os -import sys -import unittest -import warnings - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO -from Bio.Nexus.Nexus import NexusError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import PrankCommandline - from Bio.Application import _escape_filename - - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -prank_exe = None -if sys.platform == "win32": - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files" - # For Windows, PRANK just comes as a zip file which contains the - # prank.exe file which the user could put anywhere. We'll try a few - # sensible locations under Program Files... and then the full path. - likely_dirs = [ - "", # Current dir - prog_files, - os.path.join(prog_files, "Prank"), - ] + sys.path - for folder in likely_dirs: - if os.path.isdir(folder): - if os.path.isfile(os.path.join(folder, "prank.exe")): - prank_exe = os.path.join(folder, "prank.exe") - break - if prank_exe: - break -else: - from subprocess import getoutput - - output = getoutput("prank") - if "not found" not in output and "not recognized" not in output: - if "prank" in output.lower(): - prank_exe = "prank" -if not prank_exe: - raise MissingExternalDependencyError( - "Install PRANK if you want to use the Bio.Align.Applications wrapper." - ) - - -class PrankApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/fa01" - - def tearDown(self): - """Remove generated files. - - output.1.dnd output.1.fas output.1.xml output.2.dnd output.2.fas output.2.xml - """ - if os.path.isfile("output.1.dnd"): - os.remove("output.1.dnd") - if os.path.isfile("output.1.fas"): - os.remove("output.1.fas") - if os.path.isfile("output.1.xml"): - os.remove("output.1.xml") - if os.path.isfile("output.2.dnd"): - os.remove("output.2.dnd") - if os.path.isfile("output.2.fas"): - os.remove("output.2.fas") - if os.path.isfile("output.2.xml"): - os.remove("output.2.xml") - if os.path.isfile("output.1.nex"): - os.remove("output.1.nex") - if os.path.isfile("output.2.nex"): - os.remove("output.2.nex") - - def test_Prank_simple(self): - """Simple round-trip through app with infile. - - output.?.??? files written to cwd - no way to redirect - """ - cmdline = PrankCommandline(prank_exe) - cmdline.set_parameter("d", self.infile1) - self.assertEqual(str(cmdline), _escape_filename(prank_exe) + " -d=Fasta/fa01") - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - output, error = cmdline() - self.assertEqual(error, "") - self.assertIn("Total time", output) - - def test_Prank_simple_with_NEXUS_output(self): - """Simple round-trip through app with infile, output in NEXUS. - - output.?.??? files written to cwd - no way to redirect - """ - records = list(SeqIO.parse(self.infile1, "fasta")) - # Try using keyword argument, - cmdline = PrankCommandline(prank_exe, d=self.infile1) - # Try using a property, - cmdline.d = self.infile1 - cmdline.f = 17 # NEXUS format - cmdline.set_parameter("dots", True) - self.assertEqual( - str(cmdline), _escape_filename(prank_exe) + " -d=Fasta/fa01 -f=17 -dots" - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdout, stderr = cmdline() - self.assertIn("Total time", stdout) - self.assertEqual(stderr, "") - try: - if os.path.isfile("output.best.nex"): - # Prank v.130820 and perhaps earlier use ".best.*" output names - nex_fname = "output.best.nex" - elif os.path.isfile("output.2.nex"): - # Older Prank versions use ".2.*" output names - nex_fname = "output.2.nex" - else: - raise RuntimeError("Can't find PRANK's NEXUS output (*.nex)") - align = AlignIO.read(nex_fname, "nexus") - for old, new in zip(records, align): - # Old versions of Prank reduced name to 9 chars - self.assertTrue(old.id == new.id or old.id[:9] == new.id) - # infile1 has alignment gaps in it - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - except NexusError: - # See bug 3119, - # Bio.Nexus can't parse output from prank v100701 (1 July 2010) - pass - - def test_Prank_complex_command_line(self): - """Round-trip with complex command line.""" - cmdline = PrankCommandline(prank_exe) - cmdline.set_parameter("d", self.infile1) - cmdline.set_parameter("-gaprate", 0.321) - cmdline.set_parameter("gapext", 0.6) - cmdline.set_parameter("-dots", 1) # i.e. True - # Try using a property: - cmdline.kappa = 3 - cmdline.skipins = True - cmdline.set_parameter("-once", True) - cmdline.realbranches = True - self.assertEqual( - str(cmdline), - _escape_filename(prank_exe) - + " -d=Fasta/fa01" - + " -dots -gaprate=0.321 -gapext=0.6 -kappa=3" - + " -once -skipins -realbranches", - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdout, stderr = cmdline() - self.assertIn("Total time", stdout) - - -class PrankConversion(unittest.TestCase): - def setUp(self): - # As these reads are all 36, it can be seen as pre-aligned: - self.input = "Quality/example.fasta" - self.output = "temp with space" # prefix, PRANK will pick extensions - - def conversion(self, prank_number, prank_ext, format): - """Get PRANK to do a conversion, and check it with SeqIO.""" - filename = f"{self.output}.{prank_ext}" - if os.path.isfile(filename): - os.remove(filename) - cmdline = PrankCommandline( - prank_exe, - d=self.input, - convert=True, - f=prank_number, - o=f'"{self.output}"', - ) - self.assertEqual( - str(cmdline), - _escape_filename(prank_exe) - + f' -d={self.input} -o="{self.output}" -f={prank_number} -convert', - ) - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - message, error = cmdline() - self.assertIn("PRANK", message) - self.assertIn((f"converting '{self.input}' to '{filename}'"), message, message) - self.assertEqual(error, "") - self.assertTrue(os.path.isfile(filename)) - old = AlignIO.read(self.input, "fasta") - # Hack... - if format == "phylip": - for record in old: - record.id = record.id[:10] - new = AlignIO.read(filename, format) - self.assertEqual(len(old), len(new)) - for old_r, new_r in zip(old, new): - self.assertEqual(old_r.id, new_r.id) - self.assertEqual(old_r.seq, new_r.seq) - os.remove(filename) - - def test_convert_to_fasta(self): - """Convert FASTA to FASTA format.""" - self.conversion(8, "fas", "fasta") - - # Prank v.100701 seems to output an invalid file here... - # def test_convert_to_phylip32(self): - # """Convert FASTA to PHYLIP 3.2 format.""" - # self.conversion(11, "phy", "phylip") - - def test_convert_to_phylip(self): - """Convert FASTA to PHYLIP format.""" - self.conversion(12, "phy", "phylip") - - # PRANK truncated the record names in the matrix block. An error? - # def test_convert_to_paup_nexus(self): - # """Convert FASTA to PAUP/NEXUS.""" - # self.conversion(17, "nex", "nexus") - - # We don't support format 18, PAML - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_Probcons_tool.py b/Tests/test_Probcons_tool.py deleted file mode 100644 index d94834b6d..000000000 --- a/Tests/test_Probcons_tool.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Unittests for Bio.Align.Applications interface for PROBCONS.""" - -import os -import sys -import unittest -import warnings -from io import StringIO - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import ProbconsCommandline - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -probcons_exe = None -if sys.platform == "win32": - raise MissingExternalDependencyError("PROBCONS not available on Windows") -else: - from subprocess import getoutput - - output = getoutput("probcons") - if "not found" not in output and "not recognized" not in output: - if "probcons" in output.lower(): - probcons_exe = "probcons" - -if not probcons_exe: - raise MissingExternalDependencyError( - "Install PROBCONS if you want to use the Bio.Align.Applications wrapper." - ) - - -class ProbconsApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/fa01" - self.annotation_outfile = "Fasta/probcons_annot.out" - - def tearDown(self): - if os.path.isfile(self.annotation_outfile): - os.remove(self.annotation_outfile) - - def test_Probcons_alignment_fasta(self): - """Round-trip through app and read fasta alignment from stdout.""" - cmdline = ProbconsCommandline(probcons_exe, input=self.infile1) - self.assertEqual(str(cmdline), probcons_exe + " Fasta/fa01") - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdout, stderr = cmdline() - self.assertTrue(stderr.startswith("\nPROBCONS")) - align = AlignIO.read(StringIO(stdout), "fasta") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_Probcons_alignment_clustalw(self): - """Round-trip through app and read clustalw alignment from stdout.""" - cmdline = ProbconsCommandline(probcons_exe) - cmdline.set_parameter("input", "Fasta/fa01") - cmdline.clustalw = True - self.assertEqual(str(cmdline), probcons_exe + " -clustalw Fasta/fa01") - self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) - stdout, stderr = cmdline() - self.assertTrue(stderr.strip().startswith("PROBCONS")) - align = AlignIO.read(StringIO(stdout), "clustal") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_Probcons_complex_commandline(self): - """Round-trip through app with complex command line and output file.""" - cmdline = ProbconsCommandline(probcons_exe, pre=1) - cmdline.set_parameter("input", "Fasta/fa01") - cmdline.consistency = 4 - cmdline.set_parameter("--iterative-refinement", 222) - cmdline.set_parameter("a", True) - cmdline.annot = self.annotation_outfile - self.assertEqual( - str(cmdline), - probcons_exe - + " -c 4 -ir 222 -pre 1 -annot Fasta/probcons_annot.out -a Fasta/fa01", - ) - stdout, stderr = cmdline() - self.assertTrue(stderr.startswith("\nPROBCONS")) - self.assertTrue(stdout.startswith(">AK1H_ECOLI/1-378")) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_TCoffee_tool.py b/Tests/test_TCoffee_tool.py deleted file mode 100644 index 20ab1c08a..000000000 --- a/Tests/test_TCoffee_tool.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright 2009 by Cymon J. Cox. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -"""Unittests for Bio.Align.Applications interface for TCOFFEE.""" - -import os -import sys -import unittest -import warnings - -from Bio import AlignIO -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Align.Applications import TCoffeeCommandline - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -t_coffee_exe = None -if sys.platform == "win32": - raise MissingExternalDependencyError("Testing TCOFFEE on Windows not supported yet") -else: - from subprocess import getoutput - - output = getoutput("t_coffee -version") - if "not found" not in output and "not recognized" not in output: - if "t_coffee" in output.lower() or "t-coffee" in output.lower(): - t_coffee_exe = "t_coffee" - -if not t_coffee_exe: - raise MissingExternalDependencyError( - "Install TCOFFEE if you want to use the Bio.Align.Applications wrapper." - ) - - -class TCoffeeApplication(unittest.TestCase): - def setUp(self): - self.infile1 = "Fasta/fa01" - # TODO: Use a temp dir for the output files: - self.outfile1 = "fa01.aln" - self.outfile2 = "fa01.html" # Written by default when no output set - self.outfile3 = "Fasta/tc_out.pir" - self.outfile4 = "Fasta/tc_out.aln" - self.outfile5 = "Fasta/tc_out.phy" - self.outfile6 = "Fasta/tc_out.msf" - - def tearDown(self): - if os.path.isfile(self.outfile1): - os.remove(self.outfile1) - if os.path.isfile(self.outfile2): - os.remove(self.outfile2) - if os.path.isfile(self.outfile3): - os.remove(self.outfile3) - if os.path.isfile(self.outfile4): - os.remove(self.outfile4) - if os.path.isfile(self.outfile5): - os.remove(self.outfile5) - - def test_TCoffee_fasta(self): - """Round-trip through app and read clustal alignment from file.""" - cmdline = TCoffeeCommandline(t_coffee_exe, infile=self.infile1) - self.assertEqual(str(cmdline), t_coffee_exe + " -infile Fasta/fa01") - stdout, stderr = cmdline() - self.assertTrue(stderr.strip().startswith("PROGRAM: T-COFFEE")) - align = AlignIO.read(self.outfile1, "clustal") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_TCoffee_pir(self): - """Round-trip through app and read pir alignment from file.""" - cmdline = TCoffeeCommandline(t_coffee_exe, quiet=True) - cmdline.infile = self.infile1 - cmdline.outfile = self.outfile3 - cmdline.output = "pir_aln" - self.assertEqual( - str(cmdline), - t_coffee_exe - + " -output pir_aln -infile Fasta/fa01 -outfile Fasta/tc_out.pir -quiet", - ) - stdout, stderr = cmdline() - # Can get warnings in stderr output - self.assertNotIn("error", stderr.lower(), stderr) - align = AlignIO.read(self.outfile3, "pir") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_TCoffee_clustalw(self): - """Round-trip through app and read clustalw alignment from file.""" - cmdline = TCoffeeCommandline(t_coffee_exe, gapopen=-2) - cmdline.infile = self.infile1 - cmdline.outfile = self.outfile4 - cmdline.set_parameter("output", "clustalw_aln") - cmdline.outorder = "input" - cmdline.set_parameter("gapext", -5) - cmdline.type = "protein" - self.assertEqual( - str(cmdline), - t_coffee_exe - + " -output clustalw_aln -infile Fasta/fa01 -outfile Fasta/tc_out.aln " - "-type protein -outorder input -gapopen -2 -gapext -5", - ) - stdout, stderr = cmdline() - self.assertTrue(stderr.strip().startswith("PROGRAM: T-COFFEE")) - align = AlignIO.read(self.outfile4, "clustal") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_TCoffee_phylip(self): - """Round-trip through app and read PHYLIP alignment from file.""" - cmdline = TCoffeeCommandline( - t_coffee_exe, - infile=self.infile1, - outfile=self.outfile5, - quiet=True, - output="phylip_aln", - ) - self.assertEqual( - str(cmdline), - t_coffee_exe + " -output phylip_aln " - "-infile Fasta/fa01 -outfile Fasta/tc_out.phy -quiet", - ) - stdout, stderr = cmdline() - # Can get warnings in stderr output - self.assertNotIn("error", stderr.lower(), stderr) - align = AlignIO.read(self.outfile5, "phylip") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - # TCoffee does strict 10 character truncation as per original PHYLIP - self.assertEqual(old.id[:10], new.id[:10]) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - def test_TCoffee_msf(self): - """Round-trip through app and read GCG MSF alignment from file.""" - cmdline = TCoffeeCommandline( - t_coffee_exe, - infile=self.infile1, - outfile=self.outfile6, - quiet=True, - output="msf_aln", - ) - self.assertEqual( - str(cmdline), - t_coffee_exe - + " -output msf_aln -infile Fasta/fa01 -outfile Fasta/tc_out.msf -quiet", - ) - stdout, stderr = cmdline() - # Can get warnings in stderr output - self.assertNotIn("error", stderr.lower(), stderr) - align = AlignIO.read(self.outfile6, "msf") - records = list(SeqIO.parse(self.infile1, "fasta")) - self.assertEqual(len(records), len(align)) - for old, new in zip(records, align): - self.assertEqual(old.id, new.id) - self.assertEqual( - str(new.seq).replace("-", ""), str(old.seq).replace("-", "") - ) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_XXmotif_tool.py b/Tests/test_XXmotif_tool.py deleted file mode 100644 index 9f6832a5a..000000000 --- a/Tests/test_XXmotif_tool.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2012 by Christian Brueffer. All rights reserved. -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Tests for XXmotif tool.""" - -import glob -import os -import shutil -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import SeqIO - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import ApplicationError - from Bio.motifs.applications import XXmotifCommandline - - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -xxmotif_exe = None -if sys.platform == "win32": - # TODO - raise MissingExternalDependencyError( - "Testing this on Windows is not implemented yet" - ) -else: - from subprocess import getoutput - - output = getoutput("XXmotif") - if output.find("== XXmotif version") != -1: - xxmotif_exe = "XXmotif" - -if not xxmotif_exe: - raise MissingExternalDependencyError( - "Install XXmotif if you want to use XXmotif from Biopython." - ) - - -class XXmotifTestCase(unittest.TestCase): - def setUp(self): - self.out_dir = "xxmotif-temp" - self.files_to_clean = set() - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - - if os.path.isdir(self.out_dir): - shutil.rmtree(self.out_dir) - - def standard_test_procedure(self, cline): - """Shared test procedure used by all tests.""" - output, error = cline() - - self.assertTrue(os.path.isdir(self.out_dir)) - self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.meme"))) - self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_MotifFile.txt"))) - self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_Pvals.txt"))) - self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.pwm"))) - self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_sequence.txt"))) - - # TODO - # Parsing the MEME file would be nice, but unfortunately the - # MEME parser does not like what XXmotif produces yet. - - def copy_and_mark_for_cleanup(self, path): - """Copy file to working directory and marks it for removal. - - XXmotif currently only handles a canonical filename as input, no paths. - This method copies the specified file in the specified path to the - current working directory and marks it for removal. - """ - filename = os.path.split(path)[1] - - shutil.copyfile(path, filename) - self.add_file_to_clean(filename) - - return filename - - def add_file_to_clean(self, filename): - """Add a file for deferred removal by the tearDown routine.""" - self.files_to_clean.add(filename) - - -class XXmotifTestErrorConditions(XXmotifTestCase): - def test_empty_file(self): - """Test a non-existing input file.""" - input_file = "does_not_exist.fasta" - self.assertFalse(os.path.isfile(input_file)) - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - try: - stdout, stderr = cline() - except ApplicationError as err: - self.assertEqual(err.returncode, 255) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_invalid_format(self): - """Test an input file in an invalid format.""" - input_file = self.copy_and_mark_for_cleanup("Medline/pubmed_result1.txt") - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - try: - stdout, stderr = cline() - except ApplicationError as err: - self.assertEqual(err.returncode, 255) - else: - self.fail(f"Should have failed, returned:\n{stdout}\n{stderr}") - - def test_output_directory_with_space(self): - """Test an output directory containing a space.""" - temp_out_dir = "xxmotif test" - input_file = self.copy_and_mark_for_cleanup("Fasta/f002") - - try: - XXmotifCommandline(outdir=temp_out_dir, seqfile=input_file) - except ValueError: - pass - else: - self.fail("expected ValueError") - - -class XXmotifTestNormalConditions(XXmotifTestCase): - def test_fasta_one_sequence(self): - """Test a fasta input file containing only one sequence.""" - record = list(SeqIO.parse("Registry/seqs.fasta", "fasta"))[0] - input_file = "seq.fasta" - with open(input_file, "w") as handle: - SeqIO.write(record, handle, "fasta") - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - def test_properties(self): - """Test setting options via properties.""" - input_file = self.copy_and_mark_for_cleanup("Fasta/f002") - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - cline.revcomp = True - cline.pseudo = 20 - cline.startmotif = "ACGGGT" - - self.standard_test_procedure(cline) - - def test_large_fasta_file(self): - """Test a large fasta input file.""" - records = list(SeqIO.parse("NBRF/B_nuc.pir", "pir")) - input_file = "temp_b_nuc.fasta" - with open(input_file, "w") as handle: - SeqIO.write(records, handle, "fasta") - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - def test_input_filename_with_space(self): - """Test an input filename containing a space.""" - records = SeqIO.parse("Phylip/hennigian.phy", "phylip") - input_file = "temp horses.fasta" - with open(input_file, "w") as handle: - SeqIO.write(records, handle, "fasta") - - cline = XXmotifCommandline(outdir=self.out_dir, seqfile=input_file) - - self.add_file_to_clean(input_file) - self.standard_test_procedure(cline) - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_phyml_tool.py b/Tests/test_phyml_tool.py deleted file mode 100644 index 5384f5cbb..000000000 --- a/Tests/test_phyml_tool.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (C) 2012 by Eric Talevich. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Unit tests for Bio.Phylo.Applications wrappers.""" - -import os -import sys -import unittest -import warnings -from subprocess import getoutput - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import Phylo - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Phylo.Applications import PhymlCommandline - - -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -phyml_exe = None -exe_name = "PhyML-3.1_win32.exe" if sys.platform == "win32" else "phyml" - -output = getoutput(exe_name + " --version") -# Looks like this: -# . This is PhyML version 20120412. -if "20" in output and "PhyML" in output: - phyml_exe = exe_name - -if not phyml_exe: - raise MissingExternalDependencyError( - "Couldn't find the PhyML software. Install PhyML 3.0 or later if you want " - "to use the Bio.Phylo.Applications wrapper." - ) - - -# Example Phylip file with 4 aligned protein sequences -EX_PHYLIP = "Phylip/interlaced2.phy" - - -class AppTests(unittest.TestCase): - """Tests for application wrappers.""" - - def test_phyml(self): - """Run PhyML using the wrapper.""" - # Stabilize phyml tests by running in single threaded mode by default. - # Note: PHYMLCPUS environment is specific to Debian and derivatives. - if not os.getenv("PHYMLCPUS"): - os.putenv("PHYMLCPUS", "1") - cmd = PhymlCommandline(phyml_exe, input=EX_PHYLIP, datatype="aa") - # Smoke test - try: - out, err = cmd() - self.assertGreater(len(out), 0) - self.assertEqual(len(err), 0) - # Check the output tree - outfname = EX_PHYLIP + "_phyml_tree.txt" - if not os.path.isfile(outfname): - # NB: Briefly, PhyML dropped the .txt suffix (#919) - outfname = outfname[:-4] - tree = Phylo.read(outfname, "newick") - self.assertEqual(tree.count_terminals(), 4) - except Exception as exc: - self.fail(f"PhyML wrapper error: {exc}") - finally: - # Clean up generated files - for suffix in [ - "_phyml_tree.txt", - "_phyml_tree", - "_phyml_stats.txt", - "_phyml_stats", - ]: - fname = EX_PHYLIP + suffix - if os.path.isfile(fname): - os.remove(fname) - - -# --------------------------------------------------------- - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_raxml_tool.py b/Tests/test_raxml_tool.py deleted file mode 100644 index 9e5d15c2c..000000000 --- a/Tests/test_raxml_tool.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (C) 2012 by Eric Talevich. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -"""Unit tests for Bio.Phylo.Applications wrappers.""" - -import os -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError -from Bio import Phylo - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Phylo.Applications import RaxmlCommandline - - -raxml_exe = None -try: - from subprocess import getoutput - - output = getoutput("raxmlHPC -v") - if "not found" not in output and "not recognized" not in output: - if "This is RAxML" in output: - raxml_exe = "raxmlHPC" -except FileNotFoundError: - pass - -if not raxml_exe: - raise MissingExternalDependencyError( - "Install RAxML (binary raxmlHPC) if you want" - " to test the Bio.Phylo.Applications wrapper." - ) - -# Example Phylip file with 4 aligned protein sequences -EX_PHYLIP = "Phylip/interlaced2.phy" - - -class AppTests(unittest.TestCase): - """Tests for application wrappers.""" - - def test_raxml(self): - """Run RAxML using the wrapper.""" - cmd = RaxmlCommandline( - raxml_exe, sequences=EX_PHYLIP, model="PROTCATWAG", name="test" - ) - # The parsimony seed should be set automatically - self.assertIn("-p", str(cmd)) - # Smoke test - try: - out, err = cmd() - self.assertGreater(len(out), 0) - self.assertEqual(len(err), 0) - # Check the output tree - tree = Phylo.read("RAxML_result.test", "newick") - self.assertEqual(tree.count_terminals(), 4) - finally: - # Remove RAxML-generated files, or RAxML will complain bitterly - # during the next run - for fname in [ - "RAxML_info.test", - "RAxML_log.test", - "RAxML_parsimonyTree.test", - "RAxML_result.test", - # Present in 7.2.X+ but not 7.0.4: - "RAxML_bestTree.test", - ]: - if os.path.isfile(fname): - os.remove(fname) - - -# --------------------------------------------------------- - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/Tests/test_samtools_tool.py b/Tests/test_samtools_tool.py deleted file mode 100644 index 198beb141..000000000 --- a/Tests/test_samtools_tool.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright 2014 by Saket Choudhary. Based on test_Clustalw_tool.py by Peter -# Cock . -# -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. - -# Last Checked with samtools [0.1.18 (r982:295)] - -"""Tests for samtools tool.""" - -import os -import sys -import unittest -import warnings - -from Bio import BiopythonDeprecationWarning -from Bio import MissingExternalDependencyError - -with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=BiopythonDeprecationWarning) - from Bio.Application import ApplicationError - from Bio.Sequencing.Applications import SamtoolsCalmdCommandline - from Bio.Sequencing.Applications import SamtoolsCatCommandline - from Bio.Sequencing.Applications import SamtoolsFaidxCommandline - from Bio.Sequencing.Applications import SamtoolsIdxstatsCommandline - from Bio.Sequencing.Applications import SamtoolsIndexCommandline - from Bio.Sequencing.Applications import SamtoolsMergeCommandline - from Bio.Sequencing.Applications import SamtoolsMpileupCommandline - from Bio.Sequencing.Applications import SamtoolsSortCommandline - from Bio.Sequencing.Applications import SamtoolsVersion1xSortCommandline - from Bio.Sequencing.Applications import SamtoolsViewCommandline - -# TODO from Bio.Sequencing.Applications import SamtoolsPhaseCommandline -# TODO from Bio.Sequencing.Applications import SamtoolsReheaderCommandline -# TODO from Bio.Sequencing.Applications import SamtoolsRmdupCommandline -# TODO from Bio.Sequencing.Applications import SamtoolsTargetcutCommandline -# TODO from Bio.Sequencing.Applications import SamtoolsFixmateCommandline -################################################################# -SamtoolsVersion0xSortCommandline = SamtoolsSortCommandline -# Try to avoid problems when the OS is in another language -os.environ["LANG"] = "C" - -samtools_exe = None -if sys.platform == "win32": - # TODO - Check the path? - try: - # This can vary depending on the Windows language. - prog_files = os.environ["PROGRAMFILES"] - except KeyError: - prog_files = r"C:\Program Files" - # By default tries C:\Program Files\samtools\samtools.exe - # or C:\Program Files\samtools.exe was chosen - likely_dirs = ["samtools", ""] - likely_exes = ["samtools.exe"] - for folder in likely_dirs: - if os.path.isdir(os.path.join(prog_files, folder)): - for filename in likely_exes: - if os.path.isfile(os.path.join(prog_files, folder, filename)): - samtools_exe = os.path.join(prog_files, folder, filename) - break - if samtools_exe: - break -else: - from subprocess import getoutput - - output = getoutput("samtools") - - # Since "not found" may be in another language, try and be sure this is - # really the samtools tool's output - if ( - "not found" not in output - and "samtools (Tools for alignments in the SAM format)" in output - ): - samtools_exe = "samtools" - -if not samtools_exe: - raise MissingExternalDependencyError( - "Install samtools and correctly set the file path to " - "the program if you want to use it from Biopython" - ) - - -class SamtoolsTestCase(unittest.TestCase): - """Class for implementing Samtools test cases.""" - - def setUp(self): - self.files_to_clean = set() - self.samfile1 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "sam1.sam" - ) - self.reference = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "BWA", - "human_g1k_v37_truncated.fasta", - ) - self.referenceindexfile = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "BWA", - "human_g1k_v37_truncated.fasta.fai", - ) - self.samfile2 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "sam2.sam" - ) - self.bamfile1 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1.bam" - ) - self.bamfile2 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam2.bam" - ) - self.outsamfile = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "out.sam" - ) - self.outbamfile = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "out.bam" - ) - self.bamindexfile1 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1.bam.bai" - ) - self.sortedbamfile1 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam1_sorted.bam" - ) - self.sortedbamfile2 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "SamBam", "bam2_sorted.bam" - ) - self.files_to_clean = [ - self.referenceindexfile, - self.bamindexfile1, - self.outbamfile, - ] - - def tearDown(self): - for filename in self.files_to_clean: - if os.path.isfile(filename): - os.remove(filename) - - def test_view(self): - """Test for samtools view.""" - cmdline = SamtoolsViewCommandline(samtools_exe) - cmdline.set_parameter("input_file", self.bamfile1) - stdout_bam, stderr_bam = cmdline() - self.assertTrue( - stderr_bam.startswith(""), - f"SAM file viewing failed: \n{cmdline}\nStdout:{stdout_bam}", - ) - cmdline.set_parameter("input_file", self.samfile1) - cmdline.set_parameter("S", True) - stdout_sam, stderr_sam = cmdline() - self.assertTrue( - stdout_sam.startswith("HWI-1KL120:88:D0LRBACXX:1:1101:1780:2146"), - f"SAM file viewing failed:\n{cmdline}\nStderr:{stderr_sam}", - ) - - def create_fasta_index(self): - """Create index for reference fasta sequence.""" - cmdline = SamtoolsFaidxCommandline(samtools_exe) - cmdline.set_parameter("reference", self.reference) - stdout, stderr = cmdline() - - def create_bam_index(self, input_bam): - """Create index of an input bam file.""" - cmdline = SamtoolsIndexCommandline(samtools_exe) - cmdline.set_parameter("input_bam", input_bam) - stdout, stderr = cmdline() - - def test_faidx(self): - cmdline = SamtoolsFaidxCommandline(samtools_exe) - cmdline.set_parameter("reference", self.reference) - stdout, stderr = cmdline() - self.assertFalse(stderr, f"Samtools faidx failed:\n{cmdline}\nStderr:{stderr}") - self.assertTrue(os.path.isfile(self.referenceindexfile)) - - def test_calmd(self): - """Test for samtools calmd.""" - self.create_fasta_index() - cmdline = SamtoolsCalmdCommandline(samtools_exe) - cmdline.set_parameter("reference", self.reference) - cmdline.set_parameter("input_bam", self.bamfile1) - # If there is no index file for the reference - # samtools calmd creates one at the time of calling - - if os.path.exists(self.referenceindexfile): - # print("exists") - stderr_calmd_expected = "" - else: - # print("doesn't exist") - stderr_calmd_expected = "[fai_load] build FASTA index.\n" - stdout, stderr = cmdline() - self.assertEqual(stderr, stderr_calmd_expected) - - def test_cat(self): - cmdline = SamtoolsCatCommandline(samtools_exe) - cmdline.set_parameter("o", self.outbamfile) - cmdline.set_parameter("input_bam", [self.bamfile1, self.bamfile2]) - stdout, stderr = cmdline() - self.assertEqual(stderr, "") - - # TODO: def test_fixmate(self): - - def test_sort(self): - cmdline = SamtoolsVersion0xSortCommandline(samtools_exe) - cmdline.set_parameter("input", self.bamfile1) - cmdline.set_parameter("out_prefix", "SamBam/out") - - try: - stdout, stderr = cmdline() - except ApplicationError as err: - if ( - "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files" - in str(err) - ): - cmdline = SamtoolsVersion1xSortCommandline(samtools_exe) - cmdline.set_parameter("input", self.bamfile1) - cmdline.set_parameter("-T", "out") - cmdline.set_parameter("-o", "out.bam") - try: - stdout, stderr = cmdline() - except ApplicationError: - raise - else: - raise - self.assertFalse(stderr, f"Samtools sort failed:\n{cmdline}\nStderr:{stderr}") - - def test_index(self): - cmdline = SamtoolsIndexCommandline(samtools_exe) - cmdline.set_parameter("input_bam", self.bamfile1) - stdout, stderr = cmdline() - self.assertFalse(stderr, f"Samtools index failed:\n{cmdline}\nStderr:{stderr}") - self.assertTrue(os.path.exists(self.bamindexfile1)) - - def test_idxstats(self): - self.create_bam_index(self.bamfile1) - cmdline = SamtoolsIdxstatsCommandline(samtools_exe) - cmdline.set_parameter("input_bam", self.bamfile1) - stdout, stderr = cmdline() - self.assertFalse( - stderr, f"Samtools idxstats failed:\n{cmdline}\nStderr:{stderr}" - ) - - def test_merge(self): - cmdline = SamtoolsMergeCommandline(samtools_exe) - cmdline.set_parameter("input_bam", [self.bamfile1, self.bamfile2]) - cmdline.set_parameter("out_bam", self.outbamfile) - cmdline.set_parameter("f", True) # Overwrite out.bam if it exists - stdout, stderr = cmdline() - # Worked up to v1.2, then there was a regression failing with message - # but as of v1.3 expect a warning: [W::bam_merge_core2] No @HD tag found. - self.assertTrue( - not stderr or stderr.strip() == "[W::bam_merge_core2] No @HD tag found.", - f"Samtools merge failed:\n{cmdline}\nStderr:{stderr}", - ) - self.assertTrue(os.path.exists(self.outbamfile)) - - def test_mpileup(self): - cmdline = SamtoolsMpileupCommandline(samtools_exe) - cmdline.set_parameter("input_file", [self.bamfile1]) - stdout, stderr = cmdline() - self.assertNotIn("[bam_pileup_core]", stdout) - - def test_mpileup_list(self): - cmdline = SamtoolsMpileupCommandline(samtools_exe) - cmdline.set_parameter("input_file", [self.sortedbamfile1, self.sortedbamfile2]) - stdout, stderr = cmdline() - self.assertNotIn("[bam_pileup_core]", stdout) - - # TODO: def test_phase(self): - # TODO: def test_reheader(self): - # TODO: def test_rmdup(self): - # TODO: def test_targetcut(self): - - -if __name__ == "__main__": - runner = unittest.TextTestRunner(verbosity=2) - unittest.main(testRunner=runner) diff --git a/setup.py b/setup.py index a0a4f7910..813876160 100644 --- a/setup.py +++ b/setup.py @@ -126,11 +126,9 @@ PACKAGES = [ "Bio", "Bio.Affy", "Bio.Align", - "Bio.Align.Applications", "Bio.Align.substitution_matrices", "Bio.AlignIO", "Bio.Alphabet", - "Bio.Application", "Bio.Blast", "Bio.CAPS", "Bio.Cluster", @@ -154,7 +152,6 @@ PACKAGES = [ "Bio.KEGG.KGML", "Bio.Medline", "Bio.motifs", - "Bio.motifs.applications", "Bio.motifs.jaspar", "Bio.Nexus", "Bio.NMR", @@ -176,12 +173,10 @@ PACKAGES = [ "Bio.SeqIO", "Bio.SeqUtils", "Bio.Sequencing", - "Bio.Sequencing.Applications", "Bio.SVDSuperimposer", "Bio.SwissProt", "Bio.TogoWS", "Bio.Phylo", - "Bio.Phylo.Applications", "Bio.Phylo.PAML", "Bio.UniGene", "Bio.UniProt",